Blame view

offline_tasks/collaboration/run.sh 4.37 KB
5ab1c29c   tangwang   first commit
1
  #!/bin/bash
5b61955e   tangwang   offline tasks: me...
2
  source ~/.bash_profile
5ab1c29c   tangwang   first commit
3
  
5b61955e   tangwang   offline tasks: me...
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  # ============================================================================
  # 配置区域 - 可根据实际情况修改
  # ============================================================================
  
  # 数据路径配置
  # 修改这个路径指向实际的session文件位置
  SESSION_DATA_DIR="../offline_tasks/output"
  
  # Swing算法参数
  ALPHA=0.7          # Swing算法的alpha参数
  THRESHOLD1=1       # 交互强度阈值1
  THRESHOLD2=3       # 交互强度阈值2
  THREAD_NUM=4       # 线程数
  SHOW_PROGRESS=1    # 是否显示进度 (0/1)
  
  # Python环境(如果需要特定的Python环境,在这里配置)
  PYTHON_CMD="python3"
  
  # ============================================================================
  # 脚本执行区域
  # ============================================================================
  
  # 编译C++程序
  echo "编译Swing程序..."
5ab1c29c   tangwang   first commit
28
  make
5b61955e   tangwang   offline tasks: me...
29
30
31
32
33
34
35
36
37
  if [[ $? -ne 0 ]]; then
      echo "编译失败,退出"
      exit 1
  fi
  
  # 获取日期
  DAY=`date +"%Y%m%d"`
  # 如果需要使用特定日期,取消下面的注释
  # DAY=20241017
5ab1c29c   tangwang   first commit
38
  
5b61955e   tangwang   offline tasks: me...
39
  echo "处理日期: ${DAY}"
5ab1c29c   tangwang   first commit
40
  
5b61955e   tangwang   offline tasks: me...
41
42
43
44
  # 清理旧的输出目录(365天前)和日志(180天前)
  find . -type d -name 'output_*' -ctime +365 -exec rm -rf {} \; 2>/dev/null
  mkdir -p logs
  find logs/ -type f -mtime +180 -exec rm -f {} \; 2>/dev/null
5ab1c29c   tangwang   first commit
45
  
5b61955e   tangwang   offline tasks: me...
46
  # 创建输出目录
5ab1c29c   tangwang   first commit
47
  output_dir=output_${DAY}
5b61955e   tangwang   offline tasks: me...
48
  mkdir -p ${output_dir}
5ab1c29c   tangwang   first commit
49
  
5b61955e   tangwang   offline tasks: me...
50
51
52
53
54
55
56
57
58
59
60
61
  # 确定session文件路径
  # 优先使用带日期的文件,如果不存在则使用.cpp格式的文件
  SESSION_FILE="${SESSION_DATA_DIR}/session.txt.${DAY}.cpp"
  if [[ ! -f ${SESSION_FILE} ]]; then
      SESSION_FILE="${SESSION_DATA_DIR}/session.txt.${DAY}"
  fi
  
  if [[ ! -f ${SESSION_FILE} ]]; then
      echo "错误: Session文件不存在: ${SESSION_FILE}"
      echo "请先运行 generate_session.py 生成session文件"
      exit 1
  fi
5ab1c29c   tangwang   first commit
62
  
5b61955e   tangwang   offline tasks: me...
63
64
  echo "使用session文件: ${SESSION_FILE}"
  echo "Swing参数: alpha=${ALPHA}, threshold1=${THRESHOLD1}, threshold2=${THRESHOLD2}, threads=${THREAD_NUM}"
5ab1c29c   tangwang   first commit
65
  
5b61955e   tangwang   offline tasks: me...
66
67
68
69
70
71
72
73
74
75
76
77
78
  # 运行Swing算法
  # 如果session文件格式是 "uid \t json",需要用cut -f 2提取json部分
  # 如果session文件格式是纯json(每行一个),直接cat即可
  echo "开始运行Swing算法..."
  if grep -q $'\t' ${SESSION_FILE}; then
      # 包含tab,需要提取第二列
      echo "检测到session文件包含uid,提取json部分..."
      cat ${SESSION_FILE} | cut -f 2 | bin/swing ${ALPHA} ${THRESHOLD1} ${THRESHOLD2} ${THREAD_NUM} ${output_dir} ${SHOW_PROGRESS}
  else
      # 纯json格式
      echo "检测到session文件为纯json格式..."
      cat ${SESSION_FILE} | bin/swing ${ALPHA} ${THRESHOLD1} ${THRESHOLD2} ${THREAD_NUM} ${output_dir} ${SHOW_PROGRESS}
  fi
5ab1c29c   tangwang   first commit
79
  
5b61955e   tangwang   offline tasks: me...
80
  # 检查Swing算法是否成功执行
5ab1c29c   tangwang   first commit
81
  if [[ $? -eq 0 ]]; then
5b61955e   tangwang   offline tasks: me...
82
83
84
      echo "Swing算法执行成功"
      
      # 更新软链接指向最新输出
5ab1c29c   tangwang   first commit
85
86
87
88
      if [[ -e output ]]; then
          rm -rf output
      fi
      ln -s "${output_dir}" output
5b61955e   tangwang   offline tasks: me...
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
      echo "软链接已更新为指向 ${output_dir}"
      
      # 合并结果文件
      echo "合并结果文件..."
      cat output/sim_matrx.* > output/swing_similar.txt
      echo "结果已合并到 output/swing_similar.txt"
      
      # 生成可读的debug文件(添加商品名称)
      echo "生成可读的debug文件..."
      DEBUG_SCRIPT="../offline_tasks/scripts/add_names_to_swing.py"
      
      if [[ -f ${DEBUG_SCRIPT} ]]; then
          ${PYTHON_CMD} ${DEBUG_SCRIPT} output/swing_similar.txt output/swing_similar_readable.txt --debug
          
          if [[ $? -eq 0 ]]; then
              echo "Debug文件已生成: output/swing_similar_readable.txt"
          else
              echo "警告: 生成debug文件失败,但Swing结果已保存"
          fi
      else
          echo "警告: Debug脚本不存在: ${DEBUG_SCRIPT}"
          echo "跳过生成可读文件"
      fi
      
5ab1c29c   tangwang   first commit
113
  else
5b61955e   tangwang   offline tasks: me...
114
115
      echo "Swing算法执行失败,未更新软链接"
      exit 1
5ab1c29c   tangwang   first commit
116
117
  fi
  
5b61955e   tangwang   offline tasks: me...
118
119
120
  # ============================================================================
  # 用户协同过滤(UCF)- 可选
  # ============================================================================
5ab1c29c   tangwang   first commit
121
  
5b61955e   tangwang   offline tasks: me...
122
123
124
125
126
  # 如果需要运行UCF,取消下面的注释
  # echo "运行用户协同过滤..."
  # # 仅使用最新的5万条数据
  # tail -n 50000 ${SESSION_FILE} > output/ucf.input
  # python3 src/ucf.py output/ucf.input output/ucf.txt
5ab1c29c   tangwang   first commit
127
  
5b61955e   tangwang   offline tasks: me...
128
129
130
131
  echo "全部完成!"
  echo "结果文件:"
  echo "  - Swing相似度: ${output_dir}/swing_similar.txt"
  echo "  - Swing可读版: ${output_dir}/swing_similar_readable.txt"
5ab1c29c   tangwang   first commit