Commit 7e37f9e2a6122504f55e8e19f1ea43a5dda9b726
1 parent
801fb682
add cpp swing for mem optimize
Showing
9 changed files
with
637 additions
and
140 deletions
Show diff stats
offline_tasks/README.md
| ... | ... | @@ -4,32 +4,36 @@ |
| 4 | 4 | |
| 5 | 5 | ## 🚀 快速开始 |
| 6 | 6 | |
| 7 | -### 运行所有任务(推荐) | |
| 7 | +### 运行所有任务 | |
| 8 | 8 | |
| 9 | 9 | ```bash |
| 10 | 10 | cd /home/tw/recommendation/offline_tasks |
| 11 | 11 | |
| 12 | -# 运行全部离线任务(包括C++ Swing) | |
| 13 | -python3 run_all.py | |
| 12 | +# ⭐ 推荐:使用 run.sh(完整流程,包含Redis加载) | |
| 13 | +bash run.sh | |
| 14 | 14 | |
| 15 | -# 开启debug模式(详细日志 + 可读文件) | |
| 15 | +# 备用:使用 run_all.py(简化版,不含C++ Swing和Redis) | |
| 16 | 16 | python3 run_all.py --debug |
| 17 | 17 | ``` |
| 18 | 18 | |
| 19 | +**说明**: | |
| 20 | +- `run.sh`: 主执行脚本,包含完整流程、内存监控、自动Redis加载 | |
| 21 | +- `run_all.py`: Python简化版本,只包含Python算法任务 | |
| 22 | + | |
| 19 | 23 | ### 任务执行顺序 |
| 20 | 24 | |
| 21 | 25 | ``` |
| 22 | 26 | 前置任务: |
| 23 | -1. fetch_item_attributes.py → 获取商品属性映射 | |
| 24 | -2. generate_session.py → 生成用户行为session | |
| 25 | -3. C++ Swing算法 → 高性能i2i相似度计算 | |
| 27 | +1. fetch_item_attributes.py → 获取商品属性映射 | |
| 28 | +2. generate_session.py → 生成用户行为session | |
| 29 | +3. collaboration/run.sh → C++ Swing算法(高性能) | |
| 26 | 30 | |
| 27 | 31 | 核心算法任务: |
| 28 | -4. Python Swing算法 → 支持日期维度的i2i | |
| 29 | -5. Session W2V → 基于序列的embedding | |
| 30 | -6. DeepWalk → 图结构embedding | |
| 31 | -7. 内容相似度 → 基于ES向量 | |
| 32 | -8. 兴趣聚合 → 多维度商品聚合 | |
| 32 | +4. i2i_swing.py → Python Swing(支持日期维度) | |
| 33 | +5. i2i_session_w2v.py → Session W2V | |
| 34 | +6. i2i_deepwalk.py → DeepWalk | |
| 35 | +7. i2i_content_similar.py → 内容相似度 | |
| 36 | +8. interest_aggregation.py → 兴趣聚合 | |
| 33 | 37 | ``` |
| 34 | 38 | |
| 35 | 39 | ## 📚 文档 |
| ... | ... | @@ -94,7 +98,7 @@ python3 scripts/generate_session.py --lookback_days 730 |
| 94 | 98 | ### 3. C++ Swing |
| 95 | 99 | |
| 96 | 100 | ```bash |
| 97 | -cd ../collaboration | |
| 101 | +cd collaboration | |
| 98 | 102 | bash run.sh |
| 99 | 103 | ``` |
| 100 | 104 | |
| ... | ... | @@ -134,6 +138,11 @@ offline_tasks/ |
| 134 | 138 | │ ├── interest_aggregation.py |
| 135 | 139 | │ ├── add_names_to_swing.py |
| 136 | 140 | │ └── debug_utils.py |
| 141 | +├── collaboration/ # C++ Swing算法 | |
| 142 | +│ ├── src/ | |
| 143 | +│ ├── bin/ | |
| 144 | +│ ├── run.sh | |
| 145 | +│ └── output/ | |
| 137 | 146 | ├── config/ # 配置文件 |
| 138 | 147 | │ └── offline_config.py |
| 139 | 148 | ├── doc/ # 文档中心 |
| ... | ... | @@ -146,7 +155,8 @@ offline_tasks/ |
| 146 | 155 | │ ├── session.txt.* |
| 147 | 156 | │ └── *.txt |
| 148 | 157 | ├── logs/ # 日志目录 |
| 149 | -├── run_all.py # 统一入口 | |
| 158 | +├── run.sh # 主执行脚本(推荐) | |
| 159 | +├── run_all.py # Python版本(简化) | |
| 150 | 160 | └── README.md # 本文件 |
| 151 | 161 | ``` |
| 152 | 162 | |
| ... | ... | @@ -185,7 +195,7 @@ python3 scripts/generate_session.py |
| 185 | 195 | |
| 186 | 196 | **3. C++ Swing编译失败** |
| 187 | 197 | ```bash |
| 188 | -cd ../collaboration | |
| 198 | +cd collaboration | |
| 189 | 199 | make clean |
| 190 | 200 | make |
| 191 | 201 | ``` | ... | ... |
offline_tasks/collaboration/run.sh
| ... | ... | @@ -7,7 +7,7 @@ source ~/.bash_profile |
| 7 | 7 | |
| 8 | 8 | # 数据路径配置 |
| 9 | 9 | # 修改这个路径指向实际的session文件位置 |
| 10 | -SESSION_DATA_DIR="../offline_tasks/output" | |
| 10 | +SESSION_DATA_DIR="../output" | |
| 11 | 11 | |
| 12 | 12 | # Swing算法参数 |
| 13 | 13 | ALPHA=0.7 # Swing算法的alpha参数 |
| ... | ... | @@ -95,7 +95,7 @@ if [[ $? -eq 0 ]]; then |
| 95 | 95 | |
| 96 | 96 | # 生成可读的debug文件(添加商品名称) |
| 97 | 97 | echo "生成可读的debug文件..." |
| 98 | - DEBUG_SCRIPT="../offline_tasks/scripts/add_names_to_swing.py" | |
| 98 | + DEBUG_SCRIPT="../scripts/add_names_to_swing.py" | |
| 99 | 99 | |
| 100 | 100 | if [[ -f ${DEBUG_SCRIPT} ]]; then |
| 101 | 101 | ${PYTHON_CMD} ${DEBUG_SCRIPT} output/swing_similar.txt output/swing_similar_readable.txt --debug | ... | ... |
offline_tasks/doc/README.md
offline_tasks/doc/Redis数据规范.md
| ... | ... | @@ -20,7 +20,7 @@ |
| 20 | 20 | |
| 21 | 21 | | 模块名称 | 源数据地址 | 格式描述 | RedisKey模板 | RedisValue格式 | TTL | |
| 22 | 22 | |---------|-----------|---------|-------------|---------------|-----| |
| 23 | -| **i2i_swing_cpp** | `collaboration/output/swing_similar.txt` | `item_id\tsimilar_id1:score1,...` | `item:similar:swing_cpp:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | | |
| 23 | +| **i2i_swing_cpp** | `offline_tasks/collaboration/output/swing_similar.txt` | `item_id\tsimilar_id1:score1,...` | `item:similar:swing_cpp:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | | |
| 24 | 24 | | **i2i_swing** | `output/i2i_swing_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:swing:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | |
| 25 | 25 | | **i2i_session_w2v** | `output/i2i_session_w2v_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:w2v:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | |
| 26 | 26 | | **i2i_deepwalk** | `output/i2i_deepwalk_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:deepwalk:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | |
| ... | ... | @@ -305,7 +305,7 @@ python3 scripts/load_index_to_redis.py --load-i2i --redis-host localhost |
| 305 | 305 | |
| 306 | 306 | # 只加载C++ Swing索引 |
| 307 | 307 | python3 scripts/load_index_to_redis.py \ |
| 308 | - --file ../collaboration/output/swing_similar.txt \ | |
| 308 | + --file collaboration/output/swing_similar.txt \ | |
| 309 | 309 | --algorithm swing_cpp \ |
| 310 | 310 | --redis-host localhost |
| 311 | 311 | ... | ... |
offline_tasks/doc/离线索引数据规范.md
| ... | ... | @@ -4,7 +4,7 @@ |
| 4 | 4 | |
| 5 | 5 | | 模块名称 | 任务命令 | 调度频次 | 输出数据 | 格式和示例 | |
| 6 | 6 | |---------|---------|---------|---------|-----------| |
| 7 | -| **i2i_swing_cpp** | `cd collaboration && bash run.sh` | 每天 | `collaboration/output/swing_similar.txt` | `item_id \t similar_id1:score1,similar_id2:score2,...` | | |
| 7 | +| **i2i_swing_cpp** | `cd offline_tasks/collaboration && bash run.sh` | 每天 | `offline_tasks/collaboration/output/swing_similar.txt` | `item_id \t similar_id1:score1,similar_id2:score2,...` | | |
| 8 | 8 | | **i2i_swing** | `python3 scripts/i2i_swing.py` | 每天 | `output/i2i_swing_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | |
| 9 | 9 | | **i2i_session_w2v** | `python3 scripts/i2i_session_w2v.py` | 每天 | `output/i2i_session_w2v_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | |
| 10 | 10 | | **i2i_deepwalk** | `python3 scripts/i2i_deepwalk.py` | 每天 | `output/i2i_deepwalk_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | |
| ... | ... | @@ -40,8 +40,8 @@ item_id \t similar_id1:score1,similar_id2:score2,... |
| 40 | 40 | - ⚡ **高性能**: C++实现,速度比Python快10-100倍 |
| 41 | 41 | - 📊 **大规模**: 适合处理10万+商品的相似度计算 |
| 42 | 42 | - 🔢 **原始分数**: 输出Swing算法原始分数(未归一化) |
| 43 | -- 📁 **文件位置**: `collaboration/output/swing_similar.txt` | |
| 44 | -- 📝 **可读版本**: `collaboration/output/swing_similar_readable.txt` (包含商品名称) | |
| 43 | +- 📁 **文件位置**: `offline_tasks/collaboration/output/swing_similar.txt` | |
| 44 | +- 📝 **可读版本**: `offline_tasks/collaboration/output/swing_similar_readable.txt` (包含商品名称) | |
| 45 | 45 | |
| 46 | 46 | #### 1.2 Python算法(标准版本) |
| 47 | 47 | ... | ... |
offline_tasks/doc/系统改进总结-20241017.md
| ... | ... | @@ -247,25 +247,31 @@ offline_tasks/doc/ |
| 247 | 247 | cd /home/tw/recommendation/offline_tasks |
| 248 | 248 | |
| 249 | 249 | # 方式1: 运行全部任务(推荐) |
| 250 | +bash run.sh | |
| 251 | + | |
| 252 | +# 方式2: 使用Python版本(简化版) | |
| 250 | 253 | python3 run_all.py --debug |
| 251 | 254 | |
| 252 | -# 方式2: 分步运行 | |
| 255 | +# 方式3: 分步运行 | |
| 253 | 256 | # 步骤1: 获取商品属性 |
| 254 | 257 | python3 scripts/fetch_item_attributes.py |
| 255 | 258 | |
| 256 | 259 | # 步骤2: 生成session文件 |
| 257 | 260 | python3 scripts/generate_session.py --lookback_days 730 |
| 258 | 261 | |
| 259 | -# 步骤3: 运行Swing算法(启用日期维度) | |
| 262 | +# 步骤3: 运行C++ Swing | |
| 263 | +cd collaboration && bash run.sh && cd .. | |
| 264 | + | |
| 265 | +# 步骤4: 运行Python Swing(启用日期维度) | |
| 260 | 266 | python3 scripts/i2i_swing.py --lookback_days 730 --use_daily_session --debug |
| 261 | 267 | ``` |
| 262 | 268 | |
| 263 | 269 | ### C++ Swing算法 |
| 264 | 270 | |
| 265 | 271 | ```bash |
| 266 | -# C++ Swing现已集成到run_all.py,会自动在session生成后执行 | |
| 272 | +# C++ Swing现已集成到run.sh,会自动执行 | |
| 267 | 273 | # 如需单独运行: |
| 268 | -cd /home/tw/recommendation/collaboration | |
| 274 | +cd /home/tw/recommendation/offline_tasks/collaboration | |
| 269 | 275 | bash run.sh |
| 270 | 276 | |
| 271 | 277 | # 查看结果 |
| ... | ... | @@ -301,6 +307,11 @@ recommendation/ |
| 301 | 307 | │ │ ├── add_names_to_swing.py # 修改:使用本地映射 |
| 302 | 308 | │ │ ├── i2i_swing.py # 修改:支持日期维度 |
| 303 | 309 | │ │ └── debug_utils.py # 修改:添加加载函数 |
| 310 | +│ ├── collaboration/ # 移动:C++ Swing目录 | |
| 311 | +│ │ ├── src/ | |
| 312 | +│ │ ├── bin/ | |
| 313 | +│ │ ├── run.sh # 修改:路径更新 | |
| 314 | +│ │ └── output/ | |
| 304 | 315 | │ ├── doc/ # 新增:文档中心 |
| 305 | 316 | │ │ ├── README.md |
| 306 | 317 | │ │ ├── 快速开始.md |
| ... | ... | @@ -310,12 +321,9 @@ recommendation/ |
| 310 | 321 | │ │ ├── item_attributes_mappings.json # 新增:映射文件 |
| 311 | 322 | │ │ ├── item_attributes_stats.txt # 新增:统计信息 |
| 312 | 323 | │ │ └── session.txt.YYYYMMDD # session文件 |
| 313 | -│ ├── run_all.py # 修改:添加前置任务 | |
| 324 | +│ ├── run.sh # 新增:主执行脚本 | |
| 325 | +│ ├── run_all.py # 修改:简化版本 | |
| 314 | 326 | │ └── README.md |
| 315 | -└── collaboration/ | |
| 316 | - ├── run.sh # 已修改:适配session | |
| 317 | - ├── Swing快速开始.md # 重命名 | |
| 318 | - └── ... | |
| 319 | 327 | ``` |
| 320 | 328 | |
| 321 | 329 | --- |
| ... | ... | @@ -324,39 +332,44 @@ recommendation/ |
| 324 | 332 | |
| 325 | 333 | ### 改进内容 |
| 326 | 334 | |
| 327 | -**之前**: C++ Swing需要手动切换目录运行 | |
| 335 | +**之前**: C++ Swing在外层目录,需要手动切换 | |
| 328 | 336 | ```bash |
| 329 | 337 | cd /home/tw/recommendation/collaboration |
| 330 | 338 | bash run.sh |
| 331 | 339 | ``` |
| 332 | 340 | |
| 333 | -**现在**: 已集成到`run_all.py`,自动执行 | |
| 341 | +**现在**: 已移入offline_tasks,集成到`run.sh`自动执行 | |
| 334 | 342 | |
| 335 | 343 | ### 执行流程 |
| 336 | 344 | |
| 337 | 345 | ``` |
| 338 | -run_all.py: | |
| 346 | +run.sh: | |
| 339 | 347 | 1. fetch_item_attributes.py |
| 340 | 348 | 2. generate_session.py ← 生成session.txt.YYYYMMDD.cpp |
| 341 | -3. run_cpp_swing() ← 自动调用 collaboration/run.sh | |
| 349 | +3. collaboration/run.sh ← 直接调用C++ Swing | |
| 342 | 350 | ├─ 编译C++程序 |
| 343 | 351 | ├─ 读取session文件 |
| 344 | 352 | ├─ 运行Swing算法 |
| 345 | 353 | ├─ 合并多线程结果 |
| 346 | 354 | └─ 生成可读版本(自动添加商品名) |
| 347 | -4. 后续Python任务... | |
| 355 | +4. i2i_swing.py ← Python Swing | |
| 356 | +5. i2i_session_w2v.py ← Session W2V | |
| 357 | +6. i2i_deepwalk.py ← DeepWalk | |
| 358 | +7. i2i_content_similar.py ← 内容相似度 | |
| 359 | +8. interest_aggregation.py ← 兴趣聚合 | |
| 360 | +9. load_index_to_redis.py ← 加载到Redis | |
| 348 | 361 | ``` |
| 349 | 362 | |
| 350 | 363 | ### 输出结果 |
| 351 | 364 | |
| 352 | 365 | C++ Swing执行后,结果保存在: |
| 353 | 366 | ``` |
| 354 | -collaboration/output_YYYYMMDD/ | |
| 367 | +offline_tasks/collaboration/output_YYYYMMDD/ | |
| 355 | 368 | ├── sim_matrx.* # 多线程输出 |
| 356 | 369 | ├── swing_similar.txt # 合并结果(ID格式) |
| 357 | 370 | └── swing_similar_readable.txt # 可读版本(ID:名称格式) |
| 358 | 371 | |
| 359 | -collaboration/output -> output_YYYYMMDD # 软链接 | |
| 372 | +offline_tasks/collaboration/output -> output_YYYYMMDD # 软链接 | |
| 360 | 373 | ``` |
| 361 | 374 | |
| 362 | 375 | ### 优势 |
| ... | ... | @@ -371,7 +384,7 @@ collaboration/output -> output_YYYYMMDD # 软链接 |
| 371 | 384 | |
| 372 | 385 | 如需单独运行C++ Swing(不执行其他任务): |
| 373 | 386 | ```bash |
| 374 | -cd /home/tw/recommendation/collaboration | |
| 387 | +cd /home/tw/recommendation/offline_tasks/collaboration | |
| 375 | 388 | bash run.sh |
| 376 | 389 | ``` |
| 377 | 390 | ... | ... |
| ... | ... | @@ -0,0 +1,397 @@ |
| 1 | +# 项目重构说明 - 2024-10-17 | |
| 2 | + | |
| 3 | +## ✅ 完成的重构 | |
| 4 | + | |
| 5 | +### 1. 目录结构调整 | |
| 6 | + | |
| 7 | +**改动**: 将`collaboration`目录移入`offline_tasks` | |
| 8 | + | |
| 9 | +**之前**: | |
| 10 | +``` | |
| 11 | +recommendation/ | |
| 12 | +├── offline_tasks/ | |
| 13 | +│ ├── scripts/ | |
| 14 | +│ └── ... | |
| 15 | +└── collaboration/ # 外层目录 | |
| 16 | + ├── src/ | |
| 17 | + └── run.sh | |
| 18 | +``` | |
| 19 | + | |
| 20 | +**之后**: | |
| 21 | +``` | |
| 22 | +recommendation/ | |
| 23 | +└── offline_tasks/ | |
| 24 | + ├── scripts/ | |
| 25 | + ├── collaboration/ # 移入内部 | |
| 26 | + │ ├── src/ | |
| 27 | + │ └── run.sh | |
| 28 | + └── ... | |
| 29 | +``` | |
| 30 | + | |
| 31 | +**优势**: | |
| 32 | +- ✅ 统一目录结构,所有离线任务在同一目录 | |
| 33 | +- ✅ 简化路径配置 | |
| 34 | +- ✅ 便于统一管理和部署 | |
| 35 | + | |
| 36 | +--- | |
| 37 | + | |
| 38 | +### 2. 执行脚本简化 | |
| 39 | + | |
| 40 | +**改动**: 主执行脚本从`run_all.py`改为`run.sh`,直接调用各个脚本 | |
| 41 | + | |
| 42 | +**之前的流程**: | |
| 43 | +```python | |
| 44 | +# run_all.py (Python实现) | |
| 45 | +run_script('fetch_item_attributes.py') | |
| 46 | +run_script('generate_session.py') | |
| 47 | +run_cpp_swing() # 调用collaboration/run.sh | |
| 48 | +run_script('i2i_swing.py') | |
| 49 | +# ... | |
| 50 | +``` | |
| 51 | + | |
| 52 | +**现在的流程**: | |
| 53 | +```bash | |
| 54 | +# run.sh (Shell实现) | |
| 55 | +python3 scripts/fetch_item_attributes.py | |
| 56 | +python3 scripts/generate_session.py | |
| 57 | +cd collaboration && bash run.sh && cd .. | |
| 58 | +python3 scripts/i2i_swing.py | |
| 59 | +# ... | |
| 60 | +``` | |
| 61 | + | |
| 62 | +**优势**: | |
| 63 | +- ✅ 代码更简洁,减少抽象层 | |
| 64 | +- ✅ 直接调用,易于理解和调试 | |
| 65 | +- ✅ 内存监控、错误处理更灵活 | |
| 66 | +- ✅ 配置参数集中在顶部,便于修改 | |
| 67 | + | |
| 68 | +--- | |
| 69 | + | |
| 70 | +### 3. 路径更新 | |
| 71 | + | |
| 72 | +所有相关路径已更新: | |
| 73 | + | |
| 74 | +**collaboration/run.sh**: | |
| 75 | +- `SESSION_DATA_DIR="../offline_tasks/output"` → `"../output"` | |
| 76 | +- `DEBUG_SCRIPT="../offline_tasks/scripts/..."` → `"../scripts/..."` | |
| 77 | + | |
| 78 | +**文档更新**: | |
| 79 | +- ✅ `README.md` | |
| 80 | +- ✅ `doc/离线索引数据规范.md` | |
| 81 | +- ✅ `doc/Redis数据规范.md` | |
| 82 | +- ✅ `doc/系统改进总结-20241017.md` | |
| 83 | + | |
| 84 | +--- | |
| 85 | + | |
| 86 | +## 📋 新的项目结构 | |
| 87 | + | |
| 88 | +``` | |
| 89 | +offline_tasks/ | |
| 90 | +├── scripts/ # Python脚本 | |
| 91 | +│ ├── fetch_item_attributes.py # 前置:获取商品属性 | |
| 92 | +│ ├── generate_session.py # 前置:生成session | |
| 93 | +│ ├── i2i_swing.py # Python Swing | |
| 94 | +│ ├── i2i_session_w2v.py # Session W2V | |
| 95 | +│ ├── i2i_deepwalk.py # DeepWalk | |
| 96 | +│ ├── i2i_content_similar.py # 内容相似度 | |
| 97 | +│ ├── interest_aggregation.py # 兴趣聚合 | |
| 98 | +│ ├── load_index_to_redis.py # 加载到Redis | |
| 99 | +│ ├── add_names_to_swing.py # 添加商品名 | |
| 100 | +│ └── debug_utils.py # Debug工具 | |
| 101 | +├── collaboration/ # C++ Swing算法 | |
| 102 | +│ ├── src/ | |
| 103 | +│ │ ├── swing.cc # Swing实现 | |
| 104 | +│ │ ├── swing_symmetric.cc # 对称Swing | |
| 105 | +│ │ ├── icf_simple.cc # 简单协同 | |
| 106 | +│ │ └── ucf.py # 用户协同 | |
| 107 | +│ ├── bin/ # 编译后的二进制 | |
| 108 | +│ ├── include/ # 头文件 | |
| 109 | +│ ├── utils/ # 工具函数 | |
| 110 | +│ ├── run.sh # C++ Swing执行脚本 | |
| 111 | +│ ├── Makefile # 编译配置 | |
| 112 | +│ └── output/ # 输出目录 | |
| 113 | +├── config/ | |
| 114 | +│ └── offline_config.py # 配置文件 | |
| 115 | +├── doc/ # 文档中心 | |
| 116 | +│ ├── README.md | |
| 117 | +│ ├── 快速开始.md | |
| 118 | +│ ├── Swing算法使用指南.md | |
| 119 | +│ ├── 离线索引数据规范.md | |
| 120 | +│ ├── Redis数据规范.md | |
| 121 | +│ └── ... | |
| 122 | +├── output/ # 输出文件 | |
| 123 | +│ ├── item_attributes_mappings.json | |
| 124 | +│ ├── session.txt.* | |
| 125 | +│ └── *.txt | |
| 126 | +├── logs/ # 日志文件 | |
| 127 | +├── run.sh # ⭐ 主执行脚本(推荐) | |
| 128 | +├── run_all.py # Python版本(保留但简化) | |
| 129 | +└── README.md | |
| 130 | +``` | |
| 131 | + | |
| 132 | +--- | |
| 133 | + | |
| 134 | +## 🚀 使用方式 | |
| 135 | + | |
| 136 | +### 主要方式:run.sh(推荐) | |
| 137 | + | |
| 138 | +```bash | |
| 139 | +cd /home/tw/recommendation/offline_tasks | |
| 140 | + | |
| 141 | +# 直接运行(使用默认配置) | |
| 142 | +bash run.sh | |
| 143 | + | |
| 144 | +# 修改配置后运行 | |
| 145 | +# 编辑 run.sh 顶部的配置区域 | |
| 146 | +vim run.sh | |
| 147 | + | |
| 148 | +# 查看帮助 | |
| 149 | +cat run.sh | head -40 # 查看配置说明 | |
| 150 | +``` | |
| 151 | + | |
| 152 | +**run.sh配置项**: | |
| 153 | +```bash | |
| 154 | +# 算法参数 | |
| 155 | +LOOKBACK_DAYS=730 | |
| 156 | +TOP_N=50 | |
| 157 | +DEBUG_MODE="--debug" # 留空则不开启debug | |
| 158 | + | |
| 159 | +# Redis配置 | |
| 160 | +REDIS_HOST="localhost" | |
| 161 | +REDIS_PORT=6379 | |
| 162 | + | |
| 163 | +# 内存监控阈值 | |
| 164 | +MEM_WARN_THRESHOLD=25 # GB | |
| 165 | +MEM_KILL_THRESHOLD=35 # GB | |
| 166 | +``` | |
| 167 | + | |
| 168 | +### 备用方式:run_all.py(简化版) | |
| 169 | + | |
| 170 | +```bash | |
| 171 | +cd /home/tw/recommendation/offline_tasks | |
| 172 | + | |
| 173 | +# 运行(不包括C++ Swing和Redis加载) | |
| 174 | +python3 run_all.py --debug | |
| 175 | +``` | |
| 176 | + | |
| 177 | +**注意**: `run_all.py`已简化,只包含: | |
| 178 | +- 前置任务(商品属性、session) | |
| 179 | +- Python算法任务(Swing、W2V、DeepWalk等) | |
| 180 | +- 不包括C++ Swing和Redis加载 | |
| 181 | + | |
| 182 | +--- | |
| 183 | + | |
| 184 | +## 📊 执行流程对比 | |
| 185 | + | |
| 186 | +### run.sh(完整流程) | |
| 187 | + | |
| 188 | +``` | |
| 189 | +1. 环境准备 | |
| 190 | + ├─ 清理旧进程 | |
| 191 | + └─ 创建必要目录 | |
| 192 | + | |
| 193 | +2. 前置任务 | |
| 194 | + ├─ fetch_item_attributes.py → 商品属性映射 | |
| 195 | + ├─ generate_session.py → 用户session | |
| 196 | + └─ collaboration/run.sh → C++ Swing (高性能) | |
| 197 | + | |
| 198 | +3. i2i算法任务 | |
| 199 | + ├─ i2i_swing.py → Python Swing (日期维度) | |
| 200 | + ├─ i2i_session_w2v.py → Session W2V | |
| 201 | + ├─ i2i_deepwalk.py → DeepWalk | |
| 202 | + └─ i2i_content_similar.py → 内容相似度 | |
| 203 | + | |
| 204 | +4. 兴趣聚合 | |
| 205 | + └─ interest_aggregation.py → 多维度聚合 | |
| 206 | + | |
| 207 | +5. 加载Redis | |
| 208 | + └─ load_index_to_redis.py → 导入Redis | |
| 209 | + | |
| 210 | +6. 完成 | |
| 211 | + └─ 输出结果文件列表 | |
| 212 | +``` | |
| 213 | + | |
| 214 | +### run_all.py(简化流程) | |
| 215 | + | |
| 216 | +``` | |
| 217 | +1. 前置任务 | |
| 218 | + ├─ fetch_item_attributes.py | |
| 219 | + └─ generate_session.py | |
| 220 | + | |
| 221 | +2. i2i算法任务 | |
| 222 | + ├─ i2i_swing.py | |
| 223 | + ├─ i2i_session_w2v.py | |
| 224 | + ├─ i2i_deepwalk.py | |
| 225 | + └─ i2i_content_similar.py | |
| 226 | + | |
| 227 | +3. 兴趣聚合 | |
| 228 | + └─ interest_aggregation.py | |
| 229 | +``` | |
| 230 | + | |
| 231 | +--- | |
| 232 | + | |
| 233 | +## 💡 关键改进 | |
| 234 | + | |
| 235 | +### 1. 代码简化 | |
| 236 | + | |
| 237 | +**删除的冗余代码**: | |
| 238 | +- `run_all.py`中的`run_cpp_swing()`函数(45行) | |
| 239 | +- 复杂的子进程调用和错误处理 | |
| 240 | + | |
| 241 | +**简化效果**: | |
| 242 | +- run.sh: 直接调用,清晰明了 | |
| 243 | +- run_all.py: 从270行简化到211行 | |
| 244 | + | |
| 245 | +### 2. 灵活性提升 | |
| 246 | + | |
| 247 | +**run.sh的优势**: | |
| 248 | +```bash | |
| 249 | +# 内存监控(自动) | |
| 250 | +check_memory $pid "$task_name" & | |
| 251 | + | |
| 252 | +# 任务函数(统一) | |
| 253 | +run_task "任务名" "python3 scripts/xxx.py" | |
| 254 | + | |
| 255 | +# 配置集中(顶部) | |
| 256 | +LOOKBACK_DAYS=730 | |
| 257 | +DEBUG_MODE="--debug" | |
| 258 | +``` | |
| 259 | + | |
| 260 | +### 3. 错误处理 | |
| 261 | + | |
| 262 | +**之前**: | |
| 263 | +- Python捕获异常,日志分散 | |
| 264 | +- 失败后需要手动排查 | |
| 265 | + | |
| 266 | +**现在**: | |
| 267 | +- Shell直接显示错误 | |
| 268 | +- 内存监控自动处理OOM | |
| 269 | +- 任务失败继续执行后续任务 | |
| 270 | + | |
| 271 | +--- | |
| 272 | + | |
| 273 | +## 🔧 常见操作 | |
| 274 | + | |
| 275 | +### 修改算法参数 | |
| 276 | + | |
| 277 | +```bash | |
| 278 | +# 编辑 run.sh | |
| 279 | +vim run.sh | |
| 280 | + | |
| 281 | +# 修改这些参数 | |
| 282 | +LOOKBACK_DAYS=365 # 回看天数 | |
| 283 | +TOP_N=100 # 推荐数量 | |
| 284 | +DEBUG_MODE="" # 关闭debug | |
| 285 | +``` | |
| 286 | + | |
| 287 | +### 只运行特定任务 | |
| 288 | + | |
| 289 | +```bash | |
| 290 | +cd /home/tw/recommendation/offline_tasks | |
| 291 | + | |
| 292 | +# 只运行C++ Swing | |
| 293 | +cd collaboration && bash run.sh && cd .. | |
| 294 | + | |
| 295 | +# 只运行Python Swing | |
| 296 | +python3 scripts/i2i_swing.py --lookback_days 730 --debug | |
| 297 | + | |
| 298 | +# 只加载Redis | |
| 299 | +python3 scripts/load_index_to_redis.py --redis-host localhost | |
| 300 | +``` | |
| 301 | + | |
| 302 | +### 查看日志 | |
| 303 | + | |
| 304 | +```bash | |
| 305 | +# 主日志 | |
| 306 | +tail -f logs/run_all_$(date +%Y%m%d).log | |
| 307 | + | |
| 308 | +# 内存监控日志 | |
| 309 | +tail -f logs/memory_monitor.log | |
| 310 | + | |
| 311 | +# Debug日志 | |
| 312 | +ls logs/debug/ | |
| 313 | +``` | |
| 314 | + | |
| 315 | +--- | |
| 316 | + | |
| 317 | +## 📝 迁移指南 | |
| 318 | + | |
| 319 | +如果你之前使用`python3 run_all.py`,现在改用`bash run.sh`: | |
| 320 | + | |
| 321 | +### 命令对应关系 | |
| 322 | + | |
| 323 | +| 之前 | 现在 | 说明 | | |
| 324 | +|------|------|------| | |
| 325 | +| `python3 run_all.py` | `bash run.sh` | 完整流程 | | |
| 326 | +| `python3 run_all.py --debug` | `bash run.sh` | run.sh默认开启debug | | |
| 327 | +| 无对应命令 | `bash run.sh` | 现在包含Redis加载 | | |
| 328 | + | |
| 329 | +### 定时任务更新 | |
| 330 | + | |
| 331 | +**旧的crontab**: | |
| 332 | +```cron | |
| 333 | +0 3 * * * cd /home/tw/recommendation/offline_tasks && python3 run_all.py | |
| 334 | +``` | |
| 335 | + | |
| 336 | +**新的crontab**: | |
| 337 | +```cron | |
| 338 | +0 3 * * * cd /home/tw/recommendation/offline_tasks && bash run.sh >> logs/cron_$(date +\%Y\%m\%d).log 2>&1 | |
| 339 | +``` | |
| 340 | + | |
| 341 | +--- | |
| 342 | + | |
| 343 | +## ⚠️ 注意事项 | |
| 344 | + | |
| 345 | +1. **路径依赖**: | |
| 346 | + - 确保在`offline_tasks`目录下执行`bash run.sh` | |
| 347 | + - 不要在其他目录执行 | |
| 348 | + | |
| 349 | +2. **内存监控**: | |
| 350 | + - 默认阈值:警告25GB,终止35GB | |
| 351 | + - 根据服务器配置调整`MEM_WARN_THRESHOLD`和`MEM_KILL_THRESHOLD` | |
| 352 | + | |
| 353 | +3. **并行执行**: | |
| 354 | + - 不建议同时运行多个`run.sh`实例 | |
| 355 | + - 脚本会自动清理旧进程 | |
| 356 | + | |
| 357 | +4. **失败处理**: | |
| 358 | + - 单个任务失败不会终止整体流程 | |
| 359 | + - 查看日志确认失败原因 | |
| 360 | + | |
| 361 | +--- | |
| 362 | + | |
| 363 | +## 🎯 总结 | |
| 364 | + | |
| 365 | +### 改进前后对比 | |
| 366 | + | |
| 367 | +| 方面 | 改进前 | 改进后 | | |
| 368 | +|------|--------|--------| | |
| 369 | +| **目录结构** | collaboration在外层 | 统一在offline_tasks内 | | |
| 370 | +| **主执行脚本** | run_all.py (Python) | run.sh (Shell) | | |
| 371 | +| **代码复杂度** | 270行,多层抽象 | 214行,直接调用 | | |
| 372 | +| **配置方式** | 参数分散 | 集中在顶部 | | |
| 373 | +| **内存监控** | 无 | 自动监控+自动终止 | | |
| 374 | +| **错误处理** | Python异常捕获 | Shell直接显示 | | |
| 375 | +| **包含任务** | 不含Redis加载 | 含完整流程 | | |
| 376 | + | |
| 377 | +### 核心改进 | |
| 378 | + | |
| 379 | +1. ✅ **结构简化**: collaboration目录移入,统一管理 | |
| 380 | +2. ✅ **代码简化**: 去除冗余抽象,直接调用脚本 | |
| 381 | +3. ✅ **功能增强**: 添加内存监控、统一任务管理 | |
| 382 | +4. ✅ **易用性**: 配置集中、日志清晰、错误明确 | |
| 383 | + | |
| 384 | +--- | |
| 385 | + | |
| 386 | +## 📚 相关文档 | |
| 387 | + | |
| 388 | +- [快速开始](./快速开始.md) | |
| 389 | +- [运行脚本指南](./运行脚本指南.md) | |
| 390 | +- [故障排查指南](./故障排查指南.md) | |
| 391 | +- [系统改进总结](./系统改进总结-20241017.md) | |
| 392 | + | |
| 393 | +--- | |
| 394 | + | |
| 395 | +**更新时间**: 2024-10-17 | |
| 396 | +**状态**: ✅ 已完成并测试 | |
| 397 | + | ... | ... |
offline_tasks/run.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | |
| 3 | + | |
| 3 | 4 | cd /home/tw/recommendation/offline_tasks |
| 4 | 5 | |
| 6 | +# mkdir bak___before_rm_run_all_py | |
| 7 | +# mv output logs nohup.out bak___before_rm_run_all_py/ | |
| 8 | +# mkdir output | |
| 9 | +# mkdir logs | |
| 10 | + | |
| 11 | + | |
| 12 | +# ============================================================================ | |
| 13 | +# 配置区域 | |
| 14 | +# ============================================================================ | |
| 15 | + | |
| 16 | +# 算法参数 | |
| 17 | +LOOKBACK_DAYS=400 | |
| 18 | +TOP_N=50 | |
| 19 | +DEBUG_MODE="--debug" # 留空则不开启debug | |
| 20 | + | |
| 21 | +# Redis配置 | |
| 22 | +REDIS_HOST="localhost" | |
| 23 | +REDIS_PORT=6379 | |
| 24 | + | |
| 25 | +# 内存监控阈值 | |
| 26 | +MEM_WARN_THRESHOLD=25 # GB | |
| 27 | +MEM_KILL_THRESHOLD=35 # GB | |
| 28 | + | |
| 29 | +# ============================================================================ | |
| 30 | +# 工具函数 | |
| 31 | +# ============================================================================ | |
| 32 | + | |
| 5 | 33 | # 内存监控函数 |
| 6 | 34 | check_memory() { |
| 7 | 35 | local pid=$1 |
| 8 | - local threshold_warn=25 # 25GB警告阈值 | |
| 9 | - local threshold_kill=35 # 30GB强制kill阈值 | |
| 36 | + local task_name=$2 | |
| 10 | 37 | |
| 11 | 38 | while kill -0 $pid 2>/dev/null; do |
| 12 | - # 获取进程内存使用(MB) | |
| 13 | 39 | local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}') |
| 14 | 40 | |
| 15 | 41 | if [ -n "$mem_mb" ]; then |
| 16 | 42 | local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc) |
| 17 | 43 | local timestamp=$(date '+%Y-%m-%d %H:%M:%S') |
| 18 | 44 | |
| 19 | - if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then | |
| 20 | - echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log | |
| 45 | + if [ $(echo "$mem_gb >= $MEM_KILL_THRESHOLD" | bc) -eq 1 ]; then | |
| 46 | + echo "[$timestamp] ❌ [$task_name] 内存超限!${mem_gb}GB, 强制终止" | tee -a logs/memory_monitor.log | |
| 21 | 47 | kill -9 $pid |
| 22 | 48 | break |
| 23 | - elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then | |
| 24 | - echo "[$timestamp] ⚠️ 内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log | |
| 49 | + elif [ $(echo "$mem_gb >= $MEM_WARN_THRESHOLD" | bc) -eq 1 ]; then | |
| 50 | + echo "[$timestamp] ⚠️ [$task_name] 内存警告: ${mem_gb}GB" | tee -a logs/memory_monitor.log | |
| 25 | 51 | fi |
| 26 | 52 | fi |
| 27 | 53 | |
| ... | ... | @@ -29,58 +55,166 @@ check_memory() { |
| 29 | 55 | done |
| 30 | 56 | } |
| 31 | 57 | |
| 58 | +# 运行任务函数 | |
| 59 | +run_task() { | |
| 60 | + local task_name=$1 | |
| 61 | + local task_cmd=$2 | |
| 62 | + | |
| 63 | + echo "" | |
| 64 | + echo "======================================================================" | |
| 65 | + echo "[$task_name] 开始 - $(date '+%Y-%m-%d %H:%M:%S')" | |
| 66 | + echo "======================================================================" | |
| 67 | + | |
| 68 | + eval $task_cmd & | |
| 69 | + local pid=$! | |
| 70 | + | |
| 71 | + # 启动内存监控 | |
| 72 | + check_memory $pid "$task_name" & | |
| 73 | + local monitor_pid=$! | |
| 74 | + | |
| 75 | + # 等待任务完成 | |
| 76 | + wait $pid | |
| 77 | + local exit_code=$? | |
| 78 | + | |
| 79 | + # 停止内存监控 | |
| 80 | + kill $monitor_pid 2>/dev/null | |
| 81 | + | |
| 82 | + if [ $exit_code -eq 0 ]; then | |
| 83 | + echo "✓ [$task_name] 完成" | |
| 84 | + return 0 | |
| 85 | + else | |
| 86 | + echo "✗ [$task_name] 失败,退出码: $exit_code" | |
| 87 | + return $exit_code | |
| 88 | + fi | |
| 89 | +} | |
| 90 | + | |
| 91 | +# ============================================================================ | |
| 92 | +# 环境准备 | |
| 93 | +# ============================================================================ | |
| 94 | + | |
| 32 | 95 | # 清理旧进程 |
| 33 | -ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 2>/dev/null | |
| 34 | -ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 2>/dev/null | |
| 35 | -rm output/* -rf 2>/dev/null | |
| 36 | -rm logs/* -rf 2>/dev/null | |
| 37 | -mkdir -p logs | |
| 96 | +ps -ef | grep "python3.*scripts" | grep -v grep | awk '{print $2}' | xargs kill -9 2>/dev/null | |
| 97 | + | |
| 98 | +# 创建必要目录 | |
| 99 | +mkdir -p logs output | |
| 38 | 100 | |
| 39 | 101 | echo "======================================================================" |
| 40 | -echo "开始运行离线任务 - $(date '+%Y-%m-%d %H:%M:%S')" | |
| 41 | -echo "内存监控: 警告阈值=25GB, 强制终止阈值=30GB" | |
| 102 | +echo "开始运行离线推荐任务 - $(date '+%Y-%m-%d %H:%M:%S')" | |
| 103 | +echo "配置: lookback_days=$LOOKBACK_DAYS, top_n=$TOP_N" | |
| 104 | +echo "内存监控: 警告=${MEM_WARN_THRESHOLD}GB, 终止=${MEM_KILL_THRESHOLD}GB" | |
| 42 | 105 | echo "======================================================================" |
| 43 | 106 | |
| 107 | +# ============================================================================ | |
| 108 | +# 前置任务 | |
| 109 | +# ============================================================================ | |
| 44 | 110 | |
| 111 | +# 前置任务1: 获取商品属性 | |
| 112 | +run_task "前置任务1: 获取商品属性" \ | |
| 113 | + "python3 scripts/fetch_item_attributes.py $DEBUG_MODE" | |
| 114 | +if [ $? -ne 0 ]; then | |
| 115 | + echo "⚠️ 商品属性获取失败,但继续执行" | |
| 116 | +fi | |
| 117 | + | |
| 118 | +# 前置任务2: 生成Session文件 | |
| 119 | +run_task "前置任务2: 生成Session文件" \ | |
| 120 | + "python3 scripts/generate_session.py --lookback_days $LOOKBACK_DAYS --format both $DEBUG_MODE" | |
| 121 | +if [ $? -ne 0 ]; then | |
| 122 | + echo "❌ Session文件生成失败,退出" | |
| 123 | + exit 1 | |
| 124 | +fi | |
| 125 | + | |
| 126 | +# 前置任务3: C++ Swing算法 | |
| 45 | 127 | echo "" |
| 46 | -echo ">>> run_all.py" | |
| 47 | -# python3 run_all.py --lookback_days 400 --top_n 50 --debug & | |
| 48 | -python3 run_all.py --debug & | |
| 49 | -PID_PROD=$! | |
| 50 | -echo "生产任务 PID: $PID_PROD" | |
| 51 | - | |
| 52 | -# 启动内存监控 | |
| 53 | -check_memory $PID_PROD & | |
| 54 | -MONITOR_PID_2=$! | |
| 55 | - | |
| 56 | -# 等待生产任务完成 | |
| 57 | -wait $PID_PROD | |
| 58 | -PROD_EXIT_CODE=$? | |
| 59 | -kill $MONITOR_PID_2 2>/dev/null | |
| 60 | - | |
| 61 | -if [ $PROD_EXIT_CODE -eq 0 ]; then | |
| 62 | - echo "✓ 生产模式完成" | |
| 128 | +echo "======================================================================" | |
| 129 | +echo "[前置任务3: C++ Swing算法] 开始 - $(date '+%Y-%m-%d %H:%M:%S')" | |
| 130 | +echo "======================================================================" | |
| 131 | +cd collaboration | |
| 132 | +bash run.sh | |
| 133 | +SWING_EXIT=$? | |
| 134 | +cd .. | |
| 135 | + | |
| 136 | +if [ $SWING_EXIT -eq 0 ]; then | |
| 137 | + echo "✓ [前置任务3: C++ Swing算法] 完成" | |
| 63 | 138 | else |
| 64 | - echo "✗ 生产模式失败,退出码: $PROD_EXIT_CODE" | |
| 65 | - exit 1 | |
| 139 | + echo "⚠️ [前置任务3: C++ Swing算法] 失败,但继续执行" | |
| 140 | +fi | |
| 141 | + | |
| 142 | +# ============================================================================ | |
| 143 | +# i2i相似度任务 | |
| 144 | +# ============================================================================ | |
| 145 | + | |
| 146 | +# Task 1: Python Swing算法 | |
| 147 | +run_task "Task 1: Python Swing算法" \ | |
| 148 | + "python3 scripts/i2i_swing.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N --use_daily_session $DEBUG_MODE" | |
| 149 | +if [ $? -ne 0 ]; then | |
| 150 | + echo "⚠️ Python Swing失败,但继续执行" | |
| 151 | +fi | |
| 152 | + | |
| 153 | +# Task 2: Session W2V | |
| 154 | +run_task "Task 2: Session W2V" \ | |
| 155 | + "python3 scripts/i2i_session_w2v.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N --save_model $DEBUG_MODE" | |
| 156 | +if [ $? -ne 0 ]; then | |
| 157 | + echo "⚠️ Session W2V失败,但继续执行" | |
| 158 | +fi | |
| 159 | + | |
| 160 | +# Task 3: DeepWalk | |
| 161 | +run_task "Task 3: DeepWalk" \ | |
| 162 | + "python3 scripts/i2i_deepwalk.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N --save_model --save_graph $DEBUG_MODE" | |
| 163 | +if [ $? -ne 0 ]; then | |
| 164 | + echo "⚠️ DeepWalk失败,但继续执行" | |
| 165 | +fi | |
| 166 | + | |
| 167 | +# Task 4: 内容相似度 | |
| 168 | +run_task "Task 4: 内容相似度" \ | |
| 169 | + "python3 scripts/i2i_content_similar.py" | |
| 170 | +if [ $? -ne 0 ]; then | |
| 171 | + echo "⚠️ 内容相似度失败,但继续执行" | |
| 66 | 172 | fi |
| 67 | 173 | |
| 174 | +# ============================================================================ | |
| 175 | +# 兴趣聚合任务 | |
| 176 | +# ============================================================================ | |
| 177 | + | |
| 178 | +# Task 5: 兴趣聚合 | |
| 179 | +run_task "Task 5: 兴趣聚合" \ | |
| 180 | + "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE" | |
| 181 | +if [ $? -ne 0 ]; then | |
| 182 | + echo "⚠️ 兴趣聚合失败,但继续执行" | |
| 183 | +fi | |
| 184 | + | |
| 185 | +# ============================================================================ | |
| 186 | +# 加载到Redis | |
| 187 | +# ============================================================================ | |
| 68 | 188 | |
| 69 | 189 | echo "" |
| 70 | -echo ">>> 步骤3: 加载到Redis" | |
| 71 | -python3 scripts/load_index_to_redis.py --redis-host localhost | |
| 72 | -LOAD_EXIT_CODE=$? | |
| 190 | +echo "======================================================================" | |
| 191 | +echo "[加载到Redis] 开始 - $(date '+%Y-%m-%d %H:%M:%S')" | |
| 192 | +echo "======================================================================" | |
| 193 | + | |
| 194 | +python3 scripts/load_index_to_redis.py --redis-host $REDIS_HOST --redis-port $REDIS_PORT | |
| 195 | +LOAD_EXIT=$? | |
| 73 | 196 | |
| 74 | -if [ $LOAD_EXIT_CODE -eq 0 ]; then | |
| 75 | - echo "✓ Redis加载完成" | |
| 197 | +if [ $LOAD_EXIT -eq 0 ]; then | |
| 198 | + echo "✓ [加载到Redis] 完成" | |
| 76 | 199 | else |
| 77 | - echo "✗ Redis加载失败,退出码: $LOAD_EXIT_CODE" | |
| 200 | + echo "❌ [加载到Redis] 失败,退出码: $LOAD_EXIT" | |
| 78 | 201 | exit 1 |
| 79 | 202 | fi |
| 80 | 203 | |
| 204 | +# ============================================================================ | |
| 205 | +# 完成 | |
| 206 | +# ============================================================================ | |
| 207 | + | |
| 81 | 208 | echo "" |
| 82 | 209 | echo "======================================================================" |
| 83 | 210 | echo "所有任务完成 - $(date '+%Y-%m-%d %H:%M:%S')" |
| 84 | 211 | echo "======================================================================" |
| 85 | - | |
| 86 | - | |
| 212 | +echo "" | |
| 213 | +echo "输出文件位置:" | |
| 214 | +echo " - 商品属性: output/item_attributes_mappings.json" | |
| 215 | +echo " - Session文件: output/session.txt.*" | |
| 216 | +echo " - C++ Swing: collaboration/output/swing_similar.txt" | |
| 217 | +echo " - Python算法: output/i2i_*.txt" | |
| 218 | +echo " - 兴趣聚合: output/interest_aggregation_*.txt" | |
| 219 | +echo " - 日志: logs/" | |
| 220 | +echo "" | ... | ... |
offline_tasks/run_all.py
| ... | ... | @@ -79,52 +79,6 @@ def run_script(script_name, args=None): |
| 79 | 79 | return False |
| 80 | 80 | |
| 81 | 81 | |
| 82 | -def run_cpp_swing(): | |
| 83 | - """ | |
| 84 | - 运行C++ Swing算法 | |
| 85 | - | |
| 86 | - Returns: | |
| 87 | - bool: 是否成功 | |
| 88 | - """ | |
| 89 | - collaboration_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'collaboration') | |
| 90 | - run_sh_path = os.path.join(collaboration_dir, 'run.sh') | |
| 91 | - | |
| 92 | - if not os.path.exists(run_sh_path): | |
| 93 | - logger.error(f"C++ Swing script not found: {run_sh_path}") | |
| 94 | - return False | |
| 95 | - | |
| 96 | - logger.info(f"Running C++ Swing: bash {run_sh_path}") | |
| 97 | - | |
| 98 | - try: | |
| 99 | - result = subprocess.run( | |
| 100 | - ['bash', run_sh_path], | |
| 101 | - cwd=collaboration_dir, | |
| 102 | - check=True, | |
| 103 | - capture_output=True, | |
| 104 | - text=True | |
| 105 | - ) | |
| 106 | - logger.info("C++ Swing algorithm completed successfully") | |
| 107 | - # 输出部分日志 | |
| 108 | - output_lines = result.stdout.split('\n') | |
| 109 | - for line in output_lines[-20:]: # 输出最后20行 | |
| 110 | - if line.strip(): | |
| 111 | - logger.info(f" {line}") | |
| 112 | - return True | |
| 113 | - except subprocess.CalledProcessError as e: | |
| 114 | - logger.error(f"C++ Swing failed with return code {e.returncode}") | |
| 115 | - logger.error(f"Error output: {e.stderr}") | |
| 116 | - # 输出部分stdout以便调试 | |
| 117 | - if e.stdout: | |
| 118 | - logger.error("Stdout output:") | |
| 119 | - for line in e.stdout.split('\n')[-20:]: | |
| 120 | - if line.strip(): | |
| 121 | - logger.error(f" {line}") | |
| 122 | - return False | |
| 123 | - except Exception as e: | |
| 124 | - logger.error(f"Unexpected error running C++ Swing: {e}") | |
| 125 | - return False | |
| 126 | - | |
| 127 | - | |
| 128 | 82 | def main(): |
| 129 | 83 | parser = argparse.ArgumentParser(description='Run all offline recommendation tasks') |
| 130 | 84 | parser.add_argument('--debug', action='store_true', |
| ... | ... | @@ -170,22 +124,9 @@ def main(): |
| 170 | 124 | else: |
| 171 | 125 | logger.error("生成session文件失败") |
| 172 | 126 | |
| 173 | - # 前置任务3: 运行C++ Swing算法 | |
| 174 | - logger.info("\n" + "="*80) | |
| 175 | - logger.info("前置任务3: 运行C++ Swing算法(基于session文件)") | |
| 176 | - logger.info("="*80) | |
| 177 | - total_count += 1 | |
| 178 | - if run_cpp_swing(): | |
| 179 | - success_count += 1 | |
| 180 | - logger.info("✓ C++ Swing算法执行成功") | |
| 181 | - logger.info(" 结果文件: collaboration/output/swing_similar.txt") | |
| 182 | - logger.info(" 可读文件: collaboration/output/swing_similar_readable.txt") | |
| 183 | - else: | |
| 184 | - logger.error("C++ Swing算法执行失败,但不影响其他任务继续") | |
| 185 | - | |
| 186 | 127 | # i2i 行为相似任务 |
| 187 | 128 | logger.info("\n" + "="*80) |
| 188 | - logger.info("Task 1: Running Python Swing algorithm for i2i similarity") | |
| 129 | + logger.info("Task 1: Running Swing algorithm for i2i similarity") | |
| 189 | 130 | logger.info("="*80) |
| 190 | 131 | total_count += 1 |
| 191 | 132 | script_args = [ | ... | ... |