Commit a1f370ee2c21fb33066534766d8358f439342a50
1 parent
1721766b
offline tasks
Showing
6 changed files
with
714 additions
and
12 deletions
Show diff stats
| @@ -0,0 +1,186 @@ | @@ -0,0 +1,186 @@ | ||
| 1 | +# 商品名称映射问题修复 | ||
| 2 | + | ||
| 3 | +## 问题描述 | ||
| 4 | + | ||
| 5 | +在Debug模式生成的明文文件中,商品名称显示为"Unknown": | ||
| 6 | + | ||
| 7 | +``` | ||
| 8 | +[7] i2i:swing:3667845 | ||
| 9 | +-------------------------------------------------------------------------------- | ||
| 10 | + 1. ID:3588590(Unknown) - Score:0.2857 | ||
| 11 | + 2. ID:3623446(Unknown) - Score:... | ||
| 12 | +``` | ||
| 13 | + | ||
| 14 | +## 根本原因 | ||
| 15 | + | ||
| 16 | +数据类型不匹配导致名称映射失败: | ||
| 17 | + | ||
| 18 | +1. **fetch_name_mappings()** 从数据库获取的映射,key是**字符串类型**: | ||
| 19 | + ```python | ||
| 20 | + mappings['item'] = dict(zip(df['id'].astype(str), df['name'])) | ||
| 21 | + # 结果: {'12345': '香蕉干', '67890': '芒果干', ...} | ||
| 22 | + ``` | ||
| 23 | + | ||
| 24 | +2. **item_name_map** 从DataFrame创建的映射,key是**整数类型**: | ||
| 25 | + ```python | ||
| 26 | + item_name_map = dict(zip(df['item_id'].unique(), ...)) | ||
| 27 | + # 结果: {12345: '香蕉干', 67890: '芒果干', ...} | ||
| 28 | + ``` | ||
| 29 | + | ||
| 30 | +3. **更新操作失败**: | ||
| 31 | + ```python | ||
| 32 | + name_mappings['item'].update(item_name_map) | ||
| 33 | + # int类型的key和str类型的key不匹配,实际上没有更新成功 | ||
| 34 | + ``` | ||
| 35 | + | ||
| 36 | +4. **查询时找不到**: | ||
| 37 | + ```python | ||
| 38 | + name = name_mappings.get('item', {}).get(str(item_id), 'Unknown') | ||
| 39 | + # 将int的item_id转为str查询,但dict中只有int类型的key,所以返回'Unknown' | ||
| 40 | + ``` | ||
| 41 | + | ||
| 42 | +## 修复方案 | ||
| 43 | + | ||
| 44 | +### 方法1:统一转为字符串(已采用) | ||
| 45 | + | ||
| 46 | +```python | ||
| 47 | +# 修改前 | ||
| 48 | +item_name_map = dict(zip(df['item_id'].unique(), df.groupby('item_id')['item_name'].first())) | ||
| 49 | + | ||
| 50 | +# 修改后(key转为字符串) | ||
| 51 | +item_name_map = dict(zip(df['item_id'].unique().astype(str), df.groupby('item_id')['item_name'].first())) | ||
| 52 | +``` | ||
| 53 | + | ||
| 54 | +### 方法2:update时转换(备选) | ||
| 55 | + | ||
| 56 | +```python | ||
| 57 | +# 转换key类型后再更新 | ||
| 58 | +name_mappings['item'].update({str(k): v for k, v in item_name_map.items()}) | ||
| 59 | +``` | ||
| 60 | + | ||
| 61 | +## 验证修复 | ||
| 62 | + | ||
| 63 | +修复后运行debug模式: | ||
| 64 | + | ||
| 65 | +```bash | ||
| 66 | +cd /home/tw/recommendation/offline_tasks | ||
| 67 | +python3 scripts/i2i_swing.py --lookback_days 7 --top_n 10 --debug | ||
| 68 | +``` | ||
| 69 | + | ||
| 70 | +检查明文文件: | ||
| 71 | + | ||
| 72 | +```bash | ||
| 73 | +more output/debug/i2i_swing_*_readable.txt | ||
| 74 | +``` | ||
| 75 | + | ||
| 76 | +应该看到: | ||
| 77 | + | ||
| 78 | +``` | ||
| 79 | +[7] i2i:swing:3667845(商品名称) | ||
| 80 | +-------------------------------------------------------------------------------- | ||
| 81 | + 1. ID:3588590(香蕉干) - Score:0.2857 | ||
| 82 | + 2. ID:3623446(芒果干) - Score:0.2143 | ||
| 83 | +``` | ||
| 84 | + | ||
| 85 | +## 数据库字段确认 | ||
| 86 | + | ||
| 87 | +正确的查询关系: | ||
| 88 | + | ||
| 89 | +```sql | ||
| 90 | +SELECT | ||
| 91 | + pgs.id as 'sku_id', | ||
| 92 | + pgs.name as '商品名称', | ||
| 93 | + ss.name as '供应商名称', | ||
| 94 | + pc_1.name as '一级类目', | ||
| 95 | + pc_2.name as '二级类目' | ||
| 96 | +FROM prd_goods_sku pgs | ||
| 97 | +LEFT JOIN sup_supplier ss ON pgs.supplier_id = ss.id | ||
| 98 | +LEFT JOIN prd_category pc_1 ON ... | ||
| 99 | +``` | ||
| 100 | + | ||
| 101 | +## 其他脚本 | ||
| 102 | + | ||
| 103 | +这个问题也可能存在于其他脚本,需要同样修复: | ||
| 104 | +- ✅ i2i_swing.py (已修复) | ||
| 105 | +- ⚠️ i2i_session_w2v.py (如果有debug功能需要检查) | ||
| 106 | +- ⚠️ i2i_deepwalk.py (如果有debug功能需要检查) | ||
| 107 | +- ⚠️ i2i_content_similar.py (如果有debug功能需要检查) | ||
| 108 | +- ⚠️ interest_aggregation.py (如果有debug功能需要检查) | ||
| 109 | + | ||
| 110 | +## 预防措施 | ||
| 111 | + | ||
| 112 | +为避免类似问题,建议: | ||
| 113 | + | ||
| 114 | +1. **统一数据类型约定**: | ||
| 115 | + - 所有ID映射的key统一使用字符串类型 | ||
| 116 | + - 在debug_utils.py中明确文档说明 | ||
| 117 | + | ||
| 118 | +2. **添加类型检查**: | ||
| 119 | + ```python | ||
| 120 | + def safe_update_mapping(target_dict, source_dict): | ||
| 121 | + """安全更新映射,自动转换key类型""" | ||
| 122 | + for k, v in source_dict.items(): | ||
| 123 | + target_dict[str(k)] = v | ||
| 124 | + ``` | ||
| 125 | + | ||
| 126 | +3. **添加调试日志**: | ||
| 127 | + ```python | ||
| 128 | + if debug: | ||
| 129 | + logger.debug(f"更新前: {len(name_mappings['item'])} 个名称") | ||
| 130 | + name_mappings['item'].update(item_name_map) | ||
| 131 | + logger.debug(f"更新后: {len(name_mappings['item'])} 个名称") | ||
| 132 | + ``` | ||
| 133 | + | ||
| 134 | +--- | ||
| 135 | + | ||
| 136 | +**状态**: ✅ 已修复 | ||
| 137 | +**影响范围**: i2i_swing.py | ||
| 138 | +**修复时间**: 2025-10-16 | ||
| 139 | + | ||
| 140 | +## 补充修复 - 主输出文件 | ||
| 141 | + | ||
| 142 | +问题同样存在于主输出文件中: | ||
| 143 | + | ||
| 144 | +``` | ||
| 145 | +1070176 Unknown 2786217:0.4000 | ||
| 146 | +2786217 Unknown 1070176:0.4000 | ||
| 147 | +``` | ||
| 148 | + | ||
| 149 | +### 原因 | ||
| 150 | + | ||
| 151 | +主输出代码中使用整数item_id作为key查询: | ||
| 152 | + | ||
| 153 | +```python | ||
| 154 | +for item_id, sims in result.items(): | ||
| 155 | + item_name = item_name_map.get(item_id, 'Unknown') # item_id是int,但map的key是str | ||
| 156 | +``` | ||
| 157 | + | ||
| 158 | +### 修复 | ||
| 159 | + | ||
| 160 | +统一转换为字符串: | ||
| 161 | + | ||
| 162 | +```python | ||
| 163 | +for item_id, sims in result.items(): | ||
| 164 | + item_name = item_name_map.get(str(item_id), 'Unknown') # 转换为字符串查询 | ||
| 165 | +``` | ||
| 166 | + | ||
| 167 | +### 验证 | ||
| 168 | + | ||
| 169 | +```bash | ||
| 170 | +# 重新运行 | ||
| 171 | +python3 scripts/i2i_swing.py --lookback_days 7 --top_n 10 | ||
| 172 | + | ||
| 173 | +# 检查输出 | ||
| 174 | +more output/i2i_swing_20251016.txt | ||
| 175 | +``` | ||
| 176 | + | ||
| 177 | +应该看到: | ||
| 178 | +``` | ||
| 179 | +1070176 商品名称A 2786217:0.4000 | ||
| 180 | +2786217 商品名称B 1070176:0.4000 | ||
| 181 | +``` | ||
| 182 | + | ||
| 183 | +--- | ||
| 184 | + | ||
| 185 | +**更新时间**: 2025-10-16 20:30 | ||
| 186 | +**状态**: ✅ 完全修复 |
| @@ -0,0 +1,197 @@ | @@ -0,0 +1,197 @@ | ||
| 1 | +# 离线索引产出规范 | ||
| 2 | + | ||
| 3 | +## 📋 索引任务列表 | ||
| 4 | + | ||
| 5 | +| 模块名称 | 任务命令 | 调度频次 | 输出数据 | 格式和示例 | | ||
| 6 | +|---------|---------|---------|---------|-----------| | ||
| 7 | +| **i2i_swing** | `python3 scripts/i2i_swing.py` | 每天 | `output/i2i_swing_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | | ||
| 8 | +| **i2i_session_w2v** | `python3 scripts/i2i_session_w2v.py` | 每天 | `output/i2i_session_w2v_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | | ||
| 9 | +| **i2i_deepwalk** | `python3 scripts/i2i_deepwalk.py` | 每天 | `output/i2i_deepwalk_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | | ||
| 10 | +| **i2i_content** | `python3 scripts/i2i_content_similar.py` | 每周 | `output/i2i_content_hybrid_YYYYMMDD.txt` | `item_id \t item_name \t similar_id1:score1,similar_id2:score2,...` | | ||
| 11 | +| **interest_hot** | `python3 scripts/interest_aggregation.py` | 每天 | `output/interest_aggregation_hot_YYYYMMDD.txt` | `dimension_key \t item_id1,item_id2,item_id3,...` | | ||
| 12 | +| **interest_cart** | `python3 scripts/interest_aggregation.py` | 每天 | `output/interest_aggregation_cart_YYYYMMDD.txt` | `dimension_key \t item_id1,item_id2,item_id3,...` | | ||
| 13 | +| **interest_new** | `python3 scripts/interest_aggregation.py` | 每天 | `output/interest_aggregation_new_YYYYMMDD.txt` | `dimension_key \t item_id1,item_id2,item_id3,...` | | ||
| 14 | +| **interest_global** | `python3 scripts/interest_aggregation.py` | 每天 | `output/interest_aggregation_global_YYYYMMDD.txt` | `dimension_key \t item_id1,item_id2,item_id3,...` | | ||
| 15 | + | ||
| 16 | +## 📊 详细格式说明 | ||
| 17 | + | ||
| 18 | +### 1. i2i相似度索引 | ||
| 19 | + | ||
| 20 | +#### 输出格式 | ||
| 21 | +``` | ||
| 22 | +item_id \t item_name \t similar_id1:score1,similar_id2:score2,... | ||
| 23 | +``` | ||
| 24 | + | ||
| 25 | +#### 示例 | ||
| 26 | +``` | ||
| 27 | +12345 香蕉干 67890:0.8567,11223:0.7234,44556:0.6891 | ||
| 28 | +67890 芒果干 12345:0.8567,22334:0.7123,55667:0.6543 | ||
| 29 | +``` | ||
| 30 | + | ||
| 31 | +#### 字段说明 | ||
| 32 | +- `item_id`: 商品SKU ID | ||
| 33 | +- `item_name`: 商品名称 | ||
| 34 | +- `similar_id`: 相似商品ID | ||
| 35 | +- `score`: 相似度分数(0-1之间,越大越相似) | ||
| 36 | + | ||
| 37 | +#### 算法差异 | ||
| 38 | +| 算法 | 特点 | 适用场景 | | ||
| 39 | +|------|------|---------| | ||
| 40 | +| **Swing** | 基于用户共同行为,发现购买关联 | 详情页"大家都在看" | | ||
| 41 | +| **Session W2V** | 基于会话序列,捕捉浏览顺序 | 详情页"看了又看" | | ||
| 42 | +| **DeepWalk** | 基于图结构,发现深层关系 | 详情页"相关推荐" | | ||
| 43 | +| **Content** | 基于商品属性,类目相似 | 冷启动商品推荐 | | ||
| 44 | + | ||
| 45 | +### 2. 兴趣点聚合索引 | ||
| 46 | + | ||
| 47 | +#### 输出格式 | ||
| 48 | +``` | ||
| 49 | +dimension_key \t item_id1,item_id2,item_id3,... | ||
| 50 | +``` | ||
| 51 | + | ||
| 52 | +#### 示例 | ||
| 53 | +``` | ||
| 54 | +platform:pc 12345,67890,11223,44556,22334 | ||
| 55 | +category_level2:200 67890,12345,22334,55667,11223 | ||
| 56 | +platform_category2:pc_200 12345,67890,22334,11223,55667 | ||
| 57 | +supplier:10001 12345,44556,22334,67890,11223 | ||
| 58 | +``` | ||
| 59 | + | ||
| 60 | +#### 维度说明 | ||
| 61 | + | ||
| 62 | +**单维度(7个)** | ||
| 63 | +- `platform:{platform_id}` - 业务平台(pc, h5, app等) | ||
| 64 | +- `client_platform:{client}` - 客户端平台(iOS, Android, Web等) | ||
| 65 | +- `supplier:{supplier_id}` - 供应商 | ||
| 66 | +- `category_level1:{cat_id}` - 一级分类 | ||
| 67 | +- `category_level2:{cat_id}` - 二级分类 | ||
| 68 | +- `category_level3:{cat_id}` - 三级分类 | ||
| 69 | +- `category_level4:{cat_id}` - 四级分类 | ||
| 70 | + | ||
| 71 | +**组合维度(4个)** | ||
| 72 | +- `platform_client:{platform}_{client}` - 平台+客户端 | ||
| 73 | +- `platform_category2:{platform}_{cat_id}` - 平台+二级分类 | ||
| 74 | +- `platform_category3:{platform}_{cat_id}` - 平台+三级分类 | ||
| 75 | +- `client_category2:{client}_{cat_id}` - 客户端+二级分类 | ||
| 76 | + | ||
| 77 | +#### 列表类型说明 | ||
| 78 | + | ||
| 79 | +| 类型 | 文件名 | 计算逻辑 | 适用场景 | | ||
| 80 | +|------|--------|---------|---------| | ||
| 81 | +| **hot** | `interest_aggregation_hot_YYYYMMDD.txt` | 最近N天的高频交互商品 | 首页"热门推荐" | | ||
| 82 | +| **cart** | `interest_aggregation_cart_YYYYMMDD.txt` | 高加购率商品 | 首页"热门加购" | | ||
| 83 | +| **new** | `interest_aggregation_new_YYYYMMDD.txt` | 最近上架的新品 | 首页"新品推荐" | | ||
| 84 | +| **global** | `interest_aggregation_global_YYYYMMDD.txt` | 全局热门商品 | 首页"猜你喜欢" | | ||
| 85 | + | ||
| 86 | +## 🔄 调度建议 | ||
| 87 | + | ||
| 88 | +### 每日调度(数据量大,变化快) | ||
| 89 | +```bash | ||
| 90 | +# 每天凌晨3点执行 | ||
| 91 | +0 3 * * * cd /home/tw/recommendation/offline_tasks && python3 run_all.py --lookback_days 730 --top_n 50 | ||
| 92 | +``` | ||
| 93 | + | ||
| 94 | +### 每周调度(数据量小,变化慢) | ||
| 95 | +```bash | ||
| 96 | +# 每周日凌晨4点执行 | ||
| 97 | +0 4 * * 0 cd /home/tw/recommendation/offline_tasks && python3 scripts/i2i_content_similar.py --top_n 50 | ||
| 98 | +``` | ||
| 99 | + | ||
| 100 | +## 📁 文件命名规范 | ||
| 101 | + | ||
| 102 | +### 标准格式 | ||
| 103 | +``` | ||
| 104 | +{algorithm_name}_{date}.txt | ||
| 105 | +``` | ||
| 106 | + | ||
| 107 | +### 示例 | ||
| 108 | +``` | ||
| 109 | +i2i_swing_20251016.txt | ||
| 110 | +i2i_session_w2v_20251016.txt | ||
| 111 | +interest_aggregation_hot_20251016.txt | ||
| 112 | +``` | ||
| 113 | + | ||
| 114 | +### Debug文件(开发调试用) | ||
| 115 | +``` | ||
| 116 | +output/debug/{algorithm_name}_{date}_readable.txt | ||
| 117 | +logs/debug/{algorithm_name}_{date}_{time}.log | ||
| 118 | +``` | ||
| 119 | + | ||
| 120 | +## 📈 数据量估算 | ||
| 121 | + | ||
| 122 | +| 索引类型 | 索引数量 | 单条大小 | 总大小 | 更新频率 | | ||
| 123 | +|---------|---------|---------|--------|---------| | ||
| 124 | +| i2i_swing | ~50,000 | ~500B | ~25MB | 每天 | | ||
| 125 | +| i2i_session_w2v | ~50,000 | ~500B | ~25MB | 每天 | | ||
| 126 | +| i2i_deepwalk | ~50,000 | ~500B | ~25MB | 每天 | | ||
| 127 | +| i2i_content | ~50,000 | ~500B | ~25MB | 每周 | | ||
| 128 | +| interest_hot | ~10,000 | ~1KB | ~10MB | 每天 | | ||
| 129 | +| interest_cart | ~10,000 | ~1KB | ~10MB | 每天 | | ||
| 130 | +| interest_new | ~5,000 | ~1KB | ~5MB | 每天 | | ||
| 131 | +| interest_global | ~10,000 | ~1KB | ~10MB | 每天 | | ||
| 132 | +| **总计** | **~245,000** | - | **~135MB** | - | | ||
| 133 | + | ||
| 134 | +## 🎯 质量检查 | ||
| 135 | + | ||
| 136 | +### 数据完整性检查 | ||
| 137 | +```bash | ||
| 138 | +# 检查文件是否生成 | ||
| 139 | +ls -lh output/*_$(date +%Y%m%d).txt | ||
| 140 | + | ||
| 141 | +# 检查行数 | ||
| 142 | +wc -l output/*_$(date +%Y%m%d).txt | ||
| 143 | + | ||
| 144 | +# 检查格式 | ||
| 145 | +head -5 output/i2i_swing_$(date +%Y%m%d).txt | ||
| 146 | +``` | ||
| 147 | + | ||
| 148 | +### 数据质量指标 | ||
| 149 | + | ||
| 150 | +**i2i索引质量** | ||
| 151 | +- 覆盖率:有推荐的商品数 / 总商品数 > 80% | ||
| 152 | +- 推荐数量:每个商品推荐10-50个相似商品 | ||
| 153 | +- 分数范围:相似度分数在0.01-1.0之间 | ||
| 154 | + | ||
| 155 | +**兴趣聚合质量** | ||
| 156 | +- 覆盖率:有数据的维度数 / 总维度数 > 60% | ||
| 157 | +- 推荐数量:每个维度推荐50-1000个商品 | ||
| 158 | +- 商品去重:同一商品在列表中只出现一次 | ||
| 159 | + | ||
| 160 | +## 🔍 查询示例 | ||
| 161 | + | ||
| 162 | +### 查看特定商品的相似推荐 | ||
| 163 | +```bash | ||
| 164 | +# 查看商品12345的相似商品 | ||
| 165 | +grep "^12345\t" output/i2i_swing_20251016.txt | ||
| 166 | +``` | ||
| 167 | + | ||
| 168 | +### 查看特定维度的热门商品 | ||
| 169 | +```bash | ||
| 170 | +# 查看PC平台的热门商品 | ||
| 171 | +grep "^platform:pc\t" output/interest_aggregation_hot_20251016.txt | ||
| 172 | +``` | ||
| 173 | + | ||
| 174 | +### 统计索引数量 | ||
| 175 | +```bash | ||
| 176 | +# 统计各类型索引数量 | ||
| 177 | +for file in output/*_20251016.txt; do | ||
| 178 | + echo "$file: $(wc -l < $file) 条" | ||
| 179 | +done | ||
| 180 | +``` | ||
| 181 | + | ||
| 182 | +## ⚠️ 注意事项 | ||
| 183 | + | ||
| 184 | +1. **文件编码**: 所有文件使用UTF-8编码 | ||
| 185 | +2. **分隔符**: 使用Tab(\t)分隔字段 | ||
| 186 | +3. **商品ID**: 使用数字类型,不带引号 | ||
| 187 | +4. **分数精度**: 相似度分数保留4位小数 | ||
| 188 | +5. **排序规则**: 相似商品按分数降序排列 | ||
| 189 | +6. **去重**: 确保推荐列表中没有重复商品 | ||
| 190 | +7. **有效性**: 推荐的商品必须是在售状态 | ||
| 191 | + | ||
| 192 | +## 🔗 相关文档 | ||
| 193 | + | ||
| 194 | +- **Redis数据规范**: `REDIS_DATA_SPEC.md` | ||
| 195 | +- **API接口文档**: `RECOMMENDATION_API.md` | ||
| 196 | +- **Debug指南**: `DEBUG_GUIDE.md` | ||
| 197 | +- **配置说明**: `UPDATE_CONFIG_GUIDE.md` |
| @@ -0,0 +1,304 @@ | @@ -0,0 +1,304 @@ | ||
| 1 | +# Redis数据灌入规范 | ||
| 2 | + | ||
| 3 | +## 📋 数据灌入概述 | ||
| 4 | + | ||
| 5 | +将离线生成的推荐索引加载到Redis,供在线系统实时查询使用。 | ||
| 6 | + | ||
| 7 | +## 🔑 Redis Key规范 | ||
| 8 | + | ||
| 9 | +### 通用规则 | ||
| 10 | +``` | ||
| 11 | +{namespace}:{function}:{algorithm}:{identifier} | ||
| 12 | +``` | ||
| 13 | + | ||
| 14 | +- `namespace`: 业务命名空间(item, user, interest等) | ||
| 15 | +- `function`: 功能类型(similar, feature, hot等) | ||
| 16 | +- `algorithm`: 算法名称(swing, w2v, deepwalk等) | ||
| 17 | +- `identifier`: 具体标识(item_id, dimension_key等) | ||
| 18 | + | ||
| 19 | +## 📊 数据灌入规范表 | ||
| 20 | + | ||
| 21 | +| 模块名称 | 源数据地址 | 格式描述 | RedisKey模板 | RedisValue格式 | TTL | | ||
| 22 | +|---------|-----------|---------|-------------|---------------|-----| | ||
| 23 | +| **i2i_swing** | `output/i2i_swing_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:swing:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | | ||
| 24 | +| **i2i_session_w2v** | `output/i2i_session_w2v_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:w2v:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | | ||
| 25 | +| **i2i_deepwalk** | `output/i2i_deepwalk_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:deepwalk:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 7天 | | ||
| 26 | +| **i2i_content** | `output/i2i_content_hybrid_YYYYMMDD.txt` | `item_id\titem_name\tsimilar_id1:score1,...` | `item:similar:content:{item_id}` | `[[similar_id1,score1],[similar_id2,score2],...]` | 30天 | | ||
| 27 | +| **interest_hot** | `output/interest_aggregation_hot_YYYYMMDD.txt` | `dimension_key\titem_id1,item_id2,...` | `interest:hot:{dimension_key}` | `[item_id1,item_id2,item_id3,...]` | 3天 | | ||
| 28 | +| **interest_cart** | `output/interest_aggregation_cart_YYYYMMDD.txt` | `dimension_key\titem_id1,item_id2,...` | `interest:cart:{dimension_key}` | `[item_id1,item_id2,item_id3,...]` | 3天 | | ||
| 29 | +| **interest_new** | `output/interest_aggregation_new_YYYYMMDD.txt` | `dimension_key\titem_id1,item_id2,...` | `interest:new:{dimension_key}` | `[item_id1,item_id2,item_id3,...]` | 3天 | | ||
| 30 | +| **interest_global** | `output/interest_aggregation_global_YYYYMMDD.txt` | `dimension_key\titem_id1,item_id2,...` | `interest:global:{dimension_key}` | `[item_id1,item_id2,item_id3,...]` | 7天 | | ||
| 31 | + | ||
| 32 | +## 📝 详细说明 | ||
| 33 | + | ||
| 34 | +### 1. i2i相似度索引 | ||
| 35 | + | ||
| 36 | +#### 源数据格式 | ||
| 37 | +``` | ||
| 38 | +12345 香蕉干 67890:0.8567,11223:0.7234,44556:0.6891 | ||
| 39 | +``` | ||
| 40 | + | ||
| 41 | +#### Redis存储 | ||
| 42 | + | ||
| 43 | +**Key**: `item:similar:swing:12345` | ||
| 44 | + | ||
| 45 | +**Value** (JSON格式): | ||
| 46 | +```json | ||
| 47 | +[[67890, 0.8567], [11223, 0.7234], [44556, 0.6891]] | ||
| 48 | +``` | ||
| 49 | + | ||
| 50 | +**Value** (序列化后): | ||
| 51 | +```python | ||
| 52 | +import json | ||
| 53 | +value = json.dumps([[67890, 0.8567], [11223, 0.7234], [44556, 0.6891]]) | ||
| 54 | +# 存储: "[[67890,0.8567],[11223,0.7234],[44556,0.6891]]" | ||
| 55 | +``` | ||
| 56 | + | ||
| 57 | +#### 查询示例 | ||
| 58 | +```python | ||
| 59 | +import redis | ||
| 60 | +import json | ||
| 61 | + | ||
| 62 | +r = redis.Redis(host='localhost', port=6379, db=0) | ||
| 63 | + | ||
| 64 | +# 获取商品12345的相似商品(Swing算法) | ||
| 65 | +similar_items = json.loads(r.get('item:similar:swing:12345')) | ||
| 66 | +# 返回: [[67890, 0.8567], [11223, 0.7234], [44556, 0.6891]] | ||
| 67 | + | ||
| 68 | +# 获取Top5相似商品 | ||
| 69 | +top_5 = similar_items[:5] | ||
| 70 | +``` | ||
| 71 | + | ||
| 72 | +### 2. 兴趣点聚合索引 | ||
| 73 | + | ||
| 74 | +#### 源数据格式 | ||
| 75 | +``` | ||
| 76 | +platform:pc 12345,67890,11223,44556,22334 | ||
| 77 | +category_level2:200 67890,12345,22334,55667,11223 | ||
| 78 | +``` | ||
| 79 | + | ||
| 80 | +#### Redis存储 | ||
| 81 | + | ||
| 82 | +**Key**: `interest:hot:platform:pc` | ||
| 83 | + | ||
| 84 | +**Value** (JSON格式): | ||
| 85 | +```json | ||
| 86 | +[12345, 67890, 11223, 44556, 22334] | ||
| 87 | +``` | ||
| 88 | + | ||
| 89 | +**Value** (序列化后): | ||
| 90 | +```python | ||
| 91 | +import json | ||
| 92 | +value = json.dumps([12345, 67890, 11223, 44556, 22334]) | ||
| 93 | +# 存储: "[12345,67890,11223,44556,22334]" | ||
| 94 | +``` | ||
| 95 | + | ||
| 96 | +#### 查询示例 | ||
| 97 | +```python | ||
| 98 | +import redis | ||
| 99 | +import json | ||
| 100 | + | ||
| 101 | +r = redis.Redis(host='localhost', port=6379, db=0) | ||
| 102 | + | ||
| 103 | +# 获取PC平台的热门商品 | ||
| 104 | +hot_items = json.loads(r.get('interest:hot:platform:pc')) | ||
| 105 | +# 返回: [12345, 67890, 11223, 44556, 22334] | ||
| 106 | + | ||
| 107 | +# 获取Top10热门商品 | ||
| 108 | +top_10 = hot_items[:10] | ||
| 109 | +``` | ||
| 110 | + | ||
| 111 | +## 🔄 数据加载流程 | ||
| 112 | + | ||
| 113 | +### 1. 加载i2i索引 | ||
| 114 | + | ||
| 115 | +```python | ||
| 116 | +def load_i2i_index(file_path, algorithm_name, redis_client, expire_seconds=604800): | ||
| 117 | + """ | ||
| 118 | + 加载i2i相似度索引到Redis | ||
| 119 | + | ||
| 120 | + Args: | ||
| 121 | + file_path: 索引文件路径 | ||
| 122 | + algorithm_name: 算法名称(swing, w2v, deepwalk, content) | ||
| 123 | + redis_client: Redis客户端 | ||
| 124 | + expire_seconds: 过期时间(秒),默认7天 | ||
| 125 | + """ | ||
| 126 | + import json | ||
| 127 | + | ||
| 128 | + count = 0 | ||
| 129 | + with open(file_path, 'r', encoding='utf-8') as f: | ||
| 130 | + for line in f: | ||
| 131 | + parts = line.strip().split('\t') | ||
| 132 | + if len(parts) < 3: | ||
| 133 | + continue | ||
| 134 | + | ||
| 135 | + item_id = parts[0] | ||
| 136 | + similar_str = parts[2] # similar_id1:score1,similar_id2:score2,... | ||
| 137 | + | ||
| 138 | + # 解析相似商品 | ||
| 139 | + similar_items = [] | ||
| 140 | + for pair in similar_str.split(','): | ||
| 141 | + if ':' in pair: | ||
| 142 | + sim_id, score = pair.split(':') | ||
| 143 | + similar_items.append([int(sim_id), float(score)]) | ||
| 144 | + | ||
| 145 | + # 存储到Redis | ||
| 146 | + redis_key = f"item:similar:{algorithm_name}:{item_id}" | ||
| 147 | + redis_value = json.dumps(similar_items) | ||
| 148 | + | ||
| 149 | + redis_client.set(redis_key, redis_value) | ||
| 150 | + redis_client.expire(redis_key, expire_seconds) | ||
| 151 | + | ||
| 152 | + count += 1 | ||
| 153 | + | ||
| 154 | + return count | ||
| 155 | +``` | ||
| 156 | + | ||
| 157 | +### 2. 加载兴趣聚合索引 | ||
| 158 | + | ||
| 159 | +```python | ||
| 160 | +def load_interest_index(file_path, list_type, redis_client, expire_seconds=259200): | ||
| 161 | + """ | ||
| 162 | + 加载兴趣点聚合索引到Redis | ||
| 163 | + | ||
| 164 | + Args: | ||
| 165 | + file_path: 索引文件路径 | ||
| 166 | + list_type: 列表类型(hot, cart, new, global) | ||
| 167 | + redis_client: Redis客户端 | ||
| 168 | + expire_seconds: 过期时间(秒),默认3天 | ||
| 169 | + """ | ||
| 170 | + import json | ||
| 171 | + | ||
| 172 | + count = 0 | ||
| 173 | + with open(file_path, 'r', encoding='utf-8') as f: | ||
| 174 | + for line in f: | ||
| 175 | + parts = line.strip().split('\t') | ||
| 176 | + if len(parts) != 2: | ||
| 177 | + continue | ||
| 178 | + | ||
| 179 | + dimension_key = parts[0] # platform:pc | ||
| 180 | + item_ids_str = parts[1] # 12345,67890,11223,... | ||
| 181 | + | ||
| 182 | + # 解析商品ID列表 | ||
| 183 | + item_ids = [int(item_id) for item_id in item_ids_str.split(',')] | ||
| 184 | + | ||
| 185 | + # 存储到Redis | ||
| 186 | + redis_key = f"interest:{list_type}:{dimension_key}" | ||
| 187 | + redis_value = json.dumps(item_ids) | ||
| 188 | + | ||
| 189 | + redis_client.set(redis_key, redis_value) | ||
| 190 | + redis_client.expire(redis_key, expire_seconds) | ||
| 191 | + | ||
| 192 | + count += 1 | ||
| 193 | + | ||
| 194 | + return count | ||
| 195 | +``` | ||
| 196 | + | ||
| 197 | +## 🚀 快速加载命令 | ||
| 198 | + | ||
| 199 | +### 加载所有索引 | ||
| 200 | +```bash | ||
| 201 | +cd /home/tw/recommendation/offline_tasks | ||
| 202 | + | ||
| 203 | +# 加载所有索引(使用今天的数据) | ||
| 204 | +python3 scripts/load_index_to_redis.py --redis-host localhost --redis-port 6379 | ||
| 205 | + | ||
| 206 | +# 加载指定日期的索引 | ||
| 207 | +python3 scripts/load_index_to_redis.py --date 20251016 --redis-host localhost | ||
| 208 | + | ||
| 209 | +# 只加载i2i索引 | ||
| 210 | +python3 scripts/load_index_to_redis.py --load-i2i --redis-host localhost | ||
| 211 | + | ||
| 212 | +# 只加载兴趣聚合索引 | ||
| 213 | +python3 scripts/load_index_to_redis.py --load-interest --redis-host localhost | ||
| 214 | +``` | ||
| 215 | + | ||
| 216 | +### 验证数据 | ||
| 217 | +```bash | ||
| 218 | +# 连接Redis | ||
| 219 | +redis-cli | ||
| 220 | + | ||
| 221 | +# 检查key数量 | ||
| 222 | +DBSIZE | ||
| 223 | + | ||
| 224 | +# 查看某个商品的相似推荐 | ||
| 225 | +GET item:similar:swing:12345 | ||
| 226 | + | ||
| 227 | +# 查看平台热门商品 | ||
| 228 | +GET interest:hot:platform:pc | ||
| 229 | + | ||
| 230 | +# 查看所有i2i相关的key | ||
| 231 | +KEYS item:similar:* | ||
| 232 | + | ||
| 233 | +# 查看所有interest相关的key | ||
| 234 | +KEYS interest:* | ||
| 235 | + | ||
| 236 | +# 检查key的过期时间 | ||
| 237 | +TTL item:similar:swing:12345 | ||
| 238 | +``` | ||
| 239 | + | ||
| 240 | +## 📊 数据统计 | ||
| 241 | + | ||
| 242 | +### Redis内存占用估算 | ||
| 243 | + | ||
| 244 | +| 索引类型 | Key数量 | 单条Value大小 | 总内存 | | ||
| 245 | +|---------|--------|-------------|--------| | ||
| 246 | +| i2i_swing | 50,000 | ~500B | ~25MB | | ||
| 247 | +| i2i_w2v | 50,000 | ~500B | ~25MB | | ||
| 248 | +| i2i_deepwalk | 50,000 | ~500B | ~25MB | | ||
| 249 | +| i2i_content | 50,000 | ~500B | ~25MB | | ||
| 250 | +| interest_hot | 10,000 | ~1KB | ~10MB | | ||
| 251 | +| interest_cart | 10,000 | ~1KB | ~10MB | | ||
| 252 | +| interest_new | 5,000 | ~1KB | ~5MB | | ||
| 253 | +| interest_global | 10,000 | ~1KB | ~10MB | | ||
| 254 | +| **总计** | **245,000** | - | **~135MB** | | ||
| 255 | + | ||
| 256 | +### 过期策略 | ||
| 257 | + | ||
| 258 | +| 索引类型 | TTL | 原因 | | ||
| 259 | +|---------|-----|------| | ||
| 260 | +| i2i行为相似 | 7天 | 用户行为变化快,需要频繁更新 | | ||
| 261 | +| i2i内容相似 | 30天 | 商品属性变化慢,可以保留更久 | | ||
| 262 | +| 热门/加购 | 3天 | 热度变化快,需要及时更新 | | ||
| 263 | +| 新品 | 3天 | 新品概念有时效性 | | ||
| 264 | +| 全局热门 | 7天 | 相对稳定,可以保留更久 | | ||
| 265 | + | ||
| 266 | +## ⚠️ 注意事项 | ||
| 267 | + | ||
| 268 | +1. **原子性**: 使用Pipeline批量写入,提高性能 | ||
| 269 | +2. **过期时间**: 合理设置TTL,避免过期数据 | ||
| 270 | +3. **内存管理**: 定期清理过期key,监控内存使用 | ||
| 271 | +4. **数据版本**: 使用日期标记,支持数据回滚 | ||
| 272 | +5. **容错处理**: 加载失败时不影响线上服务 | ||
| 273 | +6. **监控告警**: 监控加载成功率、Redis内存、查询延迟 | ||
| 274 | + | ||
| 275 | +## 🔍 监控指标 | ||
| 276 | + | ||
| 277 | +### 数据质量指标 | ||
| 278 | +```python | ||
| 279 | +# 检查加载成功率 | ||
| 280 | +total_keys = redis_client.dbsize() | ||
| 281 | +expected_keys = 245000 | ||
| 282 | +success_rate = total_keys / expected_keys * 100 | ||
| 283 | + | ||
| 284 | +# 检查数据完整性 | ||
| 285 | +sample_keys = [ | ||
| 286 | + 'item:similar:swing:12345', | ||
| 287 | + 'interest:hot:platform:pc' | ||
| 288 | +] | ||
| 289 | +for key in sample_keys: | ||
| 290 | + if not redis_client.exists(key): | ||
| 291 | + print(f"Missing key: {key}") | ||
| 292 | +``` | ||
| 293 | + | ||
| 294 | +### 性能指标 | ||
| 295 | +- 加载耗时: < 5分钟 | ||
| 296 | +- 内存占用: < 200MB | ||
| 297 | +- 查询延迟: < 1ms | ||
| 298 | +- 成功率: > 99% | ||
| 299 | + | ||
| 300 | +## 🔗 相关文档 | ||
| 301 | + | ||
| 302 | +- **离线索引规范**: `OFFLINE_INDEX_SPEC.md` | ||
| 303 | +- **API接口文档**: `RECOMMENDATION_API.md` | ||
| 304 | +- **运维手册**: `OPERATIONS.md` |
offline_tasks/run.sh
| 1 | cd /home/tw/recommendation/offline_tasks | 1 | cd /home/tw/recommendation/offline_tasks |
| 2 | 2 | ||
| 3 | -# 查看配置指南 | ||
| 4 | -cat UPDATE_CONFIG_GUIDE.md | 3 | +# # 查看配置指南 |
| 4 | +# cat UPDATE_CONFIG_GUIDE.md | ||
| 5 | 5 | ||
| 6 | -# 查看优化总结 | ||
| 7 | -cat ../CONFIG_CHANGES_SUMMARY.md | 6 | +# 2. 测试连接 |
| 7 | +python3 test_connection.py | ||
| 8 | 8 | ||
| 9 | -python3 run_all.py --lookback_days 7 --top_n 10 --debug > log.runall | ||
| 10 | \ No newline at end of file | 9 | \ No newline at end of file |
| 10 | +# 3. 调试模式运行(小数据量) | ||
| 11 | +python3 run_all.py --lookback_days 7 --top_n 10 --debug | ||
| 12 | + | ||
| 13 | +mv output output_debug | ||
| 14 | +mkdir output | ||
| 15 | + | ||
| 16 | +# # 4. 生产模式运行(大数据量) | ||
| 17 | +python3 run_all.py --lookback_days 730 --top_n 50 | ||
| 18 | + | ||
| 19 | +# 5. 加载到Redis | ||
| 20 | +python3 scripts/load_index_to_redis.py --redis-host localhost |
offline_tasks/scripts/debug_utils.py
| @@ -278,7 +278,7 @@ def fetch_name_mappings(engine, debug=False): | @@ -278,7 +278,7 @@ def fetch_name_mappings(engine, debug=False): | ||
| 278 | 278 | ||
| 279 | try: | 279 | try: |
| 280 | # 获取商品名称 | 280 | # 获取商品名称 |
| 281 | - query = "SELECT id, name FROM prd_goods_sku WHERE status IN (2,4,5) LIMIT 100000" | 281 | + query = "SELECT id, name FROM prd_goods_sku WHERE status IN (2,4,5) LIMIT 5000000" |
| 282 | df = pd.read_sql(query, engine) | 282 | df = pd.read_sql(query, engine) |
| 283 | mappings['item'] = dict(zip(df['id'].astype(str), df['name'])) | 283 | mappings['item'] = dict(zip(df['id'].astype(str), df['name'])) |
| 284 | if debug: | 284 | if debug: |
| @@ -289,7 +289,7 @@ def fetch_name_mappings(engine, debug=False): | @@ -289,7 +289,7 @@ def fetch_name_mappings(engine, debug=False): | ||
| 289 | 289 | ||
| 290 | try: | 290 | try: |
| 291 | # 获取分类名称 | 291 | # 获取分类名称 |
| 292 | - query = "SELECT id, name FROM prd_category LIMIT 10000" | 292 | + query = "SELECT id, name FROM prd_category LIMIT 100000" |
| 293 | df = pd.read_sql(query, engine) | 293 | df = pd.read_sql(query, engine) |
| 294 | mappings['category'] = dict(zip(df['id'].astype(str), df['name'])) | 294 | mappings['category'] = dict(zip(df['id'].astype(str), df['name'])) |
| 295 | if debug: | 295 | if debug: |
| @@ -300,7 +300,7 @@ def fetch_name_mappings(engine, debug=False): | @@ -300,7 +300,7 @@ def fetch_name_mappings(engine, debug=False): | ||
| 300 | 300 | ||
| 301 | try: | 301 | try: |
| 302 | # 获取供应商名称 | 302 | # 获取供应商名称 |
| 303 | - query = "SELECT id, name FROM sup_supplier LIMIT 10000" | 303 | + query = "SELECT id, name FROM sup_supplier LIMIT 100000" |
| 304 | df = pd.read_sql(query, engine) | 304 | df = pd.read_sql(query, engine) |
| 305 | mappings['supplier'] = dict(zip(df['id'].astype(str), df['name'])) | 305 | mappings['supplier'] = dict(zip(df['id'].astype(str), df['name'])) |
| 306 | if debug: | 306 | if debug: |
offline_tasks/scripts/i2i_swing.py
| @@ -303,8 +303,8 @@ def main(): | @@ -303,8 +303,8 @@ def main(): | ||
| 303 | debug=args.debug | 303 | debug=args.debug |
| 304 | ) | 304 | ) |
| 305 | 305 | ||
| 306 | - # 创建item_id到name的映射 | ||
| 307 | - item_name_map = dict(zip(df['item_id'].unique(), df.groupby('item_id')['item_name'].first())) | 306 | + # 创建item_id到name的映射(key转为字符串,与name_mappings一致) |
| 307 | + item_name_map = dict(zip(df['item_id'].unique().astype(str), df.groupby('item_id')['item_name'].first())) | ||
| 308 | 308 | ||
| 309 | # 输出结果 | 309 | # 输出结果 |
| 310 | output_file = args.output or os.path.join(OUTPUT_DIR, f'i2i_swing_{datetime.now().strftime("%Y%m%d")}.txt') | 310 | output_file = args.output or os.path.join(OUTPUT_DIR, f'i2i_swing_{datetime.now().strftime("%Y%m%d")}.txt') |
| @@ -313,7 +313,8 @@ def main(): | @@ -313,7 +313,8 @@ def main(): | ||
| 313 | output_count = 0 | 313 | output_count = 0 |
| 314 | with open(output_file, 'w', encoding='utf-8') as f: | 314 | with open(output_file, 'w', encoding='utf-8') as f: |
| 315 | for item_id, sims in result.items(): | 315 | for item_id, sims in result.items(): |
| 316 | - item_name = item_name_map.get(item_id, 'Unknown') | 316 | + # item_name_map的key是字符串,需要转换 |
| 317 | + item_name = item_name_map.get(str(item_id), 'Unknown') | ||
| 317 | 318 | ||
| 318 | # 只取前N个最相似的商品 | 319 | # 只取前N个最相似的商品 |
| 319 | top_sims = sims[:args.top_n] | 320 | top_sims = sims[:args.top_n] |
| @@ -336,9 +337,13 @@ def main(): | @@ -336,9 +337,13 @@ def main(): | ||
| 336 | logger.debug("获取ID到名称的映射...") | 337 | logger.debug("获取ID到名称的映射...") |
| 337 | name_mappings = fetch_name_mappings(engine, debug=True) | 338 | name_mappings = fetch_name_mappings(engine, debug=True) |
| 338 | 339 | ||
| 339 | - # 准备索引数据(使用已有的item_name_map) | 340 | + # 准备索引数据(合并已有的item_name_map) |
| 341 | + # item_name_map的key已经是str类型,可以直接更新 | ||
| 340 | name_mappings['item'].update(item_name_map) | 342 | name_mappings['item'].update(item_name_map) |
| 341 | 343 | ||
| 344 | + if debug: | ||
| 345 | + logger.debug(f"name_mappings['item']共有 {len(name_mappings['item'])} 个商品名称") | ||
| 346 | + | ||
| 342 | index_data = {} | 347 | index_data = {} |
| 343 | for item_id, sims in result.items(): | 348 | for item_id, sims in result.items(): |
| 344 | top_sims = sims[:args.top_n] | 349 | top_sims = sims[:args.top_n] |