fix

tangwang
1 parent 0d5f0a82
Showing 3 changed files with 164 additions and 21 deletions Show diff stats
offline_tasks/CHANGES_SUMMARY.md
offline_tasks/doc/详细设计文档.md
offline_tasks/scripts/tag_category_similar.py
@@ -0,0 +1,102 @@
+# 更新总结 - 2025-10-22
+
+## 1. 修改 tag_category_similar.py 脚本
+
+### 主要变更
+- **SQL查询修改**：从只查询分类名称改为同时查询分类ID和名称
+  - 修改前：`GROUP_CONCAT(pc_1.name) AS '商品信息'`
+  - 修改后：`GROUP_CONCAT(DISTINCT CONCAT(pc_1.id, ':', pc_1.name)) AS '商品信息'`
+
+- **数据处理修改**：解析ID和名称对，建立ID到名称的映射
+  ```python
+  cat_id_to_name = {}
+  for cat_pair in categories:
+      if ':' in cat_pair:
+          cat_id, cat_name = cat_pair.split(':', 1)
+          cat_id_to_name[cat_id] = cat_name
+          unique_cats.add(cat_id)
+  ```
+
+- **输出格式修改**：
+  - **主输出文件**（`output/tag_category_similar_YYYYMMDD.txt`）：只包含ID
+    - 格式：`category_id \t similar_id1:score1,similar_id2:score2,...`
+  - **Debug文件**（`output/debug/tag_category_similar_YYYYMMDD_readable.txt`）：包含ID+名称
+    - 格式：`category_id:category_name \t similar_id1:similar_name1:score1,...`
+
+### 输出示例
+
+**主文件（用于Redis加载）：**
+```
+123	456:0.8123,789:0.7654,234:0.6543
+```
+
+**Debug文件（便于人工查看）：**
+```
+================================================================================
+明文索引文件
+生成时间: 2025-10-22 HH:MM:SS
+描述: tag_category_similar (分类相似度)
+总索引数: 708
+================================================================================
+
+123:BB夹/一字夹	456:横夹:0.8123,789:橡皮筋/发圈:0.7654,234:抓夹:0.6543
+```
+
+## 2. 更新详细设计文档
+
+### 修改位置
+文件：`offline_tasks/doc/详细设计文档.md`
+章节：📊 数据量统计
+
+### 变更内容
+- 基于真实输出文件统计了实际数据量
+- 更新了所有索引类型的准确数量和大小
+- 添加了 `tag_category_similar` 索引统计
+- 更新了Redis内存占用预估（从180MB更新为400MB）
+
+### 实际数据统计（2025-10-22）
+
+| 索引类型 | 索引数量 | 单条平均大小 | 总大小 | 更新频率 |
+|---------|---------|---------|--------|---------|
+| i2i_deepwalk | 48,376 | ~780B | 36MB | 每天 |
+| i2i_session_w2v | 50,990 | ~840B | 41MB | 每天 |
+| i2i_content_name | 127,720 | ~830B | 101MB | 每周 |
+| i2i_content_pic | 0 | - | 0 | 每周 |
+| i2i_item_behavior | 178,775 | ~750B | 128MB | 每天 |
+| interest_hot | 14,001 | ~520B | 6.9MB | 每天 |
+| interest_cart | 15,563 | ~670B | 10MB | 每天 |
+| interest_new | 6,463 | ~500B | 3.1MB | 每天 |
+| interest_global | 17,533 | ~660B | 11MB | 每天 |
+| tag_category_similar | 708 | ~930B | 630KB | 每周 |
+| **总计** | **~460,000** | - | **~338MB** | - |
+
+## 3. 需要执行的操作
+
+要应用这些更改，需要重新运行脚本：
+
+```bash
+cd /home/tw/recommendation/offline_tasks
+python scripts/tag_category_similar.py --debug
+```
+
+这将生成：
+- `output/tag_category_similar_20251022.txt` - ID格式的主文件
+- `output/debug/tag_category_similar_20251022_readable.txt` - 可读格式的调试文件
+
+## 4. 影响分析
+
+### 向后兼容性
+- ⚠️ **破坏性变更**：主输出文件格式从名称改为ID
+- 需要更新所有使用该文件的下游系统（特别是Redis加载脚本）
+
+### 优势
+1. ✅ ID格式更稳定，不受名称变更影响
+2. ✅ 数据更准确，避免名称重复问题
+3. ✅ 保留可读版本便于调试和验证
+4. ✅ 与其他索引文件格式保持一致
+
+### 建议
+- 更新Redis加载脚本，使其能够处理category ID
+- 在API层做ID到名称的转换（如需要）
+- 保持debug文件生成，便于问题排查
+
@@ -615,24 +615,27 @@ crontab -e
 ## 📊 数据量统计
-### 索引数量估算
+### 索引数量统计（基于真实数据）
-| 索引类型 | 索引数量 | 单条大小 | 总大小 | 更新频率 |
+| 索引类型 | 索引数量 | 单条平均大小 | 总大小 | 更新频率 |
 |---------|---------|---------|--------|---------|
-| i2i_swing_cpp | ~50,000 | ~400B | ~20MB | 每天 |
-| i2i_swing | ~50,000 | ~500B | ~25MB | 每天 |
-| i2i_session_w2v | ~50,000 | ~500B | ~25MB | 每天 |
-| i2i_deepwalk | ~50,000 | ~500B | ~25MB | 每天 |
-| i2i_content | ~50,000 | ~500B | ~25MB | 每周 |
-| interest_hot | ~10,000 | ~1KB | ~10MB | 每天 |
-| interest_cart | ~10,000 | ~1KB | ~10MB | 每天 |
-| interest_new | ~5,000 | ~1KB | ~5MB | 每天 |
-| interest_global | ~10,000 | ~1KB | ~10MB | 每天 |
-| **总计** | **~295,000** | - | **~155MB** | - |
+| i2i_deepwalk | 48,376 | ~780B | 36MB | 每天 |
+| i2i_session_w2v | 50,990 | ~840B | 41MB | 每天 |
+| i2i_content_name | 127,720 | ~830B | 101MB | 每周 |
+| i2i_content_pic | 0 | - | 0 | 每周 |
+| i2i_item_behavior | 178,775 | ~750B | 128MB | 每天 |
+| interest_hot | 14,001 | ~520B | 6.9MB | 每天 |
+| interest_cart | 15,563 | ~670B | 10MB | 每天 |
+| interest_new | 6,463 | ~500B | 3.1MB | 每天 |
+| interest_global | 17,533 | ~660B | 11MB | 每天 |
+| tag_category_similar | 708 | ~930B | 630KB | 每周 |
+| **总计** | **~460,000** | - | **~338MB** | - |
+
+> 注：统计数据基于 2025-10-22 的实际输出文件
 ### Redis内存占用
-加载到Redis后的内存占用约 **180MB**（包含key开销）
+加载到Redis后的内存占用约 **400MB**（包含key开销和Redis数据结构开销）
 ---
@@ -35,7 +35,7 @@ SELECT
   sp.code AS `PO单号`,
   psm.name AS `区域`,
   bb.code AS `客户编码`,
-  GROUP_CONCAT(pc_1.name) AS `商品信息`,
+  GROUP_CONCAT(DISTINCT CONCAT(pc_1.id, ':', pc_1.name)) AS `商品信息`,
   MIN(spi.order_time) AS `下单货时间`
 FROM sale_po sp
 INNER JOIN sale_po_item spi ON sp.id = spi.po_id
@@ -62,6 +62,8 @@ if args.debug:
     print(f"[DEBUG] 查询完成，共 {len(df)} 条订单记录")
 # 处理商品信息，分割并去重
+# 构建ID到名称的映射
+cat_id_to_name = {}
 cooccur = defaultdict(lambda: defaultdict(int))
 freq = defaultdict(int)
@@ -70,7 +72,15 @@ for _, row in df.iterrows():
     if pd.isna(row['商品信息']):
         continue
     categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()]
-    unique_cats = set(categories)
+    unique_cats = set()
+    for cat_pair in categories:
+        if ':' in cat_pair:
+            cat_id, cat_name = cat_pair.split(':', 1)
+            cat_id = cat_id.strip()
+            cat_name = cat_name.strip()
+            cat_id_to_name[cat_id] = cat_name
+            unique_cats.add(cat_id)
+    
     for c1 in unique_cats:
         freq[c1] += 1
         for c2 in unique_cats:
@@ -105,22 +115,50 @@ if args.debug:
 # 准备输出
 date_str = datetime.now().strftime('%Y%m%d')
 output_dir = 'output'
+debug_dir = os.path.join(output_dir, 'debug')
 os.makedirs(output_dir, exist_ok=True)
+os.makedirs(debug_dir, exist_ok=True)
 output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt')
+debug_file = os.path.join(debug_dir, f'tag_category_similar_{date_str}_readable.txt')
-# 输出相似分类到文件
+# 输出相似分类到文件（ID格式）
 if args.debug:
     print(f"[DEBUG] 开始写入文件: {output_file}")
 with open(output_file, 'w', encoding='utf-8') as f:
-    for cat, sims in sorted(result.items()):
-        cat_clean = clean_text_field(cat)
-        # 格式: category_name \t similar_cat1:score1,similar_cat2:score2,...
-        sim_str = ','.join([f'{clean_text_field(sim_cat)}:{score:.4f}' for sim_cat, score in sims])
-        f.write(f'{cat_clean}\t{sim_str}\n')
+    for cat_id, sims in sorted(result.items()):
+        # 格式: category_id \t similar_id1:score1,similar_id2:score2,...
+        sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims])
+        f.write(f'{cat_id}\t{sim_str}\n')
+
+# 输出可读版本到debug目录（ID+名称格式）
+if args.debug:
+    print(f"[DEBUG] 开始写入可读文件: {debug_file}")
+
+with open(debug_file, 'w', encoding='utf-8') as f:
+    # 写入文件头信息
+    f.write('='*80 + '\n')
+    f.write('明文索引文件\n')
+    f.write(f'生成时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')
+    f.write(f'描述: tag_category_similar (分类相似度)\n')
+    f.write(f'总索引数: {len(result)}\n')
+    f.write('='*80 + '\n\n')
+    
+    for cat_id, sims in sorted(result.items()):
+        cat_name = cat_id_to_name.get(cat_id, 'Unknown')
+        cat_clean = clean_text_field(cat_name)
+        # 格式: category_id:category_name \t similar_id1:similar_name1:score1,...
+        sim_parts = []
+        for sim_id, score in sims:
+            sim_name = cat_id_to_name.get(sim_id, 'Unknown')
+            sim_clean = clean_text_field(sim_name)
+            sim_parts.append(f'{sim_id}:{sim_clean}:{score:.4f}')
+        sim_str = ','.join(sim_parts)
+        f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n')
 print(f"✓ Tag相似度计算完成")
 print(f"  - 输出文件: {output_file}")
+print(f"  - 可读文件: {debug_file}")
 print(f"  - 分类数: {len(result)}")
 if result:
     avg_sims = sum(len(sims) for sims in result.values()) / len(result)