Commit 29ce002b505d4e28c2e0883b215c193aadcd3651
1 parent
1557b026
add logs
Showing
1 changed file
with
75 additions
and
5 deletions
Show diff stats
offline_tasks/scripts/tag_category_similar.py
| ... | ... | @@ -6,6 +6,7 @@ from sqlalchemy import create_engine |
| 6 | 6 | from db_service import create_db_connection |
| 7 | 7 | import argparse |
| 8 | 8 | from datetime import datetime |
| 9 | +import json | |
| 9 | 10 | |
| 10 | 11 | def clean_text_field(text): |
| 11 | 12 | if pd.isna(text): |
| ... | ... | @@ -64,6 +65,19 @@ if 'category_id' in df.columns: |
| 64 | 65 | if 'supplier_id' in df.columns: |
| 65 | 66 | df['supplier_id'] = df['supplier_id'].astype(int) |
| 66 | 67 | |
| 68 | +# 统计信息收集 | |
| 69 | +stats = { | |
| 70 | + 'start_time': datetime.now(), | |
| 71 | + 'total_orders': len(df), | |
| 72 | + 'unique_regions': 0, | |
| 73 | + 'unique_customers': 0, | |
| 74 | + 'total_categories': 0, | |
| 75 | + 'categories_with_similarities': 0, | |
| 76 | + 'total_similarity_pairs': 0, | |
| 77 | + 'avg_similarities_per_category': 0, | |
| 78 | + 'file_stats': {} | |
| 79 | +} | |
| 80 | + | |
| 67 | 81 | if args.debug: |
| 68 | 82 | print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") |
| 69 | 83 | |
| ... | ... | @@ -73,6 +87,10 @@ cat_id_to_name = {} |
| 73 | 87 | cooccur = defaultdict(lambda: defaultdict(int)) |
| 74 | 88 | freq = defaultdict(int) |
| 75 | 89 | |
| 90 | +# 统计唯一区域和客户 | |
| 91 | +stats['unique_regions'] = df['区域'].nunique() if '区域' in df.columns else 0 | |
| 92 | +stats['unique_customers'] = df['客户编码'].nunique() if '客户编码' in df.columns else 0 | |
| 93 | + | |
| 76 | 94 | for _, row in df.iterrows(): |
| 77 | 95 | # Handle None values in 商品信息 |
| 78 | 96 | if pd.isna(row['商品信息']): |
| ... | ... | @@ -93,11 +111,15 @@ for _, row in df.iterrows(): |
| 93 | 111 | if c1 != c2: |
| 94 | 112 | cooccur[c1][c2] += 1 |
| 95 | 113 | |
| 114 | +# 更新统计信息 | |
| 115 | +stats['total_categories'] = len(freq) | |
| 116 | + | |
| 96 | 117 | # 计算余弦相似度 |
| 97 | 118 | if args.debug: |
| 98 | 119 | print(f"[DEBUG] 开始计算分类相似度...") |
| 99 | 120 | |
| 100 | 121 | result = {} |
| 122 | +total_similarity_pairs = 0 | |
| 101 | 123 | for c1 in cooccur: |
| 102 | 124 | sim_scores = [] |
| 103 | 125 | for c2 in cooccur[c1]: |
| ... | ... | @@ -109,6 +131,12 @@ for c1 in cooccur: |
| 109 | 131 | sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 |
| 110 | 132 | # 只保留top_n个相似分类 |
| 111 | 133 | result[c1] = sim_scores[:args.top_n] |
| 134 | + total_similarity_pairs += len(sim_scores[:args.top_n]) | |
| 135 | + | |
| 136 | +# 更新统计信息 | |
| 137 | +stats['categories_with_similarities'] = len(result) | |
| 138 | +stats['total_similarity_pairs'] = total_similarity_pairs | |
| 139 | +stats['avg_similarities_per_category'] = total_similarity_pairs / len(result) if result else 0 | |
| 112 | 140 | |
| 113 | 141 | if args.debug: |
| 114 | 142 | print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") |
| ... | ... | @@ -137,6 +165,15 @@ with open(output_file, 'w', encoding='utf-8') as f: |
| 137 | 165 | sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) |
| 138 | 166 | f.write(f'{cat_id}\t{sim_str}\n') |
| 139 | 167 | |
| 168 | +# 获取文件统计信息 | |
| 169 | +if os.path.exists(output_file): | |
| 170 | + file_size = os.path.getsize(output_file) | |
| 171 | + stats['file_stats']['output_file'] = { | |
| 172 | + 'path': output_file, | |
| 173 | + 'size_bytes': file_size, | |
| 174 | + 'size_mb': round(file_size / (1024 * 1024), 2) | |
| 175 | + } | |
| 176 | + | |
| 140 | 177 | # 输出可读版本到debug目录(ID+名称格式) |
| 141 | 178 | if args.debug: |
| 142 | 179 | print(f"[DEBUG] 开始写入可读文件: {debug_file}") |
| ... | ... | @@ -162,10 +199,43 @@ with open(debug_file, 'w', encoding='utf-8') as f: |
| 162 | 199 | sim_str = ','.join(sim_parts) |
| 163 | 200 | f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') |
| 164 | 201 | |
| 165 | -print(f"✓ Tag相似度计算完成") | |
| 202 | +# 获取debug文件统计信息 | |
| 203 | +if os.path.exists(debug_file): | |
| 204 | + file_size = os.path.getsize(debug_file) | |
| 205 | + stats['file_stats']['debug_file'] = { | |
| 206 | + 'path': debug_file, | |
| 207 | + 'size_bytes': file_size, | |
| 208 | + 'size_mb': round(file_size / (1024 * 1024), 2) | |
| 209 | + } | |
| 210 | + | |
| 211 | +# 计算处理时间 | |
| 212 | +stats['end_time'] = datetime.now() | |
| 213 | +stats['processing_time_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds() | |
| 214 | + | |
| 215 | +# 输出统计信息 | |
| 216 | +print("\n" + "="*80) | |
| 217 | +print("Tag分类相似度计算 - 关键统计信息") | |
| 218 | +print("="*80) | |
| 219 | +print(f"📊 数据概览:") | |
| 220 | +print(f" - 总订单数: {stats['total_orders']:,}") | |
| 221 | +print(f" - 唯一区域数: {stats['unique_regions']:,}") | |
| 222 | +print(f" - 唯一客户数: {stats['unique_customers']:,}") | |
| 223 | +print(f" - 总分类数: {stats['total_categories']:,}") | |
| 224 | +print(f" - 有相似度的分类数: {stats['categories_with_similarities']:,}") | |
| 225 | +print(f" - 总相似度对数量: {stats['total_similarity_pairs']:,}") | |
| 226 | +print(f" - 平均每分类相似数: {stats['avg_similarities_per_category']:.1f}") | |
| 227 | + | |
| 228 | +print(f"\n📁 输出文件:") | |
| 229 | +for file_type, file_info in stats['file_stats'].items(): | |
| 230 | + print(f" - {file_type}: {file_info['path']}") | |
| 231 | + print(f" 大小: {file_info['size_mb']} MB ({file_info['size_bytes']:,} bytes)") | |
| 232 | + | |
| 233 | +print(f"\n⏱️ 处理时间:") | |
| 234 | +print(f" - 开始时间: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}") | |
| 235 | +print(f" - 结束时间: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}") | |
| 236 | +print(f" - 总耗时: {stats['processing_time_seconds']:.2f} 秒") | |
| 237 | + | |
| 238 | +print(f"\n✅ Tag相似度计算完成") | |
| 166 | 239 | print(f" - 输出文件: {output_file}") |
| 167 | 240 | print(f" - 可读文件: {debug_file}") |
| 168 | -print(f" - 分类数: {len(result)}") | |
| 169 | -if result: | |
| 170 | - avg_sims = sum(len(sims) for sims in result.values()) / len(result) | |
| 171 | - print(f" - 平均相似分类数: {avg_sims:.1f}") | |
| 241 | +print("="*80) | ... | ... |