Commit 29ce002b505d4e28c2e0883b215c193aadcd3651
1 parent
1557b026
add logs
Showing
1 changed file
with
75 additions
and
5 deletions
Show diff stats
offline_tasks/scripts/tag_category_similar.py
| @@ -6,6 +6,7 @@ from sqlalchemy import create_engine | @@ -6,6 +6,7 @@ from sqlalchemy import create_engine | ||
| 6 | from db_service import create_db_connection | 6 | from db_service import create_db_connection |
| 7 | import argparse | 7 | import argparse |
| 8 | from datetime import datetime | 8 | from datetime import datetime |
| 9 | +import json | ||
| 9 | 10 | ||
| 10 | def clean_text_field(text): | 11 | def clean_text_field(text): |
| 11 | if pd.isna(text): | 12 | if pd.isna(text): |
| @@ -64,6 +65,19 @@ if 'category_id' in df.columns: | @@ -64,6 +65,19 @@ if 'category_id' in df.columns: | ||
| 64 | if 'supplier_id' in df.columns: | 65 | if 'supplier_id' in df.columns: |
| 65 | df['supplier_id'] = df['supplier_id'].astype(int) | 66 | df['supplier_id'] = df['supplier_id'].astype(int) |
| 66 | 67 | ||
| 68 | +# 统计信息收集 | ||
| 69 | +stats = { | ||
| 70 | + 'start_time': datetime.now(), | ||
| 71 | + 'total_orders': len(df), | ||
| 72 | + 'unique_regions': 0, | ||
| 73 | + 'unique_customers': 0, | ||
| 74 | + 'total_categories': 0, | ||
| 75 | + 'categories_with_similarities': 0, | ||
| 76 | + 'total_similarity_pairs': 0, | ||
| 77 | + 'avg_similarities_per_category': 0, | ||
| 78 | + 'file_stats': {} | ||
| 79 | +} | ||
| 80 | + | ||
| 67 | if args.debug: | 81 | if args.debug: |
| 68 | print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") | 82 | print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") |
| 69 | 83 | ||
| @@ -73,6 +87,10 @@ cat_id_to_name = {} | @@ -73,6 +87,10 @@ cat_id_to_name = {} | ||
| 73 | cooccur = defaultdict(lambda: defaultdict(int)) | 87 | cooccur = defaultdict(lambda: defaultdict(int)) |
| 74 | freq = defaultdict(int) | 88 | freq = defaultdict(int) |
| 75 | 89 | ||
| 90 | +# 统计唯一区域和客户 | ||
| 91 | +stats['unique_regions'] = df['区域'].nunique() if '区域' in df.columns else 0 | ||
| 92 | +stats['unique_customers'] = df['客户编码'].nunique() if '客户编码' in df.columns else 0 | ||
| 93 | + | ||
| 76 | for _, row in df.iterrows(): | 94 | for _, row in df.iterrows(): |
| 77 | # Handle None values in 商品信息 | 95 | # Handle None values in 商品信息 |
| 78 | if pd.isna(row['商品信息']): | 96 | if pd.isna(row['商品信息']): |
| @@ -93,11 +111,15 @@ for _, row in df.iterrows(): | @@ -93,11 +111,15 @@ for _, row in df.iterrows(): | ||
| 93 | if c1 != c2: | 111 | if c1 != c2: |
| 94 | cooccur[c1][c2] += 1 | 112 | cooccur[c1][c2] += 1 |
| 95 | 113 | ||
| 114 | +# 更新统计信息 | ||
| 115 | +stats['total_categories'] = len(freq) | ||
| 116 | + | ||
| 96 | # 计算余弦相似度 | 117 | # 计算余弦相似度 |
| 97 | if args.debug: | 118 | if args.debug: |
| 98 | print(f"[DEBUG] 开始计算分类相似度...") | 119 | print(f"[DEBUG] 开始计算分类相似度...") |
| 99 | 120 | ||
| 100 | result = {} | 121 | result = {} |
| 122 | +total_similarity_pairs = 0 | ||
| 101 | for c1 in cooccur: | 123 | for c1 in cooccur: |
| 102 | sim_scores = [] | 124 | sim_scores = [] |
| 103 | for c2 in cooccur[c1]: | 125 | for c2 in cooccur[c1]: |
| @@ -109,6 +131,12 @@ for c1 in cooccur: | @@ -109,6 +131,12 @@ for c1 in cooccur: | ||
| 109 | sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 | 131 | sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 |
| 110 | # 只保留top_n个相似分类 | 132 | # 只保留top_n个相似分类 |
| 111 | result[c1] = sim_scores[:args.top_n] | 133 | result[c1] = sim_scores[:args.top_n] |
| 134 | + total_similarity_pairs += len(sim_scores[:args.top_n]) | ||
| 135 | + | ||
| 136 | +# 更新统计信息 | ||
| 137 | +stats['categories_with_similarities'] = len(result) | ||
| 138 | +stats['total_similarity_pairs'] = total_similarity_pairs | ||
| 139 | +stats['avg_similarities_per_category'] = total_similarity_pairs / len(result) if result else 0 | ||
| 112 | 140 | ||
| 113 | if args.debug: | 141 | if args.debug: |
| 114 | print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") | 142 | print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") |
| @@ -137,6 +165,15 @@ with open(output_file, 'w', encoding='utf-8') as f: | @@ -137,6 +165,15 @@ with open(output_file, 'w', encoding='utf-8') as f: | ||
| 137 | sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) | 165 | sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) |
| 138 | f.write(f'{cat_id}\t{sim_str}\n') | 166 | f.write(f'{cat_id}\t{sim_str}\n') |
| 139 | 167 | ||
| 168 | +# 获取文件统计信息 | ||
| 169 | +if os.path.exists(output_file): | ||
| 170 | + file_size = os.path.getsize(output_file) | ||
| 171 | + stats['file_stats']['output_file'] = { | ||
| 172 | + 'path': output_file, | ||
| 173 | + 'size_bytes': file_size, | ||
| 174 | + 'size_mb': round(file_size / (1024 * 1024), 2) | ||
| 175 | + } | ||
| 176 | + | ||
| 140 | # 输出可读版本到debug目录(ID+名称格式) | 177 | # 输出可读版本到debug目录(ID+名称格式) |
| 141 | if args.debug: | 178 | if args.debug: |
| 142 | print(f"[DEBUG] 开始写入可读文件: {debug_file}") | 179 | print(f"[DEBUG] 开始写入可读文件: {debug_file}") |
| @@ -162,10 +199,43 @@ with open(debug_file, 'w', encoding='utf-8') as f: | @@ -162,10 +199,43 @@ with open(debug_file, 'w', encoding='utf-8') as f: | ||
| 162 | sim_str = ','.join(sim_parts) | 199 | sim_str = ','.join(sim_parts) |
| 163 | f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') | 200 | f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') |
| 164 | 201 | ||
| 165 | -print(f"✓ Tag相似度计算完成") | 202 | +# 获取debug文件统计信息 |
| 203 | +if os.path.exists(debug_file): | ||
| 204 | + file_size = os.path.getsize(debug_file) | ||
| 205 | + stats['file_stats']['debug_file'] = { | ||
| 206 | + 'path': debug_file, | ||
| 207 | + 'size_bytes': file_size, | ||
| 208 | + 'size_mb': round(file_size / (1024 * 1024), 2) | ||
| 209 | + } | ||
| 210 | + | ||
| 211 | +# 计算处理时间 | ||
| 212 | +stats['end_time'] = datetime.now() | ||
| 213 | +stats['processing_time_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds() | ||
| 214 | + | ||
| 215 | +# 输出统计信息 | ||
| 216 | +print("\n" + "="*80) | ||
| 217 | +print("Tag分类相似度计算 - 关键统计信息") | ||
| 218 | +print("="*80) | ||
| 219 | +print(f"📊 数据概览:") | ||
| 220 | +print(f" - 总订单数: {stats['total_orders']:,}") | ||
| 221 | +print(f" - 唯一区域数: {stats['unique_regions']:,}") | ||
| 222 | +print(f" - 唯一客户数: {stats['unique_customers']:,}") | ||
| 223 | +print(f" - 总分类数: {stats['total_categories']:,}") | ||
| 224 | +print(f" - 有相似度的分类数: {stats['categories_with_similarities']:,}") | ||
| 225 | +print(f" - 总相似度对数量: {stats['total_similarity_pairs']:,}") | ||
| 226 | +print(f" - 平均每分类相似数: {stats['avg_similarities_per_category']:.1f}") | ||
| 227 | + | ||
| 228 | +print(f"\n📁 输出文件:") | ||
| 229 | +for file_type, file_info in stats['file_stats'].items(): | ||
| 230 | + print(f" - {file_type}: {file_info['path']}") | ||
| 231 | + print(f" 大小: {file_info['size_mb']} MB ({file_info['size_bytes']:,} bytes)") | ||
| 232 | + | ||
| 233 | +print(f"\n⏱️ 处理时间:") | ||
| 234 | +print(f" - 开始时间: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 235 | +print(f" - 结束时间: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 236 | +print(f" - 总耗时: {stats['processing_time_seconds']:.2f} 秒") | ||
| 237 | + | ||
| 238 | +print(f"\n✅ Tag相似度计算完成") | ||
| 166 | print(f" - 输出文件: {output_file}") | 239 | print(f" - 输出文件: {output_file}") |
| 167 | print(f" - 可读文件: {debug_file}") | 240 | print(f" - 可读文件: {debug_file}") |
| 168 | -print(f" - 分类数: {len(result)}") | ||
| 169 | -if result: | ||
| 170 | - avg_sims = sum(len(sims) for sims in result.values()) / len(result) | ||
| 171 | - print(f" - 平均相似分类数: {avg_sims:.1f}") | 241 | +print("="*80) |