diff --git a/offline_tasks/scripts/tag_category_similar.py b/offline_tasks/scripts/tag_category_similar.py index 2f40f9f..58bbe93 100644 --- a/offline_tasks/scripts/tag_category_similar.py +++ b/offline_tasks/scripts/tag_category_similar.py @@ -6,6 +6,7 @@ from sqlalchemy import create_engine from db_service import create_db_connection import argparse from datetime import datetime +import json def clean_text_field(text): if pd.isna(text): @@ -64,6 +65,19 @@ if 'category_id' in df.columns: if 'supplier_id' in df.columns: df['supplier_id'] = df['supplier_id'].astype(int) +# 统计信息收集 +stats = { + 'start_time': datetime.now(), + 'total_orders': len(df), + 'unique_regions': 0, + 'unique_customers': 0, + 'total_categories': 0, + 'categories_with_similarities': 0, + 'total_similarity_pairs': 0, + 'avg_similarities_per_category': 0, + 'file_stats': {} +} + if args.debug: print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") @@ -73,6 +87,10 @@ cat_id_to_name = {} cooccur = defaultdict(lambda: defaultdict(int)) freq = defaultdict(int) +# 统计唯一区域和客户 +stats['unique_regions'] = df['区域'].nunique() if '区域' in df.columns else 0 +stats['unique_customers'] = df['客户编码'].nunique() if '客户编码' in df.columns else 0 + for _, row in df.iterrows(): # Handle None values in 商品信息 if pd.isna(row['商品信息']): @@ -93,11 +111,15 @@ for _, row in df.iterrows(): if c1 != c2: cooccur[c1][c2] += 1 +# 更新统计信息 +stats['total_categories'] = len(freq) + # 计算余弦相似度 if args.debug: print(f"[DEBUG] 开始计算分类相似度...") result = {} +total_similarity_pairs = 0 for c1 in cooccur: sim_scores = [] for c2 in cooccur[c1]: @@ -109,6 +131,12 @@ for c1 in cooccur: sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 # 只保留top_n个相似分类 result[c1] = sim_scores[:args.top_n] + total_similarity_pairs += len(sim_scores[:args.top_n]) + +# 更新统计信息 +stats['categories_with_similarities'] = len(result) +stats['total_similarity_pairs'] = total_similarity_pairs +stats['avg_similarities_per_category'] = total_similarity_pairs / len(result) if result else 0 if args.debug: print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") @@ -137,6 +165,15 @@ with open(output_file, 'w', encoding='utf-8') as f: sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) f.write(f'{cat_id}\t{sim_str}\n') +# 获取文件统计信息 +if os.path.exists(output_file): + file_size = os.path.getsize(output_file) + stats['file_stats']['output_file'] = { + 'path': output_file, + 'size_bytes': file_size, + 'size_mb': round(file_size / (1024 * 1024), 2) + } + # 输出可读版本到debug目录(ID+名称格式) if args.debug: print(f"[DEBUG] 开始写入可读文件: {debug_file}") @@ -162,10 +199,43 @@ with open(debug_file, 'w', encoding='utf-8') as f: sim_str = ','.join(sim_parts) f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') -print(f"✓ Tag相似度计算完成") +# 获取debug文件统计信息 +if os.path.exists(debug_file): + file_size = os.path.getsize(debug_file) + stats['file_stats']['debug_file'] = { + 'path': debug_file, + 'size_bytes': file_size, + 'size_mb': round(file_size / (1024 * 1024), 2) + } + +# 计算处理时间 +stats['end_time'] = datetime.now() +stats['processing_time_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds() + +# 输出统计信息 +print("\n" + "="*80) +print("Tag分类相似度计算 - 关键统计信息") +print("="*80) +print(f"📊 数据概览:") +print(f" - 总订单数: {stats['total_orders']:,}") +print(f" - 唯一区域数: {stats['unique_regions']:,}") +print(f" - 唯一客户数: {stats['unique_customers']:,}") +print(f" - 总分类数: {stats['total_categories']:,}") +print(f" - 有相似度的分类数: {stats['categories_with_similarities']:,}") +print(f" - 总相似度对数量: {stats['total_similarity_pairs']:,}") +print(f" - 平均每分类相似数: {stats['avg_similarities_per_category']:.1f}") + +print(f"\n📁 输出文件:") +for file_type, file_info in stats['file_stats'].items(): + print(f" - {file_type}: {file_info['path']}") + print(f" 大小: {file_info['size_mb']} MB ({file_info['size_bytes']:,} bytes)") + +print(f"\n⏱️ 处理时间:") +print(f" - 开始时间: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}") +print(f" - 结束时间: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}") +print(f" - 总耗时: {stats['processing_time_seconds']:.2f} 秒") + +print(f"\n✅ Tag相似度计算完成") print(f" - 输出文件: {output_file}") print(f" - 可读文件: {debug_file}") -print(f" - 分类数: {len(result)}") -if result: - avg_sims = sum(len(sims) for sims in result.values()) / len(result) - print(f" - 平均相似分类数: {avg_sims:.1f}") +print("="*80) -- libgit2 0.21.2