Commit 789edb145a135790338384fc1ca6eff6bb5c727c
1 parent
29ce002b
add logs
Showing
4 changed files
with
126 additions
and
39 deletions
Show diff stats
config.py
offline_tasks/scripts/i2i_item_behavior.py
| @@ -7,6 +7,41 @@ from db_service import create_db_connection | @@ -7,6 +7,41 @@ from db_service import create_db_connection | ||
| 7 | import argparse | 7 | import argparse |
| 8 | from datetime import datetime | 8 | from datetime import datetime |
| 9 | from scripts.debug_utils import save_readable_index | 9 | from scripts.debug_utils import save_readable_index |
| 10 | +import logging | ||
| 11 | + | ||
| 12 | +def setup_logger(): | ||
| 13 | + """设置logger配置""" | ||
| 14 | + # 创建logs目录 | ||
| 15 | + logs_dir = 'logs' | ||
| 16 | + os.makedirs(logs_dir, exist_ok=True) | ||
| 17 | + | ||
| 18 | + # 创建logger | ||
| 19 | + logger = logging.getLogger('i2i_item_behavior') | ||
| 20 | + logger.setLevel(logging.INFO) | ||
| 21 | + | ||
| 22 | + # 避免重复添加handler | ||
| 23 | + if logger.handlers: | ||
| 24 | + return logger | ||
| 25 | + | ||
| 26 | + # 创建文件handler | ||
| 27 | + log_file = os.path.join(logs_dir, f'i2i_item_behavior_{datetime.now().strftime("%Y%m%d")}.log') | ||
| 28 | + file_handler = logging.FileHandler(log_file, encoding='utf-8') | ||
| 29 | + file_handler.setLevel(logging.INFO) | ||
| 30 | + | ||
| 31 | + # 创建控制台handler | ||
| 32 | + console_handler = logging.StreamHandler() | ||
| 33 | + console_handler.setLevel(logging.INFO) | ||
| 34 | + | ||
| 35 | + # 创建formatter | ||
| 36 | + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | ||
| 37 | + file_handler.setFormatter(formatter) | ||
| 38 | + console_handler.setFormatter(formatter) | ||
| 39 | + | ||
| 40 | + # 添加handler到logger | ||
| 41 | + logger.addHandler(file_handler) | ||
| 42 | + logger.addHandler(console_handler) | ||
| 43 | + | ||
| 44 | + return logger | ||
| 10 | 45 | ||
| 11 | def clean_text_field(text): | 46 | def clean_text_field(text): |
| 12 | if pd.isna(text): | 47 | if pd.isna(text): |
offline_tasks/scripts/tag_category_similar.py
| @@ -7,6 +7,41 @@ from db_service import create_db_connection | @@ -7,6 +7,41 @@ from db_service import create_db_connection | ||
| 7 | import argparse | 7 | import argparse |
| 8 | from datetime import datetime | 8 | from datetime import datetime |
| 9 | import json | 9 | import json |
| 10 | +import logging | ||
| 11 | + | ||
| 12 | +def setup_logger(): | ||
| 13 | + """设置logger配置""" | ||
| 14 | + # 创建logs目录 | ||
| 15 | + logs_dir = 'logs' | ||
| 16 | + os.makedirs(logs_dir, exist_ok=True) | ||
| 17 | + | ||
| 18 | + # 创建logger | ||
| 19 | + logger = logging.getLogger('tag_category_similar') | ||
| 20 | + logger.setLevel(logging.INFO) | ||
| 21 | + | ||
| 22 | + # 避免重复添加handler | ||
| 23 | + if logger.handlers: | ||
| 24 | + return logger | ||
| 25 | + | ||
| 26 | + # 创建文件handler | ||
| 27 | + log_file = os.path.join(logs_dir, f'tag_category_similar_{datetime.now().strftime("%Y%m%d")}.log') | ||
| 28 | + file_handler = logging.FileHandler(log_file, encoding='utf-8') | ||
| 29 | + file_handler.setLevel(logging.INFO) | ||
| 30 | + | ||
| 31 | + # 创建控制台handler | ||
| 32 | + console_handler = logging.StreamHandler() | ||
| 33 | + console_handler.setLevel(logging.INFO) | ||
| 34 | + | ||
| 35 | + # 创建formatter | ||
| 36 | + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | ||
| 37 | + file_handler.setFormatter(formatter) | ||
| 38 | + console_handler.setFormatter(formatter) | ||
| 39 | + | ||
| 40 | + # 添加handler到logger | ||
| 41 | + logger.addHandler(file_handler) | ||
| 42 | + logger.addHandler(console_handler) | ||
| 43 | + | ||
| 44 | + return logger | ||
| 10 | 45 | ||
| 11 | def clean_text_field(text): | 46 | def clean_text_field(text): |
| 12 | if pd.isna(text): | 47 | if pd.isna(text): |
| @@ -21,6 +56,9 @@ parser.add_argument('--top_n', type=int, default=50, help='每个分类保留的 | @@ -21,6 +56,9 @@ parser.add_argument('--top_n', type=int, default=50, help='每个分类保留的 | ||
| 21 | parser.add_argument('--debug', action='store_true', help='开启debug模式') | 56 | parser.add_argument('--debug', action='store_true', help='开启debug模式') |
| 22 | args = parser.parse_args() | 57 | args = parser.parse_args() |
| 23 | 58 | ||
| 59 | +# 初始化logger | ||
| 60 | +logger = setup_logger() | ||
| 61 | + | ||
| 24 | bpms_host = '120.76.244.158' | 62 | bpms_host = '120.76.244.158' |
| 25 | bpms_port = '3325' | 63 | bpms_port = '3325' |
| 26 | bpms_database = 'bpms' | 64 | bpms_database = 'bpms' |
| @@ -52,9 +90,14 @@ WHERE spi.quantity > 0 | @@ -52,9 +90,14 @@ WHERE spi.quantity > 0 | ||
| 52 | GROUP BY sp.code, psm.name, bb.code; | 90 | GROUP BY sp.code, psm.name, bb.code; |
| 53 | """ | 91 | """ |
| 54 | 92 | ||
| 93 | +logger.info("="*80) | ||
| 94 | +logger.info("Tag分类相似度计算开始") | ||
| 95 | +logger.info("="*80) | ||
| 96 | +logger.info(f"参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") | ||
| 97 | + | ||
| 55 | if args.debug: | 98 | if args.debug: |
| 56 | - print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") | ||
| 57 | - print(f"[DEBUG] 开始查询数据库...") | 99 | + logger.debug(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") |
| 100 | + logger.debug(f"[DEBUG] 开始查询数据库...") | ||
| 58 | 101 | ||
| 59 | # 执行 SQL 查询并将结果加载到 pandas DataFrame | 102 | # 执行 SQL 查询并将结果加载到 pandas DataFrame |
| 60 | df = pd.read_sql(sql_query, engine) | 103 | df = pd.read_sql(sql_query, engine) |
| @@ -78,8 +121,10 @@ stats = { | @@ -78,8 +121,10 @@ stats = { | ||
| 78 | 'file_stats': {} | 121 | 'file_stats': {} |
| 79 | } | 122 | } |
| 80 | 123 | ||
| 124 | +logger.info(f"数据库查询完成,共 {len(df)} 条订单记录") | ||
| 125 | + | ||
| 81 | if args.debug: | 126 | if args.debug: |
| 82 | - print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") | 127 | + logger.debug(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") |
| 83 | 128 | ||
| 84 | # 处理商品信息,分割并去重 | 129 | # 处理商品信息,分割并去重 |
| 85 | # 构建ID到名称的映射 | 130 | # 构建ID到名称的映射 |
| @@ -115,8 +160,10 @@ for _, row in df.iterrows(): | @@ -115,8 +160,10 @@ for _, row in df.iterrows(): | ||
| 115 | stats['total_categories'] = len(freq) | 160 | stats['total_categories'] = len(freq) |
| 116 | 161 | ||
| 117 | # 计算余弦相似度 | 162 | # 计算余弦相似度 |
| 163 | +logger.info("开始计算分类相似度...") | ||
| 164 | + | ||
| 118 | if args.debug: | 165 | if args.debug: |
| 119 | - print(f"[DEBUG] 开始计算分类相似度...") | 166 | + logger.debug(f"[DEBUG] 开始计算分类相似度...") |
| 120 | 167 | ||
| 121 | result = {} | 168 | result = {} |
| 122 | total_similarity_pairs = 0 | 169 | total_similarity_pairs = 0 |
| @@ -138,13 +185,15 @@ stats['categories_with_similarities'] = len(result) | @@ -138,13 +185,15 @@ stats['categories_with_similarities'] = len(result) | ||
| 138 | stats['total_similarity_pairs'] = total_similarity_pairs | 185 | stats['total_similarity_pairs'] = total_similarity_pairs |
| 139 | stats['avg_similarities_per_category'] = total_similarity_pairs / len(result) if result else 0 | 186 | stats['avg_similarities_per_category'] = total_similarity_pairs / len(result) if result else 0 |
| 140 | 187 | ||
| 188 | +logger.info(f"相似度计算完成,共 {len(result)} 个分类有相似推荐") | ||
| 189 | + | ||
| 141 | if args.debug: | 190 | if args.debug: |
| 142 | - print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") | 191 | + logger.debug(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") |
| 143 | unique_cats = set() | 192 | unique_cats = set() |
| 144 | for cats in result.values(): | 193 | for cats in result.values(): |
| 145 | for cat, _ in cats: | 194 | for cat, _ in cats: |
| 146 | unique_cats.add(cat) | 195 | unique_cats.add(cat) |
| 147 | - print(f"[DEBUG] 唯一分类数: {len(unique_cats)}") | 196 | + logger.debug(f"[DEBUG] 唯一分类数: {len(unique_cats)}") |
| 148 | 197 | ||
| 149 | # 准备输出 | 198 | # 准备输出 |
| 150 | date_str = datetime.now().strftime('%Y%m%d') | 199 | date_str = datetime.now().strftime('%Y%m%d') |
| @@ -156,8 +205,10 @@ output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt') | @@ -156,8 +205,10 @@ output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt') | ||
| 156 | debug_file = os.path.join(debug_dir, f'tag_category_similar_{date_str}_readable.txt') | 205 | debug_file = os.path.join(debug_dir, f'tag_category_similar_{date_str}_readable.txt') |
| 157 | 206 | ||
| 158 | # 输出相似分类到文件(ID格式) | 207 | # 输出相似分类到文件(ID格式) |
| 208 | +logger.info(f"开始写入输出文件: {output_file}") | ||
| 209 | + | ||
| 159 | if args.debug: | 210 | if args.debug: |
| 160 | - print(f"[DEBUG] 开始写入文件: {output_file}") | 211 | + logger.debug(f"[DEBUG] 开始写入文件: {output_file}") |
| 161 | 212 | ||
| 162 | with open(output_file, 'w', encoding='utf-8') as f: | 213 | with open(output_file, 'w', encoding='utf-8') as f: |
| 163 | for cat_id, sims in sorted(result.items()): | 214 | for cat_id, sims in sorted(result.items()): |
| @@ -165,6 +216,8 @@ with open(output_file, 'w', encoding='utf-8') as f: | @@ -165,6 +216,8 @@ with open(output_file, 'w', encoding='utf-8') as f: | ||
| 165 | sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) | 216 | sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) |
| 166 | f.write(f'{cat_id}\t{sim_str}\n') | 217 | f.write(f'{cat_id}\t{sim_str}\n') |
| 167 | 218 | ||
| 219 | +logger.info(f"输出文件写入完成: {output_file}") | ||
| 220 | + | ||
| 168 | # 获取文件统计信息 | 221 | # 获取文件统计信息 |
| 169 | if os.path.exists(output_file): | 222 | if os.path.exists(output_file): |
| 170 | file_size = os.path.getsize(output_file) | 223 | file_size = os.path.getsize(output_file) |
| @@ -175,8 +228,10 @@ if os.path.exists(output_file): | @@ -175,8 +228,10 @@ if os.path.exists(output_file): | ||
| 175 | } | 228 | } |
| 176 | 229 | ||
| 177 | # 输出可读版本到debug目录(ID+名称格式) | 230 | # 输出可读版本到debug目录(ID+名称格式) |
| 231 | +logger.info(f"开始写入可读文件: {debug_file}") | ||
| 232 | + | ||
| 178 | if args.debug: | 233 | if args.debug: |
| 179 | - print(f"[DEBUG] 开始写入可读文件: {debug_file}") | 234 | + logger.debug(f"[DEBUG] 开始写入可读文件: {debug_file}") |
| 180 | 235 | ||
| 181 | with open(debug_file, 'w', encoding='utf-8') as f: | 236 | with open(debug_file, 'w', encoding='utf-8') as f: |
| 182 | # 写入文件头信息 | 237 | # 写入文件头信息 |
| @@ -199,6 +254,8 @@ with open(debug_file, 'w', encoding='utf-8') as f: | @@ -199,6 +254,8 @@ with open(debug_file, 'w', encoding='utf-8') as f: | ||
| 199 | sim_str = ','.join(sim_parts) | 254 | sim_str = ','.join(sim_parts) |
| 200 | f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') | 255 | f.write(f'{cat_id}:{cat_clean}\t{sim_str}\n') |
| 201 | 256 | ||
| 257 | +logger.info(f"可读文件写入完成: {debug_file}") | ||
| 258 | + | ||
| 202 | # 获取debug文件统计信息 | 259 | # 获取debug文件统计信息 |
| 203 | if os.path.exists(debug_file): | 260 | if os.path.exists(debug_file): |
| 204 | file_size = os.path.getsize(debug_file) | 261 | file_size = os.path.getsize(debug_file) |
| @@ -213,29 +270,29 @@ stats['end_time'] = datetime.now() | @@ -213,29 +270,29 @@ stats['end_time'] = datetime.now() | ||
| 213 | stats['processing_time_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds() | 270 | stats['processing_time_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds() |
| 214 | 271 | ||
| 215 | # 输出统计信息 | 272 | # 输出统计信息 |
| 216 | -print("\n" + "="*80) | ||
| 217 | -print("Tag分类相似度计算 - 关键统计信息") | ||
| 218 | -print("="*80) | ||
| 219 | -print(f"📊 数据概览:") | ||
| 220 | -print(f" - 总订单数: {stats['total_orders']:,}") | ||
| 221 | -print(f" - 唯一区域数: {stats['unique_regions']:,}") | ||
| 222 | -print(f" - 唯一客户数: {stats['unique_customers']:,}") | ||
| 223 | -print(f" - 总分类数: {stats['total_categories']:,}") | ||
| 224 | -print(f" - 有相似度的分类数: {stats['categories_with_similarities']:,}") | ||
| 225 | -print(f" - 总相似度对数量: {stats['total_similarity_pairs']:,}") | ||
| 226 | -print(f" - 平均每分类相似数: {stats['avg_similarities_per_category']:.1f}") | ||
| 227 | - | ||
| 228 | -print(f"\n📁 输出文件:") | 273 | +logger.info("="*80) |
| 274 | +logger.info("Tag分类相似度计算 - 关键统计信息") | ||
| 275 | +logger.info("="*80) | ||
| 276 | +logger.info(f"📊 数据概览:") | ||
| 277 | +logger.info(f" - 总订单数: {stats['total_orders']:,}") | ||
| 278 | +logger.info(f" - 唯一区域数: {stats['unique_regions']:,}") | ||
| 279 | +logger.info(f" - 唯一客户数: {stats['unique_customers']:,}") | ||
| 280 | +logger.info(f" - 总分类数: {stats['total_categories']:,}") | ||
| 281 | +logger.info(f" - 有相似度的分类数: {stats['categories_with_similarities']:,}") | ||
| 282 | +logger.info(f" - 总相似度对数量: {stats['total_similarity_pairs']:,}") | ||
| 283 | +logger.info(f" - 平均每分类相似数: {stats['avg_similarities_per_category']:.1f}") | ||
| 284 | + | ||
| 285 | +logger.info(f"📁 输出文件:") | ||
| 229 | for file_type, file_info in stats['file_stats'].items(): | 286 | for file_type, file_info in stats['file_stats'].items(): |
| 230 | - print(f" - {file_type}: {file_info['path']}") | ||
| 231 | - print(f" 大小: {file_info['size_mb']} MB ({file_info['size_bytes']:,} bytes)") | ||
| 232 | - | ||
| 233 | -print(f"\n⏱️ 处理时间:") | ||
| 234 | -print(f" - 开始时间: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 235 | -print(f" - 结束时间: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 236 | -print(f" - 总耗时: {stats['processing_time_seconds']:.2f} 秒") | ||
| 237 | - | ||
| 238 | -print(f"\n✅ Tag相似度计算完成") | ||
| 239 | -print(f" - 输出文件: {output_file}") | ||
| 240 | -print(f" - 可读文件: {debug_file}") | ||
| 241 | -print("="*80) | 287 | + logger.info(f" - {file_type}: {file_info['path']}") |
| 288 | + logger.info(f" 大小: {file_info['size_mb']} MB ({file_info['size_bytes']:,} bytes)") | ||
| 289 | + | ||
| 290 | +logger.info(f"⏱️ 处理时间:") | ||
| 291 | +logger.info(f" - 开始时间: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 292 | +logger.info(f" - 结束时间: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}") | ||
| 293 | +logger.info(f" - 总耗时: {stats['processing_time_seconds']:.2f} 秒") | ||
| 294 | + | ||
| 295 | +logger.info(f"✅ Tag相似度计算完成") | ||
| 296 | +logger.info(f" - 输出文件: {output_file}") | ||
| 297 | +logger.info(f" - 可读文件: {debug_file}") | ||
| 298 | +logger.info("="*80) |
requirements.txt