Commit 14f3dcbe9706dff7b8ae96eb45daf3c8f0845a21

Authored by tangwang
1 parent 8cc6477b

offline tasks

offline_tasks/run.sh
... ... @@ -4,7 +4,7 @@ cd /home/tw/recommendation/offline_tasks
4 4 # cat UPDATE_CONFIG_GUIDE.md
5 5  
6 6 # 2. 测试连接
7   -python3 test_connection.py
  7 +# python3 test_connection.py
8 8  
9 9 # 3. 调试模式运行(小数据量)
10 10 python3 run_all.py --lookback_days 7 --top_n 10 --debug
... ... @@ -13,7 +13,9 @@ mv output output_debug
13 13 mkdir output
14 14  
15 15 # # 4. 生产模式运行(大数据量)
16   -python3 run_all.py --lookback_days 730 --top_n 50
  16 +python3 run_all.py --lookback_days 730 --top_n 50 --debug
17 17  
18 18 # 5. 加载到Redis
19 19 python3 scripts/load_index_to_redis.py --redis-host localhost
  20 +
  21 +
... ...
offline_tasks/scripts/i2i_content_similar.py
... ... @@ -17,6 +17,11 @@ from db_service import create_db_connection
17 17 from offline_tasks.config.offline_config import (
18 18 DB_CONFIG, OUTPUT_DIR, DEFAULT_I2I_TOP_N
19 19 )
  20 +from offline_tasks.scripts.debug_utils import (
  21 + setup_debug_logger, log_dataframe_info, log_dict_stats,
  22 + save_readable_index, fetch_name_mappings, log_algorithm_params,
  23 + log_processing_step
  24 +)
20 25  
21 26  
22 27 def fetch_product_features(engine):
... ... @@ -221,8 +226,19 @@ def main():
221 226  
222 227 args = parser.parse_args()
223 228  
  229 + # 设置logger
  230 + logger = setup_debug_logger('i2i_content_similar', debug=args.debug)
  231 +
  232 + # 记录算法参数
  233 + params = {
  234 + 'top_n': args.top_n,
  235 + 'method': args.method,
  236 + 'debug': args.debug
  237 + }
  238 + log_algorithm_params(logger, params)
  239 +
224 240 # 创建数据库连接
225   - print("Connecting to database...")
  241 + logger.info("连接数据库...")
226 242 engine = create_db_connection(
227 243 DB_CONFIG['host'],
228 244 DB_CONFIG['port'],
... ... @@ -232,34 +248,47 @@ def main():
232 248 )
233 249  
234 250 # 获取商品特征
  251 + log_processing_step(logger, "获取商品特征")
235 252 df = fetch_product_features(engine)
  253 + logger.info(f"获取到 {len(df)} 个商品的特征数据")
  254 + log_dataframe_info(logger, df, "商品特征数据")
236 255  
237 256 # 计算相似度
  257 + log_processing_step(logger, f"计算相似度 (方法: {args.method})")
238 258 if args.method == 'tfidf':
239   - print("\nUsing TF-IDF method...")
  259 + logger.info("使用 TF-IDF 方法...")
240 260 result = calculate_content_similarity(df, args.top_n)
241 261 elif args.method == 'category':
242   - print("\nUsing category-based method...")
  262 + logger.info("使用基于分类的方法...")
243 263 result = calculate_category_based_similarity(df)
244 264 else: # hybrid
245   - print("\nUsing hybrid method...")
  265 + logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...")
246 266 tfidf_sim = calculate_content_similarity(df, args.top_n)
247 267 category_sim = calculate_category_based_similarity(df)
248 268 result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3)
249 269  
250   - # 创建item_id到name的映射
251   - item_name_map = dict(zip(df['item_id'], df['item_name']))
  270 + logger.info(f"为 {len(result)} 个物品生成了相似度")
252 271  
253 272 # 输出结果
  273 + log_processing_step(logger, "保存结果")
254 274 output_file = args.output or os.path.join(
255 275 OUTPUT_DIR,
256 276 f'i2i_content_{args.method}_{datetime.now().strftime("%Y%m%d")}.txt'
257 277 )
258 278  
259   - print(f"\nWriting results to {output_file}...")
  279 + # 获取name mappings
  280 + name_mappings = {}
  281 + if args.debug:
  282 + logger.info("获取物品名称映射...")
  283 + name_mappings = fetch_name_mappings(engine, debug=True)
  284 +
  285 + logger.info(f"写入结果到 {output_file}...")
260 286 with open(output_file, 'w', encoding='utf-8') as f:
261 287 for item_id, sims in result.items():
262   - item_name = item_name_map.get(item_id, 'Unknown')
  288 + # 使用name_mappings获取名称
  289 + item_name = name_mappings.get(item_id, 'Unknown')
  290 + if item_name == 'Unknown' and 'item_name' in df.columns:
  291 + item_name = df[df['item_id'] == item_id]['item_name'].iloc[0] if len(df[df['item_id'] == item_id]) > 0 else 'Unknown'
263 292  
264 293 if not sims:
265 294 continue
... ... @@ -268,8 +297,19 @@ def main():
268 297 sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims])
269 298 f.write(f'{item_id}\t{item_name}\t{sim_str}\n')
270 299  
271   - print(f"Done! Generated content-based similarities for {len(result)} items")
272   - print(f"Output saved to: {output_file}")
  300 + logger.info(f"完成!为 {len(result)} 个物品生成了基于内容的相似度")
  301 + logger.info(f"输出保存到:{output_file}")
  302 +
  303 + # 如果启用debug模式,保存可读格式
  304 + if args.debug:
  305 + log_processing_step(logger, "保存Debug可读格式")
  306 + save_readable_index(
  307 + output_file,
  308 + result,
  309 + name_mappings,
  310 + index_type=f'i2i:content:{args.method}',
  311 + logger=logger
  312 + )
273 313  
274 314  
275 315 if __name__ == '__main__':
... ...
offline_tasks/scripts/i2i_deepwalk.py
... ... @@ -17,6 +17,11 @@ from offline_tasks.config.offline_config import (
17 17 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
18 18 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
19 19 )
  20 +from offline_tasks.scripts.debug_utils import (
  21 + setup_debug_logger, log_dataframe_info, log_dict_stats,
  22 + save_readable_index, fetch_name_mappings, log_algorithm_params,
  23 + log_processing_step
  24 +)
20 25  
21 26  
22 27 def build_item_graph(df, behavior_weights):
... ... @@ -223,8 +228,26 @@ def main():
223 228  
224 229 args = parser.parse_args()
225 230  
  231 + # 设置logger
  232 + logger = setup_debug_logger('i2i_deepwalk', debug=args.debug)
  233 +
  234 + # 记录算法参数
  235 + params = {
  236 + 'num_walks': args.num_walks,
  237 + 'walk_length': args.walk_length,
  238 + 'window_size': args.window_size,
  239 + 'vector_size': args.vector_size,
  240 + 'min_count': args.min_count,
  241 + 'workers': args.workers,
  242 + 'epochs': args.epochs,
  243 + 'top_n': args.top_n,
  244 + 'lookback_days': args.lookback_days,
  245 + 'debug': args.debug
  246 + }
  247 + log_algorithm_params(logger, params)
  248 +
226 249 # 创建数据库连接
227   - print("Connecting to database...")
  250 + logger.info("连接数据库...")
228 251 engine = create_db_connection(
229 252 DB_CONFIG['host'],
230 253 DB_CONFIG['port'],
... ... @@ -235,7 +258,7 @@ def main():
235 258  
236 259 # 获取时间范围
237 260 start_date, end_date = get_time_range(args.lookback_days)
238   - print(f"Fetching data from {start_date} to {end_date}...")
  261 + logger.info(f"获取数据范围:{start_date} 到 {end_date}")
239 262  
240 263 # SQL查询 - 获取用户行为数据
241 264 sql_query = f"""
... ... @@ -255,9 +278,12 @@ def main():
255 278 AND se.anonymous_id IS NOT NULL
256 279 """
257 280  
258   - print("Executing SQL query...")
  281 + logger.info("执行SQL查询...")
259 282 df = pd.read_sql(sql_query, engine)
260   - print(f"Fetched {len(df)} records")
  283 + logger.info(f"获取到 {len(df)} 条记录")
  284 +
  285 + # 记录数据信息
  286 + log_dataframe_info(logger, df, "用户行为数据")
261 287  
262 288 # 定义行为权重
263 289 behavior_weights = {
... ... @@ -267,23 +293,26 @@ def main():
267 293 'addToCart': 3.0,
268 294 'purchase': 10.0
269 295 }
  296 + logger.debug(f"行为权重: {behavior_weights}")
270 297  
271 298 # 构建物品图
272   - print("Building item graph...")
  299 + log_processing_step(logger, "构建物品图")
273 300 graph = build_item_graph(df, behavior_weights)
274   - print(f"Graph built with {len(graph)} nodes")
  301 + logger.info(f"构建物品图完成,共 {len(graph)} 个节点")
275 302  
276 303 # 保存边文件(可选)
277 304 if args.save_graph:
278 305 edge_file = os.path.join(OUTPUT_DIR, f'item_graph_{datetime.now().strftime("%Y%m%d")}.txt')
279 306 save_edge_file(graph, edge_file)
  307 + logger.info(f"图边文件已保存到 {edge_file}")
280 308  
281 309 # 生成随机游走
282   - print("Generating random walks...")
  310 + log_processing_step(logger, "生成随机游走")
283 311 walks = generate_walks(graph, args.num_walks, args.walk_length)
284   - print(f"Generated {len(walks)} walks")
  312 + logger.info(f"生成 {len(walks)} 条游走路径")
285 313  
286 314 # 训练Word2Vec模型
  315 + log_processing_step(logger, "训练Word2Vec模型")
287 316 w2v_config = {
288 317 'vector_size': args.vector_size,
289 318 'window_size': args.window_size,
... ... @@ -292,29 +321,39 @@ def main():
292 321 'epochs': args.epochs,
293 322 'sg': 1
294 323 }
  324 + logger.debug(f"Word2Vec配置: {w2v_config}")
295 325  
296 326 model = train_word2vec(walks, w2v_config)
  327 + logger.info(f"训练完成。词汇表大小:{len(model.wv)}")
297 328  
298 329 # 保存模型(可选)
299 330 if args.save_model:
300 331 model_path = os.path.join(OUTPUT_DIR, f'deepwalk_model_{datetime.now().strftime("%Y%m%d")}.model')
301 332 model.save(model_path)
302   - print(f"Model saved to {model_path}")
  333 + logger.info(f"模型已保存到 {model_path}")
303 334  
304 335 # 生成相似度
305   - print("Generating similarities...")
  336 + log_processing_step(logger, "生成相似度")
306 337 result = generate_similarities(model, top_n=args.top_n)
307   -
308   - # 创建item_id到name的映射
309   - item_name_map = dict(zip(df['item_id'].astype(str), df.groupby('item_id')['item_name'].first()))
  338 + logger.info(f"生成了 {len(result)} 个物品的相似度")
310 339  
311 340 # 输出结果
  341 + log_processing_step(logger, "保存结果")
312 342 output_file = args.output or os.path.join(OUTPUT_DIR, f'i2i_deepwalk_{datetime.now().strftime("%Y%m%d")}.txt')
313 343  
314   - print(f"Writing results to {output_file}...")
  344 + # 获取name mappings
  345 + name_mappings = {}
  346 + if args.debug:
  347 + logger.info("获取物品名称映射...")
  348 + name_mappings = fetch_name_mappings(engine, debug=True)
  349 +
  350 + logger.info(f"写入结果到 {output_file}...")
315 351 with open(output_file, 'w', encoding='utf-8') as f:
316 352 for item_id, sims in result.items():
317   - item_name = item_name_map.get(item_id, 'Unknown')
  353 + # 使用name_mappings获取名称
  354 + item_name = name_mappings.get(int(item_id), 'Unknown') if item_id.isdigit() else 'Unknown'
  355 + if item_name == 'Unknown' and 'item_name' in df.columns:
  356 + item_name = df[df['item_id'].astype(str) == item_id]['item_name'].iloc[0] if len(df[df['item_id'].astype(str) == item_id]) > 0 else 'Unknown'
318 357  
319 358 if not sims:
320 359 continue
... ... @@ -323,8 +362,19 @@ def main():
323 362 sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims])
324 363 f.write(f'{item_id}\t{item_name}\t{sim_str}\n')
325 364  
326   - print(f"Done! Generated i2i similarities for {len(result)} items")
327   - print(f"Output saved to: {output_file}")
  365 + logger.info(f"完成!为 {len(result)} 个物品生成了相似度")
  366 + logger.info(f"输出保存到:{output_file}")
  367 +
  368 + # 如果启用debug模式,保存可读格式
  369 + if args.debug:
  370 + log_processing_step(logger, "保存Debug可读格式")
  371 + save_readable_index(
  372 + output_file,
  373 + result,
  374 + name_mappings,
  375 + index_type='i2i:deepwalk',
  376 + logger=logger
  377 + )
328 378  
329 379  
330 380 if __name__ == '__main__':
... ...
offline_tasks/scripts/i2i_session_w2v.py
... ... @@ -18,21 +18,30 @@ from offline_tasks.config.offline_config import (
18 18 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
19 19 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
20 20 )
  21 +from offline_tasks.scripts.debug_utils import (
  22 + setup_debug_logger, log_dataframe_info, log_dict_stats,
  23 + save_readable_index, fetch_name_mappings, log_algorithm_params,
  24 + log_processing_step
  25 +)
21 26  
22 27  
23   -def prepare_session_data(df, session_gap_minutes=30):
  28 +def prepare_session_data(df, session_gap_minutes=30, logger=None):
24 29 """
25 30 准备会话数据
26 31  
27 32 Args:
28 33 df: DataFrame with columns: user_id, item_id, create_time
29 34 session_gap_minutes: 会话间隔时间(分钟)
  35 + logger: Logger instance for debugging
30 36  
31 37 Returns:
32 38 List of sessions, each session is a list of item_ids
33 39 """
34 40 sessions = []
35 41  
  42 + if logger:
  43 + logger.debug(f"开始准备会话数据,会话间隔:{session_gap_minutes}分钟")
  44 +
36 45 # 按用户和时间排序
37 46 df = df.sort_values(['user_id', 'create_time'])
38 47  
... ... @@ -65,21 +74,33 @@ def prepare_session_data(df, session_gap_minutes=30):
65 74 # 过滤掉长度小于2的会话
66 75 sessions = [s for s in sessions if len(s) >= 2]
67 76  
  77 + if logger:
  78 + session_lengths = [len(s) for s in sessions]
  79 + logger.debug(f"生成 {len(sessions)} 个会话")
  80 + logger.debug(f"会话长度统计:最小={min(session_lengths)}, 最大={max(session_lengths)}, "
  81 + f"平均={sum(session_lengths)/len(session_lengths):.2f}")
  82 +
68 83 return sessions
69 84  
70 85  
71   -def train_word2vec(sessions, config):
  86 +def train_word2vec(sessions, config, logger=None):
72 87 """
73 88 训练Word2Vec模型
74 89  
75 90 Args:
76 91 sessions: List of sessions
77 92 config: Word2Vec配置
  93 + logger: Logger instance for debugging
78 94  
79 95 Returns:
80 96 Word2Vec模型
81 97 """
82   - print(f"Training Word2Vec with {len(sessions)} sessions...")
  98 + if logger:
  99 + logger.info(f"训练Word2Vec模型,共 {len(sessions)} 个会话")
  100 + logger.debug(f"模型参数:vector_size={config['vector_size']}, window={config['window_size']}, "
  101 + f"min_count={config['min_count']}, epochs={config['epochs']}")
  102 + else:
  103 + print(f"Training Word2Vec with {len(sessions)} sessions...")
83 104  
84 105 model = Word2Vec(
85 106 sentences=sessions,
... ... @@ -92,23 +113,30 @@ def train_word2vec(sessions, config):
92 113 seed=42
93 114 )
94 115  
95   - print(f"Training completed. Vocabulary size: {len(model.wv)}")
  116 + if logger:
  117 + logger.info(f"训练完成。词汇表大小:{len(model.wv)}")
  118 + else:
  119 + print(f"Training completed. Vocabulary size: {len(model.wv)}")
96 120 return model
97 121  
98 122  
99   -def generate_similarities(model, top_n=50):
  123 +def generate_similarities(model, top_n=50, logger=None):
100 124 """
101 125 生成物品相似度
102 126  
103 127 Args:
104 128 model: Word2Vec模型
105 129 top_n: Top N similar items
  130 + logger: Logger instance for debugging
106 131  
107 132 Returns:
108 133 Dict[item_id, List[Tuple(similar_item_id, score)]]
109 134 """
110 135 result = {}
111 136  
  137 + if logger:
  138 + logger.info(f"生成Top {top_n} 相似物品")
  139 +
112 140 for item_id in model.wv.index_to_key:
113 141 try:
114 142 similar_items = model.wv.most_similar(item_id, topn=top_n)
... ... @@ -116,6 +144,9 @@ def generate_similarities(model, top_n=50):
116 144 except KeyError:
117 145 continue
118 146  
  147 + if logger:
  148 + logger.info(f"生成了 {len(result)} 个物品的相似度")
  149 +
119 150 return result
120 151  
121 152  
... ... @@ -146,8 +177,25 @@ def main():
146 177  
147 178 args = parser.parse_args()
148 179  
  180 + # 设置logger
  181 + logger = setup_debug_logger('i2i_session_w2v', debug=args.debug)
  182 +
  183 + # 记录算法参数
  184 + params = {
  185 + 'window_size': args.window_size,
  186 + 'vector_size': args.vector_size,
  187 + 'min_count': args.min_count,
  188 + 'workers': args.workers,
  189 + 'epochs': args.epochs,
  190 + 'top_n': args.top_n,
  191 + 'lookback_days': args.lookback_days,
  192 + 'session_gap_minutes': args.session_gap,
  193 + 'debug': args.debug
  194 + }
  195 + log_algorithm_params(logger, params)
  196 +
149 197 # 创建数据库连接
150   - print("Connecting to database...")
  198 + logger.info("连接数据库...")
151 199 engine = create_db_connection(
152 200 DB_CONFIG['host'],
153 201 DB_CONFIG['port'],
... ... @@ -158,7 +206,7 @@ def main():
158 206  
159 207 # 获取时间范围
160 208 start_date, end_date = get_time_range(args.lookback_days)
161   - print(f"Fetching data from {start_date} to {end_date}...")
  209 + logger.info(f"获取数据范围:{start_date} 到 {end_date}")
162 210  
163 211 # SQL查询 - 获取用户行为序列
164 212 sql_query = f"""
... ... @@ -181,19 +229,23 @@ def main():
181 229 se.create_time
182 230 """
183 231  
184   - print("Executing SQL query...")
  232 + logger.info("执行SQL查询...")
185 233 df = pd.read_sql(sql_query, engine)
186   - print(f"Fetched {len(df)} records")
  234 + logger.info(f"获取到 {len(df)} 条记录")
  235 +
  236 + # 记录数据信息
  237 + log_dataframe_info(logger, df, "用户行为数据")
187 238  
188 239 # 转换create_time为datetime
189 240 df['create_time'] = pd.to_datetime(df['create_time'])
190 241  
191 242 # 准备会话数据
192   - print("Preparing session data...")
193   - sessions = prepare_session_data(df, session_gap_minutes=args.session_gap)
194   - print(f"Generated {len(sessions)} sessions")
  243 + log_processing_step(logger, "准备会话数据")
  244 + sessions = prepare_session_data(df, session_gap_minutes=args.session_gap, logger=logger)
  245 + logger.info(f"生成 {len(sessions)} 个会话")
195 246  
196 247 # 训练Word2Vec模型
  248 + log_processing_step(logger, "训练Word2Vec模型")
197 249 w2v_config = {
198 250 'vector_size': args.vector_size,
199 251 'window_size': args.window_size,
... ... @@ -203,28 +255,35 @@ def main():
203 255 'sg': 1
204 256 }
205 257  
206   - model = train_word2vec(sessions, w2v_config)
  258 + model = train_word2vec(sessions, w2v_config, logger=logger)
207 259  
208 260 # 保存模型(可选)
209 261 if args.save_model:
210 262 model_path = os.path.join(OUTPUT_DIR, f'session_w2v_model_{datetime.now().strftime("%Y%m%d")}.model')
211 263 model.save(model_path)
212   - print(f"Model saved to {model_path}")
  264 + logger.info(f"模型已保存到 {model_path}")
213 265  
214 266 # 生成相似度
215   - print("Generating similarities...")
216   - result = generate_similarities(model, top_n=args.top_n)
217   -
218   - # 创建item_id到name的映射
219   - item_name_map = dict(zip(df['item_id'].astype(str), df.groupby('item_id')['item_name'].first()))
  267 + log_processing_step(logger, "生成相似度")
  268 + result = generate_similarities(model, top_n=args.top_n, logger=logger)
220 269  
221 270 # 输出结果
  271 + log_processing_step(logger, "保存结果")
222 272 output_file = args.output or os.path.join(OUTPUT_DIR, f'i2i_session_w2v_{datetime.now().strftime("%Y%m%d")}.txt')
223 273  
224   - print(f"Writing results to {output_file}...")
  274 + # 获取name mappings用于标准输出格式
  275 + name_mappings = {}
  276 + if args.debug:
  277 + logger.info("获取物品名称映射...")
  278 + name_mappings = fetch_name_mappings(engine, debug=True)
  279 +
  280 + logger.info(f"写入结果到 {output_file}...")
225 281 with open(output_file, 'w', encoding='utf-8') as f:
226 282 for item_id, sims in result.items():
227   - item_name = item_name_map.get(item_id, 'Unknown')
  283 + # 使用name_mappings获取名称,如果没有则从df中获取
  284 + item_name = name_mappings.get(int(item_id), 'Unknown') if item_id.isdigit() else 'Unknown'
  285 + if item_name == 'Unknown' and 'item_name' in df.columns:
  286 + item_name = df[df['item_id'].astype(str) == item_id]['item_name'].iloc[0] if len(df[df['item_id'].astype(str) == item_id]) > 0 else 'Unknown'
228 287  
229 288 if not sims:
230 289 continue
... ... @@ -233,8 +292,19 @@ def main():
233 292 sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims])
234 293 f.write(f'{item_id}\t{item_name}\t{sim_str}\n')
235 294  
236   - print(f"Done! Generated i2i similarities for {len(result)} items")
237   - print(f"Output saved to: {output_file}")
  295 + logger.info(f"完成!为 {len(result)} 个物品生成了相似度")
  296 + logger.info(f"输出保存到:{output_file}")
  297 +
  298 + # 如果启用debug模式,保存可读格式
  299 + if args.debug:
  300 + log_processing_step(logger, "保存Debug可读格式")
  301 + save_readable_index(
  302 + output_file,
  303 + result,
  304 + name_mappings,
  305 + index_type='i2i:session_w2v',
  306 + logger=logger
  307 + )
238 308  
239 309  
240 310 if __name__ == '__main__':
... ...
offline_tasks/scripts/interest_aggregation.py
... ... @@ -17,6 +17,11 @@ from offline_tasks.config.offline_config import (
17 17 DB_CONFIG, OUTPUT_DIR, INTEREST_AGGREGATION_CONFIG, get_time_range,
18 18 DEFAULT_LOOKBACK_DAYS, DEFAULT_RECENT_DAYS, DEFAULT_INTEREST_TOP_N
19 19 )
  20 +from offline_tasks.scripts.debug_utils import (
  21 + setup_debug_logger, log_dataframe_info, log_dict_stats,
  22 + save_readable_index, fetch_name_mappings, log_algorithm_params,
  23 + log_processing_step
  24 +)
20 25  
21 26  
22 27 def calculate_time_weight(event_time, reference_time, decay_factor=0.95, days_unit=30):
... ... @@ -227,8 +232,22 @@ def main():
227 232  
228 233 args = parser.parse_args()
229 234  
  235 + # 设置logger
  236 + logger = setup_debug_logger('interest_aggregation', debug=args.debug)
  237 +
  238 + # 记录算法参数
  239 + params = {
  240 + 'top_n': args.top_n,
  241 + 'lookback_days': args.lookback_days,
  242 + 'recent_days': args.recent_days,
  243 + 'new_days': args.new_days,
  244 + 'decay_factor': args.decay_factor,
  245 + 'debug': args.debug
  246 + }
  247 + log_algorithm_params(logger, params)
  248 +
230 249 # 创建数据库连接
231   - print("Connecting to database...")
  250 + logger.info("连接数据库...")
232 251 engine = create_db_connection(
233 252 DB_CONFIG['host'],
234 253 DB_CONFIG['port'],
... ... @@ -242,7 +261,9 @@ def main():
242 261 recent_start_date, _ = get_time_range(args.recent_days)
243 262 new_start_date, _ = get_time_range(args.new_days)
244 263  
245   - print(f"Fetching data from {start_date} to {end_date}...")
  264 + logger.info(f"获取数据范围:{start_date} 到 {end_date}")
  265 + logger.debug(f"热门商品起始日期:{recent_start_date}")
  266 + logger.debug(f"新品起始日期:{new_start_date}")
246 267  
247 268 # SQL查询 - 获取用户行为数据(包含用户特征和商品分类)
248 269 sql_query = f"""
... ... @@ -279,9 +300,12 @@ def main():
279 300 se.create_time
280 301 """
281 302  
282   - print("Executing SQL query...")
  303 + logger.info("执行SQL查询...")
283 304 df = pd.read_sql(sql_query, engine)
284   - print(f"Fetched {len(df)} records")
  305 + logger.info(f"获取到 {len(df)} 条记录")
  306 +
  307 + # 记录数据信息
  308 + log_dataframe_info(logger, df, "用户行为数据")
285 309  
286 310 # 转换时间列
287 311 df['create_time'] = pd.to_datetime(df['create_time'])
... ... @@ -289,37 +313,70 @@ def main():
289 313  
290 314 # 定义行为权重
291 315 behavior_weights = INTEREST_AGGREGATION_CONFIG['behavior_weights']
  316 + logger.debug(f"行为权重: {behavior_weights}")
292 317  
293 318 # 准备不同类型的数据集
  319 + log_processing_step(logger, "准备不同类型的数据集")
294 320  
295 321 # 1. 热门商品:最近N天的高交互商品
296 322 df_hot = df[df['create_time'] >= recent_start_date].copy()
  323 + logger.info(f"热门商品数据集:{len(df_hot)} 条记录")
297 324  
298 325 # 2. 加购商品:加购行为
299 326 df_cart = df[df['event_type'].isin(['addToCart', 'addToPool'])].copy()
  327 + logger.info(f"加购商品数据集:{len(df_cart)} 条记录")
300 328  
301 329 # 3. 新品:商品创建时间在最近N天内
302 330 df_new = df[df['item_create_time'] >= new_start_date].copy()
  331 + logger.info(f"新品数据集:{len(df_new)} 条记录")
303 332  
304 333 # 生成不同列表类型的索引
305   - print("\n=== Generating indices ===")
  334 + log_processing_step(logger, "生成不同列表类型的索引")
306 335 list_type_indices = generate_list_type_indices(
307 336 df_hot, df_cart, df_new, behavior_weights
308 337 )
  338 + logger.info(f"生成了 {len(list_type_indices)} 种列表类型的索引")
  339 +
  340 + # 获取name mappings用于debug输出
  341 + name_mappings = {}
  342 + if args.debug:
  343 + logger.info("获取物品名称映射...")
  344 + name_mappings = fetch_name_mappings(engine, debug=True)
309 345  
310 346 # 输出索引
  347 + log_processing_step(logger, "保存索引文件")
311 348 for list_type, aggregations in list_type_indices.items():
312 349 output_prefix = f'{args.output_prefix}_{list_type}'
  350 + logger.info(f"保存 {list_type} 类型的索引...")
313 351 output_indices(aggregations, output_prefix, top_n=args.top_n)
  352 +
  353 + # 如果启用debug模式,保存可读格式
  354 + if args.debug and aggregations:
  355 + for dim_key, items in aggregations.items():
  356 + if items:
  357 + # 为每个维度生成可读索引
  358 + result_dict = {dim_key: items[:args.top_n]}
  359 + output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt')
  360 + if os.path.exists(output_file):
  361 + save_readable_index(
  362 + output_file,
  363 + result_dict,
  364 + name_mappings,
  365 + index_type=f'interest:{list_type}:{dim_key}',
  366 + logger=logger
  367 + )
314 368  
315 369 # 生成全局索引(所有数据)
316   - print("\nGenerating global indices...")
  370 + log_processing_step(logger, "生成全局索引")
317 371 global_aggregations = aggregate_by_dimensions(
318 372 df, behavior_weights, time_decay=True, decay_factor=args.decay_factor
319 373 )
  374 + logger.info("保存全局索引...")
320 375 output_indices(global_aggregations, f'{args.output_prefix}_global', top_n=args.top_n)
321 376  
322   - print("\n=== All indices generated successfully! ===")
  377 + logger.info("="*80)
  378 + logger.info("所有索引生成完成!")
  379 + logger.info("="*80)
323 380  
324 381  
325 382 if __name__ == '__main__':
... ...