Commit 40442baf1b796509110529e7ce705b24450e5da6

Authored by tangwang
1 parent 9832fef6

offline tasks: fix bugs of i2i swing / hot / sessionw2v

offline_tasks/run.sh
@@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks @@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks
6 # 2. 测试连接 6 # 2. 测试连接
7 # python3 test_connection.py 7 # python3 test_connection.py
8 8
  9 +ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9
  10 +ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9
  11 +rm output/* -rf
  12 +rm logs/* -rf
  13 +
  14 +
9 # 3. 调试模式运行(小数据量) 15 # 3. 调试模式运行(小数据量)
10 python3 run_all.py --lookback_days 7 --top_n 10 --debug 16 python3 run_all.py --lookback_days 7 --top_n 10 --debug
11 17
offline_tasks/scripts/i2i_content_similar.py
@@ -107,24 +107,37 @@ def build_feature_text(row): @@ -107,24 +107,37 @@ def build_feature_text(row):
107 return ' '.join(features) 107 return ' '.join(features)
108 108
109 109
110 -def calculate_content_similarity(df, top_n=50): 110 +def calculate_content_similarity(df, top_n=50, logger=None):
111 """ 111 """
112 - 基于内容计算相似度 112 + 基于内容计算相似度(内存优化版)
113 """ 113 """
114 - print("Building feature texts...") 114 +
  115 + if logger:
  116 + logger.info("构建特征文本...")
  117 + else:
  118 + print("Building feature texts...")
115 df['feature_text'] = df.apply(build_feature_text, axis=1) 119 df['feature_text'] = df.apply(build_feature_text, axis=1)
116 120
117 - print("Calculating TF-IDF...") 121 + if logger:
  122 + logger.info("计算 TF-IDF...")
  123 + else:
  124 + print("Calculating TF-IDF...")
118 vectorizer = TfidfVectorizer(max_features=1000) 125 vectorizer = TfidfVectorizer(max_features=1000)
119 tfidf_matrix = vectorizer.fit_transform(df['feature_text']) 126 tfidf_matrix = vectorizer.fit_transform(df['feature_text'])
120 127
121 - print("Calculating cosine similarity...")  
122 - # 分批计算相似度以节省内存 128 + if logger:
  129 + logger.info(f"TF-IDF 矩阵形状: {tfidf_matrix.shape}")
  130 + logger.info("开始计算余弦相似度(内存优化模式)...")
  131 + else:
  132 + print("Calculating cosine similarity...")
  133 +
123 batch_size = 1000 134 batch_size = 1000
124 result = {} 135 result = {}
125 136
126 for i in range(0, len(df), batch_size): 137 for i in range(0, len(df), batch_size):
127 end_i = min(i + batch_size, len(df)) 138 end_i = min(i + batch_size, len(df))
  139 +
  140 + # 分批计算相似度
128 batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix) 141 batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix)
129 142
130 for j, idx in enumerate(range(i, end_i)): 143 for j, idx in enumerate(range(i, end_i)):
@@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50): @@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50):
144 157
145 if similar_items: 158 if similar_items:
146 result[item_id] = similar_items 159 result[item_id] = similar_items
147 -  
148 - print(f"Processed {end_i}/{len(df)} products...") 160 +
  161 + if logger:
  162 + logger.info(f"已处理 {end_i}/{len(df)} 个商品...")
  163 + else:
  164 + print(f"Processed {end_i}/{len(df)} products...")
149 165
150 return result 166 return result
151 167
@@ -257,14 +273,16 @@ def main(): @@ -257,14 +273,16 @@ def main():
257 log_processing_step(logger, f"计算相似度 (方法: {args.method})") 273 log_processing_step(logger, f"计算相似度 (方法: {args.method})")
258 if args.method == 'tfidf': 274 if args.method == 'tfidf':
259 logger.info("使用 TF-IDF 方法...") 275 logger.info("使用 TF-IDF 方法...")
260 - result = calculate_content_similarity(df, args.top_n) 276 + result = calculate_content_similarity(df, args.top_n, logger=logger)
261 elif args.method == 'category': 277 elif args.method == 'category':
262 logger.info("使用基于分类的方法...") 278 logger.info("使用基于分类的方法...")
263 result = calculate_category_based_similarity(df) 279 result = calculate_category_based_similarity(df)
264 else: # hybrid 280 else: # hybrid
265 logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...") 281 logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...")
266 - tfidf_sim = calculate_content_similarity(df, args.top_n) 282 + tfidf_sim = calculate_content_similarity(df, args.top_n, logger=logger)
  283 + logger.info("计算基于分类的相似度...")
267 category_sim = calculate_category_based_similarity(df) 284 category_sim = calculate_category_based_similarity(df)
  285 + logger.info("合并相似度...")
268 result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3) 286 result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3)
269 287
270 logger.info(f"为 {len(result)} 个物品生成了相似度") 288 logger.info(f"为 {len(result)} 个物品生成了相似度")
@@ -307,8 +325,7 @@ def main(): @@ -307,8 +325,7 @@ def main():
307 output_file, 325 output_file,
308 result, 326 result,
309 name_mappings, 327 name_mappings,
310 - index_type=f'i2i:content:{args.method}',  
311 - logger=logger 328 + description=f'i2i:content:{args.method}'
312 ) 329 )
313 330
314 331
offline_tasks/scripts/i2i_deepwalk.py
@@ -372,8 +372,7 @@ def main(): @@ -372,8 +372,7 @@ def main():
372 output_file, 372 output_file,
373 result, 373 result,
374 name_mappings, 374 name_mappings,
375 - index_type='i2i:deepwalk',  
376 - logger=logger 375 + description='i2i:deepwalk'
377 ) 376 )
378 377
379 378
offline_tasks/scripts/i2i_session_w2v.py
@@ -185,7 +185,8 @@ def main(): @@ -185,7 +185,8 @@ def main():
185 'epochs': args.epochs, 185 'epochs': args.epochs,
186 'top_n': args.top_n, 186 'top_n': args.top_n,
187 'lookback_days': args.lookback_days, 187 'lookback_days': args.lookback_days,
188 - 'session_gap_minutes': args.session_gap, 188 + 'max_session_length': args.max_session_length,
  189 + 'min_session_length': args.min_session_length,
189 'debug': args.debug 190 'debug': args.debug
190 } 191 }
191 log_algorithm_params(logger, params) 192 log_algorithm_params(logger, params)
@@ -303,8 +304,7 @@ def main(): @@ -303,8 +304,7 @@ def main():
303 output_file, 304 output_file,
304 result, 305 result,
305 name_mappings, 306 name_mappings,
306 - index_type='i2i:session_w2v',  
307 - logger=logger 307 + description='i2i:session_w2v'
308 ) 308 )
309 309
310 310
offline_tasks/scripts/interest_aggregation.py
@@ -356,16 +356,16 @@ def main(): @@ -356,16 +356,16 @@ def main():
356 if args.debug and aggregations: 356 if args.debug and aggregations:
357 for dim_key, items in aggregations.items(): 357 for dim_key, items in aggregations.items():
358 if items: 358 if items:
359 - # 为每个维度生成可读索引  
360 - result_dict = {dim_key: items[:args.top_n]} 359 + # 为每个维度生成可读索引 - 先排序再取前N个
  360 + sorted_items = sorted(items.items(), key=lambda x: -x[1])[:args.top_n]
  361 + result_dict = {dim_key: sorted_items}
361 output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt') 362 output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt')
362 if os.path.exists(output_file): 363 if os.path.exists(output_file):
363 save_readable_index( 364 save_readable_index(
364 output_file, 365 output_file,
365 result_dict, 366 result_dict,
366 name_mappings, 367 name_mappings,
367 - index_type=f'interest:{list_type}:{dim_key}',  
368 - logger=logger 368 + description=f'interest:{list_type}:{dim_key}'
369 ) 369 )
370 370
371 # 生成全局索引(所有数据) 371 # 生成全局索引(所有数据)