Commit 40442baf1b796509110529e7ce705b24450e5da6

Authored by tangwang
1 parent 9832fef6

offline tasks: fix bugs of i2i swing / hot / sessionw2v

offline_tasks/run.sh
... ... @@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks
6 6 # 2. 测试连接
7 7 # python3 test_connection.py
8 8  
  9 +ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9
  10 +ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9
  11 +rm output/* -rf
  12 +rm logs/* -rf
  13 +
  14 +
9 15 # 3. 调试模式运行(小数据量)
10 16 python3 run_all.py --lookback_days 7 --top_n 10 --debug
11 17  
... ...
offline_tasks/scripts/i2i_content_similar.py
... ... @@ -107,24 +107,37 @@ def build_feature_text(row):
107 107 return ' '.join(features)
108 108  
109 109  
110   -def calculate_content_similarity(df, top_n=50):
  110 +def calculate_content_similarity(df, top_n=50, logger=None):
111 111 """
112   - 基于内容计算相似度
  112 + 基于内容计算相似度(内存优化版)
113 113 """
114   - print("Building feature texts...")
  114 +
  115 + if logger:
  116 + logger.info("构建特征文本...")
  117 + else:
  118 + print("Building feature texts...")
115 119 df['feature_text'] = df.apply(build_feature_text, axis=1)
116 120  
117   - print("Calculating TF-IDF...")
  121 + if logger:
  122 + logger.info("计算 TF-IDF...")
  123 + else:
  124 + print("Calculating TF-IDF...")
118 125 vectorizer = TfidfVectorizer(max_features=1000)
119 126 tfidf_matrix = vectorizer.fit_transform(df['feature_text'])
120 127  
121   - print("Calculating cosine similarity...")
122   - # 分批计算相似度以节省内存
  128 + if logger:
  129 + logger.info(f"TF-IDF 矩阵形状: {tfidf_matrix.shape}")
  130 + logger.info("开始计算余弦相似度(内存优化模式)...")
  131 + else:
  132 + print("Calculating cosine similarity...")
  133 +
123 134 batch_size = 1000
124 135 result = {}
125 136  
126 137 for i in range(0, len(df), batch_size):
127 138 end_i = min(i + batch_size, len(df))
  139 +
  140 + # 分批计算相似度
128 141 batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix)
129 142  
130 143 for j, idx in enumerate(range(i, end_i)):
... ... @@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50):
144 157  
145 158 if similar_items:
146 159 result[item_id] = similar_items
147   -
148   - print(f"Processed {end_i}/{len(df)} products...")
  160 +
  161 + if logger:
  162 + logger.info(f"已处理 {end_i}/{len(df)} 个商品...")
  163 + else:
  164 + print(f"Processed {end_i}/{len(df)} products...")
149 165  
150 166 return result
151 167  
... ... @@ -257,14 +273,16 @@ def main():
257 273 log_processing_step(logger, f"计算相似度 (方法: {args.method})")
258 274 if args.method == 'tfidf':
259 275 logger.info("使用 TF-IDF 方法...")
260   - result = calculate_content_similarity(df, args.top_n)
  276 + result = calculate_content_similarity(df, args.top_n, logger=logger)
261 277 elif args.method == 'category':
262 278 logger.info("使用基于分类的方法...")
263 279 result = calculate_category_based_similarity(df)
264 280 else: # hybrid
265 281 logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...")
266   - tfidf_sim = calculate_content_similarity(df, args.top_n)
  282 + tfidf_sim = calculate_content_similarity(df, args.top_n, logger=logger)
  283 + logger.info("计算基于分类的相似度...")
267 284 category_sim = calculate_category_based_similarity(df)
  285 + logger.info("合并相似度...")
268 286 result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3)
269 287  
270 288 logger.info(f"为 {len(result)} 个物品生成了相似度")
... ... @@ -307,8 +325,7 @@ def main():
307 325 output_file,
308 326 result,
309 327 name_mappings,
310   - index_type=f'i2i:content:{args.method}',
311   - logger=logger
  328 + description=f'i2i:content:{args.method}'
312 329 )
313 330  
314 331  
... ...
offline_tasks/scripts/i2i_deepwalk.py
... ... @@ -372,8 +372,7 @@ def main():
372 372 output_file,
373 373 result,
374 374 name_mappings,
375   - index_type='i2i:deepwalk',
376   - logger=logger
  375 + description='i2i:deepwalk'
377 376 )
378 377  
379 378  
... ...
offline_tasks/scripts/i2i_session_w2v.py
... ... @@ -185,7 +185,8 @@ def main():
185 185 'epochs': args.epochs,
186 186 'top_n': args.top_n,
187 187 'lookback_days': args.lookback_days,
188   - 'session_gap_minutes': args.session_gap,
  188 + 'max_session_length': args.max_session_length,
  189 + 'min_session_length': args.min_session_length,
189 190 'debug': args.debug
190 191 }
191 192 log_algorithm_params(logger, params)
... ... @@ -303,8 +304,7 @@ def main():
303 304 output_file,
304 305 result,
305 306 name_mappings,
306   - index_type='i2i:session_w2v',
307   - logger=logger
  307 + description='i2i:session_w2v'
308 308 )
309 309  
310 310  
... ...
offline_tasks/scripts/interest_aggregation.py
... ... @@ -356,16 +356,16 @@ def main():
356 356 if args.debug and aggregations:
357 357 for dim_key, items in aggregations.items():
358 358 if items:
359   - # 为每个维度生成可读索引
360   - result_dict = {dim_key: items[:args.top_n]}
  359 + # 为每个维度生成可读索引 - 先排序再取前N个
  360 + sorted_items = sorted(items.items(), key=lambda x: -x[1])[:args.top_n]
  361 + result_dict = {dim_key: sorted_items}
361 362 output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt')
362 363 if os.path.exists(output_file):
363 364 save_readable_index(
364 365 output_file,
365 366 result_dict,
366 367 name_mappings,
367   - index_type=f'interest:{list_type}:{dim_key}',
368   - logger=logger
  368 + description=f'interest:{list_type}:{dim_key}'
369 369 )
370 370  
371 371 # 生成全局索引(所有数据)
... ...