Commit 40442baf1b796509110529e7ce705b24450e5da6
1 parent
9832fef6
offline tasks: fix bugs of i2i swing / hot / sessionw2v
Showing
5 changed files
with
43 additions
and
21 deletions
Show diff stats
offline_tasks/run.sh
| ... | ... | @@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks |
| 6 | 6 | # 2. 测试连接 |
| 7 | 7 | # python3 test_connection.py |
| 8 | 8 | |
| 9 | +ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 | |
| 10 | +ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 | |
| 11 | +rm output/* -rf | |
| 12 | +rm logs/* -rf | |
| 13 | + | |
| 14 | + | |
| 9 | 15 | # 3. 调试模式运行(小数据量) |
| 10 | 16 | python3 run_all.py --lookback_days 7 --top_n 10 --debug |
| 11 | 17 | ... | ... |
offline_tasks/scripts/i2i_content_similar.py
| ... | ... | @@ -107,24 +107,37 @@ def build_feature_text(row): |
| 107 | 107 | return ' '.join(features) |
| 108 | 108 | |
| 109 | 109 | |
| 110 | -def calculate_content_similarity(df, top_n=50): | |
| 110 | +def calculate_content_similarity(df, top_n=50, logger=None): | |
| 111 | 111 | """ |
| 112 | - 基于内容计算相似度 | |
| 112 | + 基于内容计算相似度(内存优化版) | |
| 113 | 113 | """ |
| 114 | - print("Building feature texts...") | |
| 114 | + | |
| 115 | + if logger: | |
| 116 | + logger.info("构建特征文本...") | |
| 117 | + else: | |
| 118 | + print("Building feature texts...") | |
| 115 | 119 | df['feature_text'] = df.apply(build_feature_text, axis=1) |
| 116 | 120 | |
| 117 | - print("Calculating TF-IDF...") | |
| 121 | + if logger: | |
| 122 | + logger.info("计算 TF-IDF...") | |
| 123 | + else: | |
| 124 | + print("Calculating TF-IDF...") | |
| 118 | 125 | vectorizer = TfidfVectorizer(max_features=1000) |
| 119 | 126 | tfidf_matrix = vectorizer.fit_transform(df['feature_text']) |
| 120 | 127 | |
| 121 | - print("Calculating cosine similarity...") | |
| 122 | - # 分批计算相似度以节省内存 | |
| 128 | + if logger: | |
| 129 | + logger.info(f"TF-IDF 矩阵形状: {tfidf_matrix.shape}") | |
| 130 | + logger.info("开始计算余弦相似度(内存优化模式)...") | |
| 131 | + else: | |
| 132 | + print("Calculating cosine similarity...") | |
| 133 | + | |
| 123 | 134 | batch_size = 1000 |
| 124 | 135 | result = {} |
| 125 | 136 | |
| 126 | 137 | for i in range(0, len(df), batch_size): |
| 127 | 138 | end_i = min(i + batch_size, len(df)) |
| 139 | + | |
| 140 | + # 分批计算相似度 | |
| 128 | 141 | batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix) |
| 129 | 142 | |
| 130 | 143 | for j, idx in enumerate(range(i, end_i)): |
| ... | ... | @@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50): |
| 144 | 157 | |
| 145 | 158 | if similar_items: |
| 146 | 159 | result[item_id] = similar_items |
| 147 | - | |
| 148 | - print(f"Processed {end_i}/{len(df)} products...") | |
| 160 | + | |
| 161 | + if logger: | |
| 162 | + logger.info(f"已处理 {end_i}/{len(df)} 个商品...") | |
| 163 | + else: | |
| 164 | + print(f"Processed {end_i}/{len(df)} products...") | |
| 149 | 165 | |
| 150 | 166 | return result |
| 151 | 167 | |
| ... | ... | @@ -257,14 +273,16 @@ def main(): |
| 257 | 273 | log_processing_step(logger, f"计算相似度 (方法: {args.method})") |
| 258 | 274 | if args.method == 'tfidf': |
| 259 | 275 | logger.info("使用 TF-IDF 方法...") |
| 260 | - result = calculate_content_similarity(df, args.top_n) | |
| 276 | + result = calculate_content_similarity(df, args.top_n, logger=logger) | |
| 261 | 277 | elif args.method == 'category': |
| 262 | 278 | logger.info("使用基于分类的方法...") |
| 263 | 279 | result = calculate_category_based_similarity(df) |
| 264 | 280 | else: # hybrid |
| 265 | 281 | logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...") |
| 266 | - tfidf_sim = calculate_content_similarity(df, args.top_n) | |
| 282 | + tfidf_sim = calculate_content_similarity(df, args.top_n, logger=logger) | |
| 283 | + logger.info("计算基于分类的相似度...") | |
| 267 | 284 | category_sim = calculate_category_based_similarity(df) |
| 285 | + logger.info("合并相似度...") | |
| 268 | 286 | result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3) |
| 269 | 287 | |
| 270 | 288 | logger.info(f"为 {len(result)} 个物品生成了相似度") |
| ... | ... | @@ -307,8 +325,7 @@ def main(): |
| 307 | 325 | output_file, |
| 308 | 326 | result, |
| 309 | 327 | name_mappings, |
| 310 | - index_type=f'i2i:content:{args.method}', | |
| 311 | - logger=logger | |
| 328 | + description=f'i2i:content:{args.method}' | |
| 312 | 329 | ) |
| 313 | 330 | |
| 314 | 331 | ... | ... |
offline_tasks/scripts/i2i_deepwalk.py
offline_tasks/scripts/i2i_session_w2v.py
| ... | ... | @@ -185,7 +185,8 @@ def main(): |
| 185 | 185 | 'epochs': args.epochs, |
| 186 | 186 | 'top_n': args.top_n, |
| 187 | 187 | 'lookback_days': args.lookback_days, |
| 188 | - 'session_gap_minutes': args.session_gap, | |
| 188 | + 'max_session_length': args.max_session_length, | |
| 189 | + 'min_session_length': args.min_session_length, | |
| 189 | 190 | 'debug': args.debug |
| 190 | 191 | } |
| 191 | 192 | log_algorithm_params(logger, params) |
| ... | ... | @@ -303,8 +304,7 @@ def main(): |
| 303 | 304 | output_file, |
| 304 | 305 | result, |
| 305 | 306 | name_mappings, |
| 306 | - index_type='i2i:session_w2v', | |
| 307 | - logger=logger | |
| 307 | + description='i2i:session_w2v' | |
| 308 | 308 | ) |
| 309 | 309 | |
| 310 | 310 | ... | ... |
offline_tasks/scripts/interest_aggregation.py
| ... | ... | @@ -356,16 +356,16 @@ def main(): |
| 356 | 356 | if args.debug and aggregations: |
| 357 | 357 | for dim_key, items in aggregations.items(): |
| 358 | 358 | if items: |
| 359 | - # 为每个维度生成可读索引 | |
| 360 | - result_dict = {dim_key: items[:args.top_n]} | |
| 359 | + # 为每个维度生成可读索引 - 先排序再取前N个 | |
| 360 | + sorted_items = sorted(items.items(), key=lambda x: -x[1])[:args.top_n] | |
| 361 | + result_dict = {dim_key: sorted_items} | |
| 361 | 362 | output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt') |
| 362 | 363 | if os.path.exists(output_file): |
| 363 | 364 | save_readable_index( |
| 364 | 365 | output_file, |
| 365 | 366 | result_dict, |
| 366 | 367 | name_mappings, |
| 367 | - index_type=f'interest:{list_type}:{dim_key}', | |
| 368 | - logger=logger | |
| 368 | + description=f'interest:{list_type}:{dim_key}' | |
| 369 | 369 | ) |
| 370 | 370 | |
| 371 | 371 | # 生成全局索引(所有数据) | ... | ... |