Commit 40442baf1b796509110529e7ce705b24450e5da6
1 parent
9832fef6
offline tasks: fix bugs of i2i swing / hot / sessionw2v
Showing
5 changed files
with
43 additions
and
21 deletions
Show diff stats
offline_tasks/run.sh
| @@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks | @@ -6,6 +6,12 @@ cd /home/tw/recommendation/offline_tasks | ||
| 6 | # 2. 测试连接 | 6 | # 2. 测试连接 |
| 7 | # python3 test_connection.py | 7 | # python3 test_connection.py |
| 8 | 8 | ||
| 9 | +ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 | ||
| 10 | +ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 | ||
| 11 | +rm output/* -rf | ||
| 12 | +rm logs/* -rf | ||
| 13 | + | ||
| 14 | + | ||
| 9 | # 3. 调试模式运行(小数据量) | 15 | # 3. 调试模式运行(小数据量) |
| 10 | python3 run_all.py --lookback_days 7 --top_n 10 --debug | 16 | python3 run_all.py --lookback_days 7 --top_n 10 --debug |
| 11 | 17 |
offline_tasks/scripts/i2i_content_similar.py
| @@ -107,24 +107,37 @@ def build_feature_text(row): | @@ -107,24 +107,37 @@ def build_feature_text(row): | ||
| 107 | return ' '.join(features) | 107 | return ' '.join(features) |
| 108 | 108 | ||
| 109 | 109 | ||
| 110 | -def calculate_content_similarity(df, top_n=50): | 110 | +def calculate_content_similarity(df, top_n=50, logger=None): |
| 111 | """ | 111 | """ |
| 112 | - 基于内容计算相似度 | 112 | + 基于内容计算相似度(内存优化版) |
| 113 | """ | 113 | """ |
| 114 | - print("Building feature texts...") | 114 | + |
| 115 | + if logger: | ||
| 116 | + logger.info("构建特征文本...") | ||
| 117 | + else: | ||
| 118 | + print("Building feature texts...") | ||
| 115 | df['feature_text'] = df.apply(build_feature_text, axis=1) | 119 | df['feature_text'] = df.apply(build_feature_text, axis=1) |
| 116 | 120 | ||
| 117 | - print("Calculating TF-IDF...") | 121 | + if logger: |
| 122 | + logger.info("计算 TF-IDF...") | ||
| 123 | + else: | ||
| 124 | + print("Calculating TF-IDF...") | ||
| 118 | vectorizer = TfidfVectorizer(max_features=1000) | 125 | vectorizer = TfidfVectorizer(max_features=1000) |
| 119 | tfidf_matrix = vectorizer.fit_transform(df['feature_text']) | 126 | tfidf_matrix = vectorizer.fit_transform(df['feature_text']) |
| 120 | 127 | ||
| 121 | - print("Calculating cosine similarity...") | ||
| 122 | - # 分批计算相似度以节省内存 | 128 | + if logger: |
| 129 | + logger.info(f"TF-IDF 矩阵形状: {tfidf_matrix.shape}") | ||
| 130 | + logger.info("开始计算余弦相似度(内存优化模式)...") | ||
| 131 | + else: | ||
| 132 | + print("Calculating cosine similarity...") | ||
| 133 | + | ||
| 123 | batch_size = 1000 | 134 | batch_size = 1000 |
| 124 | result = {} | 135 | result = {} |
| 125 | 136 | ||
| 126 | for i in range(0, len(df), batch_size): | 137 | for i in range(0, len(df), batch_size): |
| 127 | end_i = min(i + batch_size, len(df)) | 138 | end_i = min(i + batch_size, len(df)) |
| 139 | + | ||
| 140 | + # 分批计算相似度 | ||
| 128 | batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix) | 141 | batch_similarity = cosine_similarity(tfidf_matrix[i:end_i], tfidf_matrix) |
| 129 | 142 | ||
| 130 | for j, idx in enumerate(range(i, end_i)): | 143 | for j, idx in enumerate(range(i, end_i)): |
| @@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50): | @@ -144,8 +157,11 @@ def calculate_content_similarity(df, top_n=50): | ||
| 144 | 157 | ||
| 145 | if similar_items: | 158 | if similar_items: |
| 146 | result[item_id] = similar_items | 159 | result[item_id] = similar_items |
| 147 | - | ||
| 148 | - print(f"Processed {end_i}/{len(df)} products...") | 160 | + |
| 161 | + if logger: | ||
| 162 | + logger.info(f"已处理 {end_i}/{len(df)} 个商品...") | ||
| 163 | + else: | ||
| 164 | + print(f"Processed {end_i}/{len(df)} products...") | ||
| 149 | 165 | ||
| 150 | return result | 166 | return result |
| 151 | 167 | ||
| @@ -257,14 +273,16 @@ def main(): | @@ -257,14 +273,16 @@ def main(): | ||
| 257 | log_processing_step(logger, f"计算相似度 (方法: {args.method})") | 273 | log_processing_step(logger, f"计算相似度 (方法: {args.method})") |
| 258 | if args.method == 'tfidf': | 274 | if args.method == 'tfidf': |
| 259 | logger.info("使用 TF-IDF 方法...") | 275 | logger.info("使用 TF-IDF 方法...") |
| 260 | - result = calculate_content_similarity(df, args.top_n) | 276 | + result = calculate_content_similarity(df, args.top_n, logger=logger) |
| 261 | elif args.method == 'category': | 277 | elif args.method == 'category': |
| 262 | logger.info("使用基于分类的方法...") | 278 | logger.info("使用基于分类的方法...") |
| 263 | result = calculate_category_based_similarity(df) | 279 | result = calculate_category_based_similarity(df) |
| 264 | else: # hybrid | 280 | else: # hybrid |
| 265 | logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...") | 281 | logger.info("使用混合方法 (TF-IDF 70% + 分类 30%)...") |
| 266 | - tfidf_sim = calculate_content_similarity(df, args.top_n) | 282 | + tfidf_sim = calculate_content_similarity(df, args.top_n, logger=logger) |
| 283 | + logger.info("计算基于分类的相似度...") | ||
| 267 | category_sim = calculate_category_based_similarity(df) | 284 | category_sim = calculate_category_based_similarity(df) |
| 285 | + logger.info("合并相似度...") | ||
| 268 | result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3) | 286 | result = merge_similarities(tfidf_sim, category_sim, weight1=0.7, weight2=0.3) |
| 269 | 287 | ||
| 270 | logger.info(f"为 {len(result)} 个物品生成了相似度") | 288 | logger.info(f"为 {len(result)} 个物品生成了相似度") |
| @@ -307,8 +325,7 @@ def main(): | @@ -307,8 +325,7 @@ def main(): | ||
| 307 | output_file, | 325 | output_file, |
| 308 | result, | 326 | result, |
| 309 | name_mappings, | 327 | name_mappings, |
| 310 | - index_type=f'i2i:content:{args.method}', | ||
| 311 | - logger=logger | 328 | + description=f'i2i:content:{args.method}' |
| 312 | ) | 329 | ) |
| 313 | 330 | ||
| 314 | 331 |
offline_tasks/scripts/i2i_deepwalk.py
offline_tasks/scripts/i2i_session_w2v.py
| @@ -185,7 +185,8 @@ def main(): | @@ -185,7 +185,8 @@ def main(): | ||
| 185 | 'epochs': args.epochs, | 185 | 'epochs': args.epochs, |
| 186 | 'top_n': args.top_n, | 186 | 'top_n': args.top_n, |
| 187 | 'lookback_days': args.lookback_days, | 187 | 'lookback_days': args.lookback_days, |
| 188 | - 'session_gap_minutes': args.session_gap, | 188 | + 'max_session_length': args.max_session_length, |
| 189 | + 'min_session_length': args.min_session_length, | ||
| 189 | 'debug': args.debug | 190 | 'debug': args.debug |
| 190 | } | 191 | } |
| 191 | log_algorithm_params(logger, params) | 192 | log_algorithm_params(logger, params) |
| @@ -303,8 +304,7 @@ def main(): | @@ -303,8 +304,7 @@ def main(): | ||
| 303 | output_file, | 304 | output_file, |
| 304 | result, | 305 | result, |
| 305 | name_mappings, | 306 | name_mappings, |
| 306 | - index_type='i2i:session_w2v', | ||
| 307 | - logger=logger | 307 | + description='i2i:session_w2v' |
| 308 | ) | 308 | ) |
| 309 | 309 | ||
| 310 | 310 |
offline_tasks/scripts/interest_aggregation.py
| @@ -356,16 +356,16 @@ def main(): | @@ -356,16 +356,16 @@ def main(): | ||
| 356 | if args.debug and aggregations: | 356 | if args.debug and aggregations: |
| 357 | for dim_key, items in aggregations.items(): | 357 | for dim_key, items in aggregations.items(): |
| 358 | if items: | 358 | if items: |
| 359 | - # 为每个维度生成可读索引 | ||
| 360 | - result_dict = {dim_key: items[:args.top_n]} | 359 | + # 为每个维度生成可读索引 - 先排序再取前N个 |
| 360 | + sorted_items = sorted(items.items(), key=lambda x: -x[1])[:args.top_n] | ||
| 361 | + result_dict = {dim_key: sorted_items} | ||
| 361 | output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt') | 362 | output_file = os.path.join(OUTPUT_DIR, f'{output_prefix}_{dim_key}_{datetime.now().strftime("%Y%m%d")}.txt') |
| 362 | if os.path.exists(output_file): | 363 | if os.path.exists(output_file): |
| 363 | save_readable_index( | 364 | save_readable_index( |
| 364 | output_file, | 365 | output_file, |
| 365 | result_dict, | 366 | result_dict, |
| 366 | name_mappings, | 367 | name_mappings, |
| 367 | - index_type=f'interest:{list_type}:{dim_key}', | ||
| 368 | - logger=logger | 368 | + description=f'interest:{list_type}:{dim_key}' |
| 369 | ) | 369 | ) |
| 370 | 370 | ||
| 371 | # 生成全局索引(所有数据) | 371 | # 生成全局索引(所有数据) |