Commit 5b954396cc1527673ee950a0b3ba24748942ab52
1 parent
d9914b07
add cos sim
Showing
3 changed files
with
274 additions
and
2 deletions
Show diff stats
offline_tasks/run.sh
| @@ -171,12 +171,26 @@ if [ $? -ne 0 ]; then | @@ -171,12 +171,26 @@ if [ $? -ne 0 ]; then | ||
| 171 | echo "⚠️ 内容相似度失败,但继续执行" | 171 | echo "⚠️ 内容相似度失败,但继续执行" |
| 172 | fi | 172 | fi |
| 173 | 173 | ||
| 174 | +# Task 5: Item行为相似度 | ||
| 175 | +run_task "Task 5: Item行为相似度" \ | ||
| 176 | + "python3 scripts/i2i_item_behavior.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE" | ||
| 177 | +if [ $? -ne 0 ]; then | ||
| 178 | + echo "⚠️ Item行为相似度失败,但继续执行" | ||
| 179 | +fi | ||
| 180 | + | ||
| 181 | +# Task 6: Tag分类相似度 | ||
| 182 | +run_task "Task 6: Tag分类相似度" \ | ||
| 183 | + "python3 scripts/tag_category_similar.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE" | ||
| 184 | +if [ $? -ne 0 ]; then | ||
| 185 | + echo "⚠️ Tag分类相似度失败,但继续执行" | ||
| 186 | +fi | ||
| 187 | + | ||
| 174 | # ============================================================================ | 188 | # ============================================================================ |
| 175 | # 兴趣聚合任务 | 189 | # 兴趣聚合任务 |
| 176 | # ============================================================================ | 190 | # ============================================================================ |
| 177 | 191 | ||
| 178 | -# Task 5: 兴趣聚合 | ||
| 179 | -run_task "Task 5: 兴趣聚合" \ | 192 | +# Task 7: 兴趣聚合 |
| 193 | +run_task "Task 7: 兴趣聚合" \ | ||
| 180 | "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE" | 194 | "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE" |
| 181 | if [ $? -ne 0 ]; then | 195 | if [ $? -ne 0 ]; then |
| 182 | echo "⚠️ 兴趣聚合失败,但继续执行" | 196 | echo "⚠️ 兴趣聚合失败,但继续执行" |
| @@ -215,6 +229,8 @@ echo " - 商品属性: output/item_attributes_mappings.json" | @@ -215,6 +229,8 @@ echo " - 商品属性: output/item_attributes_mappings.json" | ||
| 215 | echo " - Session文件: output/session.txt.*" | 229 | echo " - Session文件: output/session.txt.*" |
| 216 | echo " - C++ Swing: collaboration/output/swing_similar.txt" | 230 | echo " - C++ Swing: collaboration/output/swing_similar.txt" |
| 217 | echo " - Python算法: output/i2i_*.txt" | 231 | echo " - Python算法: output/i2i_*.txt" |
| 232 | +echo " - Item行为相似度: output/i2i_item_behavior_*.txt" | ||
| 233 | +echo " - Tag分类相似度: output/tag_category_similar_*.txt" | ||
| 218 | echo " - 兴趣聚合: output/interest_aggregation_*.txt" | 234 | echo " - 兴趣聚合: output/interest_aggregation_*.txt" |
| 219 | echo " - 日志: logs/" | 235 | echo " - 日志: logs/" |
| 220 | echo "" | 236 | echo "" |
| @@ -0,0 +1,129 @@ | @@ -0,0 +1,129 @@ | ||
| 1 | +import pandas as pd | ||
| 2 | +import math | ||
| 3 | +from collections import defaultdict | ||
| 4 | +from sqlalchemy import create_engine | ||
| 5 | +from db_service import create_db_connection | ||
| 6 | +import argparse | ||
| 7 | +from datetime import datetime | ||
| 8 | +import os | ||
| 9 | + | ||
| 10 | +def clean_text_field(text): | ||
| 11 | + if pd.isna(text): | ||
| 12 | + return '' | ||
| 13 | + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 | ||
| 14 | + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() | ||
| 15 | + | ||
| 16 | +# 解析命令行参数 | ||
| 17 | +parser = argparse.ArgumentParser(description='计算基于用户行为的商品相似度(Item Similarity)') | ||
| 18 | +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天') | ||
| 19 | +parser.add_argument('--top_n', type=int, default=50, help='每个商品保留的相似商品数量,默认50') | ||
| 20 | +parser.add_argument('--debug', action='store_true', help='开启debug模式') | ||
| 21 | +args = parser.parse_args() | ||
| 22 | + | ||
| 23 | +# 数据库连接配置 | ||
| 24 | +host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com' | ||
| 25 | +port = '9030' | ||
| 26 | +database = 'datacenter' | ||
| 27 | +username = 'readonly' | ||
| 28 | +password = 'essa1234' | ||
| 29 | + | ||
| 30 | +# 创建数据库连接 | ||
| 31 | +engine = create_db_connection(host, port, database, username, password) | ||
| 32 | + | ||
| 33 | +# SQL 查询 - 获取用户点击序列 | ||
| 34 | +sql_query = f""" | ||
| 35 | +SELECT | ||
| 36 | + DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date, | ||
| 37 | + se.anonymous_id AS user_id, | ||
| 38 | + se.item_id, | ||
| 39 | + pgs.name AS item_name | ||
| 40 | +FROM | ||
| 41 | + sensors_events se | ||
| 42 | +LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id | ||
| 43 | +WHERE | ||
| 44 | + se.event IN ('contactFactory', 'addToPool', 'addToCart') | ||
| 45 | + AND se.create_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY) | ||
| 46 | +ORDER BY | ||
| 47 | + se.anonymous_id, | ||
| 48 | + se.create_time; | ||
| 49 | +""" | ||
| 50 | + | ||
| 51 | +if args.debug: | ||
| 52 | + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") | ||
| 53 | + print(f"[DEBUG] 开始查询数据库...") | ||
| 54 | + | ||
| 55 | +# 执行 SQL 查询并将结果加载到 pandas DataFrame | ||
| 56 | +df = pd.read_sql(sql_query, engine) | ||
| 57 | + | ||
| 58 | +if args.debug: | ||
| 59 | + print(f"[DEBUG] 查询完成,共 {len(df)} 条记录") | ||
| 60 | + print(f"[DEBUG] 唯一用户数: {df['user_id'].nunique()}") | ||
| 61 | + print(f"[DEBUG] 唯一商品数: {df['item_id'].nunique()}") | ||
| 62 | + | ||
| 63 | +# 处理点击序列,计算共现关系 | ||
| 64 | +cooccur = defaultdict(lambda: defaultdict(int)) | ||
| 65 | +freq = defaultdict(int) | ||
| 66 | + | ||
| 67 | +# 按用户和日期分组处理点击序列 | ||
| 68 | +for (user_id, date), group in df.groupby(['user_id', 'date']): | ||
| 69 | + items = group['item_id'].tolist() | ||
| 70 | + unique_items = set(items) | ||
| 71 | + | ||
| 72 | + # 更新频率统计 | ||
| 73 | + for item in unique_items: | ||
| 74 | + freq[item] += 1 | ||
| 75 | + | ||
| 76 | + # 更新共现关系 | ||
| 77 | + for i in range(len(items)): | ||
| 78 | + for j in range(i + 1, len(items)): | ||
| 79 | + item1, item2 = items[i], items[j] | ||
| 80 | + if item1 != item2: | ||
| 81 | + cooccur[item1][item2] += 1 | ||
| 82 | + cooccur[item2][item1] += 1 | ||
| 83 | + | ||
| 84 | +# 计算余弦相似度 | ||
| 85 | +if args.debug: | ||
| 86 | + print(f"[DEBUG] 开始计算相似度...") | ||
| 87 | + | ||
| 88 | +result = {} | ||
| 89 | +for item1 in cooccur: | ||
| 90 | + sim_scores = [] | ||
| 91 | + for item2 in cooccur[item1]: | ||
| 92 | + numerator = cooccur[item1][item2] | ||
| 93 | + denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2]) | ||
| 94 | + if denominator != 0: | ||
| 95 | + score = numerator / denominator | ||
| 96 | + sim_scores.append((item2, score)) | ||
| 97 | + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 | ||
| 98 | + # 只保留top_n个相似商品 | ||
| 99 | + result[item1] = sim_scores[:args.top_n] | ||
| 100 | + | ||
| 101 | +if args.debug: | ||
| 102 | + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个商品有相似推荐") | ||
| 103 | + | ||
| 104 | +# 创建item_id到name的映射 | ||
| 105 | +item_name_map = dict(zip(df['item_id'], df['item_name'])) | ||
| 106 | + | ||
| 107 | +# 准备输出 | ||
| 108 | +date_str = datetime.now().strftime('%Y%m%d') | ||
| 109 | +output_dir = 'output' | ||
| 110 | +os.makedirs(output_dir, exist_ok=True) | ||
| 111 | +output_file = os.path.join(output_dir, f'i2i_item_behavior_{date_str}.txt') | ||
| 112 | + | ||
| 113 | +# 输出相似商品到文件 | ||
| 114 | +if args.debug: | ||
| 115 | + print(f"[DEBUG] 开始写入文件: {output_file}") | ||
| 116 | + | ||
| 117 | +with open(output_file, 'w', encoding='utf-8') as f: | ||
| 118 | + for item_id, sims in sorted(result.items()): | ||
| 119 | + item_name = clean_text_field(item_name_map.get(item_id, 'Unknown')) | ||
| 120 | + # 格式: item_id \t item_name \t similar_id1:score1,similar_id2:score2,... | ||
| 121 | + sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) | ||
| 122 | + f.write(f'{item_id}\t{item_name}\t{sim_str}\n') | ||
| 123 | + | ||
| 124 | +print(f"✓ Item相似度计算完成") | ||
| 125 | +print(f" - 输出文件: {output_file}") | ||
| 126 | +print(f" - 商品数: {len(result)}") | ||
| 127 | +if result: | ||
| 128 | + avg_sims = sum(len(sims) for sims in result.values()) / len(result) | ||
| 129 | + print(f" - 平均相似商品数: {avg_sims:.1f}") |
| @@ -0,0 +1,127 @@ | @@ -0,0 +1,127 @@ | ||
| 1 | +import pandas as pd | ||
| 2 | +import math | ||
| 3 | +from collections import defaultdict | ||
| 4 | +from sqlalchemy import create_engine | ||
| 5 | +from db_service import create_db_connection | ||
| 6 | +import argparse | ||
| 7 | +from datetime import datetime | ||
| 8 | +import os | ||
| 9 | + | ||
| 10 | +def clean_text_field(text): | ||
| 11 | + if pd.isna(text): | ||
| 12 | + return '' | ||
| 13 | + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 | ||
| 14 | + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() | ||
| 15 | + | ||
| 16 | +# 解析命令行参数 | ||
| 17 | +parser = argparse.ArgumentParser(description='计算基于订单的分类相似度(Tag Similarity)') | ||
| 18 | +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天') | ||
| 19 | +parser.add_argument('--top_n', type=int, default=50, help='每个分类保留的相似分类数量,默认50') | ||
| 20 | +parser.add_argument('--debug', action='store_true', help='开启debug模式') | ||
| 21 | +args = parser.parse_args() | ||
| 22 | + | ||
| 23 | +bpms_host = '120.76.244.158' | ||
| 24 | +bpms_port = '3325' | ||
| 25 | +bpms_database = 'bpms' | ||
| 26 | +bpms_username = 'PRD_M1_190311' | ||
| 27 | +bpms_password = 'WTF)xdbqtW!4gwA7' | ||
| 28 | + | ||
| 29 | +# 创建数据库连接 | ||
| 30 | +engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password) | ||
| 31 | + | ||
| 32 | +# SQL 查询 | ||
| 33 | +sql_query = f""" | ||
| 34 | +SELECT | ||
| 35 | + sp.code AS `PO单号`, | ||
| 36 | + psm.name AS `区域`, | ||
| 37 | + bb.code AS `客户编码`, | ||
| 38 | + GROUP_CONCAT(pc_1.name) AS `商品信息`, | ||
| 39 | + MIN(spi.order_time) AS `下单货时间` | ||
| 40 | +FROM sale_po sp | ||
| 41 | +INNER JOIN sale_po_item spi ON sp.id = spi.po_id | ||
| 42 | +LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id | ||
| 43 | +LEFT JOIN prd_goods pg ON pg.id = spi.spu_id | ||
| 44 | +LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1) | ||
| 45 | +LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries | ||
| 46 | +LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id | ||
| 47 | +WHERE spi.quantity > 0 | ||
| 48 | + AND spi.is_delete = 0 | ||
| 49 | + AND bb.is_delete = 0 | ||
| 50 | + AND spi.order_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY) | ||
| 51 | +GROUP BY sp.code, psm.name, bb.code; | ||
| 52 | +""" | ||
| 53 | + | ||
| 54 | +if args.debug: | ||
| 55 | + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") | ||
| 56 | + print(f"[DEBUG] 开始查询数据库...") | ||
| 57 | + | ||
| 58 | +# 执行 SQL 查询并将结果加载到 pandas DataFrame | ||
| 59 | +df = pd.read_sql(sql_query, engine) | ||
| 60 | + | ||
| 61 | +if args.debug: | ||
| 62 | + print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") | ||
| 63 | + | ||
| 64 | +# 处理商品信息,分割并去重 | ||
| 65 | +cooccur = defaultdict(lambda: defaultdict(int)) | ||
| 66 | +freq = defaultdict(int) | ||
| 67 | + | ||
| 68 | +for _, row in df.iterrows(): | ||
| 69 | + # Handle None values in 商品信息 | ||
| 70 | + if pd.isna(row['商品信息']): | ||
| 71 | + continue | ||
| 72 | + categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()] | ||
| 73 | + unique_cats = set(categories) | ||
| 74 | + for c1 in unique_cats: | ||
| 75 | + freq[c1] += 1 | ||
| 76 | + for c2 in unique_cats: | ||
| 77 | + if c1 != c2: | ||
| 78 | + cooccur[c1][c2] += 1 | ||
| 79 | + | ||
| 80 | +# 计算余弦相似度 | ||
| 81 | +if args.debug: | ||
| 82 | + print(f"[DEBUG] 开始计算分类相似度...") | ||
| 83 | + | ||
| 84 | +result = {} | ||
| 85 | +for c1 in cooccur: | ||
| 86 | + sim_scores = [] | ||
| 87 | + for c2 in cooccur[c1]: | ||
| 88 | + numerator = cooccur[c1][c2] | ||
| 89 | + denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2]) | ||
| 90 | + if denominator != 0: | ||
| 91 | + score = numerator / denominator | ||
| 92 | + sim_scores.append((c2, score)) | ||
| 93 | + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 | ||
| 94 | + # 只保留top_n个相似分类 | ||
| 95 | + result[c1] = sim_scores[:args.top_n] | ||
| 96 | + | ||
| 97 | +if args.debug: | ||
| 98 | + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") | ||
| 99 | + unique_cats = set() | ||
| 100 | + for cats in result.values(): | ||
| 101 | + for cat, _ in cats: | ||
| 102 | + unique_cats.add(cat) | ||
| 103 | + print(f"[DEBUG] 唯一分类数: {len(unique_cats)}") | ||
| 104 | + | ||
| 105 | +# 准备输出 | ||
| 106 | +date_str = datetime.now().strftime('%Y%m%d') | ||
| 107 | +output_dir = 'output' | ||
| 108 | +os.makedirs(output_dir, exist_ok=True) | ||
| 109 | +output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt') | ||
| 110 | + | ||
| 111 | +# 输出相似分类到文件 | ||
| 112 | +if args.debug: | ||
| 113 | + print(f"[DEBUG] 开始写入文件: {output_file}") | ||
| 114 | + | ||
| 115 | +with open(output_file, 'w', encoding='utf-8') as f: | ||
| 116 | + for cat, sims in sorted(result.items()): | ||
| 117 | + cat_clean = clean_text_field(cat) | ||
| 118 | + # 格式: category_name \t similar_cat1:score1,similar_cat2:score2,... | ||
| 119 | + sim_str = ','.join([f'{clean_text_field(sim_cat)}:{score:.4f}' for sim_cat, score in sims]) | ||
| 120 | + f.write(f'{cat_clean}\t{sim_str}\n') | ||
| 121 | + | ||
| 122 | +print(f"✓ Tag相似度计算完成") | ||
| 123 | +print(f" - 输出文件: {output_file}") | ||
| 124 | +print(f" - 分类数: {len(result)}") | ||
| 125 | +if result: | ||
| 126 | + avg_sims = sum(len(sims) for sims in result.values()) / len(result) | ||
| 127 | + print(f" - 平均相似分类数: {avg_sims:.1f}") |