Commit 5b954396cc1527673ee950a0b3ba24748942ab52

Authored by tangwang
1 parent d9914b07

add cos sim

offline_tasks/run.sh
@@ -171,12 +171,26 @@ if [ $? -ne 0 ]; then @@ -171,12 +171,26 @@ if [ $? -ne 0 ]; then
171 echo "⚠️ 内容相似度失败,但继续执行" 171 echo "⚠️ 内容相似度失败,但继续执行"
172 fi 172 fi
173 173
  174 +# Task 5: Item行为相似度
  175 +run_task "Task 5: Item行为相似度" \
  176 + "python3 scripts/i2i_item_behavior.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE"
  177 +if [ $? -ne 0 ]; then
  178 + echo "⚠️ Item行为相似度失败,但继续执行"
  179 +fi
  180 +
  181 +# Task 6: Tag分类相似度
  182 +run_task "Task 6: Tag分类相似度" \
  183 + "python3 scripts/tag_category_similar.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE"
  184 +if [ $? -ne 0 ]; then
  185 + echo "⚠️ Tag分类相似度失败,但继续执行"
  186 +fi
  187 +
174 # ============================================================================ 188 # ============================================================================
175 # 兴趣聚合任务 189 # 兴趣聚合任务
176 # ============================================================================ 190 # ============================================================================
177 191
178 -# Task 5: 兴趣聚合  
179 -run_task "Task 5: 兴趣聚合" \ 192 +# Task 7: 兴趣聚合
  193 +run_task "Task 7: 兴趣聚合" \
180 "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE" 194 "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE"
181 if [ $? -ne 0 ]; then 195 if [ $? -ne 0 ]; then
182 echo "⚠️ 兴趣聚合失败,但继续执行" 196 echo "⚠️ 兴趣聚合失败,但继续执行"
@@ -215,6 +229,8 @@ echo " - 商品属性: output/item_attributes_mappings.json" @@ -215,6 +229,8 @@ echo " - 商品属性: output/item_attributes_mappings.json"
215 echo " - Session文件: output/session.txt.*" 229 echo " - Session文件: output/session.txt.*"
216 echo " - C++ Swing: collaboration/output/swing_similar.txt" 230 echo " - C++ Swing: collaboration/output/swing_similar.txt"
217 echo " - Python算法: output/i2i_*.txt" 231 echo " - Python算法: output/i2i_*.txt"
  232 +echo " - Item行为相似度: output/i2i_item_behavior_*.txt"
  233 +echo " - Tag分类相似度: output/tag_category_similar_*.txt"
218 echo " - 兴趣聚合: output/interest_aggregation_*.txt" 234 echo " - 兴趣聚合: output/interest_aggregation_*.txt"
219 echo " - 日志: logs/" 235 echo " - 日志: logs/"
220 echo "" 236 echo ""
offline_tasks/scripts/i2i_item_behavior.py 0 → 100644
@@ -0,0 +1,129 @@ @@ -0,0 +1,129 @@
  1 +import pandas as pd
  2 +import math
  3 +from collections import defaultdict
  4 +from sqlalchemy import create_engine
  5 +from db_service import create_db_connection
  6 +import argparse
  7 +from datetime import datetime
  8 +import os
  9 +
  10 +def clean_text_field(text):
  11 + if pd.isna(text):
  12 + return ''
  13 + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符
  14 + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()
  15 +
  16 +# 解析命令行参数
  17 +parser = argparse.ArgumentParser(description='计算基于用户行为的商品相似度(Item Similarity)')
  18 +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天')
  19 +parser.add_argument('--top_n', type=int, default=50, help='每个商品保留的相似商品数量,默认50')
  20 +parser.add_argument('--debug', action='store_true', help='开启debug模式')
  21 +args = parser.parse_args()
  22 +
  23 +# 数据库连接配置
  24 +host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com'
  25 +port = '9030'
  26 +database = 'datacenter'
  27 +username = 'readonly'
  28 +password = 'essa1234'
  29 +
  30 +# 创建数据库连接
  31 +engine = create_db_connection(host, port, database, username, password)
  32 +
  33 +# SQL 查询 - 获取用户点击序列
  34 +sql_query = f"""
  35 +SELECT
  36 + DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date,
  37 + se.anonymous_id AS user_id,
  38 + se.item_id,
  39 + pgs.name AS item_name
  40 +FROM
  41 + sensors_events se
  42 +LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id
  43 +WHERE
  44 + se.event IN ('contactFactory', 'addToPool', 'addToCart')
  45 + AND se.create_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY)
  46 +ORDER BY
  47 + se.anonymous_id,
  48 + se.create_time;
  49 +"""
  50 +
  51 +if args.debug:
  52 + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}")
  53 + print(f"[DEBUG] 开始查询数据库...")
  54 +
  55 +# 执行 SQL 查询并将结果加载到 pandas DataFrame
  56 +df = pd.read_sql(sql_query, engine)
  57 +
  58 +if args.debug:
  59 + print(f"[DEBUG] 查询完成,共 {len(df)} 条记录")
  60 + print(f"[DEBUG] 唯一用户数: {df['user_id'].nunique()}")
  61 + print(f"[DEBUG] 唯一商品数: {df['item_id'].nunique()}")
  62 +
  63 +# 处理点击序列,计算共现关系
  64 +cooccur = defaultdict(lambda: defaultdict(int))
  65 +freq = defaultdict(int)
  66 +
  67 +# 按用户和日期分组处理点击序列
  68 +for (user_id, date), group in df.groupby(['user_id', 'date']):
  69 + items = group['item_id'].tolist()
  70 + unique_items = set(items)
  71 +
  72 + # 更新频率统计
  73 + for item in unique_items:
  74 + freq[item] += 1
  75 +
  76 + # 更新共现关系
  77 + for i in range(len(items)):
  78 + for j in range(i + 1, len(items)):
  79 + item1, item2 = items[i], items[j]
  80 + if item1 != item2:
  81 + cooccur[item1][item2] += 1
  82 + cooccur[item2][item1] += 1
  83 +
  84 +# 计算余弦相似度
  85 +if args.debug:
  86 + print(f"[DEBUG] 开始计算相似度...")
  87 +
  88 +result = {}
  89 +for item1 in cooccur:
  90 + sim_scores = []
  91 + for item2 in cooccur[item1]:
  92 + numerator = cooccur[item1][item2]
  93 + denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2])
  94 + if denominator != 0:
  95 + score = numerator / denominator
  96 + sim_scores.append((item2, score))
  97 + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序
  98 + # 只保留top_n个相似商品
  99 + result[item1] = sim_scores[:args.top_n]
  100 +
  101 +if args.debug:
  102 + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个商品有相似推荐")
  103 +
  104 +# 创建item_id到name的映射
  105 +item_name_map = dict(zip(df['item_id'], df['item_name']))
  106 +
  107 +# 准备输出
  108 +date_str = datetime.now().strftime('%Y%m%d')
  109 +output_dir = 'output'
  110 +os.makedirs(output_dir, exist_ok=True)
  111 +output_file = os.path.join(output_dir, f'i2i_item_behavior_{date_str}.txt')
  112 +
  113 +# 输出相似商品到文件
  114 +if args.debug:
  115 + print(f"[DEBUG] 开始写入文件: {output_file}")
  116 +
  117 +with open(output_file, 'w', encoding='utf-8') as f:
  118 + for item_id, sims in sorted(result.items()):
  119 + item_name = clean_text_field(item_name_map.get(item_id, 'Unknown'))
  120 + # 格式: item_id \t item_name \t similar_id1:score1,similar_id2:score2,...
  121 + sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims])
  122 + f.write(f'{item_id}\t{item_name}\t{sim_str}\n')
  123 +
  124 +print(f"✓ Item相似度计算完成")
  125 +print(f" - 输出文件: {output_file}")
  126 +print(f" - 商品数: {len(result)}")
  127 +if result:
  128 + avg_sims = sum(len(sims) for sims in result.values()) / len(result)
  129 + print(f" - 平均相似商品数: {avg_sims:.1f}")
offline_tasks/scripts/tag_category_similar.py 0 → 100644
@@ -0,0 +1,127 @@ @@ -0,0 +1,127 @@
  1 +import pandas as pd
  2 +import math
  3 +from collections import defaultdict
  4 +from sqlalchemy import create_engine
  5 +from db_service import create_db_connection
  6 +import argparse
  7 +from datetime import datetime
  8 +import os
  9 +
  10 +def clean_text_field(text):
  11 + if pd.isna(text):
  12 + return ''
  13 + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符
  14 + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()
  15 +
  16 +# 解析命令行参数
  17 +parser = argparse.ArgumentParser(description='计算基于订单的分类相似度(Tag Similarity)')
  18 +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天')
  19 +parser.add_argument('--top_n', type=int, default=50, help='每个分类保留的相似分类数量,默认50')
  20 +parser.add_argument('--debug', action='store_true', help='开启debug模式')
  21 +args = parser.parse_args()
  22 +
  23 +bpms_host = '120.76.244.158'
  24 +bpms_port = '3325'
  25 +bpms_database = 'bpms'
  26 +bpms_username = 'PRD_M1_190311'
  27 +bpms_password = 'WTF)xdbqtW!4gwA7'
  28 +
  29 +# 创建数据库连接
  30 +engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password)
  31 +
  32 +# SQL 查询
  33 +sql_query = f"""
  34 +SELECT
  35 + sp.code AS `PO单号`,
  36 + psm.name AS `区域`,
  37 + bb.code AS `客户编码`,
  38 + GROUP_CONCAT(pc_1.name) AS `商品信息`,
  39 + MIN(spi.order_time) AS `下单货时间`
  40 +FROM sale_po sp
  41 +INNER JOIN sale_po_item spi ON sp.id = spi.po_id
  42 +LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id
  43 +LEFT JOIN prd_goods pg ON pg.id = spi.spu_id
  44 +LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1)
  45 +LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries
  46 +LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id
  47 +WHERE spi.quantity > 0
  48 + AND spi.is_delete = 0
  49 + AND bb.is_delete = 0
  50 + AND spi.order_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY)
  51 +GROUP BY sp.code, psm.name, bb.code;
  52 +"""
  53 +
  54 +if args.debug:
  55 + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}")
  56 + print(f"[DEBUG] 开始查询数据库...")
  57 +
  58 +# 执行 SQL 查询并将结果加载到 pandas DataFrame
  59 +df = pd.read_sql(sql_query, engine)
  60 +
  61 +if args.debug:
  62 + print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录")
  63 +
  64 +# 处理商品信息,分割并去重
  65 +cooccur = defaultdict(lambda: defaultdict(int))
  66 +freq = defaultdict(int)
  67 +
  68 +for _, row in df.iterrows():
  69 + # Handle None values in 商品信息
  70 + if pd.isna(row['商品信息']):
  71 + continue
  72 + categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()]
  73 + unique_cats = set(categories)
  74 + for c1 in unique_cats:
  75 + freq[c1] += 1
  76 + for c2 in unique_cats:
  77 + if c1 != c2:
  78 + cooccur[c1][c2] += 1
  79 +
  80 +# 计算余弦相似度
  81 +if args.debug:
  82 + print(f"[DEBUG] 开始计算分类相似度...")
  83 +
  84 +result = {}
  85 +for c1 in cooccur:
  86 + sim_scores = []
  87 + for c2 in cooccur[c1]:
  88 + numerator = cooccur[c1][c2]
  89 + denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2])
  90 + if denominator != 0:
  91 + score = numerator / denominator
  92 + sim_scores.append((c2, score))
  93 + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序
  94 + # 只保留top_n个相似分类
  95 + result[c1] = sim_scores[:args.top_n]
  96 +
  97 +if args.debug:
  98 + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐")
  99 + unique_cats = set()
  100 + for cats in result.values():
  101 + for cat, _ in cats:
  102 + unique_cats.add(cat)
  103 + print(f"[DEBUG] 唯一分类数: {len(unique_cats)}")
  104 +
  105 +# 准备输出
  106 +date_str = datetime.now().strftime('%Y%m%d')
  107 +output_dir = 'output'
  108 +os.makedirs(output_dir, exist_ok=True)
  109 +output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt')
  110 +
  111 +# 输出相似分类到文件
  112 +if args.debug:
  113 + print(f"[DEBUG] 开始写入文件: {output_file}")
  114 +
  115 +with open(output_file, 'w', encoding='utf-8') as f:
  116 + for cat, sims in sorted(result.items()):
  117 + cat_clean = clean_text_field(cat)
  118 + # 格式: category_name \t similar_cat1:score1,similar_cat2:score2,...
  119 + sim_str = ','.join([f'{clean_text_field(sim_cat)}:{score:.4f}' for sim_cat, score in sims])
  120 + f.write(f'{cat_clean}\t{sim_str}\n')
  121 +
  122 +print(f"✓ Tag相似度计算完成")
  123 +print(f" - 输出文件: {output_file}")
  124 +print(f" - 分类数: {len(result)}")
  125 +if result:
  126 + avg_sims = sum(len(sims) for sims in result.values()) / len(result)
  127 + print(f" - 平均相似分类数: {avg_sims:.1f}")