From 5b954396cc1527673ee950a0b3ba24748942ab52 Mon Sep 17 00:00:00 2001 From: tangwang Date: Mon, 20 Oct 2025 22:39:58 +0800 Subject: [PATCH] add cos sim --- offline_tasks/run.sh | 20 ++++++++++++++++++-- offline_tasks/scripts/i2i_item_behavior.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ offline_tasks/scripts/tag_category_similar.py | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 offline_tasks/scripts/i2i_item_behavior.py create mode 100644 offline_tasks/scripts/tag_category_similar.py diff --git a/offline_tasks/run.sh b/offline_tasks/run.sh index 6233c41..02c34f2 100755 --- a/offline_tasks/run.sh +++ b/offline_tasks/run.sh @@ -171,12 +171,26 @@ if [ $? -ne 0 ]; then echo "⚠️ 内容相似度失败,但继续执行" fi +# Task 5: Item行为相似度 +run_task "Task 5: Item行为相似度" \ + "python3 scripts/i2i_item_behavior.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE" +if [ $? -ne 0 ]; then + echo "⚠️ Item行为相似度失败,但继续执行" +fi + +# Task 6: Tag分类相似度 +run_task "Task 6: Tag分类相似度" \ + "python3 scripts/tag_category_similar.py --lookback_days $LOOKBACK_DAYS --top_n $TOP_N $DEBUG_MODE" +if [ $? -ne 0 ]; then + echo "⚠️ Tag分类相似度失败,但继续执行" +fi + # ============================================================================ # 兴趣聚合任务 # ============================================================================ -# Task 5: 兴趣聚合 -run_task "Task 5: 兴趣聚合" \ +# Task 7: 兴趣聚合 +run_task "Task 7: 兴趣聚合" \ "python3 scripts/interest_aggregation.py --lookback_days $LOOKBACK_DAYS --top_n 1000 $DEBUG_MODE" if [ $? -ne 0 ]; then echo "⚠️ 兴趣聚合失败,但继续执行" @@ -215,6 +229,8 @@ echo " - 商品属性: output/item_attributes_mappings.json" echo " - Session文件: output/session.txt.*" echo " - C++ Swing: collaboration/output/swing_similar.txt" echo " - Python算法: output/i2i_*.txt" +echo " - Item行为相似度: output/i2i_item_behavior_*.txt" +echo " - Tag分类相似度: output/tag_category_similar_*.txt" echo " - 兴趣聚合: output/interest_aggregation_*.txt" echo " - 日志: logs/" echo "" diff --git a/offline_tasks/scripts/i2i_item_behavior.py b/offline_tasks/scripts/i2i_item_behavior.py new file mode 100644 index 0000000..75476df --- /dev/null +++ b/offline_tasks/scripts/i2i_item_behavior.py @@ -0,0 +1,129 @@ +import pandas as pd +import math +from collections import defaultdict +from sqlalchemy import create_engine +from db_service import create_db_connection +import argparse +from datetime import datetime +import os + +def clean_text_field(text): + if pd.isna(text): + return '' + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() + +# 解析命令行参数 +parser = argparse.ArgumentParser(description='计算基于用户行为的商品相似度(Item Similarity)') +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天') +parser.add_argument('--top_n', type=int, default=50, help='每个商品保留的相似商品数量,默认50') +parser.add_argument('--debug', action='store_true', help='开启debug模式') +args = parser.parse_args() + +# 数据库连接配置 +host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com' +port = '9030' +database = 'datacenter' +username = 'readonly' +password = 'essa1234' + +# 创建数据库连接 +engine = create_db_connection(host, port, database, username, password) + +# SQL 查询 - 获取用户点击序列 +sql_query = f""" +SELECT + DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date, + se.anonymous_id AS user_id, + se.item_id, + pgs.name AS item_name +FROM + sensors_events se +LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id +WHERE + se.event IN ('contactFactory', 'addToPool', 'addToCart') + AND se.create_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY) +ORDER BY + se.anonymous_id, + se.create_time; +""" + +if args.debug: + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") + print(f"[DEBUG] 开始查询数据库...") + +# 执行 SQL 查询并将结果加载到 pandas DataFrame +df = pd.read_sql(sql_query, engine) + +if args.debug: + print(f"[DEBUG] 查询完成,共 {len(df)} 条记录") + print(f"[DEBUG] 唯一用户数: {df['user_id'].nunique()}") + print(f"[DEBUG] 唯一商品数: {df['item_id'].nunique()}") + +# 处理点击序列,计算共现关系 +cooccur = defaultdict(lambda: defaultdict(int)) +freq = defaultdict(int) + +# 按用户和日期分组处理点击序列 +for (user_id, date), group in df.groupby(['user_id', 'date']): + items = group['item_id'].tolist() + unique_items = set(items) + + # 更新频率统计 + for item in unique_items: + freq[item] += 1 + + # 更新共现关系 + for i in range(len(items)): + for j in range(i + 1, len(items)): + item1, item2 = items[i], items[j] + if item1 != item2: + cooccur[item1][item2] += 1 + cooccur[item2][item1] += 1 + +# 计算余弦相似度 +if args.debug: + print(f"[DEBUG] 开始计算相似度...") + +result = {} +for item1 in cooccur: + sim_scores = [] + for item2 in cooccur[item1]: + numerator = cooccur[item1][item2] + denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2]) + if denominator != 0: + score = numerator / denominator + sim_scores.append((item2, score)) + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 + # 只保留top_n个相似商品 + result[item1] = sim_scores[:args.top_n] + +if args.debug: + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个商品有相似推荐") + +# 创建item_id到name的映射 +item_name_map = dict(zip(df['item_id'], df['item_name'])) + +# 准备输出 +date_str = datetime.now().strftime('%Y%m%d') +output_dir = 'output' +os.makedirs(output_dir, exist_ok=True) +output_file = os.path.join(output_dir, f'i2i_item_behavior_{date_str}.txt') + +# 输出相似商品到文件 +if args.debug: + print(f"[DEBUG] 开始写入文件: {output_file}") + +with open(output_file, 'w', encoding='utf-8') as f: + for item_id, sims in sorted(result.items()): + item_name = clean_text_field(item_name_map.get(item_id, 'Unknown')) + # 格式: item_id \t item_name \t similar_id1:score1,similar_id2:score2,... + sim_str = ','.join([f'{sim_id}:{score:.4f}' for sim_id, score in sims]) + f.write(f'{item_id}\t{item_name}\t{sim_str}\n') + +print(f"✓ Item相似度计算完成") +print(f" - 输出文件: {output_file}") +print(f" - 商品数: {len(result)}") +if result: + avg_sims = sum(len(sims) for sims in result.values()) / len(result) + print(f" - 平均相似商品数: {avg_sims:.1f}") diff --git a/offline_tasks/scripts/tag_category_similar.py b/offline_tasks/scripts/tag_category_similar.py new file mode 100644 index 0000000..d201114 --- /dev/null +++ b/offline_tasks/scripts/tag_category_similar.py @@ -0,0 +1,127 @@ +import pandas as pd +import math +from collections import defaultdict +from sqlalchemy import create_engine +from db_service import create_db_connection +import argparse +from datetime import datetime +import os + +def clean_text_field(text): + if pd.isna(text): + return '' + # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 + return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() + +# 解析命令行参数 +parser = argparse.ArgumentParser(description='计算基于订单的分类相似度(Tag Similarity)') +parser.add_argument('--lookback_days', type=int, default=180, help='回溯天数,默认180天') +parser.add_argument('--top_n', type=int, default=50, help='每个分类保留的相似分类数量,默认50') +parser.add_argument('--debug', action='store_true', help='开启debug模式') +args = parser.parse_args() + +bpms_host = '120.76.244.158' +bpms_port = '3325' +bpms_database = 'bpms' +bpms_username = 'PRD_M1_190311' +bpms_password = 'WTF)xdbqtW!4gwA7' + +# 创建数据库连接 +engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password) + +# SQL 查询 +sql_query = f""" +SELECT + sp.code AS `PO单号`, + psm.name AS `区域`, + bb.code AS `客户编码`, + GROUP_CONCAT(pc_1.name) AS `商品信息`, + MIN(spi.order_time) AS `下单货时间` +FROM sale_po sp +INNER JOIN sale_po_item spi ON sp.id = spi.po_id +LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id +LEFT JOIN prd_goods pg ON pg.id = spi.spu_id +LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1) +LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries +LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id +WHERE spi.quantity > 0 + AND spi.is_delete = 0 + AND bb.is_delete = 0 + AND spi.order_time >= DATE_SUB(NOW(), INTERVAL {args.lookback_days} DAY) +GROUP BY sp.code, psm.name, bb.code; +""" + +if args.debug: + print(f"[DEBUG] 参数配置: lookback_days={args.lookback_days}, top_n={args.top_n}") + print(f"[DEBUG] 开始查询数据库...") + +# 执行 SQL 查询并将结果加载到 pandas DataFrame +df = pd.read_sql(sql_query, engine) + +if args.debug: + print(f"[DEBUG] 查询完成,共 {len(df)} 条订单记录") + +# 处理商品信息,分割并去重 +cooccur = defaultdict(lambda: defaultdict(int)) +freq = defaultdict(int) + +for _, row in df.iterrows(): + # Handle None values in 商品信息 + if pd.isna(row['商品信息']): + continue + categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()] + unique_cats = set(categories) + for c1 in unique_cats: + freq[c1] += 1 + for c2 in unique_cats: + if c1 != c2: + cooccur[c1][c2] += 1 + +# 计算余弦相似度 +if args.debug: + print(f"[DEBUG] 开始计算分类相似度...") + +result = {} +for c1 in cooccur: + sim_scores = [] + for c2 in cooccur[c1]: + numerator = cooccur[c1][c2] + denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2]) + if denominator != 0: + score = numerator / denominator + sim_scores.append((c2, score)) + sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 + # 只保留top_n个相似分类 + result[c1] = sim_scores[:args.top_n] + +if args.debug: + print(f"[DEBUG] 相似度计算完成,共 {len(result)} 个分类有相似推荐") + unique_cats = set() + for cats in result.values(): + for cat, _ in cats: + unique_cats.add(cat) + print(f"[DEBUG] 唯一分类数: {len(unique_cats)}") + +# 准备输出 +date_str = datetime.now().strftime('%Y%m%d') +output_dir = 'output' +os.makedirs(output_dir, exist_ok=True) +output_file = os.path.join(output_dir, f'tag_category_similar_{date_str}.txt') + +# 输出相似分类到文件 +if args.debug: + print(f"[DEBUG] 开始写入文件: {output_file}") + +with open(output_file, 'w', encoding='utf-8') as f: + for cat, sims in sorted(result.items()): + cat_clean = clean_text_field(cat) + # 格式: category_name \t similar_cat1:score1,similar_cat2:score2,... + sim_str = ','.join([f'{clean_text_field(sim_cat)}:{score:.4f}' for sim_cat, score in sims]) + f.write(f'{cat_clean}\t{sim_str}\n') + +print(f"✓ Tag相似度计算完成") +print(f" - 输出文件: {output_file}") +print(f" - 分类数: {len(result)}") +if result: + avg_sims = sum(len(sims) for sims in result.values()) / len(result) + print(f" - 平均相似分类数: {avg_sims:.1f}") -- libgit2 0.21.2