import pandas as pd import math from collections import defaultdict from sqlalchemy import create_engine from db_service import create_db_connection import argparse def clean_text_field(text): if pd.isna(text): return '' # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() # 数据库连接配置 host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com' port = '9030' database = 'datacenter' username = 'readonly' password = 'essa1234' # 创建数据库连接 engine = create_db_connection(host, port, database, username, password) # SQL 查询 - 获取用户点击序列 sql_query = """ SELECT DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date, se.anonymous_id AS user_id, se.item_id, pgs.name AS item_name FROM sensors_events se LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id WHERE se.event IN ('contactFactory', 'addToPool', 'addToCart') AND se.create_time >= '2025-04-01' ORDER BY se.anonymous_id, se.create_time; """ # 执行 SQL 查询并将结果加载到 pandas DataFrame df = pd.read_sql(sql_query, engine) # 处理点击序列,计算共现关系 cooccur = defaultdict(lambda: defaultdict(int)) freq = defaultdict(int) # 按用户和日期分组处理点击序列 for (user_id, date), group in df.groupby(['user_id', 'date']): items = group['item_id'].tolist() unique_items = set(items) # 更新频率统计 for item in unique_items: freq[item] += 1 # 更新共现关系 for i in range(len(items)): for j in range(i + 1, len(items)): item1, item2 = items[i], items[j] if item1 != item2: cooccur[item1][item2] += 1 cooccur[item2][item1] += 1 # 计算余弦相似度 result = {} for item1 in cooccur: sim_scores = [] for item2 in cooccur[item1]: numerator = cooccur[item1][item2] denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2]) if denominator != 0: score = numerator / denominator sim_scores.append((item2, score)) sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 result[item1] = sim_scores # 创建item_id到name的映射 item_name_map = dict(zip(df['item_id'], df['item_name'])) # 输出相似商品 for item_id, sims in result.items(): item_name = item_name_map.get(item_id, 'Unknown') # 只取前8个最相似的商品 top_sims = sims[:8] sim_str = ','.join([f'{item_name_map.get(sim_id, "Unknown")}:{score:.4f}' for sim_id, score in top_sims]) print(f'{item_name}\t{sim_str}')