item_sim.py
2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import math
from collections import defaultdict
from sqlalchemy import create_engine
from db_service import create_db_connection
import argparse
def clean_text_field(text):
if pd.isna(text):
return ''
# 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符
return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()
# 数据库连接配置
host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com'
port = '9030'
database = 'datacenter'
username = 'readonly'
password = 'essa1234'
# 创建数据库连接
engine = create_db_connection(host, port, database, username, password)
# SQL 查询 - 获取用户点击序列
sql_query = """
SELECT
DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date,
se.anonymous_id AS user_id,
se.item_id,
pgs.name AS item_name
FROM
sensors_events se
LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id
WHERE
se.event IN ('contactFactory', 'addToPool', 'addToCart')
AND se.create_time >= '2025-04-01'
ORDER BY
se.anonymous_id,
se.create_time;
"""
# 执行 SQL 查询并将结果加载到 pandas DataFrame
df = pd.read_sql(sql_query, engine)
# 处理点击序列,计算共现关系
cooccur = defaultdict(lambda: defaultdict(int))
freq = defaultdict(int)
# 按用户和日期分组处理点击序列
for (user_id, date), group in df.groupby(['user_id', 'date']):
items = group['item_id'].tolist()
unique_items = set(items)
# 更新频率统计
for item in unique_items:
freq[item] += 1
# 更新共现关系
for i in range(len(items)):
for j in range(i + 1, len(items)):
item1, item2 = items[i], items[j]
if item1 != item2:
cooccur[item1][item2] += 1
cooccur[item2][item1] += 1
# 计算余弦相似度
result = {}
for item1 in cooccur:
sim_scores = []
for item2 in cooccur[item1]:
numerator = cooccur[item1][item2]
denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2])
if denominator != 0:
score = numerator / denominator
sim_scores.append((item2, score))
sim_scores.sort(key=lambda x: -x[1]) # 按分数排序
result[item1] = sim_scores
# 创建item_id到name的映射
item_name_map = dict(zip(df['item_id'], df['item_name']))
# 输出相似商品
for item_id, sims in result.items():
item_name = item_name_map.get(item_id, 'Unknown')
# 只取前8个最相似的商品
top_sims = sims[:8]
sim_str = ','.join([f'{item_name_map.get(sim_id, "Unknown")}:{score:.4f}' for sim_id, score in top_sims])
print(f'{item_name}\t{sim_str}')