Blame view

item_sim.py 2.72 KB
5ab1c29c   tangwang   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  import pandas as pd

  import math

  from collections import defaultdict

  from sqlalchemy import create_engine

  from db_service import create_db_connection

  import argparse

  

  def clean_text_field(text):

      if pd.isna(text):

          return ''

      # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符

      return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()

  

  # 数据库连接配置

  host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com'

  port = '9030'

  database = 'datacenter'

  username = 'readonly'

  password = 'essa1234'

  

  # 创建数据库连接

  engine = create_db_connection(host, port, database, username, password)

  

  # SQL 查询 - 获取用户点击序列

  sql_query = """

  SELECT 

      DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date,

      se.anonymous_id AS user_id,

      se.item_id,

      pgs.name AS item_name

  FROM 

      sensors_events se

  LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id

  WHERE 

      se.event IN ('contactFactory', 'addToPool', 'addToCart')

      AND se.create_time >= '2025-04-01'

  ORDER BY 

      se.anonymous_id,

      se.create_time;

  """

  

  # 执行 SQL 查询并将结果加载到 pandas DataFrame

  df = pd.read_sql(sql_query, engine)

  

  # 处理点击序列,计算共现关系

  cooccur = defaultdict(lambda: defaultdict(int))

  freq = defaultdict(int)

  

  # 按用户和日期分组处理点击序列

  for (user_id, date), group in df.groupby(['user_id', 'date']):

      items = group['item_id'].tolist()

      unique_items = set(items)

      

      # 更新频率统计

      for item in unique_items:

          freq[item] += 1

      

      # 更新共现关系

      for i in range(len(items)):

          for j in range(i + 1, len(items)):

              item1, item2 = items[i], items[j]

              if item1 != item2:

                  cooccur[item1][item2] += 1

                  cooccur[item2][item1] += 1

  

  # 计算余弦相似度

  result = {}

  for item1 in cooccur:

      sim_scores = []

      for item2 in cooccur[item1]:

          numerator = cooccur[item1][item2]

          denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2])

          if denominator != 0:

              score = numerator / denominator

              sim_scores.append((item2, score))

      sim_scores.sort(key=lambda x: -x[1])  # 按分数排序

      result[item1] = sim_scores

  

  # 创建item_id到name的映射

  item_name_map = dict(zip(df['item_id'], df['item_name']))

  

  # 输出相似商品

  for item_id, sims in result.items():

      item_name = item_name_map.get(item_id, 'Unknown')

      # 只取前8个最相似的商品

      top_sims = sims[:8]

      sim_str = ','.join([f'{item_name_map.get(sim_id, "Unknown")}:{score:.4f}' for sim_id, score in top_sims])

      print(f'{item_name}\t{sim_str}')