Blame view

tag_sim.py 2.64 KB
5ab1c29c   tangwang   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  import pandas as pd

  import math

  from collections import defaultdict

  from sqlalchemy import create_engine

  from db_service import create_db_connection

  import argparse

  

  def clean_text_field(text):

      if pd.isna(text):

          return ''

      # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符

      return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()

  

  bpms_host = '120.76.244.158'

  bpms_port = '3325'

  bpms_database = 'bpms'

  bpms_username = 'PRD_M1_190311'

  bpms_password = 'WTF)xdbqtW!4gwA7'

  

  # 创建数据库连接

  engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password)

  

  # SQL 查询

  sql_query = """

  SELECT

    sp.code AS `PO单号`,

    psm.name AS `区域`,

    bb.code AS `客户编码`,

    GROUP_CONCAT(pc_1.name) AS `商品信息`,

    MIN(spi.order_time) AS `下单货时间`

  FROM sale_po sp

  INNER JOIN sale_po_item spi ON sp.id = spi.po_id

  LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id

  LEFT JOIN prd_goods pg ON pg.id = spi.spu_id

  LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1)

  LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries

  LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id

  WHERE spi.quantity > 0

    AND spi.is_delete = 0

    AND bb.is_delete = 0

  GROUP BY sp.code, psm.name, bb.code;

  """

  

  # 执行 SQL 查询并将结果加载到 pandas DataFrame

  df = pd.read_sql(sql_query, engine)

  

  # 处理商品信息,分割并去重

  cooccur = defaultdict(lambda: defaultdict(int))

  freq = defaultdict(int)

  

  for _, row in df.iterrows():

      # Handle None values in 商品信息

      if pd.isna(row['商品信息']):

          continue

      categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()]

      unique_cats = set(categories)

      for c1 in unique_cats:

          freq[c1] += 1

          for c2 in unique_cats:

              if c1 != c2:

                  cooccur[c1][c2] += 1

  

  # 计算余弦相似度

  result = {}

  for c1 in cooccur:

      sim_scores = []

      for c2 in cooccur[c1]:

          numerator = cooccur[c1][c2]

          denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2])

          if denominator != 0:

              score = numerator / denominator

              sim_scores.append((c2, score))

      sim_scores.sort(key=lambda x: -x[1])  # 按分数排序

      result[c1] = sim_scores

  

  # 输出相似分类

  for cat, sims in result.items():

      # 只取前8个最相似的分类

      top_sims = sims[:8]

      sim_str = ','.join([f'{sim_cat}:{score:.4f}' for sim_cat, score in top_sims])

      print(f'{cat}\t{sim_str}')