5ab1c29c
tangwang
first commit
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
import pandas as pd
import math
from collections import defaultdict
from sqlalchemy import create_engine
from db_service import create_db_connection
import argparse
def clean_text_field(text):
if pd.isna(text):
return ''
# 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符
return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip()
bpms_host = '120.76.244.158'
bpms_port = '3325'
bpms_database = 'bpms'
bpms_username = 'PRD_M1_190311'
bpms_password = 'WTF)xdbqtW!4gwA7'
# 创建数据库连接
engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password)
# SQL 查询
sql_query = """
SELECT
sp.code AS `PO单号`,
psm.name AS `区域`,
bb.code AS `客户编码`,
GROUP_CONCAT(pc_1.name) AS `商品信息`,
MIN(spi.order_time) AS `下单货时间`
FROM sale_po sp
INNER JOIN sale_po_item spi ON sp.id = spi.po_id
LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id
LEFT JOIN prd_goods pg ON pg.id = spi.spu_id
LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1)
LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries
LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id
WHERE spi.quantity > 0
AND spi.is_delete = 0
AND bb.is_delete = 0
GROUP BY sp.code, psm.name, bb.code;
"""
# 执行 SQL 查询并将结果加载到 pandas DataFrame
df = pd.read_sql(sql_query, engine)
# 处理商品信息,分割并去重
cooccur = defaultdict(lambda: defaultdict(int))
freq = defaultdict(int)
for _, row in df.iterrows():
# Handle None values in 商品信息
if pd.isna(row['商品信息']):
continue
categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()]
unique_cats = set(categories)
for c1 in unique_cats:
freq[c1] += 1
for c2 in unique_cats:
if c1 != c2:
cooccur[c1][c2] += 1
# 计算余弦相似度
result = {}
for c1 in cooccur:
sim_scores = []
for c2 in cooccur[c1]:
numerator = cooccur[c1][c2]
denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2])
if denominator != 0:
score = numerator / denominator
sim_scores.append((c2, score))
sim_scores.sort(key=lambda x: -x[1]) # 按分数排序
result[c1] = sim_scores
# 输出相似分类
for cat, sims in result.items():
# 只取前8个最相似的分类
top_sims = sims[:8]
sim_str = ','.join([f'{sim_cat}:{score:.4f}' for sim_cat, score in top_sims])
print(f'{cat}\t{sim_str}')
|