import pandas as pd import math from collections import defaultdict from sqlalchemy import create_engine from db_service import create_db_connection import argparse def clean_text_field(text): if pd.isna(text): return '' # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() bpms_host = '120.76.244.158' bpms_port = '3325' bpms_database = 'bpms' bpms_username = 'PRD_M1_190311' bpms_password = 'WTF)xdbqtW!4gwA7' # 创建数据库连接 engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password) # SQL 查询 sql_query = """ SELECT sp.code AS `PO单号`, psm.name AS `区域`, bb.code AS `客户编码`, GROUP_CONCAT(pc_1.name) AS `商品信息`, MIN(spi.order_time) AS `下单货时间` FROM sale_po sp INNER JOIN sale_po_item spi ON sp.id = spi.po_id LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id LEFT JOIN prd_goods pg ON pg.id = spi.spu_id LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1) LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id WHERE spi.quantity > 0 AND spi.is_delete = 0 AND bb.is_delete = 0 GROUP BY sp.code, psm.name, bb.code; """ # 执行 SQL 查询并将结果加载到 pandas DataFrame df = pd.read_sql(sql_query, engine) # 处理商品信息,分割并去重 cooccur = defaultdict(lambda: defaultdict(int)) freq = defaultdict(int) for _, row in df.iterrows(): # Handle None values in 商品信息 if pd.isna(row['商品信息']): continue categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()] unique_cats = set(categories) for c1 in unique_cats: freq[c1] += 1 for c2 in unique_cats: if c1 != c2: cooccur[c1][c2] += 1 # 计算余弦相似度 result = {} for c1 in cooccur: sim_scores = [] for c2 in cooccur[c1]: numerator = cooccur[c1][c2] denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2]) if denominator != 0: score = numerator / denominator sim_scores.append((c2, score)) sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 result[c1] = sim_scores # 输出相似分类 for cat, sims in result.items(): # 只取前8个最相似的分类 top_sims = sims[:8] sim_str = ','.join([f'{sim_cat}:{score:.4f}' for sim_cat, score in top_sims]) print(f'{cat}\t{sim_str}')