Commit 8095cb00634aae042fa78f01cd2dda629e6f288c
1 parent
5b954396
add cos sim
Showing
5 changed files
with
0 additions
and
522 deletions
Show diff stats
hot/README.md deleted
| @@ -1,85 +0,0 @@ | @@ -1,85 +0,0 @@ | ||
| 1 | -# 热门书籍索引生成项目 | ||
| 2 | - | ||
| 3 | -## 项目简介 | ||
| 4 | -本项目旨在根据机构的阅读行为数据(reading_time埋点数据)生成热门书籍索引,通过多种方法统计不同维度下的用户访问(UV)数据。项目支持基于机构(tenant)、机构所属行业(tenant_type)及书籍标签(tag)(包括category1和category2,当成tag同等处理)等不同维度进行统计和排名,从而生成热门书籍清单。并带有自动更新的软链接以方便外部访问。 | ||
| 5 | - | ||
| 6 | - | ||
| 7 | -## 文件结构 | ||
| 8 | -- `index_generation.py`:主程序代码,包含数据加载、UV处理、书单生成和输出等主要功能。 | ||
| 9 | -- `logs/`:日志文件存放目录。 | ||
| 10 | -- `output/`:程序生成的书单输出目录。 | ||
| 11 | - | ||
| 12 | -## 输入数据 | ||
| 13 | -### 1. 书籍属性数据 (`all_books.json`) | ||
| 14 | -- **路径**:`CONFIG['books_path']` | ||
| 15 | -- **内容**:每行包含一个书籍的 JSON 数据,主要字段为: | ||
| 16 | - - `id`:书籍ID。 | ||
| 17 | - - `merged_tags`:书籍相关的标签列表,用逗号分隔。 | ||
| 18 | - | ||
| 19 | -### 2. 机构所属行业数据 (`tenants.json`) | ||
| 20 | -- **路径**:`CONFIG['tenants_path']` | ||
| 21 | -- **内容**:每行包含一个机构的 JSON 数据,主要字段为: | ||
| 22 | - - `id`:机构ID。 | ||
| 23 | - - `tenant_type`:机构所属行业类型。 | ||
| 24 | - | ||
| 25 | -### 3. 阅读行为数据 (`reading_time.json`) | ||
| 26 | -- **路径**:`CONFIG['base_dir']` 下的文件夹,文件名格式为 `reading_time.json.YYYYMMDD`。 | ||
| 27 | -- **内容**:每行包含一个阅读行为的 JSON 数据,主要字段为: | ||
| 28 | - - `user_id`:用户ID。 | ||
| 29 | - - `book_id`:书籍ID。 | ||
| 30 | - - `tenant_id`:机构ID。 | ||
| 31 | - | ||
| 32 | -## 输出数据 | ||
| 33 | -输出数据为生成的热门书籍列表,每个文件包含按指定维度统计的前 `N` 个书籍的排名结果: | ||
| 34 | -- 文件输出路径:`CONFIG['output_dir']` | ||
| 35 | -- 文件名格式:`<prefix>_<current_date>.txt`,并生成软链接至 `<prefix>.txt`。 | ||
| 36 | -- 输出内容示例:`tenant_id book_id1:uv_count1,book_id2:uv_count2,...` | ||
| 37 | - | ||
| 38 | -### 输出文件类型 | ||
| 39 | -1. `tenant_booklist.txt`:按机构(tenant)统计的热门书籍列表。 | ||
| 40 | -2. `tenant_type_booklist.txt`:按机构所属行业(tenant_type)统计的热门书籍列表。 | ||
| 41 | -3. `tag_booklist.txt`:按标签(tag)统计的热门书籍列表。 | ||
| 42 | - | ||
| 43 | -## 配置参数 | ||
| 44 | -### `CONFIG` 说明 | ||
| 45 | -- `base_dir`:阅读数据文件的目录。 | ||
| 46 | -- `books_path`:书籍属性数据文件路径。 | ||
| 47 | -- `tenants_path`:机构所属行业数据文件路径。 | ||
| 48 | -- `output_dir`:输出目录路径。 | ||
| 49 | -- `days`:用于选择最近 `days` 天内的数据文件。 | ||
| 50 | -- `top_n`:生成前 `N` 个热门书籍。 | ||
| 51 | -- `tenant_type_ratio`:用于在机构数据不足时融合所属行业数据的权重比例。 | ||
| 52 | -- `use_simple_uv_processing`: | ||
| 53 | - - `True`:累加每天的 UV。 | ||
| 54 | - - `False`:以数据周期内总 UV 统计为准。 | ||
| 55 | - | ||
| 56 | -## 计算逻辑 | ||
| 57 | -1. **数据加载** | ||
| 58 | - - 使用 `load_books_data()` 和 `load_tenants_data()` 分别加载书籍和机构的基本信息,确保各个 ID 均为字符串。 | ||
| 59 | - - 使用 `get_recent_files()` 获取最近 `days` 天的阅读数据文件列表。 | ||
| 60 | - | ||
| 61 | -2. **UV 数据处理** | ||
| 62 | - - `process_reading_data()`:简单 UV 统计,每条记录中的用户访问量直接累加。 | ||
| 63 | - - `process_reading_data_by_uv()`:用户 UV 去重统计,计算某书籍在一天内的 UV 数量。 | ||
| 64 | - - `CONFIG['use_simple_uv_processing']` 用于决定是否使用简单的累加逻辑。 | ||
| 65 | - | ||
| 66 | -3. **数据融合** | ||
| 67 | - - 使用 `merge_tenant_uv_with_type_uv()` 将机构的 UV 数据与其所属行业的 UV 数据按比例进行融合,减小数据量较小的机构所带来的统计偏差。 | ||
| 68 | - | ||
| 69 | -4. **生成书单** | ||
| 70 | - - `generate_top_booklist()` 根据 UV 统计数据生成指定维度的前 `N` 本热门书籍列表。 | ||
| 71 | - - 生成的书单文件分别保存机构、机构所属行业、标签维度的热门书籍排名。 | ||
| 72 | - | ||
| 73 | -5. **输出与软链接** | ||
| 74 | - - 使用 `write_output()` 将生成的书单写入指定文件,并更新软链接到最新文件。 | ||
| 75 | - | ||
| 76 | -## 日志 | ||
| 77 | -程序的所有日志信息输出至 `logs/index_generation.log`,主要记录数据加载、文件处理、UV 统计、文件写入等步骤的成功与错误信息,以便跟踪和排查问题。 | ||
| 78 | - | ||
| 79 | -## 运行方法 | ||
| 80 | -在终端中执行以下命令来运行主程序: | ||
| 81 | -```bash | ||
| 82 | -python main.py | ||
| 83 | -# 或者 | ||
| 84 | -sh run.sh | ||
| 85 | -``` | ||
| 86 | \ No newline at end of file | 0 | \ No newline at end of file |
hot/main.py deleted
| @@ -1,261 +0,0 @@ | @@ -1,261 +0,0 @@ | ||
| 1 | -import os | ||
| 2 | -import json | ||
| 3 | -import glob | ||
| 4 | -import logging | ||
| 5 | -from collections import defaultdict, Counter | ||
| 6 | -from datetime import datetime, timedelta | ||
| 7 | -import shutil | ||
| 8 | - | ||
| 9 | -# 设置日志配置 | ||
| 10 | -logging.basicConfig( | ||
| 11 | - filename='logs/index_generation.log', | ||
| 12 | - level=logging.INFO, | ||
| 13 | - format='%(asctime)s - %(levelname)s - %(message)s' | ||
| 14 | -) | ||
| 15 | - | ||
| 16 | -# 配置超参 | ||
| 17 | -CONFIG = { | ||
| 18 | - 'base_dir': '../fetch_data/data/', | ||
| 19 | - 'books_path': '../fetch_data/meta_data/all_books.json', | ||
| 20 | - 'tenants_path': '../fetch_data/meta_data/tenants.json', | ||
| 21 | - 'output_dir': './output', | ||
| 22 | - 'days': 30, # 天数,用于获取最近的文件 | ||
| 23 | - 'top_n': 1000, # 生成的前 N 个书单 | ||
| 24 | - 'tenant_type_ratio': 0.01, # 机构和所属行业融合的比例。可以解决机构的冷启动问题。机构内的行为数据越少,受到行业的影响越大。 | ||
| 25 | - 'use_simple_uv_processing': True # 是否使用简单UV处理逻辑 | ||
| 26 | - # 配置为True:则book的read UV统计规则为 每一天的UV的累加, | ||
| 27 | - # 配置为False:则book的read UV统计规则为统计范围内所有天的UV,该方法更多的收到运营配置的曝光的影响, | ||
| 28 | - # 默认为True | ||
| 29 | -} | ||
| 30 | - | ||
| 31 | -def load_json_files(path_pattern): | ||
| 32 | - """根据通配符加载 JSON 文件""" | ||
| 33 | - files = glob.glob(path_pattern) | ||
| 34 | - data = [] | ||
| 35 | - for file in files: | ||
| 36 | - with open(file, 'r', encoding='utf-8') as f: | ||
| 37 | - for line in f: | ||
| 38 | - line = line.strip() | ||
| 39 | - if not line: | ||
| 40 | - continue | ||
| 41 | - try: | ||
| 42 | - data.append(json.loads(line)) | ||
| 43 | - except json.JSONDecodeError: | ||
| 44 | - logging.error(f"Failed to parse JSON line in {file}: {line}") | ||
| 45 | - return data | ||
| 46 | - | ||
| 47 | -def load_books_data(books_path): | ||
| 48 | - """加载书籍属性词典,并将所有ID转换为字符串""" | ||
| 49 | - books_data = {} | ||
| 50 | - with open(books_path, 'r', encoding='utf-8') as f: | ||
| 51 | - for line in f: | ||
| 52 | - line = line.strip() | ||
| 53 | - if not line: | ||
| 54 | - continue | ||
| 55 | - book = json.loads(line) | ||
| 56 | - | ||
| 57 | - tags = book.get('merged_tags', '') | ||
| 58 | - category1 = book.get('category1', '') | ||
| 59 | - category2 = book.get('category2', '') | ||
| 60 | - combined_tags = ','.join(filter(lambda x: x not in [None, ''], [tags, category1, category2])) | ||
| 61 | - books_data[str(book['id'])] = combined_tags # 将book['id']转换为字符串 | ||
| 62 | - | ||
| 63 | - logging.info(f"Loaded {len(books_data)} books from {books_path}") | ||
| 64 | - return books_data | ||
| 65 | - | ||
| 66 | -def load_tenants_data(tenants_path): | ||
| 67 | - """加载机构所属行业词典,并将所有ID转换为字符串""" | ||
| 68 | - tenants_data = {} | ||
| 69 | - with open(tenants_path, 'r', encoding='utf-8') as f: | ||
| 70 | - for line in f: | ||
| 71 | - line = line.strip() | ||
| 72 | - if not line: | ||
| 73 | - continue | ||
| 74 | - tenant = json.loads(line) | ||
| 75 | - tenant_type = tenant.get('tenant_type', '') | ||
| 76 | - if not tenant_type: | ||
| 77 | - tenant_type = '' | ||
| 78 | - tenants_data[str(tenant['id'])] = tenant_type # 将tenant['id']转换为字符串 | ||
| 79 | - logging.info(f"Loaded {len(tenants_data)} tenants from {tenants_path}") | ||
| 80 | - return tenants_data | ||
| 81 | - | ||
| 82 | -def get_recent_files(base_dir, days=30): | ||
| 83 | - """获取最近 days 天的文件""" | ||
| 84 | - today = datetime.today() | ||
| 85 | - recent_files = [] | ||
| 86 | - for i in range(days): | ||
| 87 | - date_str = (today - timedelta(days=i)).strftime('%Y%m%d') | ||
| 88 | - path_pattern = os.path.join(base_dir, f'reading_time.json.{date_str}') | ||
| 89 | - recent_files.extend(glob.glob(path_pattern)) | ||
| 90 | - logging.info(f"Found {len(recent_files)} files for the last {days} days") | ||
| 91 | - return recent_files | ||
| 92 | - | ||
| 93 | -def process_reading_data_by_uv(reading_files, books_data, tenants_data): | ||
| 94 | - """使用用户UV数据处理阅读数据""" | ||
| 95 | - tenant_uv = defaultdict(lambda: defaultdict(set)) # 使用集合来去重 | ||
| 96 | - tenant_type_uv = defaultdict(lambda: defaultdict(set)) # 使用集合来去重 | ||
| 97 | - tag_uv = defaultdict(lambda: defaultdict(set)) # 使用集合来去重 | ||
| 98 | - | ||
| 99 | - for file in reading_files: | ||
| 100 | - with open(file, 'r', encoding='utf-8') as f: | ||
| 101 | - for line in f: | ||
| 102 | - try: | ||
| 103 | - record = json.loads(line.strip()) | ||
| 104 | - user_id = str(record.get('user_id', '')) # 将user_id转换为字符串 | ||
| 105 | - book_id = str(record.get('book_id', '')) # 将book_id转换为字符串 | ||
| 106 | - tenant_id = str(record.get('tenant_id', '')) # 将tenant_id转换为字符串 | ||
| 107 | - | ||
| 108 | - if not book_id or not tenant_id or not user_id: | ||
| 109 | - continue | ||
| 110 | - | ||
| 111 | - tenant_uv[tenant_id][book_id].add(user_id) | ||
| 112 | - tenant_type = tenants_data.get(tenant_id, '') # tenant_id已经是字符串 | ||
| 113 | - tenant_type_uv[tenant_type][book_id].add(user_id) | ||
| 114 | - | ||
| 115 | - tags = books_data.get(book_id, '').split(',') | ||
| 116 | - for tag in tags: | ||
| 117 | - if tag: | ||
| 118 | - tag_uv[tag][book_id].add(user_id) | ||
| 119 | - | ||
| 120 | - except json.JSONDecodeError: | ||
| 121 | - logging.error(f"Failed to parse JSON line in {file}: {line}") | ||
| 122 | - | ||
| 123 | - # 转换为UV数量,即集合中user_id的数量 | ||
| 124 | - tenant_uv_count = {tenant: Counter({book: len(users) for book, users in books.items()}) | ||
| 125 | - for tenant, books in tenant_uv.items()} | ||
| 126 | - tenant_type_uv_count = {tenant_type: Counter({book: len(users) for book, users in books.items()}) | ||
| 127 | - for tenant_type, books in tenant_type_uv.items()} | ||
| 128 | - tag_uv_count = {tag: Counter({book: len(users) for book, users in books.items()}) | ||
| 129 | - for tag, books in tag_uv.items()} | ||
| 130 | - | ||
| 131 | - logging.info(f"Processed reading data, total tenants: {len(tenant_uv_count)}, tenant types: {len(tenant_type_uv_count)}, tags: {len(tag_uv_count)}") | ||
| 132 | - | ||
| 133 | - return tenant_uv_count, tenant_type_uv_count, tag_uv_count | ||
| 134 | - | ||
| 135 | -def process_reading_data(reading_files, books_data, tenants_data): | ||
| 136 | - """使用简单的UV累加逻辑处理阅读数据""" | ||
| 137 | - tenant_uv = defaultdict(Counter) | ||
| 138 | - tenant_type_uv = defaultdict(Counter) | ||
| 139 | - tag_uv = defaultdict(Counter) | ||
| 140 | - | ||
| 141 | - for file in reading_files: | ||
| 142 | - with open(file, 'r', encoding='utf-8') as f: | ||
| 143 | - for line in f: | ||
| 144 | - try: | ||
| 145 | - record = json.loads(line.strip()) | ||
| 146 | - user_id = str(record.get('user_id', '')) # 将user_id转换为字符串 | ||
| 147 | - book_id = str(record.get('book_id', '')) # 将book_id转换为字符串 | ||
| 148 | - tenant_id = str(record.get('tenant_id', '')) # 将tenant_id转换为字符串 | ||
| 149 | - | ||
| 150 | - if not book_id or not tenant_id: | ||
| 151 | - continue | ||
| 152 | - | ||
| 153 | - tenant_uv[tenant_id][book_id] += 1 | ||
| 154 | - tenant_type = tenants_data.get(tenant_id, '') # tenant_id已经是字符串 | ||
| 155 | - tenant_type_uv[tenant_type][book_id] += 1 | ||
| 156 | - | ||
| 157 | - tags = books_data.get(book_id, '').split(',') | ||
| 158 | - for tag in tags: | ||
| 159 | - if tag: | ||
| 160 | - tag_uv[tag][book_id] += 1 | ||
| 161 | - | ||
| 162 | - except json.JSONDecodeError: | ||
| 163 | - logging.error(f"Failed to parse JSON line in {file}: {line}") | ||
| 164 | - | ||
| 165 | - logging.info(f"Processed reading data, total tenants: {len(tenant_uv)}, tenant types: {len(tenant_type_uv)}, tags: {len(tag_uv)}") | ||
| 166 | - | ||
| 167 | - return tenant_uv, tenant_type_uv, tag_uv | ||
| 168 | - | ||
| 169 | -def generate_top_booklist(counter_dict, top_n=1000): | ||
| 170 | - """生成排序后的前 top_n booklist""" | ||
| 171 | - result = {} | ||
| 172 | - for key, counter in counter_dict.items(): | ||
| 173 | - top_books = counter.most_common(top_n) | ||
| 174 | - if not key or len(top_books) == 0: | ||
| 175 | - continue | ||
| 176 | - result[key] = ','.join([f'{bid}:{uv}' for bid, uv in top_books]) | ||
| 177 | - return result | ||
| 178 | - | ||
| 179 | -def write_output(data, output_dir, prefix, current_date): | ||
| 180 | - """写入输出文件,并生成软链接到 output 目录下""" | ||
| 181 | - try: | ||
| 182 | - output_file_path = os.path.join(output_dir, f'{prefix}_{current_date}.txt') | ||
| 183 | - output_file_link = os.path.join(output_dir, f'{prefix}.txt') | ||
| 184 | - | ||
| 185 | - if not os.path.exists(output_dir): | ||
| 186 | - os.makedirs(output_dir) | ||
| 187 | - | ||
| 188 | - with open(output_file_path, 'w', encoding='utf-8') as f: | ||
| 189 | - for key, booklist in data.items(): | ||
| 190 | - key.replace('\t', ' ') | ||
| 191 | - if not key or not booklist: | ||
| 192 | - continue | ||
| 193 | - f.write(f"{key}\t{booklist}\n") | ||
| 194 | - | ||
| 195 | - logging.info(f"Output written to {output_file_path}") | ||
| 196 | - | ||
| 197 | - if os.path.islink(output_file_link) or os.path.exists(output_file_link): | ||
| 198 | - os.remove(output_file_link) | ||
| 199 | - | ||
| 200 | - os.symlink(os.path.basename(output_file_path), output_file_link) | ||
| 201 | - logging.info(f"Symlink created at {output_file_link} pointing to {output_file_path}") | ||
| 202 | - | ||
| 203 | - except Exception as e: | ||
| 204 | - logging.error(f"Error writing output or creating symlink: {str(e)}") | ||
| 205 | - | ||
| 206 | -def merge_tenant_uv_with_type_uv(tenant_uv, tenant_type_uv, tenants_data, ratio=CONFIG['tenant_type_ratio']): | ||
| 207 | - """合并 tenant 的 UV 统计和其所属 tenant_type 的 UV 统计结果 | ||
| 208 | - | ||
| 209 | - 融合的目的:通过融合机构所属行业的UV数据,平滑处理小机构数据不足的情况,给予它们更多的行业UV权重 ,避免因数据量小而导致的统计偏差。 | ||
| 210 | - | ||
| 211 | - ratio 参数控制行业 UV 统计数据在融合过程中所占的权重比例。较高的比例表示行业数据的影响较大,较低的比例则表示单个机构的数据占主导地位。 | ||
| 212 | - """ | ||
| 213 | - merged_tenant_uv = defaultdict(Counter) | ||
| 214 | - | ||
| 215 | - for tenant_id, books_counter in tenant_uv.items(): | ||
| 216 | - # 获取该 tenant 的 tenant_type | ||
| 217 | - tenant_type = tenants_data.get(tenant_id, '') | ||
| 218 | - | ||
| 219 | - # 获取该 tenant_type 下的 UV 统计 | ||
| 220 | - tenant_type_counter = tenant_type_uv.get(tenant_type, Counter()) | ||
| 221 | - | ||
| 222 | - # 合并 tenant 自身的 UV 统计和 tenant_type 的 UV 统计结果(乘以比例系数) | ||
| 223 | - for book_id, uv_count in books_counter.items(): | ||
| 224 | - tenant_type_uv_adjusted = int(tenant_type_counter.get(book_id, 0) * ratio) | ||
| 225 | - merged_tenant_uv[tenant_id][book_id] = uv_count + tenant_type_uv_adjusted | ||
| 226 | - | ||
| 227 | - logging.info(f"Merged tenant UV with tenant type UV using ratio {ratio}") | ||
| 228 | - return merged_tenant_uv | ||
| 229 | - | ||
| 230 | -def main(): | ||
| 231 | - # 获取当前日期 | ||
| 232 | - current_date = datetime.today().strftime('%Y%m%d') | ||
| 233 | - | ||
| 234 | - # 加载书籍和机构数据 | ||
| 235 | - books_data = load_books_data(CONFIG['books_path']) | ||
| 236 | - tenants_data = load_tenants_data(CONFIG['tenants_path']) | ||
| 237 | - | ||
| 238 | - # 获取最近配置的天数的阅读数据文件 | ||
| 239 | - reading_files = get_recent_files(CONFIG['base_dir'], days=CONFIG['days']) | ||
| 240 | - | ||
| 241 | - # 根据配置选择UV处理逻辑 | ||
| 242 | - if CONFIG['use_simple_uv_processing']: | ||
| 243 | - tenant_uv, tenant_type_uv, tag_uv = process_reading_data(reading_files, books_data, tenants_data) | ||
| 244 | - else: | ||
| 245 | - tenant_uv, tenant_type_uv, tag_uv = process_reading_data_by_uv(reading_files, books_data, tenants_data) | ||
| 246 | - | ||
| 247 | - # 合并 tenant UV 和 tenant_type UV(使用配置的比例) | ||
| 248 | - merged_tenant_uv = merge_tenant_uv_with_type_uv(tenant_uv, tenant_type_uv, tenants_data, ratio=CONFIG['tenant_type_ratio']) | ||
| 249 | - | ||
| 250 | - # 生成前N本书的书单 | ||
| 251 | - tenant_booklist = generate_top_booklist(merged_tenant_uv, top_n=CONFIG['top_n']) | ||
| 252 | - tenant_type_booklist = generate_top_booklist(tenant_type_uv, top_n=CONFIG['top_n']) | ||
| 253 | - tag_booklist = generate_top_booklist(tag_uv, top_n=CONFIG['top_n']) | ||
| 254 | - | ||
| 255 | - # 写入输出文件并生成软链接 | ||
| 256 | - write_output(tenant_booklist, CONFIG['output_dir'], 'tenant_booklist', current_date) | ||
| 257 | - write_output(tenant_type_booklist, CONFIG['output_dir'], 'tenant_type_booklist', current_date) | ||
| 258 | - write_output(tag_booklist, CONFIG['output_dir'], 'tag_booklist', current_date) | ||
| 259 | - | ||
| 260 | -if __name__ == '__main__': | ||
| 261 | - main() |
hot/run.sh deleted
item_sim.py deleted
| @@ -1,88 +0,0 @@ | @@ -1,88 +0,0 @@ | ||
| 1 | -import pandas as pd | ||
| 2 | -import math | ||
| 3 | -from collections import defaultdict | ||
| 4 | -from sqlalchemy import create_engine | ||
| 5 | -from db_service import create_db_connection | ||
| 6 | -import argparse | ||
| 7 | - | ||
| 8 | -def clean_text_field(text): | ||
| 9 | - if pd.isna(text): | ||
| 10 | - return '' | ||
| 11 | - # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 | ||
| 12 | - return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() | ||
| 13 | - | ||
| 14 | -# 数据库连接配置 | ||
| 15 | -host = 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com' | ||
| 16 | -port = '9030' | ||
| 17 | -database = 'datacenter' | ||
| 18 | -username = 'readonly' | ||
| 19 | -password = 'essa1234' | ||
| 20 | - | ||
| 21 | -# 创建数据库连接 | ||
| 22 | -engine = create_db_connection(host, port, database, username, password) | ||
| 23 | - | ||
| 24 | -# SQL 查询 - 获取用户点击序列 | ||
| 25 | -sql_query = """ | ||
| 26 | -SELECT | ||
| 27 | - DATE_FORMAT(se.create_time, '%%Y-%%m-%%d') AS date, | ||
| 28 | - se.anonymous_id AS user_id, | ||
| 29 | - se.item_id, | ||
| 30 | - pgs.name AS item_name | ||
| 31 | -FROM | ||
| 32 | - sensors_events se | ||
| 33 | -LEFT JOIN prd_goods_sku pgs ON se.item_id = pgs.id | ||
| 34 | -WHERE | ||
| 35 | - se.event IN ('contactFactory', 'addToPool', 'addToCart') | ||
| 36 | - AND se.create_time >= '2025-04-01' | ||
| 37 | -ORDER BY | ||
| 38 | - se.anonymous_id, | ||
| 39 | - se.create_time; | ||
| 40 | -""" | ||
| 41 | - | ||
| 42 | -# 执行 SQL 查询并将结果加载到 pandas DataFrame | ||
| 43 | -df = pd.read_sql(sql_query, engine) | ||
| 44 | - | ||
| 45 | -# 处理点击序列,计算共现关系 | ||
| 46 | -cooccur = defaultdict(lambda: defaultdict(int)) | ||
| 47 | -freq = defaultdict(int) | ||
| 48 | - | ||
| 49 | -# 按用户和日期分组处理点击序列 | ||
| 50 | -for (user_id, date), group in df.groupby(['user_id', 'date']): | ||
| 51 | - items = group['item_id'].tolist() | ||
| 52 | - unique_items = set(items) | ||
| 53 | - | ||
| 54 | - # 更新频率统计 | ||
| 55 | - for item in unique_items: | ||
| 56 | - freq[item] += 1 | ||
| 57 | - | ||
| 58 | - # 更新共现关系 | ||
| 59 | - for i in range(len(items)): | ||
| 60 | - for j in range(i + 1, len(items)): | ||
| 61 | - item1, item2 = items[i], items[j] | ||
| 62 | - if item1 != item2: | ||
| 63 | - cooccur[item1][item2] += 1 | ||
| 64 | - cooccur[item2][item1] += 1 | ||
| 65 | - | ||
| 66 | -# 计算余弦相似度 | ||
| 67 | -result = {} | ||
| 68 | -for item1 in cooccur: | ||
| 69 | - sim_scores = [] | ||
| 70 | - for item2 in cooccur[item1]: | ||
| 71 | - numerator = cooccur[item1][item2] | ||
| 72 | - denominator = math.sqrt(freq[item1]) * math.sqrt(freq[item2]) | ||
| 73 | - if denominator != 0: | ||
| 74 | - score = numerator / denominator | ||
| 75 | - sim_scores.append((item2, score)) | ||
| 76 | - sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 | ||
| 77 | - result[item1] = sim_scores | ||
| 78 | - | ||
| 79 | -# 创建item_id到name的映射 | ||
| 80 | -item_name_map = dict(zip(df['item_id'], df['item_name'])) | ||
| 81 | - | ||
| 82 | -# 输出相似商品 | ||
| 83 | -for item_id, sims in result.items(): | ||
| 84 | - item_name = item_name_map.get(item_id, 'Unknown') | ||
| 85 | - # 只取前8个最相似的商品 | ||
| 86 | - top_sims = sims[:8] | ||
| 87 | - sim_str = ','.join([f'{item_name_map.get(sim_id, "Unknown")}:{score:.4f}' for sim_id, score in top_sims]) | ||
| 88 | - print(f'{item_name}\t{sim_str}') |
tag_sim.py deleted
| @@ -1,81 +0,0 @@ | @@ -1,81 +0,0 @@ | ||
| 1 | -import pandas as pd | ||
| 2 | -import math | ||
| 3 | -from collections import defaultdict | ||
| 4 | -from sqlalchemy import create_engine | ||
| 5 | -from db_service import create_db_connection | ||
| 6 | -import argparse | ||
| 7 | - | ||
| 8 | -def clean_text_field(text): | ||
| 9 | - if pd.isna(text): | ||
| 10 | - return '' | ||
| 11 | - # 移除换行符、回车符,并替换其他可能导致CSV格式问题的字符 | ||
| 12 | - return str(text).replace('\r', ' ').replace('\n', ' ').replace('"', '""').strip() | ||
| 13 | - | ||
| 14 | -bpms_host = '120.76.244.158' | ||
| 15 | -bpms_port = '3325' | ||
| 16 | -bpms_database = 'bpms' | ||
| 17 | -bpms_username = 'PRD_M1_190311' | ||
| 18 | -bpms_password = 'WTF)xdbqtW!4gwA7' | ||
| 19 | - | ||
| 20 | -# 创建数据库连接 | ||
| 21 | -engine = create_db_connection(bpms_host, bpms_port, bpms_database, bpms_username, bpms_password) | ||
| 22 | - | ||
| 23 | -# SQL 查询 | ||
| 24 | -sql_query = """ | ||
| 25 | -SELECT | ||
| 26 | - sp.code AS `PO单号`, | ||
| 27 | - psm.name AS `区域`, | ||
| 28 | - bb.code AS `客户编码`, | ||
| 29 | - GROUP_CONCAT(pc_1.name) AS `商品信息`, | ||
| 30 | - MIN(spi.order_time) AS `下单货时间` | ||
| 31 | -FROM sale_po sp | ||
| 32 | -INNER JOIN sale_po_item spi ON sp.id = spi.po_id | ||
| 33 | -LEFT JOIN buy_buyer bb ON bb.id = sp.buyer_id | ||
| 34 | -LEFT JOIN prd_goods pg ON pg.id = spi.spu_id | ||
| 35 | -LEFT JOIN prd_category AS pc_1 ON pc_1.id = SUBSTRING_INDEX(SUBSTRING_INDEX(pg.category_id, '.', 2), '.', -1) | ||
| 36 | -LEFT JOIN pub_sale_market_setting psms ON psms.country_code = bb.countries | ||
| 37 | -LEFT JOIN pub_sale_market psm ON psms.sale_market_id = psm.id | ||
| 38 | -WHERE spi.quantity > 0 | ||
| 39 | - AND spi.is_delete = 0 | ||
| 40 | - AND bb.is_delete = 0 | ||
| 41 | -GROUP BY sp.code, psm.name, bb.code; | ||
| 42 | -""" | ||
| 43 | - | ||
| 44 | -# 执行 SQL 查询并将结果加载到 pandas DataFrame | ||
| 45 | -df = pd.read_sql(sql_query, engine) | ||
| 46 | - | ||
| 47 | -# 处理商品信息,分割并去重 | ||
| 48 | -cooccur = defaultdict(lambda: defaultdict(int)) | ||
| 49 | -freq = defaultdict(int) | ||
| 50 | - | ||
| 51 | -for _, row in df.iterrows(): | ||
| 52 | - # Handle None values in 商品信息 | ||
| 53 | - if pd.isna(row['商品信息']): | ||
| 54 | - continue | ||
| 55 | - categories = [cat.strip() for cat in str(row['商品信息']).split(',') if cat.strip()] | ||
| 56 | - unique_cats = set(categories) | ||
| 57 | - for c1 in unique_cats: | ||
| 58 | - freq[c1] += 1 | ||
| 59 | - for c2 in unique_cats: | ||
| 60 | - if c1 != c2: | ||
| 61 | - cooccur[c1][c2] += 1 | ||
| 62 | - | ||
| 63 | -# 计算余弦相似度 | ||
| 64 | -result = {} | ||
| 65 | -for c1 in cooccur: | ||
| 66 | - sim_scores = [] | ||
| 67 | - for c2 in cooccur[c1]: | ||
| 68 | - numerator = cooccur[c1][c2] | ||
| 69 | - denominator = math.sqrt(freq[c1]) * math.sqrt(freq[c2]) | ||
| 70 | - if denominator != 0: | ||
| 71 | - score = numerator / denominator | ||
| 72 | - sim_scores.append((c2, score)) | ||
| 73 | - sim_scores.sort(key=lambda x: -x[1]) # 按分数排序 | ||
| 74 | - result[c1] = sim_scores | ||
| 75 | - | ||
| 76 | -# 输出相似分类 | ||
| 77 | -for cat, sims in result.items(): | ||
| 78 | - # 只取前8个最相似的分类 | ||
| 79 | - top_sims = sims[:8] | ||
| 80 | - sim_str = ','.join([f'{sim_cat}:{score:.4f}' for sim_cat, score in top_sims]) | ||
| 81 | - print(f'{cat}\t{sim_str}') |