Commit c9f77c8fefe8972664c0db61f43d7df321d4db76
1 parent
06cb25fa
deepwalk refactor for memsave and perfermance optimize
Showing
12 changed files
with
17 additions
and
175 deletions
Show diff stats
offline_tasks/config.py deleted
| ... | ... | @@ -1,26 +0,0 @@ |
| 1 | -import os # Add for environment variable reading | |
| 2 | - | |
| 3 | - | |
| 4 | -ES_CONFIG = { | |
| 5 | - 'host': 'http://localhost:9200', | |
| 6 | - # default index name will be overwritten below based on APP_ENV | |
| 7 | - 'index_name': 'spu', | |
| 8 | - 'username': 'essa', | |
| 9 | - 'password': '4hOaLaf41y2VuI8y' | |
| 10 | -} | |
| 11 | - | |
| 12 | - | |
| 13 | -# Redis Cache Configuration | |
| 14 | -REDIS_CONFIG = { | |
| 15 | - # 'host': '120.76.41.98', | |
| 16 | - 'host': 'localhost', | |
| 17 | - 'port': 6479, | |
| 18 | - 'snapshot_db': 0, | |
| 19 | - 'password': 'BMfv5aI31kgHWtlx', | |
| 20 | - 'socket_timeout': 1, | |
| 21 | - 'socket_connect_timeout': 1, | |
| 22 | - 'retry_on_timeout': False, | |
| 23 | - 'cache_expire_days': 180, # 6 months | |
| 24 | - 'translation_cache_expire_days': 360, | |
| 25 | - 'translation_cache_prefix': 'trans' | |
| 26 | -} |
offline_tasks/scripts/db_service.py renamed to offline_tasks/db_service.py
offline_tasks/scripts/add_names_to_swing.py
| ... | ... | @@ -5,7 +5,7 @@ |
| 5 | 5 | """ |
| 6 | 6 | import argparse |
| 7 | 7 | from datetime import datetime |
| 8 | -from debug_utils import setup_debug_logger, load_name_mappings_from_file | |
| 8 | +from scripts.debug_utils import setup_debug_logger, load_name_mappings_from_file | |
| 9 | 9 | |
| 10 | 10 | |
| 11 | 11 | def add_names_to_swing_result(input_file, output_file, name_mappings, logger=None, debug=False): | ... | ... |
offline_tasks/scripts/config.py deleted
| ... | ... | @@ -1,130 +0,0 @@ |
| 1 | -""" | |
| 2 | -离线任务配置文件 | |
| 3 | -包含数据库连接、路径、参数等配置 | |
| 4 | -""" | |
| 5 | -import os | |
| 6 | -from datetime import datetime, timedelta | |
| 7 | - | |
| 8 | -# 数据库配置 | |
| 9 | -DB_CONFIG = { | |
| 10 | - 'host': 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com', | |
| 11 | - 'port': '9030', | |
| 12 | - 'database': 'datacenter', | |
| 13 | - 'username': 'readonly', | |
| 14 | - 'password': 'essa1234' | |
| 15 | -} | |
| 16 | - | |
| 17 | -# 路径配置 | |
| 18 | -BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| 19 | -OUTPUT_DIR = os.path.join(BASE_DIR, 'output') | |
| 20 | -LOG_DIR = os.path.join(BASE_DIR, 'logs') | |
| 21 | - | |
| 22 | -# 确保目录存在 | |
| 23 | -os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| 24 | -os.makedirs(LOG_DIR, exist_ok=True) | |
| 25 | - | |
| 26 | -# ============================================================================ | |
| 27 | -# 默认参数配置(用于调试和生产) | |
| 28 | -# ============================================================================ | |
| 29 | - | |
| 30 | -# 时间配置(建议先用小数值调试,确认无误后再改为大数值) | |
| 31 | -DEFAULT_LOOKBACK_DAYS = 400 # 默认回看天数(调试用30天,生产可改为730天) | |
| 32 | -DEFAULT_RECENT_DAYS = 180 # 默认最近天数(调试用7天,生产可改为180天) | |
| 33 | - | |
| 34 | -# i2i算法默认参数 | |
| 35 | -DEFAULT_I2I_TOP_N = 50 # 默认返回Top N个相似商品 | |
| 36 | - | |
| 37 | -# 兴趣聚合默认参数 | |
| 38 | -DEFAULT_INTEREST_TOP_N = 1000 # 默认每个key返回Top N个商品 | |
| 39 | - | |
| 40 | -# 获取时间范围 | |
| 41 | -def get_time_range(days=DEFAULT_LOOKBACK_DAYS): | |
| 42 | - """获取时间范围""" | |
| 43 | - end_date = datetime.now() | |
| 44 | - start_date = end_date - timedelta(days=days) | |
| 45 | - return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d') | |
| 46 | - | |
| 47 | -# i2i 行为相似算法配置 | |
| 48 | -I2I_CONFIG = { | |
| 49 | - # Swing 算法配置 | |
| 50 | - 'swing': { | |
| 51 | - 'alpha': 0.5, # swing算法的alpha参数 | |
| 52 | - 'threshold1': 0.5, # 交互强度阈值1 | |
| 53 | - 'threshold2': 0.5, # 交互强度阈值2 | |
| 54 | - 'max_sim_list_len': 300, # 最大相似列表长度 | |
| 55 | - 'top_n': 50, # 输出top N个相似商品 | |
| 56 | - 'thread_num': 10, # 线程数(如果使用C++版本) | |
| 57 | - }, | |
| 58 | - | |
| 59 | - # Session W2V 配置 | |
| 60 | - 'session_w2v': { | |
| 61 | - 'max_sentence_length': 100, # 最大句子长度 | |
| 62 | - 'window_size': 5, # 窗口大小 | |
| 63 | - 'vector_size': 128, # 向量维度 | |
| 64 | - 'min_count': 2, # 最小词频 | |
| 65 | - 'workers': 10, # 训练线程数 | |
| 66 | - 'epochs': 10, # 训练轮数 | |
| 67 | - 'sg': 1, # 使用skip-gram | |
| 68 | - }, | |
| 69 | - | |
| 70 | - # DeepWalk 配置 | |
| 71 | - 'deepwalk': { | |
| 72 | - 'num_walks': 10, # 每个节点的游走次数 | |
| 73 | - 'walk_length': 40, # 游走长度 | |
| 74 | - 'window_size': 5, # 窗口大小 | |
| 75 | - 'vector_size': 128, # 向量维度 | |
| 76 | - 'min_count': 2, # 最小词频 | |
| 77 | - 'workers': 10, # 训练线程数 | |
| 78 | - 'epochs': 10, # 训练轮数 | |
| 79 | - 'sg': 1, # 使用skip-gram | |
| 80 | - 'use_softmax': True, # 使用softmax | |
| 81 | - 'temperature': 1.0, # softmax温度 | |
| 82 | - 'p_tag_walk': 0.2, # 通过标签游走的概率 | |
| 83 | - } | |
| 84 | -} | |
| 85 | - | |
| 86 | -# 兴趣点聚合配置 | |
| 87 | -INTEREST_AGGREGATION_CONFIG = { | |
| 88 | - 'top_n': 1000, # 每个key生成前N个商品 | |
| 89 | - 'time_decay_factor': 0.95, # 时间衰减因子(每30天) | |
| 90 | - 'min_interaction_count': 2, # 最小交互次数 | |
| 91 | - | |
| 92 | - # 行为权重 | |
| 93 | - 'behavior_weights': { | |
| 94 | - 'click': 1.0, | |
| 95 | - 'addToCart': 3.0, | |
| 96 | - 'addToPool': 2.0, | |
| 97 | - 'contactFactory': 5.0, | |
| 98 | - 'purchase': 10.0, | |
| 99 | - }, | |
| 100 | - | |
| 101 | - # 类型配置 | |
| 102 | - 'list_types': ['hot', 'cart', 'new'], # 热门、加购、新品 | |
| 103 | -} | |
| 104 | - | |
| 105 | -# Redis配置(用于存储索引) | |
| 106 | -REDIS_CONFIG = { | |
| 107 | - 'host': 'localhost', | |
| 108 | - 'port': 6379, | |
| 109 | - 'db': 0, | |
| 110 | - 'password': None, | |
| 111 | - 'decode_responses': False | |
| 112 | -} | |
| 113 | - | |
| 114 | -# 日志配置 | |
| 115 | -LOG_CONFIG = { | |
| 116 | - 'level': 'INFO', | |
| 117 | - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| 118 | - 'date_format': '%Y-%m-%d %H:%M:%S' | |
| 119 | -} | |
| 120 | - | |
| 121 | -# Debug配置 | |
| 122 | -DEBUG_CONFIG = { | |
| 123 | - 'enabled': False, # 是否开启debug模式 | |
| 124 | - 'log_level': 'DEBUG', # debug日志级别 | |
| 125 | - 'sample_size': 5, # 数据采样大小 | |
| 126 | - 'save_readable': True, # 是否保存可读明文文件 | |
| 127 | - 'log_dataframe_info': True, # 是否记录DataFrame详细信息 | |
| 128 | - 'log_intermediate': True, # 是否记录中间结果 | |
| 129 | -} | |
| 130 | - |
offline_tasks/scripts/fetch_item_attributes.py
| ... | ... | @@ -8,8 +8,8 @@ import json |
| 8 | 8 | import argparse |
| 9 | 9 | from datetime import datetime |
| 10 | 10 | from db_service import create_db_connection |
| 11 | -from config import DB_CONFIG, OUTPUT_DIR | |
| 12 | -from debug_utils import setup_debug_logger | |
| 11 | +from config.offline_config import DB_CONFIG, OUTPUT_DIR | |
| 12 | +from scripts.debug_utils import setup_debug_logger | |
| 13 | 13 | |
| 14 | 14 | |
| 15 | 15 | def fetch_and_save_mappings(engine, output_dir, logger=None, debug=False): | ... | ... |
offline_tasks/scripts/generate_session.py
| ... | ... | @@ -9,11 +9,11 @@ from collections import defaultdict |
| 9 | 9 | import argparse |
| 10 | 10 | from datetime import datetime, timedelta |
| 11 | 11 | from db_service import create_db_connection |
| 12 | -from config import ( | |
| 12 | +from config.offline_config import ( | |
| 13 | 13 | DB_CONFIG, OUTPUT_DIR, get_time_range, |
| 14 | 14 | DEFAULT_LOOKBACK_DAYS |
| 15 | 15 | ) |
| 16 | -from debug_utils import setup_debug_logger, log_dataframe_info | |
| 16 | +from scripts.debug_utils import setup_debug_logger, log_dataframe_info | |
| 17 | 17 | |
| 18 | 18 | |
| 19 | 19 | def aggregate_user_sessions(df, behavior_weights, logger=None, debug=False): | ... | ... |
offline_tasks/scripts/i2i_content_similar.py
| ... | ... | @@ -9,8 +9,8 @@ import pandas as pd |
| 9 | 9 | from datetime import datetime, timedelta |
| 10 | 10 | from elasticsearch import Elasticsearch |
| 11 | 11 | from db_service import create_db_connection |
| 12 | -from config import DB_CONFIG, OUTPUT_DIR | |
| 13 | -from debug_utils import setup_debug_logger, log_processing_step | |
| 12 | +from config.offline_config import DB_CONFIG, OUTPUT_DIR | |
| 13 | +from scripts.debug_utils import setup_debug_logger, log_processing_step | |
| 14 | 14 | |
| 15 | 15 | # ES配置 |
| 16 | 16 | ES_CONFIG = { | ... | ... |
offline_tasks/scripts/i2i_deepwalk.py
| ... | ... | @@ -6,24 +6,22 @@ i2i - DeepWalk算法实现 |
| 6 | 6 | import pandas as pd |
| 7 | 7 | import argparse |
| 8 | 8 | import os |
| 9 | -import sys | |
| 10 | 9 | from datetime import datetime |
| 11 | 10 | from collections import defaultdict |
| 12 | 11 | from gensim.models import Word2Vec |
| 13 | 12 | from db_service import create_db_connection |
| 14 | -from config import ( | |
| 13 | +from config.offline_config import ( | |
| 15 | 14 | DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, |
| 16 | 15 | DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N |
| 17 | 16 | ) |
| 18 | -from debug_utils import ( | |
| 17 | +from scripts.debug_utils import ( | |
| 19 | 18 | setup_debug_logger, log_dataframe_info, |
| 20 | 19 | save_readable_index, fetch_name_mappings, log_algorithm_params, |
| 21 | 20 | log_processing_step |
| 22 | 21 | ) |
| 23 | 22 | |
| 24 | 23 | # 导入 DeepWalk 实现 |
| 25 | -sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deepwalk')) | |
| 26 | -from deepwalk import DeepWalk | |
| 24 | +from deepwalk.deepwalk import DeepWalk | |
| 27 | 25 | |
| 28 | 26 | |
| 29 | 27 | def build_edge_file_from_db(df, behavior_weights, output_path, logger): | ... | ... |
offline_tasks/scripts/i2i_session_w2v.py
| ... | ... | @@ -10,11 +10,11 @@ from collections import defaultdict |
| 10 | 10 | from gensim.models import Word2Vec |
| 11 | 11 | import numpy as np |
| 12 | 12 | from db_service import create_db_connection |
| 13 | -from config import ( | |
| 13 | +from config.offline_config import ( | |
| 14 | 14 | DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, |
| 15 | 15 | DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N |
| 16 | 16 | ) |
| 17 | -from debug_utils import ( | |
| 17 | +from scripts.debug_utils import ( | |
| 18 | 18 | setup_debug_logger, log_dataframe_info, log_dict_stats, |
| 19 | 19 | save_readable_index, fetch_name_mappings, log_algorithm_params, |
| 20 | 20 | log_processing_step | ... | ... |
offline_tasks/scripts/i2i_swing.py
| ... | ... | @@ -10,11 +10,11 @@ import argparse |
| 10 | 10 | import json |
| 11 | 11 | from datetime import datetime, timedelta |
| 12 | 12 | from db_service import create_db_connection |
| 13 | -from config import ( | |
| 13 | +from config.offline_config import ( | |
| 14 | 14 | DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, |
| 15 | 15 | DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N |
| 16 | 16 | ) |
| 17 | -from debug_utils import ( | |
| 17 | +from scripts.debug_utils import ( | |
| 18 | 18 | setup_debug_logger, log_dataframe_info, log_dict_stats, |
| 19 | 19 | save_readable_index, load_name_mappings_from_file, log_algorithm_params, |
| 20 | 20 | log_processing_step | ... | ... |
offline_tasks/scripts/interest_aggregation.py
| ... | ... | @@ -9,11 +9,11 @@ import json |
| 9 | 9 | from datetime import datetime, timedelta |
| 10 | 10 | from collections import defaultdict, Counter |
| 11 | 11 | from db_service import create_db_connection |
| 12 | -from config import ( | |
| 12 | +from config.offline_config import ( | |
| 13 | 13 | DB_CONFIG, OUTPUT_DIR, INTEREST_AGGREGATION_CONFIG, get_time_range, |
| 14 | 14 | DEFAULT_LOOKBACK_DAYS, DEFAULT_RECENT_DAYS, DEFAULT_INTEREST_TOP_N |
| 15 | 15 | ) |
| 16 | -from debug_utils import ( | |
| 16 | +from scripts.debug_utils import ( | |
| 17 | 17 | setup_debug_logger, log_dataframe_info, log_dict_stats, |
| 18 | 18 | save_readable_index, fetch_name_mappings, log_algorithm_params, |
| 19 | 19 | log_processing_step | ... | ... |
offline_tasks/scripts/load_index_to_redis.py