Commit c9f77c8fefe8972664c0db61f43d7df321d4db76

Authored by tangwang
1 parent 06cb25fa

deepwalk refactor for memsave and perfermance optimize

offline_tasks/config.py deleted
@@ -1,26 +0,0 @@ @@ -1,26 +0,0 @@
1 -import os # Add for environment variable reading  
2 -  
3 -  
4 -ES_CONFIG = {  
5 - 'host': 'http://localhost:9200',  
6 - # default index name will be overwritten below based on APP_ENV  
7 - 'index_name': 'spu',  
8 - 'username': 'essa',  
9 - 'password': '4hOaLaf41y2VuI8y'  
10 -}  
11 -  
12 -  
13 -# Redis Cache Configuration  
14 -REDIS_CONFIG = {  
15 - # 'host': '120.76.41.98',  
16 - 'host': 'localhost',  
17 - 'port': 6479,  
18 - 'snapshot_db': 0,  
19 - 'password': 'BMfv5aI31kgHWtlx',  
20 - 'socket_timeout': 1,  
21 - 'socket_connect_timeout': 1,  
22 - 'retry_on_timeout': False,  
23 - 'cache_expire_days': 180, # 6 months  
24 - 'translation_cache_expire_days': 360,  
25 - 'translation_cache_prefix': 'trans'  
26 -}  
offline_tasks/scripts/db_service.py renamed to offline_tasks/db_service.py
offline_tasks/scripts/add_names_to_swing.py
@@ -5,7 +5,7 @@ @@ -5,7 +5,7 @@
5 """ 5 """
6 import argparse 6 import argparse
7 from datetime import datetime 7 from datetime import datetime
8 -from debug_utils import setup_debug_logger, load_name_mappings_from_file 8 +from scripts.debug_utils import setup_debug_logger, load_name_mappings_from_file
9 9
10 10
11 def add_names_to_swing_result(input_file, output_file, name_mappings, logger=None, debug=False): 11 def add_names_to_swing_result(input_file, output_file, name_mappings, logger=None, debug=False):
offline_tasks/scripts/config.py deleted
@@ -1,130 +0,0 @@ @@ -1,130 +0,0 @@
1 -"""  
2 -离线任务配置文件  
3 -包含数据库连接、路径、参数等配置  
4 -"""  
5 -import os  
6 -from datetime import datetime, timedelta  
7 -  
8 -# 数据库配置  
9 -DB_CONFIG = {  
10 - 'host': 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com',  
11 - 'port': '9030',  
12 - 'database': 'datacenter',  
13 - 'username': 'readonly',  
14 - 'password': 'essa1234'  
15 -}  
16 -  
17 -# 路径配置  
18 -BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  
19 -OUTPUT_DIR = os.path.join(BASE_DIR, 'output')  
20 -LOG_DIR = os.path.join(BASE_DIR, 'logs')  
21 -  
22 -# 确保目录存在  
23 -os.makedirs(OUTPUT_DIR, exist_ok=True)  
24 -os.makedirs(LOG_DIR, exist_ok=True)  
25 -  
26 -# ============================================================================  
27 -# 默认参数配置(用于调试和生产)  
28 -# ============================================================================  
29 -  
30 -# 时间配置(建议先用小数值调试,确认无误后再改为大数值)  
31 -DEFAULT_LOOKBACK_DAYS = 400 # 默认回看天数(调试用30天,生产可改为730天)  
32 -DEFAULT_RECENT_DAYS = 180 # 默认最近天数(调试用7天,生产可改为180天)  
33 -  
34 -# i2i算法默认参数  
35 -DEFAULT_I2I_TOP_N = 50 # 默认返回Top N个相似商品  
36 -  
37 -# 兴趣聚合默认参数  
38 -DEFAULT_INTEREST_TOP_N = 1000 # 默认每个key返回Top N个商品  
39 -  
40 -# 获取时间范围  
41 -def get_time_range(days=DEFAULT_LOOKBACK_DAYS):  
42 - """获取时间范围"""  
43 - end_date = datetime.now()  
44 - start_date = end_date - timedelta(days=days)  
45 - return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')  
46 -  
47 -# i2i 行为相似算法配置  
48 -I2I_CONFIG = {  
49 - # Swing 算法配置  
50 - 'swing': {  
51 - 'alpha': 0.5, # swing算法的alpha参数  
52 - 'threshold1': 0.5, # 交互强度阈值1  
53 - 'threshold2': 0.5, # 交互强度阈值2  
54 - 'max_sim_list_len': 300, # 最大相似列表长度  
55 - 'top_n': 50, # 输出top N个相似商品  
56 - 'thread_num': 10, # 线程数(如果使用C++版本)  
57 - },  
58 -  
59 - # Session W2V 配置  
60 - 'session_w2v': {  
61 - 'max_sentence_length': 100, # 最大句子长度  
62 - 'window_size': 5, # 窗口大小  
63 - 'vector_size': 128, # 向量维度  
64 - 'min_count': 2, # 最小词频  
65 - 'workers': 10, # 训练线程数  
66 - 'epochs': 10, # 训练轮数  
67 - 'sg': 1, # 使用skip-gram  
68 - },  
69 -  
70 - # DeepWalk 配置  
71 - 'deepwalk': {  
72 - 'num_walks': 10, # 每个节点的游走次数  
73 - 'walk_length': 40, # 游走长度  
74 - 'window_size': 5, # 窗口大小  
75 - 'vector_size': 128, # 向量维度  
76 - 'min_count': 2, # 最小词频  
77 - 'workers': 10, # 训练线程数  
78 - 'epochs': 10, # 训练轮数  
79 - 'sg': 1, # 使用skip-gram  
80 - 'use_softmax': True, # 使用softmax  
81 - 'temperature': 1.0, # softmax温度  
82 - 'p_tag_walk': 0.2, # 通过标签游走的概率  
83 - }  
84 -}  
85 -  
86 -# 兴趣点聚合配置  
87 -INTEREST_AGGREGATION_CONFIG = {  
88 - 'top_n': 1000, # 每个key生成前N个商品  
89 - 'time_decay_factor': 0.95, # 时间衰减因子(每30天)  
90 - 'min_interaction_count': 2, # 最小交互次数  
91 -  
92 - # 行为权重  
93 - 'behavior_weights': {  
94 - 'click': 1.0,  
95 - 'addToCart': 3.0,  
96 - 'addToPool': 2.0,  
97 - 'contactFactory': 5.0,  
98 - 'purchase': 10.0,  
99 - },  
100 -  
101 - # 类型配置  
102 - 'list_types': ['hot', 'cart', 'new'], # 热门、加购、新品  
103 -}  
104 -  
105 -# Redis配置(用于存储索引)  
106 -REDIS_CONFIG = {  
107 - 'host': 'localhost',  
108 - 'port': 6379,  
109 - 'db': 0,  
110 - 'password': None,  
111 - 'decode_responses': False  
112 -}  
113 -  
114 -# 日志配置  
115 -LOG_CONFIG = {  
116 - 'level': 'INFO',  
117 - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
118 - 'date_format': '%Y-%m-%d %H:%M:%S'  
119 -}  
120 -  
121 -# Debug配置  
122 -DEBUG_CONFIG = {  
123 - 'enabled': False, # 是否开启debug模式  
124 - 'log_level': 'DEBUG', # debug日志级别  
125 - 'sample_size': 5, # 数据采样大小  
126 - 'save_readable': True, # 是否保存可读明文文件  
127 - 'log_dataframe_info': True, # 是否记录DataFrame详细信息  
128 - 'log_intermediate': True, # 是否记录中间结果  
129 -}  
130 -  
offline_tasks/scripts/fetch_item_attributes.py
@@ -8,8 +8,8 @@ import json @@ -8,8 +8,8 @@ import json
8 import argparse 8 import argparse
9 from datetime import datetime 9 from datetime import datetime
10 from db_service import create_db_connection 10 from db_service import create_db_connection
11 -from config import DB_CONFIG, OUTPUT_DIR  
12 -from debug_utils import setup_debug_logger 11 +from config.offline_config import DB_CONFIG, OUTPUT_DIR
  12 +from scripts.debug_utils import setup_debug_logger
13 13
14 14
15 def fetch_and_save_mappings(engine, output_dir, logger=None, debug=False): 15 def fetch_and_save_mappings(engine, output_dir, logger=None, debug=False):
offline_tasks/scripts/generate_session.py
@@ -9,11 +9,11 @@ from collections import defaultdict @@ -9,11 +9,11 @@ from collections import defaultdict
9 import argparse 9 import argparse
10 from datetime import datetime, timedelta 10 from datetime import datetime, timedelta
11 from db_service import create_db_connection 11 from db_service import create_db_connection
12 -from config import ( 12 +from config.offline_config import (
13 DB_CONFIG, OUTPUT_DIR, get_time_range, 13 DB_CONFIG, OUTPUT_DIR, get_time_range,
14 DEFAULT_LOOKBACK_DAYS 14 DEFAULT_LOOKBACK_DAYS
15 ) 15 )
16 -from debug_utils import setup_debug_logger, log_dataframe_info 16 +from scripts.debug_utils import setup_debug_logger, log_dataframe_info
17 17
18 18
19 def aggregate_user_sessions(df, behavior_weights, logger=None, debug=False): 19 def aggregate_user_sessions(df, behavior_weights, logger=None, debug=False):
offline_tasks/scripts/i2i_content_similar.py
@@ -9,8 +9,8 @@ import pandas as pd @@ -9,8 +9,8 @@ import pandas as pd
9 from datetime import datetime, timedelta 9 from datetime import datetime, timedelta
10 from elasticsearch import Elasticsearch 10 from elasticsearch import Elasticsearch
11 from db_service import create_db_connection 11 from db_service import create_db_connection
12 -from config import DB_CONFIG, OUTPUT_DIR  
13 -from debug_utils import setup_debug_logger, log_processing_step 12 +from config.offline_config import DB_CONFIG, OUTPUT_DIR
  13 +from scripts.debug_utils import setup_debug_logger, log_processing_step
14 14
15 # ES配置 15 # ES配置
16 ES_CONFIG = { 16 ES_CONFIG = {
offline_tasks/scripts/i2i_deepwalk.py
@@ -6,24 +6,22 @@ i2i - DeepWalk算法实现 @@ -6,24 +6,22 @@ i2i - DeepWalk算法实现
6 import pandas as pd 6 import pandas as pd
7 import argparse 7 import argparse
8 import os 8 import os
9 -import sys  
10 from datetime import datetime 9 from datetime import datetime
11 from collections import defaultdict 10 from collections import defaultdict
12 from gensim.models import Word2Vec 11 from gensim.models import Word2Vec
13 from db_service import create_db_connection 12 from db_service import create_db_connection
14 -from config import ( 13 +from config.offline_config import (
15 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
16 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
17 ) 16 )
18 -from debug_utils import ( 17 +from scripts.debug_utils import (
19 setup_debug_logger, log_dataframe_info, 18 setup_debug_logger, log_dataframe_info,
20 save_readable_index, fetch_name_mappings, log_algorithm_params, 19 save_readable_index, fetch_name_mappings, log_algorithm_params,
21 log_processing_step 20 log_processing_step
22 ) 21 )
23 22
24 # 导入 DeepWalk 实现 23 # 导入 DeepWalk 实现
25 -sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deepwalk'))  
26 -from deepwalk import DeepWalk 24 +from deepwalk.deepwalk import DeepWalk
27 25
28 26
29 def build_edge_file_from_db(df, behavior_weights, output_path, logger): 27 def build_edge_file_from_db(df, behavior_weights, output_path, logger):
offline_tasks/scripts/i2i_session_w2v.py
@@ -10,11 +10,11 @@ from collections import defaultdict @@ -10,11 +10,11 @@ from collections import defaultdict
10 from gensim.models import Word2Vec 10 from gensim.models import Word2Vec
11 import numpy as np 11 import numpy as np
12 from db_service import create_db_connection 12 from db_service import create_db_connection
13 -from config import ( 13 +from config.offline_config import (
14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
16 ) 16 )
17 -from debug_utils import ( 17 +from scripts.debug_utils import (
18 setup_debug_logger, log_dataframe_info, log_dict_stats, 18 setup_debug_logger, log_dataframe_info, log_dict_stats,
19 save_readable_index, fetch_name_mappings, log_algorithm_params, 19 save_readable_index, fetch_name_mappings, log_algorithm_params,
20 log_processing_step 20 log_processing_step
offline_tasks/scripts/i2i_swing.py
@@ -10,11 +10,11 @@ import argparse @@ -10,11 +10,11 @@ import argparse
10 import json 10 import json
11 from datetime import datetime, timedelta 11 from datetime import datetime, timedelta
12 from db_service import create_db_connection 12 from db_service import create_db_connection
13 -from config import ( 13 +from config.offline_config import (
14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range, 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
16 ) 16 )
17 -from debug_utils import ( 17 +from scripts.debug_utils import (
18 setup_debug_logger, log_dataframe_info, log_dict_stats, 18 setup_debug_logger, log_dataframe_info, log_dict_stats,
19 save_readable_index, load_name_mappings_from_file, log_algorithm_params, 19 save_readable_index, load_name_mappings_from_file, log_algorithm_params,
20 log_processing_step 20 log_processing_step
offline_tasks/scripts/interest_aggregation.py
@@ -9,11 +9,11 @@ import json @@ -9,11 +9,11 @@ import json
9 from datetime import datetime, timedelta 9 from datetime import datetime, timedelta
10 from collections import defaultdict, Counter 10 from collections import defaultdict, Counter
11 from db_service import create_db_connection 11 from db_service import create_db_connection
12 -from config import ( 12 +from config.offline_config import (
13 DB_CONFIG, OUTPUT_DIR, INTEREST_AGGREGATION_CONFIG, get_time_range, 13 DB_CONFIG, OUTPUT_DIR, INTEREST_AGGREGATION_CONFIG, get_time_range,
14 DEFAULT_LOOKBACK_DAYS, DEFAULT_RECENT_DAYS, DEFAULT_INTEREST_TOP_N 14 DEFAULT_LOOKBACK_DAYS, DEFAULT_RECENT_DAYS, DEFAULT_INTEREST_TOP_N
15 ) 15 )
16 -from debug_utils import ( 16 +from scripts.debug_utils import (
17 setup_debug_logger, log_dataframe_info, log_dict_stats, 17 setup_debug_logger, log_dataframe_info, log_dict_stats,
18 save_readable_index, fetch_name_mappings, log_algorithm_params, 18 save_readable_index, fetch_name_mappings, log_algorithm_params,
19 log_processing_step 19 log_processing_step
offline_tasks/scripts/load_index_to_redis.py
@@ -6,7 +6,7 @@ import redis @@ -6,7 +6,7 @@ import redis
6 import argparse 6 import argparse
7 import logging 7 import logging
8 from datetime import datetime 8 from datetime import datetime
9 -from config import REDIS_CONFIG, OUTPUT_DIR 9 +from config.offline_config import REDIS_CONFIG, OUTPUT_DIR
10 10
11 logging.basicConfig( 11 logging.basicConfig(
12 level=logging.INFO, 12 level=logging.INFO,