Commit c9f77c8fefe8972664c0db61f43d7df321d4db76

Authored by tangwang
1 parent 06cb25fa

deepwalk refactor for memsave and perfermance optimize

offline_tasks/config.py deleted
... ... @@ -1,26 +0,0 @@
1   -import os # Add for environment variable reading
2   -
3   -
4   -ES_CONFIG = {
5   - 'host': 'http://localhost:9200',
6   - # default index name will be overwritten below based on APP_ENV
7   - 'index_name': 'spu',
8   - 'username': 'essa',
9   - 'password': '4hOaLaf41y2VuI8y'
10   -}
11   -
12   -
13   -# Redis Cache Configuration
14   -REDIS_CONFIG = {
15   - # 'host': '120.76.41.98',
16   - 'host': 'localhost',
17   - 'port': 6479,
18   - 'snapshot_db': 0,
19   - 'password': 'BMfv5aI31kgHWtlx',
20   - 'socket_timeout': 1,
21   - 'socket_connect_timeout': 1,
22   - 'retry_on_timeout': False,
23   - 'cache_expire_days': 180, # 6 months
24   - 'translation_cache_expire_days': 360,
25   - 'translation_cache_prefix': 'trans'
26   -}
offline_tasks/scripts/db_service.py renamed to offline_tasks/db_service.py
offline_tasks/scripts/add_names_to_swing.py
... ... @@ -5,7 +5,7 @@
5 5 """
6 6 import argparse
7 7 from datetime import datetime
8   -from debug_utils import setup_debug_logger, load_name_mappings_from_file
  8 +from scripts.debug_utils import setup_debug_logger, load_name_mappings_from_file
9 9  
10 10  
11 11 def add_names_to_swing_result(input_file, output_file, name_mappings, logger=None, debug=False):
... ...
offline_tasks/scripts/config.py deleted
... ... @@ -1,130 +0,0 @@
1   -"""
2   -离线任务配置文件
3   -包含数据库连接、路径、参数等配置
4   -"""
5   -import os
6   -from datetime import datetime, timedelta
7   -
8   -# 数据库配置
9   -DB_CONFIG = {
10   - 'host': 'selectdb-cn-wuf3vsokg05-public.selectdbfe.rds.aliyuncs.com',
11   - 'port': '9030',
12   - 'database': 'datacenter',
13   - 'username': 'readonly',
14   - 'password': 'essa1234'
15   -}
16   -
17   -# 路径配置
18   -BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19   -OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
20   -LOG_DIR = os.path.join(BASE_DIR, 'logs')
21   -
22   -# 确保目录存在
23   -os.makedirs(OUTPUT_DIR, exist_ok=True)
24   -os.makedirs(LOG_DIR, exist_ok=True)
25   -
26   -# ============================================================================
27   -# 默认参数配置(用于调试和生产)
28   -# ============================================================================
29   -
30   -# 时间配置(建议先用小数值调试,确认无误后再改为大数值)
31   -DEFAULT_LOOKBACK_DAYS = 400 # 默认回看天数(调试用30天,生产可改为730天)
32   -DEFAULT_RECENT_DAYS = 180 # 默认最近天数(调试用7天,生产可改为180天)
33   -
34   -# i2i算法默认参数
35   -DEFAULT_I2I_TOP_N = 50 # 默认返回Top N个相似商品
36   -
37   -# 兴趣聚合默认参数
38   -DEFAULT_INTEREST_TOP_N = 1000 # 默认每个key返回Top N个商品
39   -
40   -# 获取时间范围
41   -def get_time_range(days=DEFAULT_LOOKBACK_DAYS):
42   - """获取时间范围"""
43   - end_date = datetime.now()
44   - start_date = end_date - timedelta(days=days)
45   - return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
46   -
47   -# i2i 行为相似算法配置
48   -I2I_CONFIG = {
49   - # Swing 算法配置
50   - 'swing': {
51   - 'alpha': 0.5, # swing算法的alpha参数
52   - 'threshold1': 0.5, # 交互强度阈值1
53   - 'threshold2': 0.5, # 交互强度阈值2
54   - 'max_sim_list_len': 300, # 最大相似列表长度
55   - 'top_n': 50, # 输出top N个相似商品
56   - 'thread_num': 10, # 线程数(如果使用C++版本)
57   - },
58   -
59   - # Session W2V 配置
60   - 'session_w2v': {
61   - 'max_sentence_length': 100, # 最大句子长度
62   - 'window_size': 5, # 窗口大小
63   - 'vector_size': 128, # 向量维度
64   - 'min_count': 2, # 最小词频
65   - 'workers': 10, # 训练线程数
66   - 'epochs': 10, # 训练轮数
67   - 'sg': 1, # 使用skip-gram
68   - },
69   -
70   - # DeepWalk 配置
71   - 'deepwalk': {
72   - 'num_walks': 10, # 每个节点的游走次数
73   - 'walk_length': 40, # 游走长度
74   - 'window_size': 5, # 窗口大小
75   - 'vector_size': 128, # 向量维度
76   - 'min_count': 2, # 最小词频
77   - 'workers': 10, # 训练线程数
78   - 'epochs': 10, # 训练轮数
79   - 'sg': 1, # 使用skip-gram
80   - 'use_softmax': True, # 使用softmax
81   - 'temperature': 1.0, # softmax温度
82   - 'p_tag_walk': 0.2, # 通过标签游走的概率
83   - }
84   -}
85   -
86   -# 兴趣点聚合配置
87   -INTEREST_AGGREGATION_CONFIG = {
88   - 'top_n': 1000, # 每个key生成前N个商品
89   - 'time_decay_factor': 0.95, # 时间衰减因子(每30天)
90   - 'min_interaction_count': 2, # 最小交互次数
91   -
92   - # 行为权重
93   - 'behavior_weights': {
94   - 'click': 1.0,
95   - 'addToCart': 3.0,
96   - 'addToPool': 2.0,
97   - 'contactFactory': 5.0,
98   - 'purchase': 10.0,
99   - },
100   -
101   - # 类型配置
102   - 'list_types': ['hot', 'cart', 'new'], # 热门、加购、新品
103   -}
104   -
105   -# Redis配置(用于存储索引)
106   -REDIS_CONFIG = {
107   - 'host': 'localhost',
108   - 'port': 6379,
109   - 'db': 0,
110   - 'password': None,
111   - 'decode_responses': False
112   -}
113   -
114   -# 日志配置
115   -LOG_CONFIG = {
116   - 'level': 'INFO',
117   - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
118   - 'date_format': '%Y-%m-%d %H:%M:%S'
119   -}
120   -
121   -# Debug配置
122   -DEBUG_CONFIG = {
123   - 'enabled': False, # 是否开启debug模式
124   - 'log_level': 'DEBUG', # debug日志级别
125   - 'sample_size': 5, # 数据采样大小
126   - 'save_readable': True, # 是否保存可读明文文件
127   - 'log_dataframe_info': True, # 是否记录DataFrame详细信息
128   - 'log_intermediate': True, # 是否记录中间结果
129   -}
130   -
offline_tasks/scripts/fetch_item_attributes.py
... ... @@ -8,8 +8,8 @@ import json
8 8 import argparse
9 9 from datetime import datetime
10 10 from db_service import create_db_connection
11   -from config import DB_CONFIG, OUTPUT_DIR
12   -from debug_utils import setup_debug_logger
  11 +from config.offline_config import DB_CONFIG, OUTPUT_DIR
  12 +from scripts.debug_utils import setup_debug_logger
13 13  
14 14  
15 15 def fetch_and_save_mappings(engine, output_dir, logger=None, debug=False):
... ...
offline_tasks/scripts/generate_session.py
... ... @@ -9,11 +9,11 @@ from collections import defaultdict
9 9 import argparse
10 10 from datetime import datetime, timedelta
11 11 from db_service import create_db_connection
12   -from config import (
  12 +from config.offline_config import (
13 13 DB_CONFIG, OUTPUT_DIR, get_time_range,
14 14 DEFAULT_LOOKBACK_DAYS
15 15 )
16   -from debug_utils import setup_debug_logger, log_dataframe_info
  16 +from scripts.debug_utils import setup_debug_logger, log_dataframe_info
17 17  
18 18  
19 19 def aggregate_user_sessions(df, behavior_weights, logger=None, debug=False):
... ...
offline_tasks/scripts/i2i_content_similar.py
... ... @@ -9,8 +9,8 @@ import pandas as pd
9 9 from datetime import datetime, timedelta
10 10 from elasticsearch import Elasticsearch
11 11 from db_service import create_db_connection
12   -from config import DB_CONFIG, OUTPUT_DIR
13   -from debug_utils import setup_debug_logger, log_processing_step
  12 +from config.offline_config import DB_CONFIG, OUTPUT_DIR
  13 +from scripts.debug_utils import setup_debug_logger, log_processing_step
14 14  
15 15 # ES配置
16 16 ES_CONFIG = {
... ...
offline_tasks/scripts/i2i_deepwalk.py
... ... @@ -6,24 +6,22 @@ i2i - DeepWalk算法实现
6 6 import pandas as pd
7 7 import argparse
8 8 import os
9   -import sys
10 9 from datetime import datetime
11 10 from collections import defaultdict
12 11 from gensim.models import Word2Vec
13 12 from db_service import create_db_connection
14   -from config import (
  13 +from config.offline_config import (
15 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
16 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
17 16 )
18   -from debug_utils import (
  17 +from scripts.debug_utils import (
19 18 setup_debug_logger, log_dataframe_info,
20 19 save_readable_index, fetch_name_mappings, log_algorithm_params,
21 20 log_processing_step
22 21 )
23 22  
24 23 # 导入 DeepWalk 实现
25   -sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deepwalk'))
26   -from deepwalk import DeepWalk
  24 +from deepwalk.deepwalk import DeepWalk
27 25  
28 26  
29 27 def build_edge_file_from_db(df, behavior_weights, output_path, logger):
... ...
offline_tasks/scripts/i2i_session_w2v.py
... ... @@ -10,11 +10,11 @@ from collections import defaultdict
10 10 from gensim.models import Word2Vec
11 11 import numpy as np
12 12 from db_service import create_db_connection
13   -from config import (
  13 +from config.offline_config import (
14 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
15 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
16 16 )
17   -from debug_utils import (
  17 +from scripts.debug_utils import (
18 18 setup_debug_logger, log_dataframe_info, log_dict_stats,
19 19 save_readable_index, fetch_name_mappings, log_algorithm_params,
20 20 log_processing_step
... ...
offline_tasks/scripts/i2i_swing.py
... ... @@ -10,11 +10,11 @@ import argparse
10 10 import json
11 11 from datetime import datetime, timedelta
12 12 from db_service import create_db_connection
13   -from config import (
  13 +from config.offline_config import (
14 14 DB_CONFIG, OUTPUT_DIR, I2I_CONFIG, get_time_range,
15 15 DEFAULT_LOOKBACK_DAYS, DEFAULT_I2I_TOP_N
16 16 )
17   -from debug_utils import (
  17 +from scripts.debug_utils import (
18 18 setup_debug_logger, log_dataframe_info, log_dict_stats,
19 19 save_readable_index, load_name_mappings_from_file, log_algorithm_params,
20 20 log_processing_step
... ...
offline_tasks/scripts/interest_aggregation.py
... ... @@ -9,11 +9,11 @@ import json
9 9 from datetime import datetime, timedelta
10 10 from collections import defaultdict, Counter
11 11 from db_service import create_db_connection
12   -from config import (
  12 +from config.offline_config import (
13 13 DB_CONFIG, OUTPUT_DIR, INTEREST_AGGREGATION_CONFIG, get_time_range,
14 14 DEFAULT_LOOKBACK_DAYS, DEFAULT_RECENT_DAYS, DEFAULT_INTEREST_TOP_N
15 15 )
16   -from debug_utils import (
  16 +from scripts.debug_utils import (
17 17 setup_debug_logger, log_dataframe_info, log_dict_stats,
18 18 save_readable_index, fetch_name_mappings, log_algorithm_params,
19 19 log_processing_step
... ...
offline_tasks/scripts/load_index_to_redis.py
... ... @@ -6,7 +6,7 @@ import redis
6 6 import argparse
7 7 import logging
8 8 from datetime import datetime
9   -from config import REDIS_CONFIG, OUTPUT_DIR
  9 +from config.offline_config import REDIS_CONFIG, OUTPUT_DIR
10 10  
11 11 logging.basicConfig(
12 12 level=logging.INFO,
... ...