indexing_utils.py
5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
索引工具函数。
提取公共逻辑,避免代码重复。
"""
import logging
from typing import Dict, Any, Optional
from sqlalchemy import Engine, text
from config import ConfigLoader
from config.tenant_config_loader import get_tenant_config_loader
from indexer.document_transformer import SPUDocumentTransformer
logger = logging.getLogger(__name__)
def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
"""
加载分类ID到名称的映射(全局,所有租户共享)。
Args:
db_engine: SQLAlchemy database engine
Returns:
Dictionary mapping category_id to category_name
"""
query = text("""
SELECT DISTINCT
category_id,
category
FROM shoplazza_product_spu
WHERE deleted = 0 AND category_id IS NOT NULL
""")
mapping = {}
try:
with db_engine.connect() as conn:
result = conn.execute(query)
for row in result:
category_id = str(int(row.category_id))
category_name = row.category
if not category_name or not category_name.strip():
logger.warning(f"Category ID {category_id} has empty name, skipping")
continue
mapping[category_id] = category_name
except Exception as e:
logger.error(f"Failed to load category mapping: {e}", exc_info=True)
return mapping
def create_document_transformer(
category_id_to_name: Dict[str, str],
tenant_id: str,
searchable_option_dimensions: Optional[list] = None,
translator: Optional[Any] = None,
translation_prompts: Optional[Dict[str, str]] = None,
encoder: Optional[Any] = None,
enable_title_embedding: bool = True
) -> SPUDocumentTransformer:
"""
创建文档转换器(统一初始化逻辑)。
Args:
category_id_to_name: 分类ID到名称的映射
tenant_id: 租户ID
searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
translator: 翻译器实例(如果为None则根据配置初始化)
translation_prompts: 翻译提示词配置(如果为None则从配置加载)
encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
enable_title_embedding: 是否启用标题向量化(默认True)
Returns:
SPUDocumentTransformer实例
"""
# 加载租户配置
tenant_config_loader = get_tenant_config_loader()
tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
# 加载搜索配置(如果需要)
if searchable_option_dimensions is None or translator is None or translation_prompts is None or (encoder is None and enable_title_embedding):
try:
config_loader = ConfigLoader()
config = config_loader.load_config()
if searchable_option_dimensions is None:
searchable_option_dimensions = config.spu_config.searchable_option_dimensions
# 根据租户配置决定是否需要翻译:只要开启任一方向的翻译,就初始化翻译器
translate_to_en = bool(tenant_config.get("translate_to_en"))
translate_to_zh = bool(tenant_config.get("translate_to_zh"))
if translator is None and (translate_to_en or translate_to_zh):
from query.translator import Translator
translator = Translator(
api_key=config.query_config.translation_api_key,
use_cache=True,
glossary_id=config.query_config.translation_glossary_id,
translation_context=config.query_config.translation_context
)
if translation_prompts is None:
translation_prompts = config.query_config.translation_prompts
# 初始化encoder(如果启用标题向量化且未提供encoder)
if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
try:
from embeddings.text_encoder import BgeEncoder
encoder = BgeEncoder()
logger.info("BgeEncoder initialized for title embedding")
except Exception as e:
logger.warning(f"Failed to initialize BgeEncoder: {e}, title embedding will be disabled")
enable_title_embedding = False
except Exception as e:
logger.warning(f"Failed to load config, using defaults: {e}")
if searchable_option_dimensions is None:
searchable_option_dimensions = ['option1', 'option2', 'option3']
if translation_prompts is None:
translation_prompts = {}
return SPUDocumentTransformer(
category_id_to_name=category_id_to_name,
searchable_option_dimensions=searchable_option_dimensions,
tenant_config=tenant_config,
translator=translator,
translation_prompts=translation_prompts,
encoder=encoder,
enable_title_embedding=enable_title_embedding
)