indexing_utils.py
4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
索引工具函数。
提取公共逻辑,避免代码重复。
"""
import logging
from typing import Dict, Any, Optional
from sqlalchemy import Engine, text
from config import ConfigLoader
from config.tenant_config_loader import get_tenant_config_loader
from indexer.document_transformer import SPUDocumentTransformer
logger = logging.getLogger(__name__)
def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
"""
加载分类ID到名称的映射(全局,所有租户共享)。
Args:
db_engine: SQLAlchemy database engine
Returns:
Dictionary mapping category_id to category_name
"""
query = text("""
SELECT DISTINCT
category_id,
category
FROM shoplazza_product_spu
WHERE deleted = 0 AND category_id IS NOT NULL
""")
mapping = {}
try:
with db_engine.connect() as conn:
result = conn.execute(query)
for row in result:
category_id = str(int(row.category_id))
category_name = row.category
if not category_name or not category_name.strip():
logger.warning(f"Category ID {category_id} has empty name, skipping")
continue
mapping[category_id] = category_name
except Exception as e:
logger.error(f"Failed to load category mapping: {e}", exc_info=True)
raise RuntimeError("Failed to load category mapping from MySQL") from e
return mapping
def create_document_transformer(
category_id_to_name: Dict[str, str],
tenant_id: str,
searchable_option_dimensions: Optional[list] = None,
translator: Optional[Any] = None,
encoder: Optional[Any] = None,
enable_title_embedding: bool = True,
image_encoder: Optional[Any] = None,
enable_image_embedding: bool = False,
config: Optional[Any] = None,
) -> SPUDocumentTransformer:
"""
创建文档转换器(统一初始化逻辑)。
Args:
category_id_to_name: 分类ID到名称的映射
tenant_id: 租户ID
searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
translator: 翻译器实例(如果为None则根据配置初始化)
encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
enable_title_embedding: 是否启用标题向量化(默认True)
image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls))
enable_image_embedding: 是否启用 image_embedding 填充(默认False)
Returns:
SPUDocumentTransformer实例
"""
# 加载租户配置
tenant_config_loader = get_tenant_config_loader()
tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
# 加载搜索配置(如果需要)
if (
searchable_option_dimensions is None
or translator is None
or (encoder is None and enable_title_embedding)
or config is None
):
if config is None:
config_loader = ConfigLoader()
config = config_loader.load_config()
if searchable_option_dimensions is None:
searchable_option_dimensions = config.spu_config.searchable_option_dimensions
index_langs = tenant_config.get("index_languages") or []
need_translator = len(index_langs) > 1
if translator is None and need_translator:
from providers import create_translation_provider
translator = create_translation_provider(config.query_config)
# 初始化encoder(如果启用标题向量化且未提供encoder)
if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
from embeddings.text_encoder import TextEmbeddingEncoder
encoder = TextEmbeddingEncoder()
logger.info("TextEmbeddingEncoder initialized for title embedding")
return SPUDocumentTransformer(
category_id_to_name=category_id_to_name,
searchable_option_dimensions=searchable_option_dimensions,
tenant_config=tenant_config,
translator=translator,
encoder=encoder,
enable_title_embedding=enable_title_embedding,
image_encoder=image_encoder,
enable_image_embedding=enable_image_embedding,
)