be52af70
tangwang
first commit
|
1
|
"""
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
2
|
Configuration loader and validator for search engine configurations.
|
be52af70
tangwang
first commit
|
3
4
|
This module handles loading, parsing, and validating YAML configuration files
|
33839b37
tangwang
属性值参与搜索:
|
5
6
7
8
|
that define how search should be executed (NOT how data should be indexed).
索引结构由 mappings/search_products.json 定义。
此配置只定义搜索行为:字段权重、搜索域、查询策略等。
|
be52af70
tangwang
first commit
|
9
10
11
|
"""
import yaml
|
be52af70
tangwang
first commit
|
12
13
14
15
|
from typing import Dict, Any, List, Optional
from dataclasses import dataclass, field
from pathlib import Path
|
be52af70
tangwang
first commit
|
16
17
18
19
20
21
|
@dataclass
class IndexConfig:
"""Configuration for an index domain (e.g., default, title, brand)."""
name: str
label: str
|
33839b37
tangwang
属性值参与搜索:
|
22
|
fields: List[str] # List of field names to include in this search domain
|
be52af70
tangwang
first commit
|
23
24
25
|
boost: float = 1.0
example: Optional[str] = None
|
be52af70
tangwang
first commit
|
26
27
28
29
30
|
@dataclass
class QueryConfig:
"""Configuration for query processing."""
supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"])
|
2739b281
tangwang
多语言索引调整
|
31
|
default_language: str = "en"
|
33839b37
tangwang
属性值参与搜索:
|
32
33
|
# Feature flags
|
be52af70
tangwang
first commit
|
34
35
|
enable_text_embedding: bool = True
enable_query_rewrite: bool = True
|
24e92141
tangwang
delete enable_mul...
|
36
|
|
33839b37
tangwang
属性值参与搜索:
|
37
|
# Query rewrite dictionary (loaded from external file)
|
be52af70
tangwang
first commit
|
38
|
rewrite_dictionary: Dict[str, str] = field(default_factory=dict)
|
33839b37
tangwang
属性值参与搜索:
|
39
|
|
42e3aea6
tangwang
tidy
|
40
|
# Translation settings (provider/URL in services.translation)
|
33839b37
tangwang
属性值参与搜索:
|
41
|
translation_service: str = "deepl"
|
be52af70
tangwang
first commit
|
42
|
translation_api_key: Optional[str] = None
|
33839b37
tangwang
属性值参与搜索:
|
43
44
|
translation_glossary_id: Optional[str] = None
translation_context: str = "e-commerce product search"
|
42e3aea6
tangwang
tidy
|
45
|
translation_prompts: Dict[str, str] = field(default_factory=dict)
|
33839b37
tangwang
属性值参与搜索:
|
46
47
48
49
50
|
# Embedding field names
text_embedding_field: Optional[str] = "title_embedding"
image_embedding_field: Optional[str] = None
|
9f96d6f3
tangwang
短query不用语义搜索
|
51
|
# Embedding disable thresholds (disable vector search for short queries)
|
33839b37
tangwang
属性值参与搜索:
|
52
53
54
55
|
embedding_disable_chinese_char_limit: int = 4
embedding_disable_english_word_limit: int = 3
# Source fields configuration
|
cd3799c6
tangwang
tenant2 1w测试数据 mo...
|
56
|
source_fields: Optional[List[str]] = None
|
70dab99f
tangwang
add logs
|
57
58
59
|
# KNN boost configuration
knn_boost: float = 0.25 # Boost value for KNN (embedding recall)
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
# Dynamic text fields for multi-language retrieval
multilingual_fields: List[str] = field(
default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"]
)
shared_fields: List[str] = field(
default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"]
)
core_multilingual_fields: List[str] = field(
default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
)
# Unified text strategy tuning
base_minimum_should_match: str = "75%"
translation_minimum_should_match: str = "75%"
translation_boost: float = 0.4
translation_boost_when_source_missing: float = 1.0
source_boost_when_missing: float = 0.6
keywords_boost: float = 0.1
enable_phrase_query: bool = True
tie_breaker_base_query: float = 0.9
tie_breaker_keywords: float = 0.9
|
13377199
tangwang
接口优化
|
82
|
|
be52af70
tangwang
first commit
|
83
84
85
86
87
|
@dataclass
class SPUConfig:
"""Configuration for SPU aggregation."""
enabled: bool = False
|
33839b37
tangwang
属性值参与搜索:
|
88
|
spu_field: Optional[str] = None
|
be52af70
tangwang
first commit
|
89
|
inner_hits_size: int = 3
|
33839b37
tangwang
属性值参与搜索:
|
90
91
|
# 配置哪些option维度参与检索(进索引、以及在线搜索)
searchable_option_dimensions: List[str] = field(default_factory=lambda: ['option1', 'option2', 'option3'])
|
be52af70
tangwang
first commit
|
92
93
94
|
@dataclass
|
a00c3672
tangwang
feat: Function Sc...
|
95
96
|
class FunctionScoreConfig:
"""Function Score配置(ES层打分规则)"""
|
33839b37
tangwang
属性值参与搜索:
|
97
98
|
score_mode: str = "sum"
boost_mode: str = "multiply"
|
a00c3672
tangwang
feat: Function Sc...
|
99
100
101
102
|
functions: List[Dict[str, Any]] = field(default_factory=list)
@dataclass
|
33839b37
tangwang
属性值参与搜索:
|
103
104
105
106
107
108
109
|
class RankingConfig:
"""Configuration for ranking expressions."""
expression: str = "bm25()"
description: str = "Default BM25 ranking"
@dataclass
|
a00c3672
tangwang
feat: Function Sc...
|
110
|
class RerankConfig:
|
42e3aea6
tangwang
tidy
|
111
|
"""重排配置(provider/URL 在 services.rerank)"""
|
506c39b7
tangwang
feat(search): 统一重...
|
112
|
rerank_window: int = 1000
|
506c39b7
tangwang
feat(search): 统一重...
|
113
114
115
|
timeout_sec: float = 15.0
weight_es: float = 0.4
weight_ai: float = 0.6
|
ff32d894
tangwang
rerank
|
116
117
|
rerank_query_template: str = "{query}"
rerank_doc_template: str = "{title}"
|
a00c3672
tangwang
feat: Function Sc...
|
118
119
120
|
@dataclass
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
121
|
class SearchConfig:
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
122
|
"""Complete configuration for search engine (multi-tenant)."""
|
33839b37
tangwang
属性值参与搜索:
|
123
124
125
126
|
# 字段权重配置(用于搜索)
field_boosts: Dict[str, float]
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
127
|
# Legacy index domains (deprecated; kept for compatibility)
|
be52af70
tangwang
first commit
|
128
|
indexes: List[IndexConfig]
|
33839b37
tangwang
属性值参与搜索:
|
129
|
|
be52af70
tangwang
first commit
|
130
131
|
# Query processing
query_config: QueryConfig
|
33839b37
tangwang
属性值参与搜索:
|
132
|
|
be52af70
tangwang
first commit
|
133
134
|
# Ranking configuration
ranking: RankingConfig
|
33839b37
tangwang
属性值参与搜索:
|
135
|
|
a00c3672
tangwang
feat: Function Sc...
|
136
137
|
# Function Score configuration (ES层打分)
function_score: FunctionScoreConfig
|
33839b37
tangwang
属性值参与搜索:
|
138
|
|
a00c3672
tangwang
feat: Function Sc...
|
139
140
|
# Rerank configuration (本地重排)
rerank: RerankConfig
|
33839b37
tangwang
属性值参与搜索:
|
141
|
|
be52af70
tangwang
first commit
|
142
143
|
# SPU configuration
spu_config: SPUConfig
|
33839b37
tangwang
属性值参与搜索:
|
144
|
|
be52af70
tangwang
first commit
|
145
146
|
# ES index settings
es_index_name: str
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
147
148
149
150
151
|
# Tenant configuration
tenant_config: Dict[str, Any] = field(default_factory=dict)
# ES settings
|
be52af70
tangwang
first commit
|
152
|
es_settings: Dict[str, Any] = field(default_factory=dict)
|
42e3aea6
tangwang
tidy
|
153
154
|
# Extensible service/provider registry (translation/embedding/rerank/...)
services: Dict[str, Any] = field(default_factory=dict)
|
be52af70
tangwang
first commit
|
155
156
157
158
159
160
161
162
|
class ConfigurationError(Exception):
"""Raised when configuration validation fails."""
pass
class ConfigLoader:
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
163
|
"""Loads and validates unified search engine configuration from YAML file."""
|
a77693fe
tangwang
调整配置目录结构
|
164
|
|
33839b37
tangwang
属性值参与搜索:
|
165
|
def __init__(self, config_file: Optional[Path] = None):
|
a77693fe
tangwang
调整配置目录结构
|
166
|
"""
|
33839b37
tangwang
属性值参与搜索:
|
167
|
Initialize config loader.
|
a77693fe
tangwang
调整配置目录结构
|
168
|
|
33839b37
tangwang
属性值参与搜索:
|
169
170
|
Args:
config_file: Path to config YAML file (defaults to config/config.yaml)
|
a77693fe
tangwang
调整配置目录结构
|
171
|
"""
|
33839b37
tangwang
属性值参与搜索:
|
172
173
174
175
176
177
178
179
|
if config_file is None:
config_file = Path(__file__).parent / "config.yaml"
self.config_file = Path(config_file)
def _load_rewrite_dictionary(self) -> Dict[str, str]:
"""Load query rewrite dictionary from external file."""
rewrite_file = Path(__file__).parent / "rewrite_dictionary.txt"
rewrite_dict = {}
|
a77693fe
tangwang
调整配置目录结构
|
180
|
|
33839b37
tangwang
属性值参与搜索:
|
181
182
|
if not rewrite_file.exists():
return rewrite_dict
|
a77693fe
tangwang
调整配置目录结构
|
183
|
|
a77693fe
tangwang
调整配置目录结构
|
184
|
try:
|
33839b37
tangwang
属性值参与搜索:
|
185
186
|
with open(rewrite_file, 'r', encoding='utf-8') as f:
for line in f:
|
a77693fe
tangwang
调整配置目录结构
|
187
|
line = line.strip()
|
a77693fe
tangwang
调整配置目录结构
|
188
189
190
|
if not line or line.startswith('#'):
continue
|
a77693fe
tangwang
调整配置目录结构
|
191
|
parts = line.split('\t')
|
33839b37
tangwang
属性值参与搜索:
|
192
193
194
195
196
|
if len(parts) >= 2:
original = parts[0].strip()
replacement = parts[1].strip()
if original and replacement:
rewrite_dict[original] = replacement
|
a77693fe
tangwang
调整配置目录结构
|
197
|
except Exception as e:
|
33839b37
tangwang
属性值参与搜索:
|
198
|
print(f"Warning: Failed to load rewrite dictionary: {e}")
|
a77693fe
tangwang
调整配置目录结构
|
199
200
|
return rewrite_dict
|
33839b37
tangwang
属性值参与搜索:
|
201
|
|
9f96d6f3
tangwang
短query不用语义搜索
|
202
|
def load_config(self, validate: bool = True) -> SearchConfig:
|
be52af70
tangwang
first commit
|
203
|
"""
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
204
|
Load unified configuration from YAML file.
|
33839b37
tangwang
属性值参与搜索:
|
205
|
|
9f96d6f3
tangwang
短query不用语义搜索
|
206
|
Args:
|
33839b37
tangwang
属性值参与搜索:
|
207
208
|
validate: Whether to validate configuration after loading
|
be52af70
tangwang
first commit
|
209
|
Returns:
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
210
|
SearchConfig object
|
33839b37
tangwang
属性值参与搜索:
|
211
|
|
be52af70
tangwang
first commit
|
212
|
Raises:
|
9f96d6f3
tangwang
短query不用语义搜索
|
213
|
ConfigurationError: If config file not found, invalid, or validation fails
|
be52af70
tangwang
first commit
|
214
|
"""
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
215
216
|
if not self.config_file.exists():
raise ConfigurationError(f"Configuration file not found: {self.config_file}")
|
33839b37
tangwang
属性值参与搜索:
|
217
|
|
be52af70
tangwang
first commit
|
218
|
try:
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
219
|
with open(self.config_file, 'r', encoding='utf-8') as f:
|
be52af70
tangwang
first commit
|
220
221
|
config_data = yaml.safe_load(f)
except yaml.YAMLError as e:
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
222
|
raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
|
33839b37
tangwang
属性值参与搜索:
|
223
|
|
9f96d6f3
tangwang
短query不用语义搜索
|
224
225
226
227
228
229
230
231
232
233
|
config = self._parse_config(config_data)
# Auto-validate configuration
if validate:
errors = self.validate_config(config)
if errors:
error_msg = "Configuration validation failed:\n" + "\n".join(f" - {err}" for err in errors)
raise ConfigurationError(error_msg)
return config
|
33839b37
tangwang
属性值参与搜索:
|
234
|
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
235
236
|
def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
"""Parse configuration dictionary into SearchConfig object."""
|
33839b37
tangwang
属性值参与搜索:
|
237
238
239
240
241
242
|
# Parse field_boosts
field_boosts = config_data.get("field_boosts", {})
if not isinstance(field_boosts, dict):
raise ConfigurationError("field_boosts must be a dictionary")
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
243
|
# Parse indexes (deprecated; compatibility only)
|
be52af70
tangwang
first commit
|
244
245
246
|
indexes = []
for index_data in config_data.get("indexes", []):
indexes.append(self._parse_index_config(index_data))
|
33839b37
tangwang
属性值参与搜索:
|
247
|
|
be52af70
tangwang
first commit
|
248
249
|
# Parse query config
query_config_data = config_data.get("query_config", {})
|
42e3aea6
tangwang
tidy
|
250
|
services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {}
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
251
|
rewrite_dictionary = self._load_rewrite_dictionary()
|
9f96d6f3
tangwang
短query不用语义搜索
|
252
|
embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
253
254
|
search_fields_cfg = query_config_data.get("search_fields", {})
text_strategy_cfg = query_config_data.get("text_query_strategy", {})
|
42e3aea6
tangwang
tidy
|
255
|
|
be52af70
tangwang
first commit
|
256
|
query_config = QueryConfig(
|
9f96d6f3
tangwang
短query不用语义搜索
|
257
|
supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
|
2739b281
tangwang
多语言索引调整
|
258
|
default_language=query_config_data.get("default_language") or "en",
|
be52af70
tangwang
first commit
|
259
260
|
enable_text_embedding=query_config_data.get("enable_text_embedding", True),
enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
|
a77693fe
tangwang
调整配置目录结构
|
261
|
rewrite_dictionary=rewrite_dictionary,
|
be52af70
tangwang
first commit
|
262
|
translation_api_key=query_config_data.get("translation_api_key"),
|
9f96d6f3
tangwang
短query不用语义搜索
|
263
|
translation_service=query_config_data.get("translation_service") or "deepl",
|
522a3964
tangwang
多语言搜索翻译的优化(deepL添...
|
264
|
translation_glossary_id=query_config_data.get("translation_glossary_id"),
|
9f96d6f3
tangwang
短query不用语义搜索
|
265
|
translation_context=query_config_data.get("translation_context") or "e-commerce product search",
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
266
|
translation_prompts=query_config_data.get("translation_prompts", {}),
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
267
|
text_embedding_field=query_config_data.get("text_embedding_field"),
|
cd3799c6
tangwang
tenant2 1w测试数据 mo...
|
268
|
image_embedding_field=query_config_data.get("image_embedding_field"),
|
9f96d6f3
tangwang
短query不用语义搜索
|
269
270
|
embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
|
70dab99f
tangwang
add logs
|
271
|
source_fields=query_config_data.get("source_fields"),
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
|
knn_boost=query_config_data.get("knn_boost", 0.25),
multilingual_fields=search_fields_cfg.get(
"multilingual_fields",
["title", "brief", "description", "vendor", "category_path", "category_name_text"],
),
shared_fields=search_fields_cfg.get(
"shared_fields",
["tags", "option1_values", "option2_values", "option3_values"],
),
core_multilingual_fields=search_fields_cfg.get(
"core_multilingual_fields",
["title", "brief", "vendor", "category_name_text"],
),
base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")),
translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")),
translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)),
translation_boost_when_source_missing=float(
text_strategy_cfg.get("translation_boost_when_source_missing", 1.0)
),
source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)),
keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)),
enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)),
tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)),
tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)),
|
be52af70
tangwang
first commit
|
296
|
)
|
33839b37
tangwang
属性值参与搜索:
|
297
|
|
be52af70
tangwang
first commit
|
298
299
300
|
# Parse ranking config
ranking_data = config_data.get("ranking", {})
ranking = RankingConfig(
|
9f96d6f3
tangwang
短query不用语义搜索
|
301
302
|
expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
|
be52af70
tangwang
first commit
|
303
|
)
|
33839b37
tangwang
属性值参与搜索:
|
304
|
|
a00c3672
tangwang
feat: Function Sc...
|
305
306
307
|
# Parse Function Score configuration
fs_data = config_data.get("function_score", {})
function_score = FunctionScoreConfig(
|
9f96d6f3
tangwang
短query不用语义搜索
|
308
309
310
|
score_mode=fs_data.get("score_mode") or "sum",
boost_mode=fs_data.get("boost_mode") or "multiply",
functions=fs_data.get("functions") or []
|
a00c3672
tangwang
feat: Function Sc...
|
311
|
)
|
33839b37
tangwang
属性值参与搜索:
|
312
|
|
42e3aea6
tangwang
tidy
|
313
|
# Parse Rerank (provider/URL in services.rerank)
|
a00c3672
tangwang
feat: Function Sc...
|
314
315
|
rerank_data = config_data.get("rerank", {})
rerank = RerankConfig(
|
506c39b7
tangwang
feat(search): 统一重...
|
316
|
rerank_window=int(rerank_data.get("rerank_window", 1000)),
|
506c39b7
tangwang
feat(search): 统一重...
|
317
318
319
|
timeout_sec=float(rerank_data.get("timeout_sec", 15.0)),
weight_es=float(rerank_data.get("weight_es", 0.4)),
weight_ai=float(rerank_data.get("weight_ai", 0.6)),
|
ff32d894
tangwang
rerank
|
320
321
|
rerank_query_template=str(rerank_data.get("rerank_query_template") or "{query}"),
rerank_doc_template=str(rerank_data.get("rerank_doc_template") or "{title}"),
|
a00c3672
tangwang
feat: Function Sc...
|
322
|
)
|
33839b37
tangwang
属性值参与搜索:
|
323
|
|
be52af70
tangwang
first commit
|
324
325
326
327
328
|
# Parse SPU config
spu_data = config_data.get("spu_config", {})
spu_config = SPUConfig(
enabled=spu_data.get("enabled", False),
spu_field=spu_data.get("spu_field"),
|
33839b37
tangwang
属性值参与搜索:
|
329
330
|
inner_hits_size=spu_data.get("inner_hits_size", 3),
searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
|
be52af70
tangwang
first commit
|
331
|
)
|
33839b37
tangwang
属性值参与搜索:
|
332
|
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
333
334
335
|
# Parse tenant config
tenant_config_data = config_data.get("tenant_config", {})
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
336
|
return SearchConfig(
|
33839b37
tangwang
属性值参与搜索:
|
337
|
field_boosts=field_boosts,
|
be52af70
tangwang
first commit
|
338
339
340
|
indexes=indexes,
query_config=query_config,
ranking=ranking,
|
a00c3672
tangwang
feat: Function Sc...
|
341
342
|
function_score=function_score,
rerank=rerank,
|
be52af70
tangwang
first commit
|
343
|
spu_config=spu_config,
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
344
|
tenant_config=tenant_config_data,
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
345
|
es_index_name=config_data.get("es_index_name", "search_products"),
|
42e3aea6
tangwang
tidy
|
346
347
|
es_settings=config_data.get("es_settings", {}),
services=services_data
|
be52af70
tangwang
first commit
|
348
|
)
|
33839b37
tangwang
属性值参与搜索:
|
349
|
|
be52af70
tangwang
first commit
|
350
351
|
def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
"""Parse index configuration from dictionary."""
|
be52af70
tangwang
first commit
|
352
353
354
|
return IndexConfig(
name=index_data["name"],
label=index_data.get("label", index_data["name"]),
|
33839b37
tangwang
属性值参与搜索:
|
355
|
fields=index_data.get("fields", []),
|
be52af70
tangwang
first commit
|
356
|
boost=index_data.get("boost", 1.0),
|
33839b37
tangwang
属性值参与搜索:
|
357
|
example=index_data.get("example")
|
be52af70
tangwang
first commit
|
358
|
)
|
33839b37
tangwang
属性值参与搜索:
|
359
|
|
9cb7528e
tangwang
店匠体系数据的搜索:mock da...
|
360
|
def validate_config(self, config: SearchConfig) -> List[str]:
|
be52af70
tangwang
first commit
|
361
|
"""
|
33839b37
tangwang
属性值参与搜索:
|
362
363
|
Validate configuration for common errors.
|
be52af70
tangwang
first commit
|
364
|
Args:
|
33839b37
tangwang
属性值参与搜索:
|
365
366
|
config: SearchConfig to validate
|
be52af70
tangwang
first commit
|
367
|
Returns:
|
33839b37
tangwang
属性值参与搜索:
|
368
|
List of error messages (empty if valid)
|
be52af70
tangwang
first commit
|
369
370
|
"""
errors = []
|
b926f678
tangwang
多语言查询
|
371
|
|
33839b37
tangwang
属性值参与搜索:
|
372
373
374
375
376
377
378
379
380
381
382
383
384
385
|
# Validate es_index_name
if not config.es_index_name:
errors.append("es_index_name is required")
# Validate field_boosts
if not config.field_boosts:
errors.append("field_boosts is empty")
for field_name, boost in config.field_boosts.items():
if not isinstance(boost, (int, float)):
errors.append(f"field_boosts['{field_name}']: boost must be a number, got {type(boost).__name__}")
elif boost < 0:
errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
386
|
# Validate indexes (deprecated, optional)
|
33839b37
tangwang
属性值参与搜索:
|
387
|
index_names = set()
|
be52af70
tangwang
first commit
|
388
|
for index in config.indexes:
|
33839b37
tangwang
属性值参与搜索:
|
389
390
391
392
|
# Check for duplicate index names
if index.name in index_names:
errors.append(f"Duplicate index name: {index.name}")
index_names.add(index.name)
|
b926f678
tangwang
多语言查询
|
393
|
|
33839b37
tangwang
属性值参与搜索:
|
394
395
396
397
|
# Validate fields in index
if not index.fields:
errors.append(f"Index '{index.name}': fields list is empty")
|
be52af70
tangwang
first commit
|
398
399
400
401
|
# Validate SPU config
if config.spu_config.enabled:
if not config.spu_config.spu_field:
errors.append("SPU aggregation enabled but no spu_field specified")
|
33839b37
tangwang
属性值参与搜索:
|
402
403
404
405
406
407
408
409
410
411
|
# Validate query config
if not config.query_config.supported_languages:
errors.append("At least one supported language must be specified")
if config.query_config.default_language not in config.query_config.supported_languages:
errors.append(
f"Default language '{config.query_config.default_language}' "
f"not in supported languages: {config.query_config.supported_languages}"
)
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
|
# Validate dynamic search fields
def _validate_str_list(name: str, values: List[str]) -> None:
if not isinstance(values, list) or not values:
errors.append(f"query_config.{name} must be a non-empty list[str]")
return
for i, val in enumerate(values):
if not isinstance(val, str) or not val.strip():
errors.append(f"query_config.{name}[{i}] must be a non-empty string")
_validate_str_list("multilingual_fields", config.query_config.multilingual_fields)
_validate_str_list("shared_fields", config.query_config.shared_fields)
_validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields)
core_set = set(config.query_config.core_multilingual_fields)
multi_set = set(config.query_config.multilingual_fields)
if not core_set.issubset(multi_set):
errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields")
# Validate text query strategy numbers
for name in (
"translation_boost",
"translation_boost_when_source_missing",
"source_boost_when_missing",
"keywords_boost",
"tie_breaker_base_query",
"tie_breaker_keywords",
):
value = getattr(config.query_config, name, None)
if not isinstance(value, (int, float)):
errors.append(f"query_config.{name} must be a number")
elif value < 0:
errors.append(f"query_config.{name} must be non-negative")
|
42e3aea6
tangwang
tidy
|
445
|
|
26b910bd
tangwang
refactor service ...
|
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
|
# Validate source_fields tri-state semantics
source_fields = config.query_config.source_fields
if source_fields is not None:
if not isinstance(source_fields, list):
errors.append("query_config.source_fields must be null or list[str]")
else:
for idx, field_name in enumerate(source_fields):
if not isinstance(field_name, str) or not field_name.strip():
errors.append(
f"query_config.source_fields[{idx}] must be a non-empty string"
)
# Validate tenant config shape (default must exist in config)
tenant_cfg = config.tenant_config
if not isinstance(tenant_cfg, dict):
errors.append("tenant_config must be an object")
else:
default_cfg = tenant_cfg.get("default")
if not isinstance(default_cfg, dict):
errors.append("tenant_config.default must be configured")
else:
index_languages = default_cfg.get("index_languages")
if not isinstance(index_languages, list) or len(index_languages) == 0:
errors.append("tenant_config.default.index_languages must be a non-empty list")
|
be52af70
tangwang
first commit
|
471
|
return errors
|
33839b37
tangwang
属性值参与搜索:
|
472
473
474
|
def to_dict(self, config: SearchConfig) -> Dict[str, Any]:
"""Convert SearchConfig to dictionary representation."""
|
a77693fe
tangwang
调整配置目录结构
|
475
|
|
33839b37
tangwang
属性值参与搜索:
|
476
|
# Build query_config dict
|
9f96d6f3
tangwang
短query不用语义搜索
|
477
478
479
|
query_config_dict = {
"supported_languages": config.query_config.supported_languages,
"default_language": config.query_config.default_language,
|
9f96d6f3
tangwang
短query不用语义搜索
|
480
481
482
|
"enable_text_embedding": config.query_config.enable_text_embedding,
"enable_query_rewrite": config.query_config.enable_query_rewrite,
"translation_service": config.query_config.translation_service,
|
33839b37
tangwang
属性值参与搜索:
|
483
484
485
|
"text_embedding_field": config.query_config.text_embedding_field,
"image_embedding_field": config.query_config.image_embedding_field,
"embedding_disable_thresholds": {
|
9f96d6f3
tangwang
短query不用语义搜索
|
486
487
|
"chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
"english_word_limit": config.query_config.embedding_disable_english_word_limit
|
33839b37
tangwang
属性值参与搜索:
|
488
|
},
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
|
"source_fields": config.query_config.source_fields,
"search_fields": {
"multilingual_fields": config.query_config.multilingual_fields,
"shared_fields": config.query_config.shared_fields,
"core_multilingual_fields": config.query_config.core_multilingual_fields,
},
"text_query_strategy": {
"base_minimum_should_match": config.query_config.base_minimum_should_match,
"translation_minimum_should_match": config.query_config.translation_minimum_should_match,
"translation_boost": config.query_config.translation_boost,
"translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing,
"source_boost_when_missing": config.query_config.source_boost_when_missing,
"keywords_boost": config.query_config.keywords_boost,
"enable_phrase_query": config.query_config.enable_phrase_query,
"tie_breaker_base_query": config.query_config.tie_breaker_base_query,
"tie_breaker_keywords": config.query_config.tie_breaker_keywords,
}
|
33839b37
tangwang
属性值参与搜索:
|
506
|
}
|
9f96d6f3
tangwang
短query不用语义搜索
|
507
|
|
33839b37
tangwang
属性值参与搜索:
|
508
|
return {
|
be52af70
tangwang
first commit
|
509
510
|
"es_index_name": config.es_index_name,
"es_settings": config.es_settings,
|
33839b37
tangwang
属性值参与搜索:
|
511
|
"field_boosts": config.field_boosts,
|
be52af70
tangwang
first commit
|
512
|
"indexes": [self._index_to_dict(index) for index in config.indexes],
|
9f96d6f3
tangwang
短query不用语义搜索
|
513
|
"query_config": query_config_dict,
|
be52af70
tangwang
first commit
|
514
515
516
517
|
"ranking": {
"expression": config.ranking.expression,
"description": config.ranking.description
},
|
1f6d15fa
tangwang
重构:SPU级别索引、统一索引架构...
|
518
519
520
521
522
523
|
"function_score": {
"score_mode": config.function_score.score_mode,
"boost_mode": config.function_score.boost_mode,
"functions": config.function_score.functions
},
"rerank": {
|
506c39b7
tangwang
feat(search): 统一重...
|
524
|
"rerank_window": config.rerank.rerank_window,
|
506c39b7
tangwang
feat(search): 统一重...
|
525
526
527
|
"timeout_sec": config.rerank.timeout_sec,
"weight_es": config.rerank.weight_es,
"weight_ai": config.rerank.weight_ai,
|
ff32d894
tangwang
rerank
|
528
529
|
"rerank_query_template": config.rerank.rerank_query_template,
"rerank_doc_template": config.rerank.rerank_doc_template,
|
1f6d15fa
tangwang
重构:SPU级别索引、统一索引架构...
|
530
|
},
|
be52af70
tangwang
first commit
|
531
532
533
|
"spu_config": {
"enabled": config.spu_config.enabled,
"spu_field": config.spu_config.spu_field,
|
33839b37
tangwang
属性值参与搜索:
|
534
535
|
"inner_hits_size": config.spu_config.inner_hits_size,
"searchable_option_dimensions": config.spu_config.searchable_option_dimensions
|
42e3aea6
tangwang
tidy
|
536
537
|
},
"services": config.services,
|
be52af70
tangwang
first commit
|
538
|
}
|
a77693fe
tangwang
调整配置目录结构
|
539
|
|
be52af70
tangwang
first commit
|
540
|
def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
|
33839b37
tangwang
属性值参与搜索:
|
541
|
"""Convert IndexConfig to dictionary."""
|
b926f678
tangwang
多语言查询
|
542
|
result = {
|
be52af70
tangwang
first commit
|
543
544
545
|
"name": index.name,
"label": index.label,
"fields": index.fields,
|
33839b37
tangwang
属性值参与搜索:
|
546
|
"boost": index.boost
|
b926f678
tangwang
多语言查询
|
547
|
}
|
9f96d6f3
tangwang
短query不用语义搜索
|
548
|
|
9f96d6f3
tangwang
短query不用语义搜索
|
549
550
|
if index.example:
result["example"] = index.example
|
33839b37
tangwang
属性值参与搜索:
|
551
552
|
return result
|