Blame view

config/config_loader.py 21.6 KB
be52af70   tangwang   first commit
1
  """
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
2
  Configuration loader and validator for search engine configurations.
be52af70   tangwang   first commit
3
4
  
  This module handles loading, parsing, and validating YAML configuration files
33839b37   tangwang   属性值参与搜索:
5
6
7
8
  that define how search should be executed (NOT how data should be indexed).
  
  索引结构由 mappings/search_products.json 定义。
  此配置只定义搜索行为:字段权重、搜索域、查询策略等。
be52af70   tangwang   first commit
9
10
11
  """
  
  import yaml
be52af70   tangwang   first commit
12
13
14
15
  from typing import Dict, Any, List, Optional
  from dataclasses import dataclass, field
  from pathlib import Path
  
be52af70   tangwang   first commit
16
17
18
19
20
21
  
  @dataclass
  class IndexConfig:
      """Configuration for an index domain (e.g., default, title, brand)."""
      name: str
      label: str
33839b37   tangwang   属性值参与搜索:
22
      fields: List[str]  # List of field names to include in this search domain
be52af70   tangwang   first commit
23
24
25
      boost: float = 1.0
      example: Optional[str] = None
  
be52af70   tangwang   first commit
26
27
28
29
30
  
  @dataclass
  class QueryConfig:
      """Configuration for query processing."""
      supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"])
2739b281   tangwang   多语言索引调整
31
      default_language: str = "en"
33839b37   tangwang   属性值参与搜索:
32
33
      
      # Feature flags
be52af70   tangwang   first commit
34
35
      enable_text_embedding: bool = True
      enable_query_rewrite: bool = True
24e92141   tangwang   delete enable_mul...
36
  
33839b37   tangwang   属性值参与搜索:
37
      # Query rewrite dictionary (loaded from external file)
be52af70   tangwang   first commit
38
      rewrite_dictionary: Dict[str, str] = field(default_factory=dict)
33839b37   tangwang   属性值参与搜索:
39
      
33839b37   tangwang   属性值参与搜索:
40
41
42
      # Embedding field names
      text_embedding_field: Optional[str] = "title_embedding"
      image_embedding_field: Optional[str] = None
bcada818   tangwang   last
43
          
33839b37   tangwang   属性值参与搜索:
44
      # Source fields configuration
cd3799c6   tangwang   tenant2 1w测试数据 mo...
45
      source_fields: Optional[List[str]] = None
70dab99f   tangwang   add logs
46
47
48
      
      # KNN boost configuration
      knn_boost: float = 0.25  # Boost value for KNN (embedding recall)
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
      
      # Dynamic text fields for multi-language retrieval
      multilingual_fields: List[str] = field(
          default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"]
      )
      shared_fields: List[str] = field(
          default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"]
      )
      core_multilingual_fields: List[str] = field(
          default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
      )
      
      # Unified text strategy tuning
      base_minimum_should_match: str = "75%"
      translation_minimum_should_match: str = "75%"
      translation_boost: float = 0.4
      translation_boost_when_source_missing: float = 1.0
      source_boost_when_missing: float = 0.6
bcada818   tangwang   last
67
      original_query_fallback_boost_when_translation_missing: float = 0.2
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
68
      tie_breaker_base_query: float = 0.9
13377199   tangwang   接口优化
69
  
77bfa7e3   tangwang   query translate
70
71
72
73
74
75
76
77
      # Query-time translation model selection (configurable)
      # - zh_to_en_model: model for zh -> en
      # - en_to_zh_model: model for en -> zh
      # - default_translation_model: fallback model for all other language pairs
      zh_to_en_model: str = "opus-mt-zh-en"
      en_to_zh_model: str = "opus-mt-en-zh"
      default_translation_model: str = "nllb-200-distilled-600m"
  
be52af70   tangwang   first commit
78
79
80
81
82
  
  @dataclass
  class SPUConfig:
      """Configuration for SPU aggregation."""
      enabled: bool = False
33839b37   tangwang   属性值参与搜索:
83
      spu_field: Optional[str] = None
be52af70   tangwang   first commit
84
      inner_hits_size: int = 3
33839b37   tangwang   属性值参与搜索:
85
86
      # 配置哪些option维度参与检索(进索引、以及在线搜索)
      searchable_option_dimensions: List[str] = field(default_factory=lambda: ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
87
88
89
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
90
91
  class FunctionScoreConfig:
      """Function Score配置(ES层打分规则)"""
33839b37   tangwang   属性值参与搜索:
92
93
      score_mode: str = "sum"
      boost_mode: str = "multiply"
a00c3672   tangwang   feat: Function Sc...
94
95
96
97
      functions: List[Dict[str, Any]] = field(default_factory=list)
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
98
  class RerankConfig:
42e3aea6   tangwang   tidy
99
      """重排配置(provider/URL 在 services.rerank)"""
5f7d7f09   tangwang   性能测试报告.md
100
      enabled: bool = True
c51d254f   tangwang   性能测试
101
      rerank_window: int = 384
506c39b7   tangwang   feat(search): 统一重...
102
103
104
      timeout_sec: float = 15.0
      weight_es: float = 0.4
      weight_ai: float = 0.6
ff32d894   tangwang   rerank
105
106
      rerank_query_template: str = "{query}"
      rerank_doc_template: str = "{title}"
a00c3672   tangwang   feat: Function Sc...
107
108
109
  
  
  @dataclass
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
110
  class SearchConfig:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
111
      """Complete configuration for search engine (multi-tenant)."""
33839b37   tangwang   属性值参与搜索:
112
113
114
115
      
      # 字段权重配置(用于搜索)
      field_boosts: Dict[str, float]
      
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
116
      # Legacy index domains (deprecated; kept for compatibility)
be52af70   tangwang   first commit
117
      indexes: List[IndexConfig]
33839b37   tangwang   属性值参与搜索:
118
      
be52af70   tangwang   first commit
119
120
      # Query processing
      query_config: QueryConfig
33839b37   tangwang   属性值参与搜索:
121
      
a00c3672   tangwang   feat: Function Sc...
122
123
      # Function Score configuration (ES层打分)
      function_score: FunctionScoreConfig
33839b37   tangwang   属性值参与搜索:
124
      
a00c3672   tangwang   feat: Function Sc...
125
126
      # Rerank configuration (本地重排)
      rerank: RerankConfig
33839b37   tangwang   属性值参与搜索:
127
      
be52af70   tangwang   first commit
128
129
      # SPU configuration
      spu_config: SPUConfig
33839b37   tangwang   属性值参与搜索:
130
      
be52af70   tangwang   first commit
131
132
      # ES index settings
      es_index_name: str
0064e946   tangwang   feat: 增量索引服务、租户配置...
133
134
135
136
137
      
      # Tenant configuration
      tenant_config: Dict[str, Any] = field(default_factory=dict)
      
      # ES settings
be52af70   tangwang   first commit
138
      es_settings: Dict[str, Any] = field(default_factory=dict)
42e3aea6   tangwang   tidy
139
140
      # Extensible service/provider registry (translation/embedding/rerank/...)
      services: Dict[str, Any] = field(default_factory=dict)
be52af70   tangwang   first commit
141
142
143
144
145
146
147
148
  
  
  class ConfigurationError(Exception):
      """Raised when configuration validation fails."""
      pass
  
  
  class ConfigLoader:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
149
      """Loads and validates unified search engine configuration from YAML file."""
a77693fe   tangwang   调整配置目录结构
150
      
33839b37   tangwang   属性值参与搜索:
151
      def __init__(self, config_file: Optional[Path] = None):
a77693fe   tangwang   调整配置目录结构
152
          """
33839b37   tangwang   属性值参与搜索:
153
          Initialize config loader.
a77693fe   tangwang   调整配置目录结构
154
          
33839b37   tangwang   属性值参与搜索:
155
156
          Args:
              config_file: Path to config YAML file (defaults to config/config.yaml)
a77693fe   tangwang   调整配置目录结构
157
          """
33839b37   tangwang   属性值参与搜索:
158
159
160
161
162
163
164
165
          if config_file is None:
              config_file = Path(__file__).parent / "config.yaml"
          self.config_file = Path(config_file)
      
      def _load_rewrite_dictionary(self) -> Dict[str, str]:
          """Load query rewrite dictionary from external file."""
          rewrite_file = Path(__file__).parent / "rewrite_dictionary.txt"
          rewrite_dict = {}
a77693fe   tangwang   调整配置目录结构
166
          
33839b37   tangwang   属性值参与搜索:
167
168
          if not rewrite_file.exists():
              return rewrite_dict
a77693fe   tangwang   调整配置目录结构
169
          
a77693fe   tangwang   调整配置目录结构
170
          try:
33839b37   tangwang   属性值参与搜索:
171
172
              with open(rewrite_file, 'r', encoding='utf-8') as f:
                  for line in f:
a77693fe   tangwang   调整配置目录结构
173
                      line = line.strip()
a77693fe   tangwang   调整配置目录结构
174
175
176
                      if not line or line.startswith('#'):
                          continue
                      
a77693fe   tangwang   调整配置目录结构
177
                      parts = line.split('\t')
33839b37   tangwang   属性值参与搜索:
178
179
180
181
182
                      if len(parts) >= 2:
                          original = parts[0].strip()
                          replacement = parts[1].strip()
                          if original and replacement:
                              rewrite_dict[original] = replacement
a77693fe   tangwang   调整配置目录结构
183
          except Exception as e:
33839b37   tangwang   属性值参与搜索:
184
              print(f"Warning: Failed to load rewrite dictionary: {e}")
a77693fe   tangwang   调整配置目录结构
185
186
          
          return rewrite_dict
33839b37   tangwang   属性值参与搜索:
187
      
9f96d6f3   tangwang   短query不用语义搜索
188
      def load_config(self, validate: bool = True) -> SearchConfig:
be52af70   tangwang   first commit
189
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
190
          Load unified configuration from YAML file.
33839b37   tangwang   属性值参与搜索:
191
          
9f96d6f3   tangwang   短query不用语义搜索
192
          Args:
33839b37   tangwang   属性值参与搜索:
193
194
              validate: Whether to validate configuration after loading
          
be52af70   tangwang   first commit
195
          Returns:
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
196
              SearchConfig object
33839b37   tangwang   属性值参与搜索:
197
          
be52af70   tangwang   first commit
198
          Raises:
9f96d6f3   tangwang   短query不用语义搜索
199
              ConfigurationError: If config file not found, invalid, or validation fails
be52af70   tangwang   first commit
200
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
201
202
          if not self.config_file.exists():
              raise ConfigurationError(f"Configuration file not found: {self.config_file}")
33839b37   tangwang   属性值参与搜索:
203
          
be52af70   tangwang   first commit
204
          try:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
205
              with open(self.config_file, 'r', encoding='utf-8') as f:
be52af70   tangwang   first commit
206
207
                  config_data = yaml.safe_load(f)
          except yaml.YAMLError as e:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
208
              raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
33839b37   tangwang   属性值参与搜索:
209
          
9f96d6f3   tangwang   短query不用语义搜索
210
211
212
213
214
215
216
217
218
219
          config = self._parse_config(config_data)
          
          # Auto-validate configuration
          if validate:
              errors = self.validate_config(config)
              if errors:
                  error_msg = "Configuration validation failed:\n" + "\n".join(f"  - {err}" for err in errors)
                  raise ConfigurationError(error_msg)
          
          return config
33839b37   tangwang   属性值参与搜索:
220
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
221
222
      def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
          """Parse configuration dictionary into SearchConfig object."""
33839b37   tangwang   属性值参与搜索:
223
224
225
226
227
228
          
          # Parse field_boosts
          field_boosts = config_data.get("field_boosts", {})
          if not isinstance(field_boosts, dict):
              raise ConfigurationError("field_boosts must be a dictionary")
          
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
229
          # Parse indexes (deprecated; compatibility only)
be52af70   tangwang   first commit
230
231
232
          indexes = []
          for index_data in config_data.get("indexes", []):
              indexes.append(self._parse_index_config(index_data))
33839b37   tangwang   属性值参与搜索:
233
          
be52af70   tangwang   first commit
234
235
          # Parse query config
          query_config_data = config_data.get("query_config", {})
4d824a77   tangwang   所有租户共用一套统一配置.tena...
236
          rewrite_dictionary = self._load_rewrite_dictionary()
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
237
238
          search_fields_cfg = query_config_data.get("search_fields", {})
          text_strategy_cfg = query_config_data.get("text_query_strategy", {})
42e3aea6   tangwang   tidy
239
  
be52af70   tangwang   first commit
240
          query_config = QueryConfig(
9f96d6f3   tangwang   短query不用语义搜索
241
              supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
2739b281   tangwang   多语言索引调整
242
              default_language=query_config_data.get("default_language") or "en",
be52af70   tangwang   first commit
243
244
              enable_text_embedding=query_config_data.get("enable_text_embedding", True),
              enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
a77693fe   tangwang   调整配置目录结构
245
              rewrite_dictionary=rewrite_dictionary,
325eec03   tangwang   1. 日志、配置基础设施,使用优化
246
              text_embedding_field=query_config_data.get("text_embedding_field"),
cd3799c6   tangwang   tenant2 1w测试数据 mo...
247
              image_embedding_field=query_config_data.get("image_embedding_field"),
70dab99f   tangwang   add logs
248
              source_fields=query_config_data.get("source_fields"),
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
              knn_boost=query_config_data.get("knn_boost", 0.25),
              multilingual_fields=search_fields_cfg.get(
                  "multilingual_fields",
                  ["title", "brief", "description", "vendor", "category_path", "category_name_text"],
              ),
              shared_fields=search_fields_cfg.get(
                  "shared_fields",
                  ["tags", "option1_values", "option2_values", "option3_values"],
              ),
              core_multilingual_fields=search_fields_cfg.get(
                  "core_multilingual_fields",
                  ["title", "brief", "vendor", "category_name_text"],
              ),
              base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")),
              translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")),
              translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)),
              translation_boost_when_source_missing=float(
                  text_strategy_cfg.get("translation_boost_when_source_missing", 1.0)
              ),
              source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)),
bcada818   tangwang   last
269
270
271
              original_query_fallback_boost_when_translation_missing=float(
                  text_strategy_cfg.get("original_query_fallback_boost_when_translation_missing", 0.2)
              ),
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
272
              tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)),
77bfa7e3   tangwang   query translate
273
274
275
276
277
              zh_to_en_model=str(query_config_data.get("zh_to_en_model") or "opus-mt-zh-en"),
              en_to_zh_model=str(query_config_data.get("en_to_zh_model") or "opus-mt-en-zh"),
              default_translation_model=str(
                  query_config_data.get("default_translation_model") or "nllb-200-distilled-600m"
              ),
be52af70   tangwang   first commit
278
          )
33839b37   tangwang   属性值参与搜索:
279
          
a00c3672   tangwang   feat: Function Sc...
280
281
282
          # Parse Function Score configuration
          fs_data = config_data.get("function_score", {})
          function_score = FunctionScoreConfig(
9f96d6f3   tangwang   短query不用语义搜索
283
284
285
              score_mode=fs_data.get("score_mode") or "sum",
              boost_mode=fs_data.get("boost_mode") or "multiply",
              functions=fs_data.get("functions") or []
a00c3672   tangwang   feat: Function Sc...
286
          )
33839b37   tangwang   属性值参与搜索:
287
          
42e3aea6   tangwang   tidy
288
          # Parse Rerank (provider/URL in services.rerank)
a00c3672   tangwang   feat: Function Sc...
289
290
          rerank_data = config_data.get("rerank", {})
          rerank = RerankConfig(
5f7d7f09   tangwang   性能测试报告.md
291
              enabled=bool(rerank_data.get("enabled", True)),
c51d254f   tangwang   性能测试
292
              rerank_window=int(rerank_data.get("rerank_window", 384)),
506c39b7   tangwang   feat(search): 统一重...
293
294
295
              timeout_sec=float(rerank_data.get("timeout_sec", 15.0)),
              weight_es=float(rerank_data.get("weight_es", 0.4)),
              weight_ai=float(rerank_data.get("weight_ai", 0.6)),
ff32d894   tangwang   rerank
296
297
              rerank_query_template=str(rerank_data.get("rerank_query_template") or "{query}"),
              rerank_doc_template=str(rerank_data.get("rerank_doc_template") or "{title}"),
a00c3672   tangwang   feat: Function Sc...
298
          )
33839b37   tangwang   属性值参与搜索:
299
          
be52af70   tangwang   first commit
300
301
302
303
304
          # Parse SPU config
          spu_data = config_data.get("spu_config", {})
          spu_config = SPUConfig(
              enabled=spu_data.get("enabled", False),
              spu_field=spu_data.get("spu_field"),
33839b37   tangwang   属性值参与搜索:
305
306
              inner_hits_size=spu_data.get("inner_hits_size", 3),
              searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
307
          )
33839b37   tangwang   属性值参与搜索:
308
          
0064e946   tangwang   feat: 增量索引服务、租户配置...
309
310
          # Parse tenant config
          tenant_config_data = config_data.get("tenant_config", {})
a73a751f   tangwang   enrich
311
312
313
314
315
316
  
          # Parse extensible services/provider registry
          services_data = config_data.get("services", {}) or {}
          if not isinstance(services_data, dict):
              raise ConfigurationError("services must be a dictionary if provided")
  
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
317
          return SearchConfig(
33839b37   tangwang   属性值参与搜索:
318
              field_boosts=field_boosts,
be52af70   tangwang   first commit
319
320
              indexes=indexes,
              query_config=query_config,
a00c3672   tangwang   feat: Function Sc...
321
322
              function_score=function_score,
              rerank=rerank,
be52af70   tangwang   first commit
323
              spu_config=spu_config,
0064e946   tangwang   feat: 增量索引服务、租户配置...
324
              tenant_config=tenant_config_data,
4d824a77   tangwang   所有租户共用一套统一配置.tena...
325
              es_index_name=config_data.get("es_index_name", "search_products"),
42e3aea6   tangwang   tidy
326
327
              es_settings=config_data.get("es_settings", {}),
              services=services_data
be52af70   tangwang   first commit
328
          )
33839b37   tangwang   属性值参与搜索:
329
      
be52af70   tangwang   first commit
330
331
      def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
          """Parse index configuration from dictionary."""
be52af70   tangwang   first commit
332
333
334
          return IndexConfig(
              name=index_data["name"],
              label=index_data.get("label", index_data["name"]),
33839b37   tangwang   属性值参与搜索:
335
              fields=index_data.get("fields", []),
be52af70   tangwang   first commit
336
              boost=index_data.get("boost", 1.0),
33839b37   tangwang   属性值参与搜索:
337
              example=index_data.get("example")
be52af70   tangwang   first commit
338
          )
33839b37   tangwang   属性值参与搜索:
339
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
340
      def validate_config(self, config: SearchConfig) -> List[str]:
be52af70   tangwang   first commit
341
          """
33839b37   tangwang   属性值参与搜索:
342
343
          Validate configuration for common errors.
          
be52af70   tangwang   first commit
344
          Args:
33839b37   tangwang   属性值参与搜索:
345
346
              config: SearchConfig to validate
          
be52af70   tangwang   first commit
347
          Returns:
33839b37   tangwang   属性值参与搜索:
348
              List of error messages (empty if valid)
be52af70   tangwang   first commit
349
350
          """
          errors = []
b926f678   tangwang   多语言查询
351
          
33839b37   tangwang   属性值参与搜索:
352
353
354
355
356
357
358
359
360
361
362
363
364
365
          # Validate es_index_name
          if not config.es_index_name:
              errors.append("es_index_name is required")
          
          # Validate field_boosts
          if not config.field_boosts:
              errors.append("field_boosts is empty")
          
          for field_name, boost in config.field_boosts.items():
              if not isinstance(boost, (int, float)):
                  errors.append(f"field_boosts['{field_name}']: boost must be a number, got {type(boost).__name__}")
              elif boost < 0:
                  errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
          
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
366
          # Validate indexes (deprecated, optional)
33839b37   tangwang   属性值参与搜索:
367
          index_names = set()
be52af70   tangwang   first commit
368
          for index in config.indexes:
33839b37   tangwang   属性值参与搜索:
369
370
371
372
              # Check for duplicate index names
              if index.name in index_names:
                  errors.append(f"Duplicate index name: {index.name}")
              index_names.add(index.name)
b926f678   tangwang   多语言查询
373
              
33839b37   tangwang   属性值参与搜索:
374
375
376
377
              # Validate fields in index
              if not index.fields:
                  errors.append(f"Index '{index.name}': fields list is empty")
          
be52af70   tangwang   first commit
378
379
380
381
          # Validate SPU config
          if config.spu_config.enabled:
              if not config.spu_config.spu_field:
                  errors.append("SPU aggregation enabled but no spu_field specified")
33839b37   tangwang   属性值参与搜索:
382
383
384
385
386
387
388
389
390
391
          
          # Validate query config
          if not config.query_config.supported_languages:
              errors.append("At least one supported language must be specified")
          
          if config.query_config.default_language not in config.query_config.supported_languages:
              errors.append(
                  f"Default language '{config.query_config.default_language}' "
                  f"not in supported languages: {config.query_config.supported_languages}"
              )
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
          
          # Validate dynamic search fields
          def _validate_str_list(name: str, values: List[str]) -> None:
              if not isinstance(values, list) or not values:
                  errors.append(f"query_config.{name} must be a non-empty list[str]")
                  return
              for i, val in enumerate(values):
                  if not isinstance(val, str) or not val.strip():
                      errors.append(f"query_config.{name}[{i}] must be a non-empty string")
          
          _validate_str_list("multilingual_fields", config.query_config.multilingual_fields)
          _validate_str_list("shared_fields", config.query_config.shared_fields)
          _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields)
          
          core_set = set(config.query_config.core_multilingual_fields)
          multi_set = set(config.query_config.multilingual_fields)
          if not core_set.issubset(multi_set):
              errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields")
          
          # Validate text query strategy numbers
          for name in (
              "translation_boost",
              "translation_boost_when_source_missing",
              "source_boost_when_missing",
bcada818   tangwang   last
416
              "original_query_fallback_boost_when_translation_missing",
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
417
              "tie_breaker_base_query",
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
418
419
420
421
422
423
          ):
              value = getattr(config.query_config, name, None)
              if not isinstance(value, (int, float)):
                  errors.append(f"query_config.{name} must be a number")
              elif value < 0:
                  errors.append(f"query_config.{name} must be non-negative")
42e3aea6   tangwang   tidy
424
  
26b910bd   tangwang   refactor service ...
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
          # Validate source_fields tri-state semantics
          source_fields = config.query_config.source_fields
          if source_fields is not None:
              if not isinstance(source_fields, list):
                  errors.append("query_config.source_fields must be null or list[str]")
              else:
                  for idx, field_name in enumerate(source_fields):
                      if not isinstance(field_name, str) or not field_name.strip():
                          errors.append(
                              f"query_config.source_fields[{idx}] must be a non-empty string"
                          )
  
          # Validate tenant config shape (default must exist in config)
          tenant_cfg = config.tenant_config
          if not isinstance(tenant_cfg, dict):
              errors.append("tenant_config must be an object")
          else:
              default_cfg = tenant_cfg.get("default")
              if not isinstance(default_cfg, dict):
                  errors.append("tenant_config.default must be configured")
              else:
                  index_languages = default_cfg.get("index_languages")
                  if not isinstance(index_languages, list) or len(index_languages) == 0:
                      errors.append("tenant_config.default.index_languages must be a non-empty list")
  
be52af70   tangwang   first commit
450
          return errors
33839b37   tangwang   属性值参与搜索:
451
452
453
      
      def to_dict(self, config: SearchConfig) -> Dict[str, Any]:
          """Convert SearchConfig to dictionary representation."""
a77693fe   tangwang   调整配置目录结构
454
          
33839b37   tangwang   属性值参与搜索:
455
          # Build query_config dict
9f96d6f3   tangwang   短query不用语义搜索
456
457
458
          query_config_dict = {
              "supported_languages": config.query_config.supported_languages,
              "default_language": config.query_config.default_language,
9f96d6f3   tangwang   短query不用语义搜索
459
460
              "enable_text_embedding": config.query_config.enable_text_embedding,
              "enable_query_rewrite": config.query_config.enable_query_rewrite,
33839b37   tangwang   属性值参与搜索:
461
462
              "text_embedding_field": config.query_config.text_embedding_field,
              "image_embedding_field": config.query_config.image_embedding_field,
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
463
464
465
466
467
468
469
470
471
472
473
474
              "source_fields": config.query_config.source_fields,
              "search_fields": {
                  "multilingual_fields": config.query_config.multilingual_fields,
                  "shared_fields": config.query_config.shared_fields,
                  "core_multilingual_fields": config.query_config.core_multilingual_fields,
              },
              "text_query_strategy": {
                  "base_minimum_should_match": config.query_config.base_minimum_should_match,
                  "translation_minimum_should_match": config.query_config.translation_minimum_should_match,
                  "translation_boost": config.query_config.translation_boost,
                  "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing,
                  "source_boost_when_missing": config.query_config.source_boost_when_missing,
bcada818   tangwang   last
475
476
477
                  "original_query_fallback_boost_when_translation_missing": (
                      config.query_config.original_query_fallback_boost_when_translation_missing
                  ),
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
478
                  "tie_breaker_base_query": config.query_config.tie_breaker_base_query,
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
479
              }
33839b37   tangwang   属性值参与搜索:
480
          }
9f96d6f3   tangwang   短query不用语义搜索
481
          
33839b37   tangwang   属性值参与搜索:
482
          return {
be52af70   tangwang   first commit
483
484
              "es_index_name": config.es_index_name,
              "es_settings": config.es_settings,
33839b37   tangwang   属性值参与搜索:
485
              "field_boosts": config.field_boosts,
be52af70   tangwang   first commit
486
              "indexes": [self._index_to_dict(index) for index in config.indexes],
9f96d6f3   tangwang   短query不用语义搜索
487
              "query_config": query_config_dict,
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
488
489
490
491
492
493
              "function_score": {
                  "score_mode": config.function_score.score_mode,
                  "boost_mode": config.function_score.boost_mode,
                  "functions": config.function_score.functions
              },
              "rerank": {
5f7d7f09   tangwang   性能测试报告.md
494
                  "enabled": config.rerank.enabled,
506c39b7   tangwang   feat(search): 统一重...
495
                  "rerank_window": config.rerank.rerank_window,
506c39b7   tangwang   feat(search): 统一重...
496
497
498
                  "timeout_sec": config.rerank.timeout_sec,
                  "weight_es": config.rerank.weight_es,
                  "weight_ai": config.rerank.weight_ai,
ff32d894   tangwang   rerank
499
500
                  "rerank_query_template": config.rerank.rerank_query_template,
                  "rerank_doc_template": config.rerank.rerank_doc_template,
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
501
              },
be52af70   tangwang   first commit
502
503
504
              "spu_config": {
                  "enabled": config.spu_config.enabled,
                  "spu_field": config.spu_config.spu_field,
33839b37   tangwang   属性值参与搜索:
505
506
                  "inner_hits_size": config.spu_config.inner_hits_size,
                  "searchable_option_dimensions": config.spu_config.searchable_option_dimensions
42e3aea6   tangwang   tidy
507
508
              },
              "services": config.services,
be52af70   tangwang   first commit
509
          }
a77693fe   tangwang   调整配置目录结构
510
      
be52af70   tangwang   first commit
511
      def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
33839b37   tangwang   属性值参与搜索:
512
          """Convert IndexConfig to dictionary."""
b926f678   tangwang   多语言查询
513
          result = {
be52af70   tangwang   first commit
514
515
516
              "name": index.name,
              "label": index.label,
              "fields": index.fields,
33839b37   tangwang   属性值参与搜索:
517
              "boost": index.boost
b926f678   tangwang   多语言查询
518
          }
9f96d6f3   tangwang   短query不用语义搜索
519
          
9f96d6f3   tangwang   短query不用语义搜索
520
521
          if index.example:
              result["example"] = index.example
33839b37   tangwang   属性值参与搜索:
522
523
          
          return result