Blame view

config/config_loader.py 23.1 KB
be52af70   tangwang   first commit
1
  """
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
2
  Configuration loader and validator for search engine configurations.
be52af70   tangwang   first commit
3
4
  
  This module handles loading, parsing, and validating YAML configuration files
33839b37   tangwang   属性值参与搜索:
5
6
7
8
  that define how search should be executed (NOT how data should be indexed).
  
  索引结构由 mappings/search_products.json 定义。
  此配置只定义搜索行为:字段权重、搜索域、查询策略等。
be52af70   tangwang   first commit
9
10
11
  """
  
  import yaml
be52af70   tangwang   first commit
12
13
14
15
  from typing import Dict, Any, List, Optional
  from dataclasses import dataclass, field
  from pathlib import Path
  
be52af70   tangwang   first commit
16
17
18
19
20
21
  
  @dataclass
  class IndexConfig:
      """Configuration for an index domain (e.g., default, title, brand)."""
      name: str
      label: str
33839b37   tangwang   属性值参与搜索:
22
      fields: List[str]  # List of field names to include in this search domain
be52af70   tangwang   first commit
23
24
25
      boost: float = 1.0
      example: Optional[str] = None
  
be52af70   tangwang   first commit
26
27
28
29
30
  
  @dataclass
  class QueryConfig:
      """Configuration for query processing."""
      supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"])
2739b281   tangwang   多语言索引调整
31
      default_language: str = "en"
33839b37   tangwang   属性值参与搜索:
32
33
      
      # Feature flags
be52af70   tangwang   first commit
34
35
      enable_text_embedding: bool = True
      enable_query_rewrite: bool = True
24e92141   tangwang   delete enable_mul...
36
  
33839b37   tangwang   属性值参与搜索:
37
      # Query rewrite dictionary (loaded from external file)
be52af70   tangwang   first commit
38
      rewrite_dictionary: Dict[str, str] = field(default_factory=dict)
33839b37   tangwang   属性值参与搜索:
39
      
42e3aea6   tangwang   tidy
40
      # Translation settings (provider/URL in services.translation)
33839b37   tangwang   属性值参与搜索:
41
      translation_service: str = "deepl"
be52af70   tangwang   first commit
42
      translation_api_key: Optional[str] = None
33839b37   tangwang   属性值参与搜索:
43
44
      translation_glossary_id: Optional[str] = None
      translation_context: str = "e-commerce product search"
42e3aea6   tangwang   tidy
45
      translation_prompts: Dict[str, str] = field(default_factory=dict)
33839b37   tangwang   属性值参与搜索:
46
47
48
49
50
      
      # Embedding field names
      text_embedding_field: Optional[str] = "title_embedding"
      image_embedding_field: Optional[str] = None
      
9f96d6f3   tangwang   短query不用语义搜索
51
      # Embedding disable thresholds (disable vector search for short queries)
33839b37   tangwang   属性值参与搜索:
52
53
54
55
      embedding_disable_chinese_char_limit: int = 4
      embedding_disable_english_word_limit: int = 3
      
      # Source fields configuration
cd3799c6   tangwang   tenant2 1w测试数据 mo...
56
      source_fields: Optional[List[str]] = None
70dab99f   tangwang   add logs
57
58
59
      
      # KNN boost configuration
      knn_boost: float = 0.25  # Boost value for KNN (embedding recall)
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
      
      # Dynamic text fields for multi-language retrieval
      multilingual_fields: List[str] = field(
          default_factory=lambda: ["title", "brief", "description", "vendor", "category_path", "category_name_text"]
      )
      shared_fields: List[str] = field(
          default_factory=lambda: ["tags", "option1_values", "option2_values", "option3_values"]
      )
      core_multilingual_fields: List[str] = field(
          default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
      )
      
      # Unified text strategy tuning
      base_minimum_should_match: str = "75%"
      translation_minimum_should_match: str = "75%"
      translation_boost: float = 0.4
      translation_boost_when_source_missing: float = 1.0
      source_boost_when_missing: float = 0.6
      keywords_boost: float = 0.1
      enable_phrase_query: bool = True
      tie_breaker_base_query: float = 0.9
      tie_breaker_keywords: float = 0.9
13377199   tangwang   接口优化
82
  
be52af70   tangwang   first commit
83
84
85
86
87
  
  @dataclass
  class SPUConfig:
      """Configuration for SPU aggregation."""
      enabled: bool = False
33839b37   tangwang   属性值参与搜索:
88
      spu_field: Optional[str] = None
be52af70   tangwang   first commit
89
      inner_hits_size: int = 3
33839b37   tangwang   属性值参与搜索:
90
91
      # 配置哪些option维度参与检索(进索引、以及在线搜索)
      searchable_option_dimensions: List[str] = field(default_factory=lambda: ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
92
93
94
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
95
96
  class FunctionScoreConfig:
      """Function Score配置(ES层打分规则)"""
33839b37   tangwang   属性值参与搜索:
97
98
      score_mode: str = "sum"
      boost_mode: str = "multiply"
a00c3672   tangwang   feat: Function Sc...
99
100
101
102
      functions: List[Dict[str, Any]] = field(default_factory=list)
  
  
  @dataclass
33839b37   tangwang   属性值参与搜索:
103
104
105
106
107
108
109
  class RankingConfig:
      """Configuration for ranking expressions."""
      expression: str = "bm25()"
      description: str = "Default BM25 ranking"
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
110
  class RerankConfig:
42e3aea6   tangwang   tidy
111
      """重排配置(provider/URL 在 services.rerank)"""
506c39b7   tangwang   feat(search): 统一重...
112
      rerank_window: int = 1000
506c39b7   tangwang   feat(search): 统一重...
113
114
115
      timeout_sec: float = 15.0
      weight_es: float = 0.4
      weight_ai: float = 0.6
ff32d894   tangwang   rerank
116
117
      rerank_query_template: str = "{query}"
      rerank_doc_template: str = "{title}"
a00c3672   tangwang   feat: Function Sc...
118
119
120
  
  
  @dataclass
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
121
  class SearchConfig:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
122
      """Complete configuration for search engine (multi-tenant)."""
33839b37   tangwang   属性值参与搜索:
123
124
125
126
      
      # 字段权重配置(用于搜索)
      field_boosts: Dict[str, float]
      
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
127
      # Legacy index domains (deprecated; kept for compatibility)
be52af70   tangwang   first commit
128
      indexes: List[IndexConfig]
33839b37   tangwang   属性值参与搜索:
129
      
be52af70   tangwang   first commit
130
131
      # Query processing
      query_config: QueryConfig
33839b37   tangwang   属性值参与搜索:
132
      
be52af70   tangwang   first commit
133
134
      # Ranking configuration
      ranking: RankingConfig
33839b37   tangwang   属性值参与搜索:
135
      
a00c3672   tangwang   feat: Function Sc...
136
137
      # Function Score configuration (ES层打分)
      function_score: FunctionScoreConfig
33839b37   tangwang   属性值参与搜索:
138
      
a00c3672   tangwang   feat: Function Sc...
139
140
      # Rerank configuration (本地重排)
      rerank: RerankConfig
33839b37   tangwang   属性值参与搜索:
141
      
be52af70   tangwang   first commit
142
143
      # SPU configuration
      spu_config: SPUConfig
33839b37   tangwang   属性值参与搜索:
144
      
be52af70   tangwang   first commit
145
146
      # ES index settings
      es_index_name: str
0064e946   tangwang   feat: 增量索引服务、租户配置...
147
148
149
150
151
      
      # Tenant configuration
      tenant_config: Dict[str, Any] = field(default_factory=dict)
      
      # ES settings
be52af70   tangwang   first commit
152
      es_settings: Dict[str, Any] = field(default_factory=dict)
42e3aea6   tangwang   tidy
153
154
      # Extensible service/provider registry (translation/embedding/rerank/...)
      services: Dict[str, Any] = field(default_factory=dict)
be52af70   tangwang   first commit
155
156
157
158
159
160
161
162
  
  
  class ConfigurationError(Exception):
      """Raised when configuration validation fails."""
      pass
  
  
  class ConfigLoader:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
163
      """Loads and validates unified search engine configuration from YAML file."""
a77693fe   tangwang   调整配置目录结构
164
      
33839b37   tangwang   属性值参与搜索:
165
      def __init__(self, config_file: Optional[Path] = None):
a77693fe   tangwang   调整配置目录结构
166
          """
33839b37   tangwang   属性值参与搜索:
167
          Initialize config loader.
a77693fe   tangwang   调整配置目录结构
168
          
33839b37   tangwang   属性值参与搜索:
169
170
          Args:
              config_file: Path to config YAML file (defaults to config/config.yaml)
a77693fe   tangwang   调整配置目录结构
171
          """
33839b37   tangwang   属性值参与搜索:
172
173
174
175
176
177
178
179
          if config_file is None:
              config_file = Path(__file__).parent / "config.yaml"
          self.config_file = Path(config_file)
      
      def _load_rewrite_dictionary(self) -> Dict[str, str]:
          """Load query rewrite dictionary from external file."""
          rewrite_file = Path(__file__).parent / "rewrite_dictionary.txt"
          rewrite_dict = {}
a77693fe   tangwang   调整配置目录结构
180
          
33839b37   tangwang   属性值参与搜索:
181
182
          if not rewrite_file.exists():
              return rewrite_dict
a77693fe   tangwang   调整配置目录结构
183
          
a77693fe   tangwang   调整配置目录结构
184
          try:
33839b37   tangwang   属性值参与搜索:
185
186
              with open(rewrite_file, 'r', encoding='utf-8') as f:
                  for line in f:
a77693fe   tangwang   调整配置目录结构
187
                      line = line.strip()
a77693fe   tangwang   调整配置目录结构
188
189
190
                      if not line or line.startswith('#'):
                          continue
                      
a77693fe   tangwang   调整配置目录结构
191
                      parts = line.split('\t')
33839b37   tangwang   属性值参与搜索:
192
193
194
195
196
                      if len(parts) >= 2:
                          original = parts[0].strip()
                          replacement = parts[1].strip()
                          if original and replacement:
                              rewrite_dict[original] = replacement
a77693fe   tangwang   调整配置目录结构
197
          except Exception as e:
33839b37   tangwang   属性值参与搜索:
198
              print(f"Warning: Failed to load rewrite dictionary: {e}")
a77693fe   tangwang   调整配置目录结构
199
200
          
          return rewrite_dict
33839b37   tangwang   属性值参与搜索:
201
      
9f96d6f3   tangwang   短query不用语义搜索
202
      def load_config(self, validate: bool = True) -> SearchConfig:
be52af70   tangwang   first commit
203
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
204
          Load unified configuration from YAML file.
33839b37   tangwang   属性值参与搜索:
205
          
9f96d6f3   tangwang   短query不用语义搜索
206
          Args:
33839b37   tangwang   属性值参与搜索:
207
208
              validate: Whether to validate configuration after loading
          
be52af70   tangwang   first commit
209
          Returns:
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
210
              SearchConfig object
33839b37   tangwang   属性值参与搜索:
211
          
be52af70   tangwang   first commit
212
          Raises:
9f96d6f3   tangwang   短query不用语义搜索
213
              ConfigurationError: If config file not found, invalid, or validation fails
be52af70   tangwang   first commit
214
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
215
216
          if not self.config_file.exists():
              raise ConfigurationError(f"Configuration file not found: {self.config_file}")
33839b37   tangwang   属性值参与搜索:
217
          
be52af70   tangwang   first commit
218
          try:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
219
              with open(self.config_file, 'r', encoding='utf-8') as f:
be52af70   tangwang   first commit
220
221
                  config_data = yaml.safe_load(f)
          except yaml.YAMLError as e:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
222
              raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
33839b37   tangwang   属性值参与搜索:
223
          
9f96d6f3   tangwang   短query不用语义搜索
224
225
226
227
228
229
230
231
232
233
          config = self._parse_config(config_data)
          
          # Auto-validate configuration
          if validate:
              errors = self.validate_config(config)
              if errors:
                  error_msg = "Configuration validation failed:\n" + "\n".join(f"  - {err}" for err in errors)
                  raise ConfigurationError(error_msg)
          
          return config
33839b37   tangwang   属性值参与搜索:
234
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
235
236
      def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
          """Parse configuration dictionary into SearchConfig object."""
33839b37   tangwang   属性值参与搜索:
237
238
239
240
241
242
          
          # Parse field_boosts
          field_boosts = config_data.get("field_boosts", {})
          if not isinstance(field_boosts, dict):
              raise ConfigurationError("field_boosts must be a dictionary")
          
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
243
          # Parse indexes (deprecated; compatibility only)
be52af70   tangwang   first commit
244
245
246
          indexes = []
          for index_data in config_data.get("indexes", []):
              indexes.append(self._parse_index_config(index_data))
33839b37   tangwang   属性值参与搜索:
247
          
be52af70   tangwang   first commit
248
249
          # Parse query config
          query_config_data = config_data.get("query_config", {})
42e3aea6   tangwang   tidy
250
          services_data = config_data.get("services", {}) if isinstance(config_data.get("services", {}), dict) else {}
4d824a77   tangwang   所有租户共用一套统一配置.tena...
251
          rewrite_dictionary = self._load_rewrite_dictionary()
9f96d6f3   tangwang   短query不用语义搜索
252
          embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
253
254
          search_fields_cfg = query_config_data.get("search_fields", {})
          text_strategy_cfg = query_config_data.get("text_query_strategy", {})
42e3aea6   tangwang   tidy
255
  
be52af70   tangwang   first commit
256
          query_config = QueryConfig(
9f96d6f3   tangwang   短query不用语义搜索
257
              supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
2739b281   tangwang   多语言索引调整
258
              default_language=query_config_data.get("default_language") or "en",
be52af70   tangwang   first commit
259
260
              enable_text_embedding=query_config_data.get("enable_text_embedding", True),
              enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
a77693fe   tangwang   调整配置目录结构
261
              rewrite_dictionary=rewrite_dictionary,
be52af70   tangwang   first commit
262
              translation_api_key=query_config_data.get("translation_api_key"),
9f96d6f3   tangwang   短query不用语义搜索
263
              translation_service=query_config_data.get("translation_service") or "deepl",
522a3964   tangwang   多语言搜索翻译的优化(deepL添...
264
              translation_glossary_id=query_config_data.get("translation_glossary_id"),
9f96d6f3   tangwang   短query不用语义搜索
265
              translation_context=query_config_data.get("translation_context") or "e-commerce product search",
0064e946   tangwang   feat: 增量索引服务、租户配置...
266
              translation_prompts=query_config_data.get("translation_prompts", {}),
325eec03   tangwang   1. 日志、配置基础设施,使用优化
267
              text_embedding_field=query_config_data.get("text_embedding_field"),
cd3799c6   tangwang   tenant2 1w测试数据 mo...
268
              image_embedding_field=query_config_data.get("image_embedding_field"),
9f96d6f3   tangwang   短query不用语义搜索
269
270
              embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
              embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
70dab99f   tangwang   add logs
271
              source_fields=query_config_data.get("source_fields"),
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
              knn_boost=query_config_data.get("knn_boost", 0.25),
              multilingual_fields=search_fields_cfg.get(
                  "multilingual_fields",
                  ["title", "brief", "description", "vendor", "category_path", "category_name_text"],
              ),
              shared_fields=search_fields_cfg.get(
                  "shared_fields",
                  ["tags", "option1_values", "option2_values", "option3_values"],
              ),
              core_multilingual_fields=search_fields_cfg.get(
                  "core_multilingual_fields",
                  ["title", "brief", "vendor", "category_name_text"],
              ),
              base_minimum_should_match=str(text_strategy_cfg.get("base_minimum_should_match", "75%")),
              translation_minimum_should_match=str(text_strategy_cfg.get("translation_minimum_should_match", "75%")),
              translation_boost=float(text_strategy_cfg.get("translation_boost", 0.4)),
              translation_boost_when_source_missing=float(
                  text_strategy_cfg.get("translation_boost_when_source_missing", 1.0)
              ),
              source_boost_when_missing=float(text_strategy_cfg.get("source_boost_when_missing", 0.6)),
              keywords_boost=float(text_strategy_cfg.get("keywords_boost", 0.1)),
              enable_phrase_query=bool(text_strategy_cfg.get("enable_phrase_query", True)),
              tie_breaker_base_query=float(text_strategy_cfg.get("tie_breaker_base_query", 0.9)),
              tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)),
be52af70   tangwang   first commit
296
          )
33839b37   tangwang   属性值参与搜索:
297
          
be52af70   tangwang   first commit
298
299
300
          # Parse ranking config
          ranking_data = config_data.get("ranking", {})
          ranking = RankingConfig(
9f96d6f3   tangwang   短query不用语义搜索
301
302
              expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
              description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
be52af70   tangwang   first commit
303
          )
33839b37   tangwang   属性值参与搜索:
304
          
a00c3672   tangwang   feat: Function Sc...
305
306
307
          # Parse Function Score configuration
          fs_data = config_data.get("function_score", {})
          function_score = FunctionScoreConfig(
9f96d6f3   tangwang   短query不用语义搜索
308
309
310
              score_mode=fs_data.get("score_mode") or "sum",
              boost_mode=fs_data.get("boost_mode") or "multiply",
              functions=fs_data.get("functions") or []
a00c3672   tangwang   feat: Function Sc...
311
          )
33839b37   tangwang   属性值参与搜索:
312
          
42e3aea6   tangwang   tidy
313
          # Parse Rerank (provider/URL in services.rerank)
a00c3672   tangwang   feat: Function Sc...
314
315
          rerank_data = config_data.get("rerank", {})
          rerank = RerankConfig(
506c39b7   tangwang   feat(search): 统一重...
316
              rerank_window=int(rerank_data.get("rerank_window", 1000)),
506c39b7   tangwang   feat(search): 统一重...
317
318
319
              timeout_sec=float(rerank_data.get("timeout_sec", 15.0)),
              weight_es=float(rerank_data.get("weight_es", 0.4)),
              weight_ai=float(rerank_data.get("weight_ai", 0.6)),
ff32d894   tangwang   rerank
320
321
              rerank_query_template=str(rerank_data.get("rerank_query_template") or "{query}"),
              rerank_doc_template=str(rerank_data.get("rerank_doc_template") or "{title}"),
a00c3672   tangwang   feat: Function Sc...
322
          )
33839b37   tangwang   属性值参与搜索:
323
          
be52af70   tangwang   first commit
324
325
326
327
328
          # Parse SPU config
          spu_data = config_data.get("spu_config", {})
          spu_config = SPUConfig(
              enabled=spu_data.get("enabled", False),
              spu_field=spu_data.get("spu_field"),
33839b37   tangwang   属性值参与搜索:
329
330
              inner_hits_size=spu_data.get("inner_hits_size", 3),
              searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
331
          )
33839b37   tangwang   属性值参与搜索:
332
          
0064e946   tangwang   feat: 增量索引服务、租户配置...
333
334
335
          # Parse tenant config
          tenant_config_data = config_data.get("tenant_config", {})
          
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
336
          return SearchConfig(
33839b37   tangwang   属性值参与搜索:
337
              field_boosts=field_boosts,
be52af70   tangwang   first commit
338
339
340
              indexes=indexes,
              query_config=query_config,
              ranking=ranking,
a00c3672   tangwang   feat: Function Sc...
341
342
              function_score=function_score,
              rerank=rerank,
be52af70   tangwang   first commit
343
              spu_config=spu_config,
0064e946   tangwang   feat: 增量索引服务、租户配置...
344
              tenant_config=tenant_config_data,
4d824a77   tangwang   所有租户共用一套统一配置.tena...
345
              es_index_name=config_data.get("es_index_name", "search_products"),
42e3aea6   tangwang   tidy
346
347
              es_settings=config_data.get("es_settings", {}),
              services=services_data
be52af70   tangwang   first commit
348
          )
33839b37   tangwang   属性值参与搜索:
349
      
be52af70   tangwang   first commit
350
351
      def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
          """Parse index configuration from dictionary."""
be52af70   tangwang   first commit
352
353
354
          return IndexConfig(
              name=index_data["name"],
              label=index_data.get("label", index_data["name"]),
33839b37   tangwang   属性值参与搜索:
355
              fields=index_data.get("fields", []),
be52af70   tangwang   first commit
356
              boost=index_data.get("boost", 1.0),
33839b37   tangwang   属性值参与搜索:
357
              example=index_data.get("example")
be52af70   tangwang   first commit
358
          )
33839b37   tangwang   属性值参与搜索:
359
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
360
      def validate_config(self, config: SearchConfig) -> List[str]:
be52af70   tangwang   first commit
361
          """
33839b37   tangwang   属性值参与搜索:
362
363
          Validate configuration for common errors.
          
be52af70   tangwang   first commit
364
          Args:
33839b37   tangwang   属性值参与搜索:
365
366
              config: SearchConfig to validate
          
be52af70   tangwang   first commit
367
          Returns:
33839b37   tangwang   属性值参与搜索:
368
              List of error messages (empty if valid)
be52af70   tangwang   first commit
369
370
          """
          errors = []
b926f678   tangwang   多语言查询
371
          
33839b37   tangwang   属性值参与搜索:
372
373
374
375
376
377
378
379
380
381
382
383
384
385
          # Validate es_index_name
          if not config.es_index_name:
              errors.append("es_index_name is required")
          
          # Validate field_boosts
          if not config.field_boosts:
              errors.append("field_boosts is empty")
          
          for field_name, boost in config.field_boosts.items():
              if not isinstance(boost, (int, float)):
                  errors.append(f"field_boosts['{field_name}']: boost must be a number, got {type(boost).__name__}")
              elif boost < 0:
                  errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
          
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
386
          # Validate indexes (deprecated, optional)
33839b37   tangwang   属性值参与搜索:
387
          index_names = set()
be52af70   tangwang   first commit
388
          for index in config.indexes:
33839b37   tangwang   属性值参与搜索:
389
390
391
392
              # Check for duplicate index names
              if index.name in index_names:
                  errors.append(f"Duplicate index name: {index.name}")
              index_names.add(index.name)
b926f678   tangwang   多语言查询
393
              
33839b37   tangwang   属性值参与搜索:
394
395
396
397
              # Validate fields in index
              if not index.fields:
                  errors.append(f"Index '{index.name}': fields list is empty")
          
be52af70   tangwang   first commit
398
399
400
401
          # Validate SPU config
          if config.spu_config.enabled:
              if not config.spu_config.spu_field:
                  errors.append("SPU aggregation enabled but no spu_field specified")
33839b37   tangwang   属性值参与搜索:
402
403
404
405
406
407
408
409
410
411
          
          # Validate query config
          if not config.query_config.supported_languages:
              errors.append("At least one supported language must be specified")
          
          if config.query_config.default_language not in config.query_config.supported_languages:
              errors.append(
                  f"Default language '{config.query_config.default_language}' "
                  f"not in supported languages: {config.query_config.supported_languages}"
              )
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
          
          # Validate dynamic search fields
          def _validate_str_list(name: str, values: List[str]) -> None:
              if not isinstance(values, list) or not values:
                  errors.append(f"query_config.{name} must be a non-empty list[str]")
                  return
              for i, val in enumerate(values):
                  if not isinstance(val, str) or not val.strip():
                      errors.append(f"query_config.{name}[{i}] must be a non-empty string")
          
          _validate_str_list("multilingual_fields", config.query_config.multilingual_fields)
          _validate_str_list("shared_fields", config.query_config.shared_fields)
          _validate_str_list("core_multilingual_fields", config.query_config.core_multilingual_fields)
          
          core_set = set(config.query_config.core_multilingual_fields)
          multi_set = set(config.query_config.multilingual_fields)
          if not core_set.issubset(multi_set):
              errors.append("query_config.core_multilingual_fields must be subset of multilingual_fields")
          
          # Validate text query strategy numbers
          for name in (
              "translation_boost",
              "translation_boost_when_source_missing",
              "source_boost_when_missing",
              "keywords_boost",
              "tie_breaker_base_query",
              "tie_breaker_keywords",
          ):
              value = getattr(config.query_config, name, None)
              if not isinstance(value, (int, float)):
                  errors.append(f"query_config.{name} must be a number")
              elif value < 0:
                  errors.append(f"query_config.{name} must be non-negative")
42e3aea6   tangwang   tidy
445
  
26b910bd   tangwang   refactor service ...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
          # Validate source_fields tri-state semantics
          source_fields = config.query_config.source_fields
          if source_fields is not None:
              if not isinstance(source_fields, list):
                  errors.append("query_config.source_fields must be null or list[str]")
              else:
                  for idx, field_name in enumerate(source_fields):
                      if not isinstance(field_name, str) or not field_name.strip():
                          errors.append(
                              f"query_config.source_fields[{idx}] must be a non-empty string"
                          )
  
          # Validate tenant config shape (default must exist in config)
          tenant_cfg = config.tenant_config
          if not isinstance(tenant_cfg, dict):
              errors.append("tenant_config must be an object")
          else:
              default_cfg = tenant_cfg.get("default")
              if not isinstance(default_cfg, dict):
                  errors.append("tenant_config.default must be configured")
              else:
                  index_languages = default_cfg.get("index_languages")
                  if not isinstance(index_languages, list) or len(index_languages) == 0:
                      errors.append("tenant_config.default.index_languages must be a non-empty list")
  
be52af70   tangwang   first commit
471
          return errors
33839b37   tangwang   属性值参与搜索:
472
473
474
      
      def to_dict(self, config: SearchConfig) -> Dict[str, Any]:
          """Convert SearchConfig to dictionary representation."""
a77693fe   tangwang   调整配置目录结构
475
          
33839b37   tangwang   属性值参与搜索:
476
          # Build query_config dict
9f96d6f3   tangwang   短query不用语义搜索
477
478
479
          query_config_dict = {
              "supported_languages": config.query_config.supported_languages,
              "default_language": config.query_config.default_language,
9f96d6f3   tangwang   短query不用语义搜索
480
481
482
              "enable_text_embedding": config.query_config.enable_text_embedding,
              "enable_query_rewrite": config.query_config.enable_query_rewrite,
              "translation_service": config.query_config.translation_service,
33839b37   tangwang   属性值参与搜索:
483
484
485
              "text_embedding_field": config.query_config.text_embedding_field,
              "image_embedding_field": config.query_config.image_embedding_field,
              "embedding_disable_thresholds": {
9f96d6f3   tangwang   短query不用语义搜索
486
487
                  "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
                  "english_word_limit": config.query_config.embedding_disable_english_word_limit
33839b37   tangwang   属性值参与搜索:
488
              },
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
              "source_fields": config.query_config.source_fields,
              "search_fields": {
                  "multilingual_fields": config.query_config.multilingual_fields,
                  "shared_fields": config.query_config.shared_fields,
                  "core_multilingual_fields": config.query_config.core_multilingual_fields,
              },
              "text_query_strategy": {
                  "base_minimum_should_match": config.query_config.base_minimum_should_match,
                  "translation_minimum_should_match": config.query_config.translation_minimum_should_match,
                  "translation_boost": config.query_config.translation_boost,
                  "translation_boost_when_source_missing": config.query_config.translation_boost_when_source_missing,
                  "source_boost_when_missing": config.query_config.source_boost_when_missing,
                  "keywords_boost": config.query_config.keywords_boost,
                  "enable_phrase_query": config.query_config.enable_phrase_query,
                  "tie_breaker_base_query": config.query_config.tie_breaker_base_query,
                  "tie_breaker_keywords": config.query_config.tie_breaker_keywords,
              }
33839b37   tangwang   属性值参与搜索:
506
          }
9f96d6f3   tangwang   短query不用语义搜索
507
          
33839b37   tangwang   属性值参与搜索:
508
          return {
be52af70   tangwang   first commit
509
510
              "es_index_name": config.es_index_name,
              "es_settings": config.es_settings,
33839b37   tangwang   属性值参与搜索:
511
              "field_boosts": config.field_boosts,
be52af70   tangwang   first commit
512
              "indexes": [self._index_to_dict(index) for index in config.indexes],
9f96d6f3   tangwang   短query不用语义搜索
513
              "query_config": query_config_dict,
be52af70   tangwang   first commit
514
515
516
517
              "ranking": {
                  "expression": config.ranking.expression,
                  "description": config.ranking.description
              },
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
518
519
520
521
522
523
              "function_score": {
                  "score_mode": config.function_score.score_mode,
                  "boost_mode": config.function_score.boost_mode,
                  "functions": config.function_score.functions
              },
              "rerank": {
506c39b7   tangwang   feat(search): 统一重...
524
                  "rerank_window": config.rerank.rerank_window,
506c39b7   tangwang   feat(search): 统一重...
525
526
527
                  "timeout_sec": config.rerank.timeout_sec,
                  "weight_es": config.rerank.weight_es,
                  "weight_ai": config.rerank.weight_ai,
ff32d894   tangwang   rerank
528
529
                  "rerank_query_template": config.rerank.rerank_query_template,
                  "rerank_doc_template": config.rerank.rerank_doc_template,
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
530
              },
be52af70   tangwang   first commit
531
532
533
              "spu_config": {
                  "enabled": config.spu_config.enabled,
                  "spu_field": config.spu_config.spu_field,
33839b37   tangwang   属性值参与搜索:
534
535
                  "inner_hits_size": config.spu_config.inner_hits_size,
                  "searchable_option_dimensions": config.spu_config.searchable_option_dimensions
42e3aea6   tangwang   tidy
536
537
              },
              "services": config.services,
be52af70   tangwang   first commit
538
          }
a77693fe   tangwang   调整配置目录结构
539
      
be52af70   tangwang   first commit
540
      def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
33839b37   tangwang   属性值参与搜索:
541
          """Convert IndexConfig to dictionary."""
b926f678   tangwang   多语言查询
542
          result = {
be52af70   tangwang   first commit
543
544
545
              "name": index.name,
              "label": index.label,
              "fields": index.fields,
33839b37   tangwang   属性值参与搜索:
546
              "boost": index.boost
b926f678   tangwang   多语言查询
547
          }
9f96d6f3   tangwang   短query不用语义搜索
548
          
9f96d6f3   tangwang   短query不用语义搜索
549
550
          if index.example:
              result["example"] = index.example
33839b37   tangwang   属性值参与搜索:
551
552
          
          return result