Blame view

config/config_loader.py 15.8 KB
be52af70   tangwang   first commit
1
  """
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
2
  Configuration loader and validator for search engine configurations.
be52af70   tangwang   first commit
3
4
  
  This module handles loading, parsing, and validating YAML configuration files
33839b37   tangwang   属性值参与搜索:
5
6
7
8
  that define how search should be executed (NOT how data should be indexed).
  
  索引结构由 mappings/search_products.json 定义。
  此配置只定义搜索行为:字段权重、搜索域、查询策略等。
be52af70   tangwang   first commit
9
10
11
12
13
14
15
16
  """
  
  import yaml
  import os
  from typing import Dict, Any, List, Optional
  from dataclasses import dataclass, field
  from pathlib import Path
  
be52af70   tangwang   first commit
17
18
19
20
21
22
  
  @dataclass
  class IndexConfig:
      """Configuration for an index domain (e.g., default, title, brand)."""
      name: str
      label: str
33839b37   tangwang   属性值参与搜索:
23
      fields: List[str]  # List of field names to include in this search domain
be52af70   tangwang   first commit
24
25
26
      boost: float = 1.0
      example: Optional[str] = None
  
be52af70   tangwang   first commit
27
28
29
30
31
32
  
  @dataclass
  class QueryConfig:
      """Configuration for query processing."""
      supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"])
      default_language: str = "zh"
33839b37   tangwang   属性值参与搜索:
33
34
      
      # Feature flags
be52af70   tangwang   first commit
35
36
37
      enable_translation: bool = True
      enable_text_embedding: bool = True
      enable_query_rewrite: bool = True
7bc756c5   tangwang   优化 ES 查询构建
38
      enable_multilang_search: bool = True  # Enable multi-language search using translations
33839b37   tangwang   属性值参与搜索:
39
40
      
      # Query rewrite dictionary (loaded from external file)
be52af70   tangwang   first commit
41
      rewrite_dictionary: Dict[str, str] = field(default_factory=dict)
33839b37   tangwang   属性值参与搜索:
42
43
44
      
      # Translation settings
      translation_service: str = "deepl"
be52af70   tangwang   first commit
45
      translation_api_key: Optional[str] = None
33839b37   tangwang   属性值参与搜索:
46
47
48
49
50
51
52
      translation_glossary_id: Optional[str] = None
      translation_context: str = "e-commerce product search"
      
      # Embedding field names
      text_embedding_field: Optional[str] = "title_embedding"
      image_embedding_field: Optional[str] = None
      
9f96d6f3   tangwang   短query不用语义搜索
53
      # Embedding disable thresholds (disable vector search for short queries)
33839b37   tangwang   属性值参与搜索:
54
55
56
57
      embedding_disable_chinese_char_limit: int = 4
      embedding_disable_english_word_limit: int = 3
      
      # Source fields configuration
cd3799c6   tangwang   tenant2 1w测试数据 mo...
58
      source_fields: Optional[List[str]] = None
13377199   tangwang   接口优化
59
  
be52af70   tangwang   first commit
60
61
62
63
64
  
  @dataclass
  class SPUConfig:
      """Configuration for SPU aggregation."""
      enabled: bool = False
33839b37   tangwang   属性值参与搜索:
65
      spu_field: Optional[str] = None
be52af70   tangwang   first commit
66
      inner_hits_size: int = 3
33839b37   tangwang   属性值参与搜索:
67
68
      # 配置哪些option维度参与检索(进索引、以及在线搜索)
      searchable_option_dimensions: List[str] = field(default_factory=lambda: ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
69
70
71
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
72
73
  class FunctionScoreConfig:
      """Function Score配置(ES层打分规则)"""
33839b37   tangwang   属性值参与搜索:
74
75
      score_mode: str = "sum"
      boost_mode: str = "multiply"
a00c3672   tangwang   feat: Function Sc...
76
77
78
79
      functions: List[Dict[str, Any]] = field(default_factory=list)
  
  
  @dataclass
33839b37   tangwang   属性值参与搜索:
80
81
82
83
84
85
86
  class RankingConfig:
      """Configuration for ranking expressions."""
      expression: str = "bm25()"
      description: str = "Default BM25 ranking"
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
87
88
89
90
91
92
93
94
  class RerankConfig:
      """本地重排配置(当前禁用)"""
      enabled: bool = False
      expression: str = ""
      description: str = ""
  
  
  @dataclass
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
95
  class SearchConfig:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
96
      """Complete configuration for search engine (multi-tenant)."""
33839b37   tangwang   属性值参与搜索:
97
98
99
100
      
      # 字段权重配置(用于搜索)
      field_boosts: Dict[str, float]
      
be52af70   tangwang   first commit
101
102
      # Index structure (query domains)
      indexes: List[IndexConfig]
33839b37   tangwang   属性值参与搜索:
103
      
be52af70   tangwang   first commit
104
105
      # Query processing
      query_config: QueryConfig
33839b37   tangwang   属性值参与搜索:
106
      
be52af70   tangwang   first commit
107
108
      # Ranking configuration
      ranking: RankingConfig
33839b37   tangwang   属性值参与搜索:
109
      
a00c3672   tangwang   feat: Function Sc...
110
111
      # Function Score configuration (ES层打分)
      function_score: FunctionScoreConfig
33839b37   tangwang   属性值参与搜索:
112
      
a00c3672   tangwang   feat: Function Sc...
113
114
      # Rerank configuration (本地重排)
      rerank: RerankConfig
33839b37   tangwang   属性值参与搜索:
115
      
be52af70   tangwang   first commit
116
117
      # SPU configuration
      spu_config: SPUConfig
33839b37   tangwang   属性值参与搜索:
118
      
be52af70   tangwang   first commit
119
120
      # ES index settings
      es_index_name: str
be52af70   tangwang   first commit
121
122
123
124
125
126
127
128
129
      es_settings: Dict[str, Any] = field(default_factory=dict)
  
  
  class ConfigurationError(Exception):
      """Raised when configuration validation fails."""
      pass
  
  
  class ConfigLoader:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
130
      """Loads and validates unified search engine configuration from YAML file."""
a77693fe   tangwang   调整配置目录结构
131
      
33839b37   tangwang   属性值参与搜索:
132
      def __init__(self, config_file: Optional[Path] = None):
a77693fe   tangwang   调整配置目录结构
133
          """
33839b37   tangwang   属性值参与搜索:
134
          Initialize config loader.
a77693fe   tangwang   调整配置目录结构
135
          
33839b37   tangwang   属性值参与搜索:
136
137
          Args:
              config_file: Path to config YAML file (defaults to config/config.yaml)
a77693fe   tangwang   调整配置目录结构
138
          """
33839b37   tangwang   属性值参与搜索:
139
140
141
142
143
144
145
146
          if config_file is None:
              config_file = Path(__file__).parent / "config.yaml"
          self.config_file = Path(config_file)
      
      def _load_rewrite_dictionary(self) -> Dict[str, str]:
          """Load query rewrite dictionary from external file."""
          rewrite_file = Path(__file__).parent / "rewrite_dictionary.txt"
          rewrite_dict = {}
a77693fe   tangwang   调整配置目录结构
147
          
33839b37   tangwang   属性值参与搜索:
148
149
          if not rewrite_file.exists():
              return rewrite_dict
a77693fe   tangwang   调整配置目录结构
150
          
a77693fe   tangwang   调整配置目录结构
151
          try:
33839b37   tangwang   属性值参与搜索:
152
153
              with open(rewrite_file, 'r', encoding='utf-8') as f:
                  for line in f:
a77693fe   tangwang   调整配置目录结构
154
                      line = line.strip()
a77693fe   tangwang   调整配置目录结构
155
156
157
                      if not line or line.startswith('#'):
                          continue
                      
a77693fe   tangwang   调整配置目录结构
158
                      parts = line.split('\t')
33839b37   tangwang   属性值参与搜索:
159
160
161
162
163
                      if len(parts) >= 2:
                          original = parts[0].strip()
                          replacement = parts[1].strip()
                          if original and replacement:
                              rewrite_dict[original] = replacement
a77693fe   tangwang   调整配置目录结构
164
          except Exception as e:
33839b37   tangwang   属性值参与搜索:
165
              print(f"Warning: Failed to load rewrite dictionary: {e}")
a77693fe   tangwang   调整配置目录结构
166
167
          
          return rewrite_dict
33839b37   tangwang   属性值参与搜索:
168
      
9f96d6f3   tangwang   短query不用语义搜索
169
      def load_config(self, validate: bool = True) -> SearchConfig:
be52af70   tangwang   first commit
170
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
171
          Load unified configuration from YAML file.
33839b37   tangwang   属性值参与搜索:
172
          
9f96d6f3   tangwang   短query不用语义搜索
173
          Args:
33839b37   tangwang   属性值参与搜索:
174
175
              validate: Whether to validate configuration after loading
          
be52af70   tangwang   first commit
176
          Returns:
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
177
              SearchConfig object
33839b37   tangwang   属性值参与搜索:
178
          
be52af70   tangwang   first commit
179
          Raises:
9f96d6f3   tangwang   短query不用语义搜索
180
              ConfigurationError: If config file not found, invalid, or validation fails
be52af70   tangwang   first commit
181
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
182
183
          if not self.config_file.exists():
              raise ConfigurationError(f"Configuration file not found: {self.config_file}")
33839b37   tangwang   属性值参与搜索:
184
          
be52af70   tangwang   first commit
185
          try:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
186
              with open(self.config_file, 'r', encoding='utf-8') as f:
be52af70   tangwang   first commit
187
188
                  config_data = yaml.safe_load(f)
          except yaml.YAMLError as e:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
189
              raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
33839b37   tangwang   属性值参与搜索:
190
          
9f96d6f3   tangwang   短query不用语义搜索
191
192
193
194
195
196
197
198
199
200
          config = self._parse_config(config_data)
          
          # Auto-validate configuration
          if validate:
              errors = self.validate_config(config)
              if errors:
                  error_msg = "Configuration validation failed:\n" + "\n".join(f"  - {err}" for err in errors)
                  raise ConfigurationError(error_msg)
          
          return config
33839b37   tangwang   属性值参与搜索:
201
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
202
203
      def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
          """Parse configuration dictionary into SearchConfig object."""
33839b37   tangwang   属性值参与搜索:
204
205
206
207
208
209
          
          # Parse field_boosts
          field_boosts = config_data.get("field_boosts", {})
          if not isinstance(field_boosts, dict):
              raise ConfigurationError("field_boosts must be a dictionary")
          
be52af70   tangwang   first commit
210
211
212
213
          # Parse indexes
          indexes = []
          for index_data in config_data.get("indexes", []):
              indexes.append(self._parse_index_config(index_data))
33839b37   tangwang   属性值参与搜索:
214
          
be52af70   tangwang   first commit
215
216
          # Parse query config
          query_config_data = config_data.get("query_config", {})
a77693fe   tangwang   调整配置目录结构
217
          
33839b37   tangwang   属性值参与搜索:
218
          # Load rewrite dictionary from external file
4d824a77   tangwang   所有租户共用一套统一配置.tena...
219
          rewrite_dictionary = self._load_rewrite_dictionary()
a77693fe   tangwang   调整配置目录结构
220
          
9f96d6f3   tangwang   短query不用语义搜索
221
222
223
          # Parse embedding disable thresholds
          embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
          
be52af70   tangwang   first commit
224
          query_config = QueryConfig(
9f96d6f3   tangwang   短query不用语义搜索
225
226
              supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
              default_language=query_config_data.get("default_language") or "zh",
be52af70   tangwang   first commit
227
228
229
              enable_translation=query_config_data.get("enable_translation", True),
              enable_text_embedding=query_config_data.get("enable_text_embedding", True),
              enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
a77693fe   tangwang   调整配置目录结构
230
              rewrite_dictionary=rewrite_dictionary,
be52af70   tangwang   first commit
231
              translation_api_key=query_config_data.get("translation_api_key"),
9f96d6f3   tangwang   短query不用语义搜索
232
              translation_service=query_config_data.get("translation_service") or "deepl",
522a3964   tangwang   多语言搜索翻译的优化(deepL添...
233
              translation_glossary_id=query_config_data.get("translation_glossary_id"),
9f96d6f3   tangwang   短query不用语义搜索
234
              translation_context=query_config_data.get("translation_context") or "e-commerce product search",
325eec03   tangwang   1. 日志、配置基础设施,使用优化
235
              text_embedding_field=query_config_data.get("text_embedding_field"),
cd3799c6   tangwang   tenant2 1w测试数据 mo...
236
              image_embedding_field=query_config_data.get("image_embedding_field"),
9f96d6f3   tangwang   短query不用语义搜索
237
238
              embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
              embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
33839b37   tangwang   属性值参与搜索:
239
              source_fields=query_config_data.get("source_fields")
be52af70   tangwang   first commit
240
          )
33839b37   tangwang   属性值参与搜索:
241
          
be52af70   tangwang   first commit
242
243
244
          # Parse ranking config
          ranking_data = config_data.get("ranking", {})
          ranking = RankingConfig(
9f96d6f3   tangwang   短query不用语义搜索
245
246
              expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
              description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
be52af70   tangwang   first commit
247
          )
33839b37   tangwang   属性值参与搜索:
248
          
a00c3672   tangwang   feat: Function Sc...
249
250
251
          # Parse Function Score configuration
          fs_data = config_data.get("function_score", {})
          function_score = FunctionScoreConfig(
9f96d6f3   tangwang   短query不用语义搜索
252
253
254
              score_mode=fs_data.get("score_mode") or "sum",
              boost_mode=fs_data.get("boost_mode") or "multiply",
              functions=fs_data.get("functions") or []
a00c3672   tangwang   feat: Function Sc...
255
          )
33839b37   tangwang   属性值参与搜索:
256
          
a00c3672   tangwang   feat: Function Sc...
257
258
259
260
          # Parse Rerank configuration
          rerank_data = config_data.get("rerank", {})
          rerank = RerankConfig(
              enabled=rerank_data.get("enabled", False),
9f96d6f3   tangwang   短query不用语义搜索
261
262
              expression=rerank_data.get("expression") or "",
              description=rerank_data.get("description") or ""
a00c3672   tangwang   feat: Function Sc...
263
          )
33839b37   tangwang   属性值参与搜索:
264
          
be52af70   tangwang   first commit
265
266
267
268
269
          # Parse SPU config
          spu_data = config_data.get("spu_config", {})
          spu_config = SPUConfig(
              enabled=spu_data.get("enabled", False),
              spu_field=spu_data.get("spu_field"),
33839b37   tangwang   属性值参与搜索:
270
271
              inner_hits_size=spu_data.get("inner_hits_size", 3),
              searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
272
          )
33839b37   tangwang   属性值参与搜索:
273
          
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
274
          return SearchConfig(
33839b37   tangwang   属性值参与搜索:
275
              field_boosts=field_boosts,
be52af70   tangwang   first commit
276
277
278
              indexes=indexes,
              query_config=query_config,
              ranking=ranking,
a00c3672   tangwang   feat: Function Sc...
279
280
              function_score=function_score,
              rerank=rerank,
be52af70   tangwang   first commit
281
              spu_config=spu_config,
4d824a77   tangwang   所有租户共用一套统一配置.tena...
282
              es_index_name=config_data.get("es_index_name", "search_products"),
be52af70   tangwang   first commit
283
284
              es_settings=config_data.get("es_settings", {})
          )
33839b37   tangwang   属性值参与搜索:
285
      
be52af70   tangwang   first commit
286
287
      def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
          """Parse index configuration from dictionary."""
be52af70   tangwang   first commit
288
289
290
          return IndexConfig(
              name=index_data["name"],
              label=index_data.get("label", index_data["name"]),
33839b37   tangwang   属性值参与搜索:
291
              fields=index_data.get("fields", []),
be52af70   tangwang   first commit
292
              boost=index_data.get("boost", 1.0),
33839b37   tangwang   属性值参与搜索:
293
              example=index_data.get("example")
be52af70   tangwang   first commit
294
          )
33839b37   tangwang   属性值参与搜索:
295
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
296
      def validate_config(self, config: SearchConfig) -> List[str]:
be52af70   tangwang   first commit
297
          """
33839b37   tangwang   属性值参与搜索:
298
299
          Validate configuration for common errors.
          
be52af70   tangwang   first commit
300
          Args:
33839b37   tangwang   属性值参与搜索:
301
302
              config: SearchConfig to validate
          
be52af70   tangwang   first commit
303
          Returns:
33839b37   tangwang   属性值参与搜索:
304
              List of error messages (empty if valid)
be52af70   tangwang   first commit
305
306
          """
          errors = []
b926f678   tangwang   多语言查询
307
          
33839b37   tangwang   属性值参与搜索:
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
          # Validate es_index_name
          if not config.es_index_name:
              errors.append("es_index_name is required")
          
          # Validate field_boosts
          if not config.field_boosts:
              errors.append("field_boosts is empty")
          
          for field_name, boost in config.field_boosts.items():
              if not isinstance(boost, (int, float)):
                  errors.append(f"field_boosts['{field_name}']: boost must be a number, got {type(boost).__name__}")
              elif boost < 0:
                  errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
          
          # Validate indexes
          if not config.indexes:
              errors.append("At least one index domain must be defined")
          
          index_names = set()
be52af70   tangwang   first commit
327
          for index in config.indexes:
33839b37   tangwang   属性值参与搜索:
328
329
330
331
              # Check for duplicate index names
              if index.name in index_names:
                  errors.append(f"Duplicate index name: {index.name}")
              index_names.add(index.name)
b926f678   tangwang   多语言查询
332
              
33839b37   tangwang   属性值参与搜索:
333
334
335
336
              # Validate fields in index
              if not index.fields:
                  errors.append(f"Index '{index.name}': fields list is empty")
          
be52af70   tangwang   first commit
337
338
339
340
          # Validate SPU config
          if config.spu_config.enabled:
              if not config.spu_config.spu_field:
                  errors.append("SPU aggregation enabled but no spu_field specified")
33839b37   tangwang   属性值参与搜索:
341
342
343
344
345
346
347
348
349
350
351
          
          # Validate query config
          if not config.query_config.supported_languages:
              errors.append("At least one supported language must be specified")
          
          if config.query_config.default_language not in config.query_config.supported_languages:
              errors.append(
                  f"Default language '{config.query_config.default_language}' "
                  f"not in supported languages: {config.query_config.supported_languages}"
              )
          
be52af70   tangwang   first commit
352
          return errors
33839b37   tangwang   属性值参与搜索:
353
354
355
      
      def to_dict(self, config: SearchConfig) -> Dict[str, Any]:
          """Convert SearchConfig to dictionary representation."""
a77693fe   tangwang   调整配置目录结构
356
          
33839b37   tangwang   属性值参与搜索:
357
          # Build query_config dict
9f96d6f3   tangwang   短query不用语义搜索
358
359
360
361
362
363
364
          query_config_dict = {
              "supported_languages": config.query_config.supported_languages,
              "default_language": config.query_config.default_language,
              "enable_translation": config.query_config.enable_translation,
              "enable_text_embedding": config.query_config.enable_text_embedding,
              "enable_query_rewrite": config.query_config.enable_query_rewrite,
              "translation_service": config.query_config.translation_service,
33839b37   tangwang   属性值参与搜索:
365
366
367
              "text_embedding_field": config.query_config.text_embedding_field,
              "image_embedding_field": config.query_config.image_embedding_field,
              "embedding_disable_thresholds": {
9f96d6f3   tangwang   短query不用语义搜索
368
369
                  "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
                  "english_word_limit": config.query_config.embedding_disable_english_word_limit
33839b37   tangwang   属性值参与搜索:
370
371
372
              },
              "source_fields": config.query_config.source_fields
          }
9f96d6f3   tangwang   短query不用语义搜索
373
          
33839b37   tangwang   属性值参与搜索:
374
          return {
be52af70   tangwang   first commit
375
376
              "es_index_name": config.es_index_name,
              "es_settings": config.es_settings,
33839b37   tangwang   属性值参与搜索:
377
              "field_boosts": config.field_boosts,
be52af70   tangwang   first commit
378
              "indexes": [self._index_to_dict(index) for index in config.indexes],
9f96d6f3   tangwang   短query不用语义搜索
379
              "query_config": query_config_dict,
be52af70   tangwang   first commit
380
381
382
383
              "ranking": {
                  "expression": config.ranking.expression,
                  "description": config.ranking.description
              },
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
384
385
386
387
388
389
390
391
392
393
              "function_score": {
                  "score_mode": config.function_score.score_mode,
                  "boost_mode": config.function_score.boost_mode,
                  "functions": config.function_score.functions
              },
              "rerank": {
                  "enabled": config.rerank.enabled,
                  "expression": config.rerank.expression,
                  "description": config.rerank.description
              },
be52af70   tangwang   first commit
394
395
396
              "spu_config": {
                  "enabled": config.spu_config.enabled,
                  "spu_field": config.spu_config.spu_field,
33839b37   tangwang   属性值参与搜索:
397
398
                  "inner_hits_size": config.spu_config.inner_hits_size,
                  "searchable_option_dimensions": config.spu_config.searchable_option_dimensions
be52af70   tangwang   first commit
399
400
              }
          }
a77693fe   tangwang   调整配置目录结构
401
      
be52af70   tangwang   first commit
402
      def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
33839b37   tangwang   属性值参与搜索:
403
          """Convert IndexConfig to dictionary."""
b926f678   tangwang   多语言查询
404
          result = {
be52af70   tangwang   first commit
405
406
407
              "name": index.name,
              "label": index.label,
              "fields": index.fields,
33839b37   tangwang   属性值参与搜索:
408
              "boost": index.boost
b926f678   tangwang   多语言查询
409
          }
9f96d6f3   tangwang   短query不用语义搜索
410
          
9f96d6f3   tangwang   短query不用语义搜索
411
412
          if index.example:
              result["example"] = index.example
33839b37   tangwang   属性值参与搜索:
413
414
415
          
          return result
  
b926f678   tangwang   多语言查询
416
  
33839b37   tangwang   属性值参与搜索:
417
418
419
420
421
422
423
424
425
426
427
428
  def load_tenant_config(tenant_id: Optional[str] = None) -> SearchConfig:
      """
      Load tenant configuration (backward compatibility wrapper).
      
      Args:
          tenant_id: Ignored (kept for backward compatibility)
      
      Returns:
          SearchConfig loaded from config/config.yaml
      """
      loader = ConfigLoader()
      return loader.load_config()