Blame view

config/config_loader.py 15.7 KB
be52af70   tangwang   first commit
1
  """
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
2
  Configuration loader and validator for search engine configurations.
be52af70   tangwang   first commit
3
4
  
  This module handles loading, parsing, and validating YAML configuration files
33839b37   tangwang   属性值参与搜索:
5
6
7
8
  that define how search should be executed (NOT how data should be indexed).
  
  索引结构由 mappings/search_products.json 定义。
  此配置只定义搜索行为:字段权重、搜索域、查询策略等。
be52af70   tangwang   first commit
9
10
11
12
13
14
15
16
  """
  
  import yaml
  import os
  from typing import Dict, Any, List, Optional
  from dataclasses import dataclass, field
  from pathlib import Path
  
be52af70   tangwang   first commit
17
18
19
20
21
22
  
  @dataclass
  class IndexConfig:
      """Configuration for an index domain (e.g., default, title, brand)."""
      name: str
      label: str
33839b37   tangwang   属性值参与搜索:
23
      fields: List[str]  # List of field names to include in this search domain
be52af70   tangwang   first commit
24
25
26
      boost: float = 1.0
      example: Optional[str] = None
  
be52af70   tangwang   first commit
27
28
29
30
31
32
  
  @dataclass
  class QueryConfig:
      """Configuration for query processing."""
      supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"])
      default_language: str = "zh"
33839b37   tangwang   属性值参与搜索:
33
34
      
      # Feature flags
be52af70   tangwang   first commit
35
36
37
      enable_translation: bool = True
      enable_text_embedding: bool = True
      enable_query_rewrite: bool = True
33839b37   tangwang   属性值参与搜索:
38
39
      
      # Query rewrite dictionary (loaded from external file)
be52af70   tangwang   first commit
40
      rewrite_dictionary: Dict[str, str] = field(default_factory=dict)
33839b37   tangwang   属性值参与搜索:
41
42
43
      
      # Translation settings
      translation_service: str = "deepl"
be52af70   tangwang   first commit
44
      translation_api_key: Optional[str] = None
33839b37   tangwang   属性值参与搜索:
45
46
47
48
49
50
51
      translation_glossary_id: Optional[str] = None
      translation_context: str = "e-commerce product search"
      
      # Embedding field names
      text_embedding_field: Optional[str] = "title_embedding"
      image_embedding_field: Optional[str] = None
      
9f96d6f3   tangwang   短query不用语义搜索
52
      # Embedding disable thresholds (disable vector search for short queries)
33839b37   tangwang   属性值参与搜索:
53
54
55
56
      embedding_disable_chinese_char_limit: int = 4
      embedding_disable_english_word_limit: int = 3
      
      # Source fields configuration
cd3799c6   tangwang   tenant2 1w测试数据 mo...
57
      source_fields: Optional[List[str]] = None
13377199   tangwang   接口优化
58
  
be52af70   tangwang   first commit
59
60
61
62
63
  
  @dataclass
  class SPUConfig:
      """Configuration for SPU aggregation."""
      enabled: bool = False
33839b37   tangwang   属性值参与搜索:
64
      spu_field: Optional[str] = None
be52af70   tangwang   first commit
65
      inner_hits_size: int = 3
33839b37   tangwang   属性值参与搜索:
66
67
      # 配置哪些option维度参与检索(进索引、以及在线搜索)
      searchable_option_dimensions: List[str] = field(default_factory=lambda: ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
68
69
70
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
71
72
  class FunctionScoreConfig:
      """Function Score配置(ES层打分规则)"""
33839b37   tangwang   属性值参与搜索:
73
74
      score_mode: str = "sum"
      boost_mode: str = "multiply"
a00c3672   tangwang   feat: Function Sc...
75
76
77
78
      functions: List[Dict[str, Any]] = field(default_factory=list)
  
  
  @dataclass
33839b37   tangwang   属性值参与搜索:
79
80
81
82
83
84
85
  class RankingConfig:
      """Configuration for ranking expressions."""
      expression: str = "bm25()"
      description: str = "Default BM25 ranking"
  
  
  @dataclass
a00c3672   tangwang   feat: Function Sc...
86
87
88
89
90
91
92
93
  class RerankConfig:
      """本地重排配置(当前禁用)"""
      enabled: bool = False
      expression: str = ""
      description: str = ""
  
  
  @dataclass
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
94
  class SearchConfig:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
95
      """Complete configuration for search engine (multi-tenant)."""
33839b37   tangwang   属性值参与搜索:
96
97
98
99
      
      # 字段权重配置(用于搜索)
      field_boosts: Dict[str, float]
      
be52af70   tangwang   first commit
100
101
      # Index structure (query domains)
      indexes: List[IndexConfig]
33839b37   tangwang   属性值参与搜索:
102
      
be52af70   tangwang   first commit
103
104
      # Query processing
      query_config: QueryConfig
33839b37   tangwang   属性值参与搜索:
105
      
be52af70   tangwang   first commit
106
107
      # Ranking configuration
      ranking: RankingConfig
33839b37   tangwang   属性值参与搜索:
108
      
a00c3672   tangwang   feat: Function Sc...
109
110
      # Function Score configuration (ES层打分)
      function_score: FunctionScoreConfig
33839b37   tangwang   属性值参与搜索:
111
      
a00c3672   tangwang   feat: Function Sc...
112
113
      # Rerank configuration (本地重排)
      rerank: RerankConfig
33839b37   tangwang   属性值参与搜索:
114
      
be52af70   tangwang   first commit
115
116
      # SPU configuration
      spu_config: SPUConfig
33839b37   tangwang   属性值参与搜索:
117
      
be52af70   tangwang   first commit
118
119
      # ES index settings
      es_index_name: str
be52af70   tangwang   first commit
120
121
122
123
124
125
126
127
128
      es_settings: Dict[str, Any] = field(default_factory=dict)
  
  
  class ConfigurationError(Exception):
      """Raised when configuration validation fails."""
      pass
  
  
  class ConfigLoader:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
129
      """Loads and validates unified search engine configuration from YAML file."""
a77693fe   tangwang   调整配置目录结构
130
      
33839b37   tangwang   属性值参与搜索:
131
      def __init__(self, config_file: Optional[Path] = None):
a77693fe   tangwang   调整配置目录结构
132
          """
33839b37   tangwang   属性值参与搜索:
133
          Initialize config loader.
a77693fe   tangwang   调整配置目录结构
134
          
33839b37   tangwang   属性值参与搜索:
135
136
          Args:
              config_file: Path to config YAML file (defaults to config/config.yaml)
a77693fe   tangwang   调整配置目录结构
137
          """
33839b37   tangwang   属性值参与搜索:
138
139
140
141
142
143
144
145
          if config_file is None:
              config_file = Path(__file__).parent / "config.yaml"
          self.config_file = Path(config_file)
      
      def _load_rewrite_dictionary(self) -> Dict[str, str]:
          """Load query rewrite dictionary from external file."""
          rewrite_file = Path(__file__).parent / "rewrite_dictionary.txt"
          rewrite_dict = {}
a77693fe   tangwang   调整配置目录结构
146
          
33839b37   tangwang   属性值参与搜索:
147
148
          if not rewrite_file.exists():
              return rewrite_dict
a77693fe   tangwang   调整配置目录结构
149
          
a77693fe   tangwang   调整配置目录结构
150
          try:
33839b37   tangwang   属性值参与搜索:
151
152
              with open(rewrite_file, 'r', encoding='utf-8') as f:
                  for line in f:
a77693fe   tangwang   调整配置目录结构
153
                      line = line.strip()
a77693fe   tangwang   调整配置目录结构
154
155
156
                      if not line or line.startswith('#'):
                          continue
                      
a77693fe   tangwang   调整配置目录结构
157
                      parts = line.split('\t')
33839b37   tangwang   属性值参与搜索:
158
159
160
161
162
                      if len(parts) >= 2:
                          original = parts[0].strip()
                          replacement = parts[1].strip()
                          if original and replacement:
                              rewrite_dict[original] = replacement
a77693fe   tangwang   调整配置目录结构
163
          except Exception as e:
33839b37   tangwang   属性值参与搜索:
164
              print(f"Warning: Failed to load rewrite dictionary: {e}")
a77693fe   tangwang   调整配置目录结构
165
166
          
          return rewrite_dict
33839b37   tangwang   属性值参与搜索:
167
      
9f96d6f3   tangwang   短query不用语义搜索
168
      def load_config(self, validate: bool = True) -> SearchConfig:
be52af70   tangwang   first commit
169
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
170
          Load unified configuration from YAML file.
33839b37   tangwang   属性值参与搜索:
171
          
9f96d6f3   tangwang   短query不用语义搜索
172
          Args:
33839b37   tangwang   属性值参与搜索:
173
174
              validate: Whether to validate configuration after loading
          
be52af70   tangwang   first commit
175
          Returns:
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
176
              SearchConfig object
33839b37   tangwang   属性值参与搜索:
177
          
be52af70   tangwang   first commit
178
          Raises:
9f96d6f3   tangwang   短query不用语义搜索
179
              ConfigurationError: If config file not found, invalid, or validation fails
be52af70   tangwang   first commit
180
          """
4d824a77   tangwang   所有租户共用一套统一配置.tena...
181
182
          if not self.config_file.exists():
              raise ConfigurationError(f"Configuration file not found: {self.config_file}")
33839b37   tangwang   属性值参与搜索:
183
          
be52af70   tangwang   first commit
184
          try:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
185
              with open(self.config_file, 'r', encoding='utf-8') as f:
be52af70   tangwang   first commit
186
187
                  config_data = yaml.safe_load(f)
          except yaml.YAMLError as e:
4d824a77   tangwang   所有租户共用一套统一配置.tena...
188
              raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
33839b37   tangwang   属性值参与搜索:
189
          
9f96d6f3   tangwang   短query不用语义搜索
190
191
192
193
194
195
196
197
198
199
          config = self._parse_config(config_data)
          
          # Auto-validate configuration
          if validate:
              errors = self.validate_config(config)
              if errors:
                  error_msg = "Configuration validation failed:\n" + "\n".join(f"  - {err}" for err in errors)
                  raise ConfigurationError(error_msg)
          
          return config
33839b37   tangwang   属性值参与搜索:
200
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
201
202
      def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
          """Parse configuration dictionary into SearchConfig object."""
33839b37   tangwang   属性值参与搜索:
203
204
205
206
207
208
          
          # Parse field_boosts
          field_boosts = config_data.get("field_boosts", {})
          if not isinstance(field_boosts, dict):
              raise ConfigurationError("field_boosts must be a dictionary")
          
be52af70   tangwang   first commit
209
210
211
212
          # Parse indexes
          indexes = []
          for index_data in config_data.get("indexes", []):
              indexes.append(self._parse_index_config(index_data))
33839b37   tangwang   属性值参与搜索:
213
          
be52af70   tangwang   first commit
214
215
          # Parse query config
          query_config_data = config_data.get("query_config", {})
a77693fe   tangwang   调整配置目录结构
216
          
33839b37   tangwang   属性值参与搜索:
217
          # Load rewrite dictionary from external file
4d824a77   tangwang   所有租户共用一套统一配置.tena...
218
          rewrite_dictionary = self._load_rewrite_dictionary()
a77693fe   tangwang   调整配置目录结构
219
          
9f96d6f3   tangwang   短query不用语义搜索
220
221
222
          # Parse embedding disable thresholds
          embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
          
be52af70   tangwang   first commit
223
          query_config = QueryConfig(
9f96d6f3   tangwang   短query不用语义搜索
224
225
              supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
              default_language=query_config_data.get("default_language") or "zh",
be52af70   tangwang   first commit
226
227
228
              enable_translation=query_config_data.get("enable_translation", True),
              enable_text_embedding=query_config_data.get("enable_text_embedding", True),
              enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
a77693fe   tangwang   调整配置目录结构
229
              rewrite_dictionary=rewrite_dictionary,
be52af70   tangwang   first commit
230
              translation_api_key=query_config_data.get("translation_api_key"),
9f96d6f3   tangwang   短query不用语义搜索
231
              translation_service=query_config_data.get("translation_service") or "deepl",
522a3964   tangwang   多语言搜索翻译的优化(deepL添...
232
              translation_glossary_id=query_config_data.get("translation_glossary_id"),
9f96d6f3   tangwang   短query不用语义搜索
233
              translation_context=query_config_data.get("translation_context") or "e-commerce product search",
325eec03   tangwang   1. 日志、配置基础设施,使用优化
234
              text_embedding_field=query_config_data.get("text_embedding_field"),
cd3799c6   tangwang   tenant2 1w测试数据 mo...
235
              image_embedding_field=query_config_data.get("image_embedding_field"),
9f96d6f3   tangwang   短query不用语义搜索
236
237
              embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
              embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
33839b37   tangwang   属性值参与搜索:
238
              source_fields=query_config_data.get("source_fields")
be52af70   tangwang   first commit
239
          )
33839b37   tangwang   属性值参与搜索:
240
          
be52af70   tangwang   first commit
241
242
243
          # Parse ranking config
          ranking_data = config_data.get("ranking", {})
          ranking = RankingConfig(
9f96d6f3   tangwang   短query不用语义搜索
244
245
              expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
              description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
be52af70   tangwang   first commit
246
          )
33839b37   tangwang   属性值参与搜索:
247
          
a00c3672   tangwang   feat: Function Sc...
248
249
250
          # Parse Function Score configuration
          fs_data = config_data.get("function_score", {})
          function_score = FunctionScoreConfig(
9f96d6f3   tangwang   短query不用语义搜索
251
252
253
              score_mode=fs_data.get("score_mode") or "sum",
              boost_mode=fs_data.get("boost_mode") or "multiply",
              functions=fs_data.get("functions") or []
a00c3672   tangwang   feat: Function Sc...
254
          )
33839b37   tangwang   属性值参与搜索:
255
          
a00c3672   tangwang   feat: Function Sc...
256
257
258
259
          # Parse Rerank configuration
          rerank_data = config_data.get("rerank", {})
          rerank = RerankConfig(
              enabled=rerank_data.get("enabled", False),
9f96d6f3   tangwang   短query不用语义搜索
260
261
              expression=rerank_data.get("expression") or "",
              description=rerank_data.get("description") or ""
a00c3672   tangwang   feat: Function Sc...
262
          )
33839b37   tangwang   属性值参与搜索:
263
          
be52af70   tangwang   first commit
264
265
266
267
268
          # Parse SPU config
          spu_data = config_data.get("spu_config", {})
          spu_config = SPUConfig(
              enabled=spu_data.get("enabled", False),
              spu_field=spu_data.get("spu_field"),
33839b37   tangwang   属性值参与搜索:
269
270
              inner_hits_size=spu_data.get("inner_hits_size", 3),
              searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
be52af70   tangwang   first commit
271
          )
33839b37   tangwang   属性值参与搜索:
272
          
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
273
          return SearchConfig(
33839b37   tangwang   属性值参与搜索:
274
              field_boosts=field_boosts,
be52af70   tangwang   first commit
275
276
277
              indexes=indexes,
              query_config=query_config,
              ranking=ranking,
a00c3672   tangwang   feat: Function Sc...
278
279
              function_score=function_score,
              rerank=rerank,
be52af70   tangwang   first commit
280
              spu_config=spu_config,
4d824a77   tangwang   所有租户共用一套统一配置.tena...
281
              es_index_name=config_data.get("es_index_name", "search_products"),
be52af70   tangwang   first commit
282
283
              es_settings=config_data.get("es_settings", {})
          )
33839b37   tangwang   属性值参与搜索:
284
      
be52af70   tangwang   first commit
285
286
      def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
          """Parse index configuration from dictionary."""
be52af70   tangwang   first commit
287
288
289
          return IndexConfig(
              name=index_data["name"],
              label=index_data.get("label", index_data["name"]),
33839b37   tangwang   属性值参与搜索:
290
              fields=index_data.get("fields", []),
be52af70   tangwang   first commit
291
              boost=index_data.get("boost", 1.0),
33839b37   tangwang   属性值参与搜索:
292
              example=index_data.get("example")
be52af70   tangwang   first commit
293
          )
33839b37   tangwang   属性值参与搜索:
294
      
9cb7528e   tangwang   店匠体系数据的搜索:mock da...
295
      def validate_config(self, config: SearchConfig) -> List[str]:
be52af70   tangwang   first commit
296
          """
33839b37   tangwang   属性值参与搜索:
297
298
          Validate configuration for common errors.
          
be52af70   tangwang   first commit
299
          Args:
33839b37   tangwang   属性值参与搜索:
300
301
              config: SearchConfig to validate
          
be52af70   tangwang   first commit
302
          Returns:
33839b37   tangwang   属性值参与搜索:
303
              List of error messages (empty if valid)
be52af70   tangwang   first commit
304
305
          """
          errors = []
b926f678   tangwang   多语言查询
306
          
33839b37   tangwang   属性值参与搜索:
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
          # Validate es_index_name
          if not config.es_index_name:
              errors.append("es_index_name is required")
          
          # Validate field_boosts
          if not config.field_boosts:
              errors.append("field_boosts is empty")
          
          for field_name, boost in config.field_boosts.items():
              if not isinstance(boost, (int, float)):
                  errors.append(f"field_boosts['{field_name}']: boost must be a number, got {type(boost).__name__}")
              elif boost < 0:
                  errors.append(f"field_boosts['{field_name}']: boost must be non-negative")
          
          # Validate indexes
          if not config.indexes:
              errors.append("At least one index domain must be defined")
          
          index_names = set()
be52af70   tangwang   first commit
326
          for index in config.indexes:
33839b37   tangwang   属性值参与搜索:
327
328
329
330
              # Check for duplicate index names
              if index.name in index_names:
                  errors.append(f"Duplicate index name: {index.name}")
              index_names.add(index.name)
b926f678   tangwang   多语言查询
331
              
33839b37   tangwang   属性值参与搜索:
332
333
334
335
              # Validate fields in index
              if not index.fields:
                  errors.append(f"Index '{index.name}': fields list is empty")
          
be52af70   tangwang   first commit
336
337
338
339
          # Validate SPU config
          if config.spu_config.enabled:
              if not config.spu_config.spu_field:
                  errors.append("SPU aggregation enabled but no spu_field specified")
33839b37   tangwang   属性值参与搜索:
340
341
342
343
344
345
346
347
348
349
350
          
          # Validate query config
          if not config.query_config.supported_languages:
              errors.append("At least one supported language must be specified")
          
          if config.query_config.default_language not in config.query_config.supported_languages:
              errors.append(
                  f"Default language '{config.query_config.default_language}' "
                  f"not in supported languages: {config.query_config.supported_languages}"
              )
          
be52af70   tangwang   first commit
351
          return errors
33839b37   tangwang   属性值参与搜索:
352
353
354
      
      def to_dict(self, config: SearchConfig) -> Dict[str, Any]:
          """Convert SearchConfig to dictionary representation."""
a77693fe   tangwang   调整配置目录结构
355
          
33839b37   tangwang   属性值参与搜索:
356
          # Build query_config dict
9f96d6f3   tangwang   短query不用语义搜索
357
358
359
360
361
362
363
          query_config_dict = {
              "supported_languages": config.query_config.supported_languages,
              "default_language": config.query_config.default_language,
              "enable_translation": config.query_config.enable_translation,
              "enable_text_embedding": config.query_config.enable_text_embedding,
              "enable_query_rewrite": config.query_config.enable_query_rewrite,
              "translation_service": config.query_config.translation_service,
33839b37   tangwang   属性值参与搜索:
364
365
366
              "text_embedding_field": config.query_config.text_embedding_field,
              "image_embedding_field": config.query_config.image_embedding_field,
              "embedding_disable_thresholds": {
9f96d6f3   tangwang   短query不用语义搜索
367
368
                  "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
                  "english_word_limit": config.query_config.embedding_disable_english_word_limit
33839b37   tangwang   属性值参与搜索:
369
370
371
              },
              "source_fields": config.query_config.source_fields
          }
9f96d6f3   tangwang   短query不用语义搜索
372
          
33839b37   tangwang   属性值参与搜索:
373
          return {
be52af70   tangwang   first commit
374
375
              "es_index_name": config.es_index_name,
              "es_settings": config.es_settings,
33839b37   tangwang   属性值参与搜索:
376
              "field_boosts": config.field_boosts,
be52af70   tangwang   first commit
377
              "indexes": [self._index_to_dict(index) for index in config.indexes],
9f96d6f3   tangwang   短query不用语义搜索
378
              "query_config": query_config_dict,
be52af70   tangwang   first commit
379
380
381
382
              "ranking": {
                  "expression": config.ranking.expression,
                  "description": config.ranking.description
              },
1f6d15fa   tangwang   重构:SPU级别索引、统一索引架构...
383
384
385
386
387
388
389
390
391
392
              "function_score": {
                  "score_mode": config.function_score.score_mode,
                  "boost_mode": config.function_score.boost_mode,
                  "functions": config.function_score.functions
              },
              "rerank": {
                  "enabled": config.rerank.enabled,
                  "expression": config.rerank.expression,
                  "description": config.rerank.description
              },
be52af70   tangwang   first commit
393
394
395
              "spu_config": {
                  "enabled": config.spu_config.enabled,
                  "spu_field": config.spu_config.spu_field,
33839b37   tangwang   属性值参与搜索:
396
397
                  "inner_hits_size": config.spu_config.inner_hits_size,
                  "searchable_option_dimensions": config.spu_config.searchable_option_dimensions
be52af70   tangwang   first commit
398
399
              }
          }
a77693fe   tangwang   调整配置目录结构
400
      
be52af70   tangwang   first commit
401
      def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
33839b37   tangwang   属性值参与搜索:
402
          """Convert IndexConfig to dictionary."""
b926f678   tangwang   多语言查询
403
          result = {
be52af70   tangwang   first commit
404
405
406
              "name": index.name,
              "label": index.label,
              "fields": index.fields,
33839b37   tangwang   属性值参与搜索:
407
              "boost": index.boost
b926f678   tangwang   多语言查询
408
          }
9f96d6f3   tangwang   短query不用语义搜索
409
          
9f96d6f3   tangwang   短query不用语义搜索
410
411
          if index.example:
              result["example"] = index.example
33839b37   tangwang   属性值参与搜索:
412
413
414
          
          return result
  
b926f678   tangwang   多语言查询
415
  
33839b37   tangwang   属性值参与搜索:
416
417
418
419
420
421
422
423
424
425
426
427
  def load_tenant_config(tenant_id: Optional[str] = None) -> SearchConfig:
      """
      Load tenant configuration (backward compatibility wrapper).
      
      Args:
          tenant_id: Ignored (kept for backward compatibility)
      
      Returns:
          SearchConfig loaded from config/config.yaml
      """
      loader = ConfigLoader()
      return loader.load_config()