Commit 9f96d6f3fa501c7f8ca045cca97ee7573120d8a7

Authored by tangwang
1 parent 99bea633

短query不用语义搜索

query config/ranking config优化
@@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address) @@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address)
41 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 41 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
42 42
43 from config.env_config import ES_CONFIG 43 from config.env_config import ES_CONFIG
  44 +from config import ConfigLoader
44 from utils import ESClient 45 from utils import ESClient
45 from search import Searcher 46 from search import Searcher
46 -from search.query_config import DEFAULT_INDEX_NAME  
47 from query import QueryParser 47 from query import QueryParser
48 48
49 # Global instances 49 # Global instances
50 _es_client: Optional[ESClient] = None 50 _es_client: Optional[ESClient] = None
51 _searcher: Optional[Searcher] = None 51 _searcher: Optional[Searcher] = None
52 _query_parser: Optional[QueryParser] = None 52 _query_parser: Optional[QueryParser] = None
  53 +_config = None
53 54
54 55
55 def init_service(es_host: str = "http://localhost:9200"): 56 def init_service(es_host: str = "http://localhost:9200"):
@@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"):
59 Args: 60 Args:
60 es_host: Elasticsearch host URL 61 es_host: Elasticsearch host URL
61 """ 62 """
62 - global _es_client, _searcher, _query_parser 63 + global _es_client, _searcher, _query_parser, _config
63 64
64 start_time = time.time() 65 start_time = time.time()
65 logger.info("Initializing search service (multi-tenant)") 66 logger.info("Initializing search service (multi-tenant)")
66 67
  68 + # Load configuration
  69 + logger.info("Loading configuration...")
  70 + config_loader = ConfigLoader("config/config.yaml")
  71 + _config = config_loader.load_config()
  72 + logger.info("Configuration loaded")
  73 +
67 # Get ES credentials 74 # Get ES credentials
68 es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username') 75 es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username')
69 es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password') 76 es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password')
@@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"):
81 88
82 # Initialize components 89 # Initialize components
83 logger.info("Initializing query parser...") 90 logger.info("Initializing query parser...")
84 - _query_parser = QueryParser() 91 + _query_parser = QueryParser(_config)
85 92
86 logger.info("Initializing searcher...") 93 logger.info("Initializing searcher...")
87 - _searcher = Searcher(_es_client, _query_parser, index_name=DEFAULT_INDEX_NAME) 94 + _searcher = Searcher(_es_client, _config, _query_parser)
88 95
89 elapsed = time.time() - start_time 96 elapsed = time.time() - start_time
90 - logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {DEFAULT_INDEX_NAME}") 97 + logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}")
91 98
92 99
93 100
@@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser: @@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser:
113 return _query_parser 120 return _query_parser
114 121
115 122
  123 +def get_config():
  124 + """Get global config instance."""
  125 + if _config is None:
  126 + raise RuntimeError("Service not initialized")
  127 + return _config
  128 +
  129 +
116 # Create FastAPI app with enhanced configuration 130 # Create FastAPI app with enhanced configuration
117 app = FastAPI( 131 app = FastAPI(
118 title="E-Commerce Search API", 132 title="E-Commerce Search API",
config/__init__.py
@@ -23,6 +23,10 @@ from .config_loader import ( @@ -23,6 +23,10 @@ from .config_loader import (
23 RerankConfig, 23 RerankConfig,
24 ConfigurationError 24 ConfigurationError
25 ) 25 )
  26 +from .utils import (
  27 + get_match_fields_for_index,
  28 + get_domain_fields
  29 +)
26 30
27 __all__ = [ 31 __all__ = [
28 # Field types 32 # Field types
@@ -46,4 +50,6 @@ __all__ = [ @@ -46,4 +50,6 @@ __all__ = [
46 'FunctionScoreConfig', 50 'FunctionScoreConfig',
47 'RerankConfig', 51 'RerankConfig',
48 'ConfigurationError', 52 'ConfigurationError',
  53 + 'get_match_fields_for_index',
  54 + 'get_domain_fields',
49 ] 55 ]
config/config.yaml
@@ -412,6 +412,11 @@ query_config: @@ -412,6 +412,11 @@ query_config:
412 text_embedding_field: "title_embedding" # Field name for text embeddings 412 text_embedding_field: "title_embedding" # Field name for text embeddings
413 image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect) 413 image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect)
414 414
  415 + # Embedding disable thresholds (disable vector search for short queries)
  416 + embedding_disable_thresholds:
  417 + chinese_char_limit: 4 # Disable embedding for Chinese queries with <= 4 characters
  418 + english_word_limit: 3 # Disable embedding for English queries with <= 3 words
  419 +
415 # Translation API (DeepL) 420 # Translation API (DeepL)
416 translation_service: "deepl" 421 translation_service: "deepl"
417 translation_api_key: null # Set via environment variable 422 translation_api_key: null # Set via environment variable
config/config_loader.py
@@ -58,6 +58,10 @@ class QueryConfig: @@ -58,6 +58,10 @@ class QueryConfig:
58 text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding") 58 text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding")
59 image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding") 59 image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding")
60 60
  61 + # Embedding disable thresholds (disable vector search for short queries)
  62 + embedding_disable_chinese_char_limit: int = 4 # Disable embedding for Chinese queries with <= this many characters
  63 + embedding_disable_english_word_limit: int = 3 # Disable embedding for English queries with <= this many words
  64 +
61 # ES source fields configuration - fields to return in search results 65 # ES source fields configuration - fields to return in search results
62 # If None, auto-collect from field configs (fields with return_in_source=True) 66 # If None, auto-collect from field configs (fields with return_in_source=True)
63 # If empty list, return all fields. Otherwise, only return specified fields. 67 # If empty list, return all fields. Otherwise, only return specified fields.
@@ -165,15 +169,18 @@ class ConfigLoader: @@ -165,15 +169,18 @@ class ConfigLoader:
165 169
166 return rewrite_dict 170 return rewrite_dict
167 171
168 - def load_config(self) -> SearchConfig: 172 + def load_config(self, validate: bool = True) -> SearchConfig:
169 """ 173 """
170 Load unified configuration from YAML file. 174 Load unified configuration from YAML file.
171 175
  176 + Args:
  177 + validate: Whether to validate configuration after loading (default: True)
  178 +
172 Returns: 179 Returns:
173 SearchConfig object 180 SearchConfig object
174 181
175 Raises: 182 Raises:
176 - ConfigurationError: If config file not found or invalid 183 + ConfigurationError: If config file not found, invalid, or validation fails
177 """ 184 """
178 if not self.config_file.exists(): 185 if not self.config_file.exists():
179 raise ConfigurationError(f"Configuration file not found: {self.config_file}") 186 raise ConfigurationError(f"Configuration file not found: {self.config_file}")
@@ -184,7 +191,16 @@ class ConfigLoader: @@ -184,7 +191,16 @@ class ConfigLoader:
184 except yaml.YAMLError as e: 191 except yaml.YAMLError as e:
185 raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}") 192 raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
186 193
187 - return self._parse_config(config_data) 194 + config = self._parse_config(config_data)
  195 +
  196 + # Auto-validate configuration
  197 + if validate:
  198 + errors = self.validate_config(config)
  199 + if errors:
  200 + error_msg = "Configuration validation failed:\n" + "\n".join(f" - {err}" for err in errors)
  201 + raise ConfigurationError(error_msg)
  202 +
  203 + return config
188 204
189 def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig: 205 def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
190 """Parse configuration dictionary into SearchConfig object.""" 206 """Parse configuration dictionary into SearchConfig object."""
@@ -214,43 +230,48 @@ class ConfigLoader: @@ -214,43 +230,48 @@ class ConfigLoader:
214 if field.return_in_source 230 if field.return_in_source
215 ] 231 ]
216 232
  233 + # Parse embedding disable thresholds
  234 + embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
  235 +
217 query_config = QueryConfig( 236 query_config = QueryConfig(
218 - supported_languages=query_config_data.get("supported_languages", ["zh", "en"]),  
219 - default_language=query_config_data.get("default_language", "zh"), 237 + supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
  238 + default_language=query_config_data.get("default_language") or "zh",
220 enable_translation=query_config_data.get("enable_translation", True), 239 enable_translation=query_config_data.get("enable_translation", True),
221 enable_text_embedding=query_config_data.get("enable_text_embedding", True), 240 enable_text_embedding=query_config_data.get("enable_text_embedding", True),
222 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), 241 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
223 rewrite_dictionary=rewrite_dictionary, 242 rewrite_dictionary=rewrite_dictionary,
224 translation_api_key=query_config_data.get("translation_api_key"), 243 translation_api_key=query_config_data.get("translation_api_key"),
225 - translation_service=query_config_data.get("translation_service", "deepl"), 244 + translation_service=query_config_data.get("translation_service") or "deepl",
226 translation_glossary_id=query_config_data.get("translation_glossary_id"), 245 translation_glossary_id=query_config_data.get("translation_glossary_id"),
227 - translation_context=query_config_data.get("translation_context", "e-commerce product search"), 246 + translation_context=query_config_data.get("translation_context") or "e-commerce product search",
228 text_embedding_field=query_config_data.get("text_embedding_field"), 247 text_embedding_field=query_config_data.get("text_embedding_field"),
229 image_embedding_field=query_config_data.get("image_embedding_field"), 248 image_embedding_field=query_config_data.get("image_embedding_field"),
  249 + embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
  250 + embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
230 source_fields=source_fields 251 source_fields=source_fields
231 ) 252 )
232 253
233 # Parse ranking config 254 # Parse ranking config
234 ranking_data = config_data.get("ranking", {}) 255 ranking_data = config_data.get("ranking", {})
235 ranking = RankingConfig( 256 ranking = RankingConfig(
236 - expression=ranking_data.get("expression", "bm25() + 0.2*text_embedding_relevance()"),  
237 - description=ranking_data.get("description", "Default BM25 + text embedding ranking") 257 + expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
  258 + description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
238 ) 259 )
239 260
240 # Parse Function Score configuration 261 # Parse Function Score configuration
241 fs_data = config_data.get("function_score", {}) 262 fs_data = config_data.get("function_score", {})
242 function_score = FunctionScoreConfig( 263 function_score = FunctionScoreConfig(
243 - score_mode=fs_data.get("score_mode", "sum"),  
244 - boost_mode=fs_data.get("boost_mode", "multiply"),  
245 - functions=fs_data.get("functions", []) 264 + score_mode=fs_data.get("score_mode") or "sum",
  265 + boost_mode=fs_data.get("boost_mode") or "multiply",
  266 + functions=fs_data.get("functions") or []
246 ) 267 )
247 268
248 # Parse Rerank configuration 269 # Parse Rerank configuration
249 rerank_data = config_data.get("rerank", {}) 270 rerank_data = config_data.get("rerank", {})
250 rerank = RerankConfig( 271 rerank = RerankConfig(
251 enabled=rerank_data.get("enabled", False), 272 enabled=rerank_data.get("enabled", False),
252 - expression=rerank_data.get("expression", ""),  
253 - description=rerank_data.get("description", "") 273 + expression=rerank_data.get("expression") or "",
  274 + description=rerank_data.get("description") or ""
254 ) 275 )
255 276
256 # Parse SPU config 277 # Parse SPU config
@@ -447,21 +468,43 @@ class ConfigLoader: @@ -447,21 +468,43 @@ class ConfigLoader:
447 output_path = Path(output_path) 468 output_path = Path(output_path)
448 469
449 # Convert config back to dictionary format 470 # Convert config back to dictionary format
  471 + query_config_dict = {
  472 + "supported_languages": config.query_config.supported_languages,
  473 + "default_language": config.query_config.default_language,
  474 + "enable_translation": config.query_config.enable_translation,
  475 + "enable_text_embedding": config.query_config.enable_text_embedding,
  476 + "enable_query_rewrite": config.query_config.enable_query_rewrite,
  477 + "translation_service": config.query_config.translation_service,
  478 + }
  479 +
  480 + # Add optional fields only if they are set
  481 + if config.query_config.translation_api_key:
  482 + query_config_dict["translation_api_key"] = config.query_config.translation_api_key
  483 + if config.query_config.translation_glossary_id:
  484 + query_config_dict["translation_glossary_id"] = config.query_config.translation_glossary_id
  485 + if config.query_config.translation_context:
  486 + query_config_dict["translation_context"] = config.query_config.translation_context
  487 + if config.query_config.text_embedding_field:
  488 + query_config_dict["text_embedding_field"] = config.query_config.text_embedding_field
  489 + if config.query_config.image_embedding_field:
  490 + query_config_dict["image_embedding_field"] = config.query_config.image_embedding_field
  491 + if config.query_config.source_fields:
  492 + query_config_dict["source_fields"] = config.query_config.source_fields
  493 +
  494 + # Add embedding disable thresholds
  495 + if (config.query_config.embedding_disable_chinese_char_limit != 4 or
  496 + config.query_config.embedding_disable_english_word_limit != 3):
  497 + query_config_dict["embedding_disable_thresholds"] = {
  498 + "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
  499 + "english_word_limit": config.query_config.embedding_disable_english_word_limit
  500 + }
  501 +
450 config_dict = { 502 config_dict = {
451 "es_index_name": config.es_index_name, 503 "es_index_name": config.es_index_name,
452 "es_settings": config.es_settings, 504 "es_settings": config.es_settings,
453 "fields": [self._field_to_dict(field) for field in config.fields], 505 "fields": [self._field_to_dict(field) for field in config.fields],
454 "indexes": [self._index_to_dict(index) for index in config.indexes], 506 "indexes": [self._index_to_dict(index) for index in config.indexes],
455 - "query_config": {  
456 - "supported_languages": config.query_config.supported_languages,  
457 - "default_language": config.query_config.default_language,  
458 - "enable_translation": config.query_config.enable_translation,  
459 - "enable_text_embedding": config.query_config.enable_text_embedding,  
460 - "enable_query_rewrite": config.query_config.enable_query_rewrite,  
461 - # rewrite_dictionary is stored in separate file, not in config  
462 - "translation_api_key": config.query_config.translation_api_key,  
463 - "translation_service": config.query_config.translation_service,  
464 - }, 507 + "query_config": query_config_dict,
465 "ranking": { 508 "ranking": {
466 "expression": config.ranking.expression, 509 "expression": config.ranking.expression,
467 "description": config.ranking.description 510 "description": config.ranking.description
@@ -505,7 +548,7 @@ class ConfigLoader: @@ -505,7 +548,7 @@ class ConfigLoader:
505 f.write(f"{key}\t{value}\n") 548 f.write(f"{key}\t{value}\n")
506 549
507 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: 550 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]:
508 - """Convert FieldConfig to dictionary.""" 551 + """Convert FieldConfig to dictionary, preserving all fields."""
509 result = { 552 result = {
510 "name": field.name, 553 "name": field.name,
511 "type": field.field_type.value, 554 "type": field.field_type.value,
@@ -513,36 +556,49 @@ class ConfigLoader: @@ -513,36 +556,49 @@ class ConfigLoader:
513 "boost": field.boost, 556 "boost": field.boost,
514 "store": field.store, 557 "store": field.store,
515 "index": field.index, 558 "index": field.index,
  559 + "return_in_source": field.return_in_source,
516 } 560 }
517 561
  562 + # Add optional fields only if they differ from defaults or are set
518 if field.analyzer: 563 if field.analyzer:
519 result["analyzer"] = field.analyzer.value 564 result["analyzer"] = field.analyzer.value
520 if field.search_analyzer: 565 if field.search_analyzer:
521 result["search_analyzer"] = field.search_analyzer.value 566 result["search_analyzer"] = field.search_analyzer.value
522 if field.multi_language: 567 if field.multi_language:
523 result["multi_language"] = field.multi_language 568 result["multi_language"] = field.multi_language
524 - result["languages"] = field.languages 569 + if field.languages:
  570 + result["languages"] = field.languages
525 if field.embedding_dims != 1024: 571 if field.embedding_dims != 1024:
526 result["embedding_dims"] = field.embedding_dims 572 result["embedding_dims"] = field.embedding_dims
527 if field.embedding_similarity != "dot_product": 573 if field.embedding_similarity != "dot_product":
528 result["embedding_similarity"] = field.embedding_similarity 574 result["embedding_similarity"] = field.embedding_similarity
529 if field.nested: 575 if field.nested:
530 result["nested"] = field.nested 576 result["nested"] = field.nested
531 - result["nested_properties"] = field.nested_properties 577 + if field.nested_properties:
  578 + result["nested_properties"] = field.nested_properties
  579 + if field.keyword_subfield:
  580 + result["keyword_subfield"] = field.keyword_subfield
  581 + if field.keyword_ignore_above != 256:
  582 + result["keyword_ignore_above"] = field.keyword_ignore_above
  583 + if field.keyword_normalizer:
  584 + result["keyword_normalizer"] = field.keyword_normalizer
532 585
533 return result 586 return result
534 587
535 def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: 588 def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
536 - """Convert IndexConfig to dictionary.""" 589 + """Convert IndexConfig to dictionary, preserving all fields."""
537 result = { 590 result = {
538 "name": index.name, 591 "name": index.name,
539 "label": index.label, 592 "label": index.label,
540 "fields": index.fields, 593 "fields": index.fields,
541 "analyzer": index.analyzer.value, 594 "analyzer": index.analyzer.value,
542 - "boost": index.boost,  
543 - "example": index.example  
544 } 595 }
545 - 596 +
  597 + # Add optional fields only if they differ from defaults or are set
  598 + if index.boost != 1.0:
  599 + result["boost"] = index.boost
  600 + if index.example:
  601 + result["example"] = index.example
546 if index.language_field_mapping: 602 if index.language_field_mapping:
547 result["language_field_mapping"] = index.language_field_mapping 603 result["language_field_mapping"] = index.language_field_mapping
548 604
config/utils.py 0 → 100644
@@ -0,0 +1,70 @@ @@ -0,0 +1,70 @@
  1 +"""
  2 +Configuration utility functions.
  3 +
  4 +Helper functions for working with SearchConfig objects.
  5 +"""
  6 +
  7 +from typing import Dict, List
  8 +from .config_loader import SearchConfig
  9 +
  10 +
  11 +def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]:
  12 + """
  13 + Generate match fields list with boost from IndexConfig and FieldConfig.
  14 +
  15 + Args:
  16 + config: SearchConfig instance
  17 + index_name: Name of the index domain (default: "default")
  18 +
  19 + Returns:
  20 + List of field names with boost, e.g., ["title_zh^3.0", "brief_zh^1.5"]
  21 + """
  22 + # Find the index config
  23 + index_config = None
  24 + for idx in config.indexes:
  25 + if idx.name == index_name:
  26 + index_config = idx
  27 + break
  28 +
  29 + if not index_config:
  30 + return []
  31 +
  32 + # Create a field name to FieldConfig mapping
  33 + field_map = {field.name: field for field in config.fields}
  34 +
  35 + # Generate match fields with boost
  36 + match_fields = []
  37 + for field_name in index_config.fields:
  38 + field_config = field_map.get(field_name)
  39 + if field_config:
  40 + # Combine index boost and field boost
  41 + total_boost = index_config.boost * field_config.boost
  42 + if total_boost != 1.0:
  43 + match_fields.append(f"{field_name}^{total_boost}")
  44 + else:
  45 + match_fields.append(field_name)
  46 + else:
  47 + # Field not found in config, use index boost only
  48 + if index_config.boost != 1.0:
  49 + match_fields.append(f"{field_name}^{index_config.boost}")
  50 + else:
  51 + match_fields.append(field_name)
  52 +
  53 + return match_fields
  54 +
  55 +
  56 +def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]:
  57 + """
  58 + Generate domain-specific match fields from all index configs.
  59 +
  60 + Args:
  61 + config: SearchConfig instance
  62 +
  63 + Returns:
  64 + Dictionary mapping domain name to list of match fields
  65 + """
  66 + domain_fields = {}
  67 + for index_config in config.indexes:
  68 + domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name)
  69 + return domain_fields
  70 +
frontend/index.html
@@ -100,9 +100,10 @@ @@ -100,9 +100,10 @@
100 100
101 <div class="sort-right"> 101 <div class="sort-right">
102 <select id="resultSize" onchange="performSearch()"> 102 <select id="resultSize" onchange="performSearch()">
103 - <option value="10">10 per page</option>  
104 - <option value="20" selected>20 per page</option>  
105 - <option value="50">50 per page</option> 103 + <option value="20">20 per page</option>
  104 + <option value="50" selected>50 per page</option>
  105 + <option value="100">50 per page</option>
  106 + <option value="200">50 per page</option>
106 </select> 107 </select>
107 </div> 108 </div>
108 </div> 109 </div>
@@ -93,7 +93,7 @@ def cmd_search(args): @@ -93,7 +93,7 @@ def cmd_search(args):
93 93
94 from query import QueryParser 94 from query import QueryParser
95 query_parser = QueryParser(config) 95 query_parser = QueryParser(config)
96 - searcher = Searcher(config, es_client, query_parser) 96 + searcher = Searcher(es_client, config, query_parser)
97 97
98 # Execute search 98 # Execute search
99 print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})") 99 print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})")
query/query_parser.py
@@ -9,13 +9,7 @@ import numpy as np @@ -9,13 +9,7 @@ import numpy as np
9 import logging 9 import logging
10 10
11 from embeddings import BgeEncoder 11 from embeddings import BgeEncoder
12 -from search.query_config import (  
13 - ENABLE_TEXT_EMBEDDING,  
14 - ENABLE_TRANSLATION,  
15 - REWRITE_DICTIONARY,  
16 - TRANSLATION_API_KEY,  
17 - TRANSLATION_SERVICE  
18 -) 12 +from config import SearchConfig
19 from .language_detector import LanguageDetector 13 from .language_detector import LanguageDetector
20 from .translator import Translator 14 from .translator import Translator
21 from .query_rewriter import QueryRewriter, QueryNormalizer 15 from .query_rewriter import QueryRewriter, QueryNormalizer
@@ -70,6 +64,7 @@ class QueryParser: @@ -70,6 +64,7 @@ class QueryParser:
70 64
71 def __init__( 65 def __init__(
72 self, 66 self,
  67 + config: SearchConfig,
73 text_encoder: Optional[BgeEncoder] = None, 68 text_encoder: Optional[BgeEncoder] = None,
74 translator: Optional[Translator] = None 69 translator: Optional[Translator] = None
75 ): 70 ):
@@ -77,21 +72,23 @@ class QueryParser: @@ -77,21 +72,23 @@ class QueryParser:
77 Initialize query parser. 72 Initialize query parser.
78 73
79 Args: 74 Args:
  75 + config: SearchConfig instance
80 text_encoder: Text embedding encoder (lazy loaded if not provided) 76 text_encoder: Text embedding encoder (lazy loaded if not provided)
81 translator: Translator instance (lazy loaded if not provided) 77 translator: Translator instance (lazy loaded if not provided)
82 """ 78 """
  79 + self.config = config
83 self._text_encoder = text_encoder 80 self._text_encoder = text_encoder
84 self._translator = translator 81 self._translator = translator
85 82
86 # Initialize components 83 # Initialize components
87 self.normalizer = QueryNormalizer() 84 self.normalizer = QueryNormalizer()
88 self.language_detector = LanguageDetector() 85 self.language_detector = LanguageDetector()
89 - self.rewriter = QueryRewriter(REWRITE_DICTIONARY) 86 + self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
90 87
91 @property 88 @property
92 def text_encoder(self) -> BgeEncoder: 89 def text_encoder(self) -> BgeEncoder:
93 """Lazy load text encoder.""" 90 """Lazy load text encoder."""
94 - if self._text_encoder is None and ENABLE_TEXT_EMBEDDING: 91 + if self._text_encoder is None and self.config.query_config.enable_text_embedding:
95 logger.info("Initializing text encoder (lazy load)...") 92 logger.info("Initializing text encoder (lazy load)...")
96 self._text_encoder = BgeEncoder() 93 self._text_encoder = BgeEncoder()
97 return self._text_encoder 94 return self._text_encoder
@@ -99,13 +96,13 @@ class QueryParser: @@ -99,13 +96,13 @@ class QueryParser:
99 @property 96 @property
100 def translator(self) -> Translator: 97 def translator(self) -> Translator:
101 """Lazy load translator.""" 98 """Lazy load translator."""
102 - if self._translator is None and ENABLE_TRANSLATION: 99 + if self._translator is None and self.config.query_config.enable_translation:
103 logger.info("Initializing translator (lazy load)...") 100 logger.info("Initializing translator (lazy load)...")
104 self._translator = Translator( 101 self._translator = Translator(
105 - api_key=TRANSLATION_API_KEY, 102 + api_key=self.config.query_config.translation_api_key,
106 use_cache=True, 103 use_cache=True,
107 - glossary_id=None, # Can be added to query_config if needed  
108 - translation_context='e-commerce product search' 104 + glossary_id=self.config.query_config.translation_glossary_id,
  105 + translation_context=self.config.query_config.translation_context
109 ) 106 )
110 return self._translator 107 return self._translator
111 108
@@ -156,7 +153,7 @@ class QueryParser: @@ -156,7 +153,7 @@ class QueryParser:
156 153
157 # Stage 2: Query rewriting 154 # Stage 2: Query rewriting
158 rewritten = None 155 rewritten = None
159 - if REWRITE_DICTIONARY: # Enable rewrite if dictionary exists 156 + if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
160 rewritten = self.rewriter.rewrite(query_text) 157 rewritten = self.rewriter.rewrite(query_text)
161 if rewritten != query_text: 158 if rewritten != query_text:
162 log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") 159 log_info(f"查询重写 | '{query_text}' -> '{rewritten}'")
@@ -173,7 +170,7 @@ class QueryParser: @@ -173,7 +170,7 @@ class QueryParser:
173 170
174 # Stage 4: Translation 171 # Stage 4: Translation
175 translations = {} 172 translations = {}
176 - if ENABLE_TRANSLATION: 173 + if self.config.query_config.enable_translation:
177 try: 174 try:
178 # Determine target languages for translation 175 # Determine target languages for translation
179 # Simplified: always translate to Chinese and English 176 # Simplified: always translate to Chinese and English
@@ -210,19 +207,47 @@ class QueryParser: @@ -210,19 +207,47 @@ class QueryParser:
210 # Stage 5: Text embedding 207 # Stage 5: Text embedding
211 query_vector = None 208 query_vector = None
212 if (generate_vector and 209 if (generate_vector and
213 - ENABLE_TEXT_EMBEDDING and 210 + self.config.query_config.enable_text_embedding and
214 domain == "default"): # Only generate vector for default domain 211 domain == "default"): # Only generate vector for default domain
215 - try:  
216 - log_debug("开始生成查询向量")  
217 - query_vector = self.text_encoder.encode([query_text])[0]  
218 - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}")  
219 - if context:  
220 - context.store_intermediate_result('query_vector_shape', query_vector.shape)  
221 - except Exception as e:  
222 - error_msg = f"查询向量生成失败 | 错误: {str(e)}"  
223 - log_info(error_msg)  
224 - if context:  
225 - context.add_warning(error_msg) 212 + # Get thresholds from config
  213 + chinese_limit = self.config.query_config.embedding_disable_chinese_char_limit
  214 + english_limit = self.config.query_config.embedding_disable_english_word_limit
  215 +
  216 + # Check if embedding should be disabled for short queries
  217 + should_disable_embedding = False
  218 + disable_reason = None
  219 +
  220 + if detected_lang == 'zh':
  221 + # For Chinese: disable embedding if character count <= threshold
  222 + char_count = len(query_text.strip())
  223 + if char_count <= chinese_limit:
  224 + should_disable_embedding = True
  225 + disable_reason = f"中文查询字数({char_count}) <= {chinese_limit},禁用向量搜索"
  226 + log_info(disable_reason)
  227 + if context:
  228 + context.store_intermediate_result('embedding_disabled_reason', disable_reason)
  229 + else:
  230 + # For English: disable embedding if word count <= threshold
  231 + word_count = len(query_text.strip().split())
  232 + if word_count <= english_limit:
  233 + should_disable_embedding = True
  234 + disable_reason = f"英文查询单词数({word_count}) <= {english_limit},禁用向量搜索"
  235 + log_info(disable_reason)
  236 + if context:
  237 + context.store_intermediate_result('embedding_disabled_reason', disable_reason)
  238 +
  239 + if not should_disable_embedding:
  240 + try:
  241 + log_debug("开始生成查询向量")
  242 + query_vector = self.text_encoder.encode([query_text])[0]
  243 + log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}")
  244 + if context:
  245 + context.store_intermediate_result('query_vector_shape', query_vector.shape)
  246 + except Exception as e:
  247 + error_msg = f"查询向量生成失败 | 错误: {str(e)}"
  248 + log_info(error_msg)
  249 + if context:
  250 + context.add_warning(error_msg)
226 251
227 # Build result 252 # Build result
228 result = ParsedQuery( 253 result = ParsedQuery(
search/es_query_builder.py
@@ -11,7 +11,7 @@ Simplified architecture: @@ -11,7 +11,7 @@ Simplified architecture:
11 from typing import Dict, Any, List, Optional, Union 11 from typing import Dict, Any, List, Optional, Union
12 import numpy as np 12 import numpy as np
13 from .boolean_parser import QueryNode 13 from .boolean_parser import QueryNode
14 -from .query_config import FUNCTION_SCORE_CONFIG 14 +from config import FunctionScoreConfig
15 15
16 16
17 class ESQueryBuilder: 17 class ESQueryBuilder:
@@ -23,7 +23,8 @@ class ESQueryBuilder: @@ -23,7 +23,8 @@ class ESQueryBuilder:
23 match_fields: List[str], 23 match_fields: List[str],
24 text_embedding_field: Optional[str] = None, 24 text_embedding_field: Optional[str] = None,
25 image_embedding_field: Optional[str] = None, 25 image_embedding_field: Optional[str] = None,
26 - source_fields: Optional[List[str]] = None 26 + source_fields: Optional[List[str]] = None,
  27 + function_score_config: Optional[FunctionScoreConfig] = None
27 ): 28 ):
28 """ 29 """
29 Initialize query builder. 30 Initialize query builder.
@@ -34,12 +35,14 @@ class ESQueryBuilder: @@ -34,12 +35,14 @@ class ESQueryBuilder:
34 text_embedding_field: Field name for text embeddings 35 text_embedding_field: Field name for text embeddings
35 image_embedding_field: Field name for image embeddings 36 image_embedding_field: Field name for image embeddings
36 source_fields: Fields to return in search results (_source includes) 37 source_fields: Fields to return in search results (_source includes)
  38 + function_score_config: Function score configuration
37 """ 39 """
38 self.index_name = index_name 40 self.index_name = index_name
39 self.match_fields = match_fields 41 self.match_fields = match_fields
40 self.text_embedding_field = text_embedding_field 42 self.text_embedding_field = text_embedding_field
41 self.image_embedding_field = image_embedding_field 43 self.image_embedding_field = image_embedding_field
42 self.source_fields = source_fields 44 self.source_fields = source_fields
  45 + self.function_score_config = function_score_config
43 46
44 def build_query( 47 def build_query(
45 self, 48 self,
@@ -182,12 +185,15 @@ class ESQueryBuilder: @@ -182,12 +185,15 @@ class ESQueryBuilder:
182 return query 185 return query
183 186
184 # Build function_score query 187 # Build function_score query
  188 + score_mode = self.function_score_config.score_mode if self.function_score_config else "sum"
  189 + boost_mode = self.function_score_config.boost_mode if self.function_score_config else "multiply"
  190 +
185 function_score_query = { 191 function_score_query = {
186 "function_score": { 192 "function_score": {
187 "query": query, 193 "query": query,
188 "functions": functions, 194 "functions": functions,
189 - "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"),  
190 - "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") 195 + "score_mode": score_mode,
  196 + "boost_mode": boost_mode
191 } 197 }
192 } 198 }
193 199
@@ -201,7 +207,10 @@ class ESQueryBuilder: @@ -201,7 +207,10 @@ class ESQueryBuilder:
201 List of function score functions 207 List of function score functions
202 """ 208 """
203 functions = [] 209 functions = []
204 - config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) 210 + if not self.function_score_config:
  211 + return functions
  212 +
  213 + config_functions = self.function_score_config.functions or []
205 214
206 for func_config in config_functions: 215 for func_config in config_functions:
207 func_type = func_config.get("type") 216 func_type = func_config.get("type")
search/query_config.py deleted
@@ -1,150 +0,0 @@ @@ -1,150 +0,0 @@
1 -"""  
2 -Query configuration constants.  
3 -  
4 -Since all tenants share the same ES mapping, we can hardcode field lists here.  
5 -"""  
6 -  
7 -import os  
8 -from typing import Dict, List  
9 -  
10 -# Default index name  
11 -DEFAULT_INDEX_NAME = "search_products"  
12 -  
13 -# Text embedding field  
14 -TEXT_EMBEDDING_FIELD = "title_embedding"  
15 -  
16 -# Image embedding field  
17 -IMAGE_EMBEDDING_FIELD = "image_embedding"  
18 -  
19 -# Default match fields for text search (with boost)  
20 -# 文本召回:同时搜索中英文字段,两者相互补充  
21 -DEFAULT_MATCH_FIELDS = [  
22 - # 中文字段  
23 - "title_zh^3.0",  
24 - "brief_zh^1.5",  
25 - "description_zh^1.0",  
26 - "vendor_zh^1.5",  
27 - "category_path_zh^1.5",  
28 - "category_name_zh^1.5",  
29 - # 英文字段  
30 - "title_en^3.0",  
31 - "brief_en^1.5",  
32 - "description_en^1.0",  
33 - "vendor_en^1.5",  
34 - "category_path_en^1.5",  
35 - "category_name_en^1.5",  
36 - # 语言无关字段  
37 - "tags^1.0",  
38 -]  
39 -  
40 -# Domain-specific match fields  
41 -DOMAIN_FIELDS: Dict[str, List[str]] = {  
42 - "default": DEFAULT_MATCH_FIELDS,  
43 - "title": ["title_zh^2.0"],  
44 - "vendor": ["vendor_zh^1.5"],  
45 - "category": ["category_path_zh^1.5", "category_name_zh^1.5"],  
46 - "tags": ["tags^1.0"]  
47 -}  
48 -  
49 -# Source fields to return in search results  
50 -# 注意:为了在后端做多语言选择,_zh / _en 字段仍然需要从 ES 取出,  
51 -# 但不会原样透出给前端,而是统一映射到 title / description / vendor 等字段。  
52 -SOURCE_FIELDS = [  
53 - # 基本标识  
54 - "tenant_id",  
55 - "spu_id",  
56 - "create_time",  
57 - "update_time",  
58 -  
59 - # 多语言文本字段(仅用于后端选择,不直接返回给前端)  
60 - "title_zh",  
61 - "title_en",  
62 - "brief_zh",  
63 - "brief_en",  
64 - "description_zh",  
65 - "description_en",  
66 - "vendor_zh",  
67 - "vendor_en",  
68 - "category_path_zh",  
69 - "category_path_en",  
70 - "category_name_zh",  
71 - "category_name_en",  
72 -  
73 - # 语言无关字段(直接返回给前端)  
74 - "tags",  
75 - "image_url",  
76 - "category_id",  
77 - "category_name",  
78 - "category_level",  
79 - "category1_name",  
80 - "category2_name",  
81 - "category3_name",  
82 - "option1_name",  
83 - "option2_name",  
84 - "option3_name",  
85 - "min_price",  
86 - "max_price",  
87 - "compare_at_price",  
88 - "sku_prices",  
89 - "sku_weights",  
90 - "sku_weight_units",  
91 - "total_inventory",  
92 - "skus",  
93 - "specifications",  
94 -]  
95 -  
96 -# Query processing settings  
97 -ENABLE_TRANSLATION = os.environ.get("ENABLE_TRANSLATION", "true").lower() == "true"  
98 -ENABLE_TEXT_EMBEDDING = os.environ.get("ENABLE_TEXT_EMBEDDING", "true").lower() == "true"  
99 -TRANSLATION_API_KEY = os.environ.get("DEEPL_API_KEY")  
100 -TRANSLATION_SERVICE = "deepl"  
101 -  
102 -# Ranking expression (currently disabled)  
103 -RANKING_EXPRESSION = "bm25() + 0.2*text_embedding_relevance()"  
104 -  
105 -# Function score config  
106 -FUNCTION_SCORE_CONFIG = {  
107 - "score_mode": "sum",  
108 - "boost_mode": "multiply",  
109 - "functions": []  
110 -}  
111 -  
112 -# Load rewrite dictionary from file if exists  
113 -def load_rewrite_dictionary() -> Dict[str, str]:  
114 - """Load query rewrite dictionary from file."""  
115 - rewrite_file = os.path.join(  
116 - os.path.dirname(os.path.dirname(__file__)),  
117 - "config",  
118 - "query_rewrite.dict"  
119 - )  
120 -  
121 - if not os.path.exists(rewrite_file):  
122 - return {}  
123 -  
124 - rewrite_dict = {}  
125 - try:  
126 - with open(rewrite_file, 'r', encoding='utf-8') as f:  
127 - for line in f:  
128 - line = line.strip()  
129 - if not line or line.startswith('#'):  
130 - continue  
131 - parts = line.split('\t')  
132 - if len(parts) == 2:  
133 - rewrite_dict[parts[0].strip()] = parts[1].strip()  
134 - except Exception as e:  
135 - print(f"Warning: Failed to load rewrite dictionary: {e}")  
136 -  
137 - return rewrite_dict  
138 -  
139 -REWRITE_DICTIONARY = load_rewrite_dictionary()  
140 -  
141 -# Default facets for faceted search  
142 -# 分类分面:使用category1_name, category2_name, category3_name  
143 -# specifications分面:使用嵌套聚合,按name分组,然后按value聚合  
144 -DEFAULT_FACETS = [  
145 - "category1_name", # 一级分类  
146 - "category2_name", # 二级分类  
147 - "category3_name", # 三级分类  
148 - "specifications" # 规格分面(特殊处理:嵌套聚合)  
149 -]  
150 -  
search/searcher.py
@@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder @@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder
14 from .boolean_parser import BooleanParser, QueryNode 14 from .boolean_parser import BooleanParser, QueryNode
15 from .es_query_builder import ESQueryBuilder 15 from .es_query_builder import ESQueryBuilder
16 from .rerank_engine import RerankEngine 16 from .rerank_engine import RerankEngine
17 -from .query_config import (  
18 - DEFAULT_INDEX_NAME,  
19 - DEFAULT_MATCH_FIELDS,  
20 - TEXT_EMBEDDING_FIELD,  
21 - IMAGE_EMBEDDING_FIELD,  
22 - SOURCE_FIELDS,  
23 - ENABLE_TRANSLATION,  
24 - ENABLE_TEXT_EMBEDDING,  
25 - RANKING_EXPRESSION  
26 -) 17 +from config import SearchConfig
  18 +from config.utils import get_match_fields_for_index
27 from context.request_context import RequestContext, RequestContextStage, create_request_context 19 from context.request_context import RequestContext, RequestContextStage, create_request_context
28 from api.models import FacetResult, FacetValue 20 from api.models import FacetResult, FacetValue
29 from api.result_formatter import ResultFormatter 21 from api.result_formatter import ResultFormatter
@@ -87,37 +79,40 @@ class Searcher: @@ -87,37 +79,40 @@ class Searcher:
87 def __init__( 79 def __init__(
88 self, 80 self,
89 es_client: ESClient, 81 es_client: ESClient,
90 - query_parser: Optional[QueryParser] = None,  
91 - index_name: str = DEFAULT_INDEX_NAME 82 + config: SearchConfig,
  83 + query_parser: Optional[QueryParser] = None
92 ): 84 ):
93 """ 85 """
94 Initialize searcher. 86 Initialize searcher.
95 87
96 Args: 88 Args:
97 es_client: Elasticsearch client 89 es_client: Elasticsearch client
  90 + config: SearchConfig instance
98 query_parser: Query parser (created if not provided) 91 query_parser: Query parser (created if not provided)
99 - index_name: ES index name (default: search_products)  
100 """ 92 """
101 self.es_client = es_client 93 self.es_client = es_client
102 - self.index_name = index_name  
103 - self.query_parser = query_parser or QueryParser() 94 + self.config = config
  95 + self.index_name = config.es_index_name
  96 + self.query_parser = query_parser or QueryParser(config)
104 97
105 # Initialize components 98 # Initialize components
106 self.boolean_parser = BooleanParser() 99 self.boolean_parser = BooleanParser()
107 - self.rerank_engine = RerankEngine(RANKING_EXPRESSION, enabled=False) 100 + self.rerank_engine = RerankEngine(config.ranking.expression, enabled=False)
108 101
109 - # Use constants from query_config  
110 - self.match_fields = DEFAULT_MATCH_FIELDS  
111 - self.text_embedding_field = TEXT_EMBEDDING_FIELD  
112 - self.image_embedding_field = IMAGE_EMBEDDING_FIELD 102 + # Get match fields from config
  103 + self.match_fields = get_match_fields_for_index(config, "default")
  104 + self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding"
  105 + self.image_embedding_field = config.query_config.image_embedding_field or "image_embedding"
  106 + self.source_fields = config.query_config.source_fields or []
113 107
114 # Query builder - simplified single-layer architecture 108 # Query builder - simplified single-layer architecture
115 self.query_builder = ESQueryBuilder( 109 self.query_builder = ESQueryBuilder(
116 - index_name=index_name, 110 + index_name=self.index_name,
117 match_fields=self.match_fields, 111 match_fields=self.match_fields,
118 text_embedding_field=self.text_embedding_field, 112 text_embedding_field=self.text_embedding_field,
119 image_embedding_field=self.image_embedding_field, 113 image_embedding_field=self.image_embedding_field,
120 - source_fields=SOURCE_FIELDS 114 + source_fields=self.source_fields,
  115 + function_score_config=self.config.function_score
121 ) 116 )
122 117
123 def search( 118 def search(
@@ -162,8 +157,8 @@ class Searcher: @@ -162,8 +157,8 @@ class Searcher:
162 context = create_request_context() 157 context = create_request_context()
163 158
164 # Always use config defaults (these are backend configuration, not user parameters) 159 # Always use config defaults (these are backend configuration, not user parameters)
165 - enable_translation = ENABLE_TRANSLATION  
166 - enable_embedding = ENABLE_TEXT_EMBEDDING 160 + enable_translation = self.config.query_config.enable_translation
  161 + enable_embedding = self.config.query_config.enable_text_embedding
167 enable_rerank = False # Temporarily disabled 162 enable_rerank = False # Temporarily disabled
168 163
169 # Start timing 164 # Start timing
@@ -508,9 +503,9 @@ class Searcher: @@ -508,9 +503,9 @@ class Searcher:
508 } 503 }
509 504
510 # Add _source filtering if source_fields are configured 505 # Add _source filtering if source_fields are configured
511 - if SOURCE_FIELDS: 506 + if self.source_fields:
512 es_query["_source"] = { 507 es_query["_source"] = {
513 - "includes": SOURCE_FIELDS 508 + "includes": self.source_fields
514 } 509 }
515 510
516 if filters or range_filters: 511 if filters or range_filters:
@@ -137,8 +137,8 @@ def mock_es_client() -&gt; Mock: @@ -137,8 +137,8 @@ def mock_es_client() -&gt; Mock:
137 def test_searcher(sample_search_config, mock_es_client) -> Searcher: 137 def test_searcher(sample_search_config, mock_es_client) -> Searcher:
138 """测试用Searcher实例""" 138 """测试用Searcher实例"""
139 return Searcher( 139 return Searcher(
140 - config=sample_search_config,  
141 - es_client=mock_es_client 140 + es_client=mock_es_client,
  141 + config=sample_search_config
142 ) 142 )
143 143
144 144