Commit 9f96d6f3fa501c7f8ca045cca97ee7573120d8a7
1 parent
99bea633
短query不用语义搜索
query config/ranking config优化
Showing
12 changed files
with
281 additions
and
250 deletions
Show diff stats
api/app.py
| @@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address) | @@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address) | ||
| 41 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 41 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 42 | 42 | ||
| 43 | from config.env_config import ES_CONFIG | 43 | from config.env_config import ES_CONFIG |
| 44 | +from config import ConfigLoader | ||
| 44 | from utils import ESClient | 45 | from utils import ESClient |
| 45 | from search import Searcher | 46 | from search import Searcher |
| 46 | -from search.query_config import DEFAULT_INDEX_NAME | ||
| 47 | from query import QueryParser | 47 | from query import QueryParser |
| 48 | 48 | ||
| 49 | # Global instances | 49 | # Global instances |
| 50 | _es_client: Optional[ESClient] = None | 50 | _es_client: Optional[ESClient] = None |
| 51 | _searcher: Optional[Searcher] = None | 51 | _searcher: Optional[Searcher] = None |
| 52 | _query_parser: Optional[QueryParser] = None | 52 | _query_parser: Optional[QueryParser] = None |
| 53 | +_config = None | ||
| 53 | 54 | ||
| 54 | 55 | ||
| 55 | def init_service(es_host: str = "http://localhost:9200"): | 56 | def init_service(es_host: str = "http://localhost:9200"): |
| @@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"): | @@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"): | ||
| 59 | Args: | 60 | Args: |
| 60 | es_host: Elasticsearch host URL | 61 | es_host: Elasticsearch host URL |
| 61 | """ | 62 | """ |
| 62 | - global _es_client, _searcher, _query_parser | 63 | + global _es_client, _searcher, _query_parser, _config |
| 63 | 64 | ||
| 64 | start_time = time.time() | 65 | start_time = time.time() |
| 65 | logger.info("Initializing search service (multi-tenant)") | 66 | logger.info("Initializing search service (multi-tenant)") |
| 66 | 67 | ||
| 68 | + # Load configuration | ||
| 69 | + logger.info("Loading configuration...") | ||
| 70 | + config_loader = ConfigLoader("config/config.yaml") | ||
| 71 | + _config = config_loader.load_config() | ||
| 72 | + logger.info("Configuration loaded") | ||
| 73 | + | ||
| 67 | # Get ES credentials | 74 | # Get ES credentials |
| 68 | es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username') | 75 | es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username') |
| 69 | es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password') | 76 | es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password') |
| @@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"): | @@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"): | ||
| 81 | 88 | ||
| 82 | # Initialize components | 89 | # Initialize components |
| 83 | logger.info("Initializing query parser...") | 90 | logger.info("Initializing query parser...") |
| 84 | - _query_parser = QueryParser() | 91 | + _query_parser = QueryParser(_config) |
| 85 | 92 | ||
| 86 | logger.info("Initializing searcher...") | 93 | logger.info("Initializing searcher...") |
| 87 | - _searcher = Searcher(_es_client, _query_parser, index_name=DEFAULT_INDEX_NAME) | 94 | + _searcher = Searcher(_es_client, _config, _query_parser) |
| 88 | 95 | ||
| 89 | elapsed = time.time() - start_time | 96 | elapsed = time.time() - start_time |
| 90 | - logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {DEFAULT_INDEX_NAME}") | 97 | + logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}") |
| 91 | 98 | ||
| 92 | 99 | ||
| 93 | 100 | ||
| @@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser: | @@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser: | ||
| 113 | return _query_parser | 120 | return _query_parser |
| 114 | 121 | ||
| 115 | 122 | ||
| 123 | +def get_config(): | ||
| 124 | + """Get global config instance.""" | ||
| 125 | + if _config is None: | ||
| 126 | + raise RuntimeError("Service not initialized") | ||
| 127 | + return _config | ||
| 128 | + | ||
| 129 | + | ||
| 116 | # Create FastAPI app with enhanced configuration | 130 | # Create FastAPI app with enhanced configuration |
| 117 | app = FastAPI( | 131 | app = FastAPI( |
| 118 | title="E-Commerce Search API", | 132 | title="E-Commerce Search API", |
config/__init__.py
| @@ -23,6 +23,10 @@ from .config_loader import ( | @@ -23,6 +23,10 @@ from .config_loader import ( | ||
| 23 | RerankConfig, | 23 | RerankConfig, |
| 24 | ConfigurationError | 24 | ConfigurationError |
| 25 | ) | 25 | ) |
| 26 | +from .utils import ( | ||
| 27 | + get_match_fields_for_index, | ||
| 28 | + get_domain_fields | ||
| 29 | +) | ||
| 26 | 30 | ||
| 27 | __all__ = [ | 31 | __all__ = [ |
| 28 | # Field types | 32 | # Field types |
| @@ -46,4 +50,6 @@ __all__ = [ | @@ -46,4 +50,6 @@ __all__ = [ | ||
| 46 | 'FunctionScoreConfig', | 50 | 'FunctionScoreConfig', |
| 47 | 'RerankConfig', | 51 | 'RerankConfig', |
| 48 | 'ConfigurationError', | 52 | 'ConfigurationError', |
| 53 | + 'get_match_fields_for_index', | ||
| 54 | + 'get_domain_fields', | ||
| 49 | ] | 55 | ] |
config/config.yaml
| @@ -412,6 +412,11 @@ query_config: | @@ -412,6 +412,11 @@ query_config: | ||
| 412 | text_embedding_field: "title_embedding" # Field name for text embeddings | 412 | text_embedding_field: "title_embedding" # Field name for text embeddings |
| 413 | image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect) | 413 | image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect) |
| 414 | 414 | ||
| 415 | + # Embedding disable thresholds (disable vector search for short queries) | ||
| 416 | + embedding_disable_thresholds: | ||
| 417 | + chinese_char_limit: 4 # Disable embedding for Chinese queries with <= 4 characters | ||
| 418 | + english_word_limit: 3 # Disable embedding for English queries with <= 3 words | ||
| 419 | + | ||
| 415 | # Translation API (DeepL) | 420 | # Translation API (DeepL) |
| 416 | translation_service: "deepl" | 421 | translation_service: "deepl" |
| 417 | translation_api_key: null # Set via environment variable | 422 | translation_api_key: null # Set via environment variable |
config/config_loader.py
| @@ -58,6 +58,10 @@ class QueryConfig: | @@ -58,6 +58,10 @@ class QueryConfig: | ||
| 58 | text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding") | 58 | text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding") |
| 59 | image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding") | 59 | image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding") |
| 60 | 60 | ||
| 61 | + # Embedding disable thresholds (disable vector search for short queries) | ||
| 62 | + embedding_disable_chinese_char_limit: int = 4 # Disable embedding for Chinese queries with <= this many characters | ||
| 63 | + embedding_disable_english_word_limit: int = 3 # Disable embedding for English queries with <= this many words | ||
| 64 | + | ||
| 61 | # ES source fields configuration - fields to return in search results | 65 | # ES source fields configuration - fields to return in search results |
| 62 | # If None, auto-collect from field configs (fields with return_in_source=True) | 66 | # If None, auto-collect from field configs (fields with return_in_source=True) |
| 63 | # If empty list, return all fields. Otherwise, only return specified fields. | 67 | # If empty list, return all fields. Otherwise, only return specified fields. |
| @@ -165,15 +169,18 @@ class ConfigLoader: | @@ -165,15 +169,18 @@ class ConfigLoader: | ||
| 165 | 169 | ||
| 166 | return rewrite_dict | 170 | return rewrite_dict |
| 167 | 171 | ||
| 168 | - def load_config(self) -> SearchConfig: | 172 | + def load_config(self, validate: bool = True) -> SearchConfig: |
| 169 | """ | 173 | """ |
| 170 | Load unified configuration from YAML file. | 174 | Load unified configuration from YAML file. |
| 171 | 175 | ||
| 176 | + Args: | ||
| 177 | + validate: Whether to validate configuration after loading (default: True) | ||
| 178 | + | ||
| 172 | Returns: | 179 | Returns: |
| 173 | SearchConfig object | 180 | SearchConfig object |
| 174 | 181 | ||
| 175 | Raises: | 182 | Raises: |
| 176 | - ConfigurationError: If config file not found or invalid | 183 | + ConfigurationError: If config file not found, invalid, or validation fails |
| 177 | """ | 184 | """ |
| 178 | if not self.config_file.exists(): | 185 | if not self.config_file.exists(): |
| 179 | raise ConfigurationError(f"Configuration file not found: {self.config_file}") | 186 | raise ConfigurationError(f"Configuration file not found: {self.config_file}") |
| @@ -184,7 +191,16 @@ class ConfigLoader: | @@ -184,7 +191,16 @@ class ConfigLoader: | ||
| 184 | except yaml.YAMLError as e: | 191 | except yaml.YAMLError as e: |
| 185 | raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}") | 192 | raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}") |
| 186 | 193 | ||
| 187 | - return self._parse_config(config_data) | 194 | + config = self._parse_config(config_data) |
| 195 | + | ||
| 196 | + # Auto-validate configuration | ||
| 197 | + if validate: | ||
| 198 | + errors = self.validate_config(config) | ||
| 199 | + if errors: | ||
| 200 | + error_msg = "Configuration validation failed:\n" + "\n".join(f" - {err}" for err in errors) | ||
| 201 | + raise ConfigurationError(error_msg) | ||
| 202 | + | ||
| 203 | + return config | ||
| 188 | 204 | ||
| 189 | def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig: | 205 | def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig: |
| 190 | """Parse configuration dictionary into SearchConfig object.""" | 206 | """Parse configuration dictionary into SearchConfig object.""" |
| @@ -214,43 +230,48 @@ class ConfigLoader: | @@ -214,43 +230,48 @@ class ConfigLoader: | ||
| 214 | if field.return_in_source | 230 | if field.return_in_source |
| 215 | ] | 231 | ] |
| 216 | 232 | ||
| 233 | + # Parse embedding disable thresholds | ||
| 234 | + embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {}) | ||
| 235 | + | ||
| 217 | query_config = QueryConfig( | 236 | query_config = QueryConfig( |
| 218 | - supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), | ||
| 219 | - default_language=query_config_data.get("default_language", "zh"), | 237 | + supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], |
| 238 | + default_language=query_config_data.get("default_language") or "zh", | ||
| 220 | enable_translation=query_config_data.get("enable_translation", True), | 239 | enable_translation=query_config_data.get("enable_translation", True), |
| 221 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), | 240 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), |
| 222 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), | 241 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), |
| 223 | rewrite_dictionary=rewrite_dictionary, | 242 | rewrite_dictionary=rewrite_dictionary, |
| 224 | translation_api_key=query_config_data.get("translation_api_key"), | 243 | translation_api_key=query_config_data.get("translation_api_key"), |
| 225 | - translation_service=query_config_data.get("translation_service", "deepl"), | 244 | + translation_service=query_config_data.get("translation_service") or "deepl", |
| 226 | translation_glossary_id=query_config_data.get("translation_glossary_id"), | 245 | translation_glossary_id=query_config_data.get("translation_glossary_id"), |
| 227 | - translation_context=query_config_data.get("translation_context", "e-commerce product search"), | 246 | + translation_context=query_config_data.get("translation_context") or "e-commerce product search", |
| 228 | text_embedding_field=query_config_data.get("text_embedding_field"), | 247 | text_embedding_field=query_config_data.get("text_embedding_field"), |
| 229 | image_embedding_field=query_config_data.get("image_embedding_field"), | 248 | image_embedding_field=query_config_data.get("image_embedding_field"), |
| 249 | + embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), | ||
| 250 | + embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), | ||
| 230 | source_fields=source_fields | 251 | source_fields=source_fields |
| 231 | ) | 252 | ) |
| 232 | 253 | ||
| 233 | # Parse ranking config | 254 | # Parse ranking config |
| 234 | ranking_data = config_data.get("ranking", {}) | 255 | ranking_data = config_data.get("ranking", {}) |
| 235 | ranking = RankingConfig( | 256 | ranking = RankingConfig( |
| 236 | - expression=ranking_data.get("expression", "bm25() + 0.2*text_embedding_relevance()"), | ||
| 237 | - description=ranking_data.get("description", "Default BM25 + text embedding ranking") | 257 | + expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()", |
| 258 | + description=ranking_data.get("description") or "Default BM25 + text embedding ranking" | ||
| 238 | ) | 259 | ) |
| 239 | 260 | ||
| 240 | # Parse Function Score configuration | 261 | # Parse Function Score configuration |
| 241 | fs_data = config_data.get("function_score", {}) | 262 | fs_data = config_data.get("function_score", {}) |
| 242 | function_score = FunctionScoreConfig( | 263 | function_score = FunctionScoreConfig( |
| 243 | - score_mode=fs_data.get("score_mode", "sum"), | ||
| 244 | - boost_mode=fs_data.get("boost_mode", "multiply"), | ||
| 245 | - functions=fs_data.get("functions", []) | 264 | + score_mode=fs_data.get("score_mode") or "sum", |
| 265 | + boost_mode=fs_data.get("boost_mode") or "multiply", | ||
| 266 | + functions=fs_data.get("functions") or [] | ||
| 246 | ) | 267 | ) |
| 247 | 268 | ||
| 248 | # Parse Rerank configuration | 269 | # Parse Rerank configuration |
| 249 | rerank_data = config_data.get("rerank", {}) | 270 | rerank_data = config_data.get("rerank", {}) |
| 250 | rerank = RerankConfig( | 271 | rerank = RerankConfig( |
| 251 | enabled=rerank_data.get("enabled", False), | 272 | enabled=rerank_data.get("enabled", False), |
| 252 | - expression=rerank_data.get("expression", ""), | ||
| 253 | - description=rerank_data.get("description", "") | 273 | + expression=rerank_data.get("expression") or "", |
| 274 | + description=rerank_data.get("description") or "" | ||
| 254 | ) | 275 | ) |
| 255 | 276 | ||
| 256 | # Parse SPU config | 277 | # Parse SPU config |
| @@ -447,21 +468,43 @@ class ConfigLoader: | @@ -447,21 +468,43 @@ class ConfigLoader: | ||
| 447 | output_path = Path(output_path) | 468 | output_path = Path(output_path) |
| 448 | 469 | ||
| 449 | # Convert config back to dictionary format | 470 | # Convert config back to dictionary format |
| 471 | + query_config_dict = { | ||
| 472 | + "supported_languages": config.query_config.supported_languages, | ||
| 473 | + "default_language": config.query_config.default_language, | ||
| 474 | + "enable_translation": config.query_config.enable_translation, | ||
| 475 | + "enable_text_embedding": config.query_config.enable_text_embedding, | ||
| 476 | + "enable_query_rewrite": config.query_config.enable_query_rewrite, | ||
| 477 | + "translation_service": config.query_config.translation_service, | ||
| 478 | + } | ||
| 479 | + | ||
| 480 | + # Add optional fields only if they are set | ||
| 481 | + if config.query_config.translation_api_key: | ||
| 482 | + query_config_dict["translation_api_key"] = config.query_config.translation_api_key | ||
| 483 | + if config.query_config.translation_glossary_id: | ||
| 484 | + query_config_dict["translation_glossary_id"] = config.query_config.translation_glossary_id | ||
| 485 | + if config.query_config.translation_context: | ||
| 486 | + query_config_dict["translation_context"] = config.query_config.translation_context | ||
| 487 | + if config.query_config.text_embedding_field: | ||
| 488 | + query_config_dict["text_embedding_field"] = config.query_config.text_embedding_field | ||
| 489 | + if config.query_config.image_embedding_field: | ||
| 490 | + query_config_dict["image_embedding_field"] = config.query_config.image_embedding_field | ||
| 491 | + if config.query_config.source_fields: | ||
| 492 | + query_config_dict["source_fields"] = config.query_config.source_fields | ||
| 493 | + | ||
| 494 | + # Add embedding disable thresholds | ||
| 495 | + if (config.query_config.embedding_disable_chinese_char_limit != 4 or | ||
| 496 | + config.query_config.embedding_disable_english_word_limit != 3): | ||
| 497 | + query_config_dict["embedding_disable_thresholds"] = { | ||
| 498 | + "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit, | ||
| 499 | + "english_word_limit": config.query_config.embedding_disable_english_word_limit | ||
| 500 | + } | ||
| 501 | + | ||
| 450 | config_dict = { | 502 | config_dict = { |
| 451 | "es_index_name": config.es_index_name, | 503 | "es_index_name": config.es_index_name, |
| 452 | "es_settings": config.es_settings, | 504 | "es_settings": config.es_settings, |
| 453 | "fields": [self._field_to_dict(field) for field in config.fields], | 505 | "fields": [self._field_to_dict(field) for field in config.fields], |
| 454 | "indexes": [self._index_to_dict(index) for index in config.indexes], | 506 | "indexes": [self._index_to_dict(index) for index in config.indexes], |
| 455 | - "query_config": { | ||
| 456 | - "supported_languages": config.query_config.supported_languages, | ||
| 457 | - "default_language": config.query_config.default_language, | ||
| 458 | - "enable_translation": config.query_config.enable_translation, | ||
| 459 | - "enable_text_embedding": config.query_config.enable_text_embedding, | ||
| 460 | - "enable_query_rewrite": config.query_config.enable_query_rewrite, | ||
| 461 | - # rewrite_dictionary is stored in separate file, not in config | ||
| 462 | - "translation_api_key": config.query_config.translation_api_key, | ||
| 463 | - "translation_service": config.query_config.translation_service, | ||
| 464 | - }, | 507 | + "query_config": query_config_dict, |
| 465 | "ranking": { | 508 | "ranking": { |
| 466 | "expression": config.ranking.expression, | 509 | "expression": config.ranking.expression, |
| 467 | "description": config.ranking.description | 510 | "description": config.ranking.description |
| @@ -505,7 +548,7 @@ class ConfigLoader: | @@ -505,7 +548,7 @@ class ConfigLoader: | ||
| 505 | f.write(f"{key}\t{value}\n") | 548 | f.write(f"{key}\t{value}\n") |
| 506 | 549 | ||
| 507 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: | 550 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: |
| 508 | - """Convert FieldConfig to dictionary.""" | 551 | + """Convert FieldConfig to dictionary, preserving all fields.""" |
| 509 | result = { | 552 | result = { |
| 510 | "name": field.name, | 553 | "name": field.name, |
| 511 | "type": field.field_type.value, | 554 | "type": field.field_type.value, |
| @@ -513,36 +556,49 @@ class ConfigLoader: | @@ -513,36 +556,49 @@ class ConfigLoader: | ||
| 513 | "boost": field.boost, | 556 | "boost": field.boost, |
| 514 | "store": field.store, | 557 | "store": field.store, |
| 515 | "index": field.index, | 558 | "index": field.index, |
| 559 | + "return_in_source": field.return_in_source, | ||
| 516 | } | 560 | } |
| 517 | 561 | ||
| 562 | + # Add optional fields only if they differ from defaults or are set | ||
| 518 | if field.analyzer: | 563 | if field.analyzer: |
| 519 | result["analyzer"] = field.analyzer.value | 564 | result["analyzer"] = field.analyzer.value |
| 520 | if field.search_analyzer: | 565 | if field.search_analyzer: |
| 521 | result["search_analyzer"] = field.search_analyzer.value | 566 | result["search_analyzer"] = field.search_analyzer.value |
| 522 | if field.multi_language: | 567 | if field.multi_language: |
| 523 | result["multi_language"] = field.multi_language | 568 | result["multi_language"] = field.multi_language |
| 524 | - result["languages"] = field.languages | 569 | + if field.languages: |
| 570 | + result["languages"] = field.languages | ||
| 525 | if field.embedding_dims != 1024: | 571 | if field.embedding_dims != 1024: |
| 526 | result["embedding_dims"] = field.embedding_dims | 572 | result["embedding_dims"] = field.embedding_dims |
| 527 | if field.embedding_similarity != "dot_product": | 573 | if field.embedding_similarity != "dot_product": |
| 528 | result["embedding_similarity"] = field.embedding_similarity | 574 | result["embedding_similarity"] = field.embedding_similarity |
| 529 | if field.nested: | 575 | if field.nested: |
| 530 | result["nested"] = field.nested | 576 | result["nested"] = field.nested |
| 531 | - result["nested_properties"] = field.nested_properties | 577 | + if field.nested_properties: |
| 578 | + result["nested_properties"] = field.nested_properties | ||
| 579 | + if field.keyword_subfield: | ||
| 580 | + result["keyword_subfield"] = field.keyword_subfield | ||
| 581 | + if field.keyword_ignore_above != 256: | ||
| 582 | + result["keyword_ignore_above"] = field.keyword_ignore_above | ||
| 583 | + if field.keyword_normalizer: | ||
| 584 | + result["keyword_normalizer"] = field.keyword_normalizer | ||
| 532 | 585 | ||
| 533 | return result | 586 | return result |
| 534 | 587 | ||
| 535 | def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: | 588 | def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: |
| 536 | - """Convert IndexConfig to dictionary.""" | 589 | + """Convert IndexConfig to dictionary, preserving all fields.""" |
| 537 | result = { | 590 | result = { |
| 538 | "name": index.name, | 591 | "name": index.name, |
| 539 | "label": index.label, | 592 | "label": index.label, |
| 540 | "fields": index.fields, | 593 | "fields": index.fields, |
| 541 | "analyzer": index.analyzer.value, | 594 | "analyzer": index.analyzer.value, |
| 542 | - "boost": index.boost, | ||
| 543 | - "example": index.example | ||
| 544 | } | 595 | } |
| 545 | - | 596 | + |
| 597 | + # Add optional fields only if they differ from defaults or are set | ||
| 598 | + if index.boost != 1.0: | ||
| 599 | + result["boost"] = index.boost | ||
| 600 | + if index.example: | ||
| 601 | + result["example"] = index.example | ||
| 546 | if index.language_field_mapping: | 602 | if index.language_field_mapping: |
| 547 | result["language_field_mapping"] = index.language_field_mapping | 603 | result["language_field_mapping"] = index.language_field_mapping |
| 548 | 604 |
| @@ -0,0 +1,70 @@ | @@ -0,0 +1,70 @@ | ||
| 1 | +""" | ||
| 2 | +Configuration utility functions. | ||
| 3 | + | ||
| 4 | +Helper functions for working with SearchConfig objects. | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from typing import Dict, List | ||
| 8 | +from .config_loader import SearchConfig | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]: | ||
| 12 | + """ | ||
| 13 | + Generate match fields list with boost from IndexConfig and FieldConfig. | ||
| 14 | + | ||
| 15 | + Args: | ||
| 16 | + config: SearchConfig instance | ||
| 17 | + index_name: Name of the index domain (default: "default") | ||
| 18 | + | ||
| 19 | + Returns: | ||
| 20 | + List of field names with boost, e.g., ["title_zh^3.0", "brief_zh^1.5"] | ||
| 21 | + """ | ||
| 22 | + # Find the index config | ||
| 23 | + index_config = None | ||
| 24 | + for idx in config.indexes: | ||
| 25 | + if idx.name == index_name: | ||
| 26 | + index_config = idx | ||
| 27 | + break | ||
| 28 | + | ||
| 29 | + if not index_config: | ||
| 30 | + return [] | ||
| 31 | + | ||
| 32 | + # Create a field name to FieldConfig mapping | ||
| 33 | + field_map = {field.name: field for field in config.fields} | ||
| 34 | + | ||
| 35 | + # Generate match fields with boost | ||
| 36 | + match_fields = [] | ||
| 37 | + for field_name in index_config.fields: | ||
| 38 | + field_config = field_map.get(field_name) | ||
| 39 | + if field_config: | ||
| 40 | + # Combine index boost and field boost | ||
| 41 | + total_boost = index_config.boost * field_config.boost | ||
| 42 | + if total_boost != 1.0: | ||
| 43 | + match_fields.append(f"{field_name}^{total_boost}") | ||
| 44 | + else: | ||
| 45 | + match_fields.append(field_name) | ||
| 46 | + else: | ||
| 47 | + # Field not found in config, use index boost only | ||
| 48 | + if index_config.boost != 1.0: | ||
| 49 | + match_fields.append(f"{field_name}^{index_config.boost}") | ||
| 50 | + else: | ||
| 51 | + match_fields.append(field_name) | ||
| 52 | + | ||
| 53 | + return match_fields | ||
| 54 | + | ||
| 55 | + | ||
| 56 | +def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]: | ||
| 57 | + """ | ||
| 58 | + Generate domain-specific match fields from all index configs. | ||
| 59 | + | ||
| 60 | + Args: | ||
| 61 | + config: SearchConfig instance | ||
| 62 | + | ||
| 63 | + Returns: | ||
| 64 | + Dictionary mapping domain name to list of match fields | ||
| 65 | + """ | ||
| 66 | + domain_fields = {} | ||
| 67 | + for index_config in config.indexes: | ||
| 68 | + domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name) | ||
| 69 | + return domain_fields | ||
| 70 | + |
frontend/index.html
| @@ -100,9 +100,10 @@ | @@ -100,9 +100,10 @@ | ||
| 100 | 100 | ||
| 101 | <div class="sort-right"> | 101 | <div class="sort-right"> |
| 102 | <select id="resultSize" onchange="performSearch()"> | 102 | <select id="resultSize" onchange="performSearch()"> |
| 103 | - <option value="10">10 per page</option> | ||
| 104 | - <option value="20" selected>20 per page</option> | ||
| 105 | - <option value="50">50 per page</option> | 103 | + <option value="20">20 per page</option> |
| 104 | + <option value="50" selected>50 per page</option> | ||
| 105 | + <option value="100">50 per page</option> | ||
| 106 | + <option value="200">50 per page</option> | ||
| 106 | </select> | 107 | </select> |
| 107 | </div> | 108 | </div> |
| 108 | </div> | 109 | </div> |
main.py
| @@ -93,7 +93,7 @@ def cmd_search(args): | @@ -93,7 +93,7 @@ def cmd_search(args): | ||
| 93 | 93 | ||
| 94 | from query import QueryParser | 94 | from query import QueryParser |
| 95 | query_parser = QueryParser(config) | 95 | query_parser = QueryParser(config) |
| 96 | - searcher = Searcher(config, es_client, query_parser) | 96 | + searcher = Searcher(es_client, config, query_parser) |
| 97 | 97 | ||
| 98 | # Execute search | 98 | # Execute search |
| 99 | print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})") | 99 | print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})") |
query/query_parser.py
| @@ -9,13 +9,7 @@ import numpy as np | @@ -9,13 +9,7 @@ import numpy as np | ||
| 9 | import logging | 9 | import logging |
| 10 | 10 | ||
| 11 | from embeddings import BgeEncoder | 11 | from embeddings import BgeEncoder |
| 12 | -from search.query_config import ( | ||
| 13 | - ENABLE_TEXT_EMBEDDING, | ||
| 14 | - ENABLE_TRANSLATION, | ||
| 15 | - REWRITE_DICTIONARY, | ||
| 16 | - TRANSLATION_API_KEY, | ||
| 17 | - TRANSLATION_SERVICE | ||
| 18 | -) | 12 | +from config import SearchConfig |
| 19 | from .language_detector import LanguageDetector | 13 | from .language_detector import LanguageDetector |
| 20 | from .translator import Translator | 14 | from .translator import Translator |
| 21 | from .query_rewriter import QueryRewriter, QueryNormalizer | 15 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| @@ -70,6 +64,7 @@ class QueryParser: | @@ -70,6 +64,7 @@ class QueryParser: | ||
| 70 | 64 | ||
| 71 | def __init__( | 65 | def __init__( |
| 72 | self, | 66 | self, |
| 67 | + config: SearchConfig, | ||
| 73 | text_encoder: Optional[BgeEncoder] = None, | 68 | text_encoder: Optional[BgeEncoder] = None, |
| 74 | translator: Optional[Translator] = None | 69 | translator: Optional[Translator] = None |
| 75 | ): | 70 | ): |
| @@ -77,21 +72,23 @@ class QueryParser: | @@ -77,21 +72,23 @@ class QueryParser: | ||
| 77 | Initialize query parser. | 72 | Initialize query parser. |
| 78 | 73 | ||
| 79 | Args: | 74 | Args: |
| 75 | + config: SearchConfig instance | ||
| 80 | text_encoder: Text embedding encoder (lazy loaded if not provided) | 76 | text_encoder: Text embedding encoder (lazy loaded if not provided) |
| 81 | translator: Translator instance (lazy loaded if not provided) | 77 | translator: Translator instance (lazy loaded if not provided) |
| 82 | """ | 78 | """ |
| 79 | + self.config = config | ||
| 83 | self._text_encoder = text_encoder | 80 | self._text_encoder = text_encoder |
| 84 | self._translator = translator | 81 | self._translator = translator |
| 85 | 82 | ||
| 86 | # Initialize components | 83 | # Initialize components |
| 87 | self.normalizer = QueryNormalizer() | 84 | self.normalizer = QueryNormalizer() |
| 88 | self.language_detector = LanguageDetector() | 85 | self.language_detector = LanguageDetector() |
| 89 | - self.rewriter = QueryRewriter(REWRITE_DICTIONARY) | 86 | + self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) |
| 90 | 87 | ||
| 91 | @property | 88 | @property |
| 92 | def text_encoder(self) -> BgeEncoder: | 89 | def text_encoder(self) -> BgeEncoder: |
| 93 | """Lazy load text encoder.""" | 90 | """Lazy load text encoder.""" |
| 94 | - if self._text_encoder is None and ENABLE_TEXT_EMBEDDING: | 91 | + if self._text_encoder is None and self.config.query_config.enable_text_embedding: |
| 95 | logger.info("Initializing text encoder (lazy load)...") | 92 | logger.info("Initializing text encoder (lazy load)...") |
| 96 | self._text_encoder = BgeEncoder() | 93 | self._text_encoder = BgeEncoder() |
| 97 | return self._text_encoder | 94 | return self._text_encoder |
| @@ -99,13 +96,13 @@ class QueryParser: | @@ -99,13 +96,13 @@ class QueryParser: | ||
| 99 | @property | 96 | @property |
| 100 | def translator(self) -> Translator: | 97 | def translator(self) -> Translator: |
| 101 | """Lazy load translator.""" | 98 | """Lazy load translator.""" |
| 102 | - if self._translator is None and ENABLE_TRANSLATION: | 99 | + if self._translator is None and self.config.query_config.enable_translation: |
| 103 | logger.info("Initializing translator (lazy load)...") | 100 | logger.info("Initializing translator (lazy load)...") |
| 104 | self._translator = Translator( | 101 | self._translator = Translator( |
| 105 | - api_key=TRANSLATION_API_KEY, | 102 | + api_key=self.config.query_config.translation_api_key, |
| 106 | use_cache=True, | 103 | use_cache=True, |
| 107 | - glossary_id=None, # Can be added to query_config if needed | ||
| 108 | - translation_context='e-commerce product search' | 104 | + glossary_id=self.config.query_config.translation_glossary_id, |
| 105 | + translation_context=self.config.query_config.translation_context | ||
| 109 | ) | 106 | ) |
| 110 | return self._translator | 107 | return self._translator |
| 111 | 108 | ||
| @@ -156,7 +153,7 @@ class QueryParser: | @@ -156,7 +153,7 @@ class QueryParser: | ||
| 156 | 153 | ||
| 157 | # Stage 2: Query rewriting | 154 | # Stage 2: Query rewriting |
| 158 | rewritten = None | 155 | rewritten = None |
| 159 | - if REWRITE_DICTIONARY: # Enable rewrite if dictionary exists | 156 | + if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists |
| 160 | rewritten = self.rewriter.rewrite(query_text) | 157 | rewritten = self.rewriter.rewrite(query_text) |
| 161 | if rewritten != query_text: | 158 | if rewritten != query_text: |
| 162 | log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") | 159 | log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") |
| @@ -173,7 +170,7 @@ class QueryParser: | @@ -173,7 +170,7 @@ class QueryParser: | ||
| 173 | 170 | ||
| 174 | # Stage 4: Translation | 171 | # Stage 4: Translation |
| 175 | translations = {} | 172 | translations = {} |
| 176 | - if ENABLE_TRANSLATION: | 173 | + if self.config.query_config.enable_translation: |
| 177 | try: | 174 | try: |
| 178 | # Determine target languages for translation | 175 | # Determine target languages for translation |
| 179 | # Simplified: always translate to Chinese and English | 176 | # Simplified: always translate to Chinese and English |
| @@ -210,19 +207,47 @@ class QueryParser: | @@ -210,19 +207,47 @@ class QueryParser: | ||
| 210 | # Stage 5: Text embedding | 207 | # Stage 5: Text embedding |
| 211 | query_vector = None | 208 | query_vector = None |
| 212 | if (generate_vector and | 209 | if (generate_vector and |
| 213 | - ENABLE_TEXT_EMBEDDING and | 210 | + self.config.query_config.enable_text_embedding and |
| 214 | domain == "default"): # Only generate vector for default domain | 211 | domain == "default"): # Only generate vector for default domain |
| 215 | - try: | ||
| 216 | - log_debug("开始生成查询向量") | ||
| 217 | - query_vector = self.text_encoder.encode([query_text])[0] | ||
| 218 | - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") | ||
| 219 | - if context: | ||
| 220 | - context.store_intermediate_result('query_vector_shape', query_vector.shape) | ||
| 221 | - except Exception as e: | ||
| 222 | - error_msg = f"查询向量生成失败 | 错误: {str(e)}" | ||
| 223 | - log_info(error_msg) | ||
| 224 | - if context: | ||
| 225 | - context.add_warning(error_msg) | 212 | + # Get thresholds from config |
| 213 | + chinese_limit = self.config.query_config.embedding_disable_chinese_char_limit | ||
| 214 | + english_limit = self.config.query_config.embedding_disable_english_word_limit | ||
| 215 | + | ||
| 216 | + # Check if embedding should be disabled for short queries | ||
| 217 | + should_disable_embedding = False | ||
| 218 | + disable_reason = None | ||
| 219 | + | ||
| 220 | + if detected_lang == 'zh': | ||
| 221 | + # For Chinese: disable embedding if character count <= threshold | ||
| 222 | + char_count = len(query_text.strip()) | ||
| 223 | + if char_count <= chinese_limit: | ||
| 224 | + should_disable_embedding = True | ||
| 225 | + disable_reason = f"中文查询字数({char_count}) <= {chinese_limit},禁用向量搜索" | ||
| 226 | + log_info(disable_reason) | ||
| 227 | + if context: | ||
| 228 | + context.store_intermediate_result('embedding_disabled_reason', disable_reason) | ||
| 229 | + else: | ||
| 230 | + # For English: disable embedding if word count <= threshold | ||
| 231 | + word_count = len(query_text.strip().split()) | ||
| 232 | + if word_count <= english_limit: | ||
| 233 | + should_disable_embedding = True | ||
| 234 | + disable_reason = f"英文查询单词数({word_count}) <= {english_limit},禁用向量搜索" | ||
| 235 | + log_info(disable_reason) | ||
| 236 | + if context: | ||
| 237 | + context.store_intermediate_result('embedding_disabled_reason', disable_reason) | ||
| 238 | + | ||
| 239 | + if not should_disable_embedding: | ||
| 240 | + try: | ||
| 241 | + log_debug("开始生成查询向量") | ||
| 242 | + query_vector = self.text_encoder.encode([query_text])[0] | ||
| 243 | + log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") | ||
| 244 | + if context: | ||
| 245 | + context.store_intermediate_result('query_vector_shape', query_vector.shape) | ||
| 246 | + except Exception as e: | ||
| 247 | + error_msg = f"查询向量生成失败 | 错误: {str(e)}" | ||
| 248 | + log_info(error_msg) | ||
| 249 | + if context: | ||
| 250 | + context.add_warning(error_msg) | ||
| 226 | 251 | ||
| 227 | # Build result | 252 | # Build result |
| 228 | result = ParsedQuery( | 253 | result = ParsedQuery( |
search/es_query_builder.py
| @@ -11,7 +11,7 @@ Simplified architecture: | @@ -11,7 +11,7 @@ Simplified architecture: | ||
| 11 | from typing import Dict, Any, List, Optional, Union | 11 | from typing import Dict, Any, List, Optional, Union |
| 12 | import numpy as np | 12 | import numpy as np |
| 13 | from .boolean_parser import QueryNode | 13 | from .boolean_parser import QueryNode |
| 14 | -from .query_config import FUNCTION_SCORE_CONFIG | 14 | +from config import FunctionScoreConfig |
| 15 | 15 | ||
| 16 | 16 | ||
| 17 | class ESQueryBuilder: | 17 | class ESQueryBuilder: |
| @@ -23,7 +23,8 @@ class ESQueryBuilder: | @@ -23,7 +23,8 @@ class ESQueryBuilder: | ||
| 23 | match_fields: List[str], | 23 | match_fields: List[str], |
| 24 | text_embedding_field: Optional[str] = None, | 24 | text_embedding_field: Optional[str] = None, |
| 25 | image_embedding_field: Optional[str] = None, | 25 | image_embedding_field: Optional[str] = None, |
| 26 | - source_fields: Optional[List[str]] = None | 26 | + source_fields: Optional[List[str]] = None, |
| 27 | + function_score_config: Optional[FunctionScoreConfig] = None | ||
| 27 | ): | 28 | ): |
| 28 | """ | 29 | """ |
| 29 | Initialize query builder. | 30 | Initialize query builder. |
| @@ -34,12 +35,14 @@ class ESQueryBuilder: | @@ -34,12 +35,14 @@ class ESQueryBuilder: | ||
| 34 | text_embedding_field: Field name for text embeddings | 35 | text_embedding_field: Field name for text embeddings |
| 35 | image_embedding_field: Field name for image embeddings | 36 | image_embedding_field: Field name for image embeddings |
| 36 | source_fields: Fields to return in search results (_source includes) | 37 | source_fields: Fields to return in search results (_source includes) |
| 38 | + function_score_config: Function score configuration | ||
| 37 | """ | 39 | """ |
| 38 | self.index_name = index_name | 40 | self.index_name = index_name |
| 39 | self.match_fields = match_fields | 41 | self.match_fields = match_fields |
| 40 | self.text_embedding_field = text_embedding_field | 42 | self.text_embedding_field = text_embedding_field |
| 41 | self.image_embedding_field = image_embedding_field | 43 | self.image_embedding_field = image_embedding_field |
| 42 | self.source_fields = source_fields | 44 | self.source_fields = source_fields |
| 45 | + self.function_score_config = function_score_config | ||
| 43 | 46 | ||
| 44 | def build_query( | 47 | def build_query( |
| 45 | self, | 48 | self, |
| @@ -182,12 +185,15 @@ class ESQueryBuilder: | @@ -182,12 +185,15 @@ class ESQueryBuilder: | ||
| 182 | return query | 185 | return query |
| 183 | 186 | ||
| 184 | # Build function_score query | 187 | # Build function_score query |
| 188 | + score_mode = self.function_score_config.score_mode if self.function_score_config else "sum" | ||
| 189 | + boost_mode = self.function_score_config.boost_mode if self.function_score_config else "multiply" | ||
| 190 | + | ||
| 185 | function_score_query = { | 191 | function_score_query = { |
| 186 | "function_score": { | 192 | "function_score": { |
| 187 | "query": query, | 193 | "query": query, |
| 188 | "functions": functions, | 194 | "functions": functions, |
| 189 | - "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"), | ||
| 190 | - "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") | 195 | + "score_mode": score_mode, |
| 196 | + "boost_mode": boost_mode | ||
| 191 | } | 197 | } |
| 192 | } | 198 | } |
| 193 | 199 | ||
| @@ -201,7 +207,10 @@ class ESQueryBuilder: | @@ -201,7 +207,10 @@ class ESQueryBuilder: | ||
| 201 | List of function score functions | 207 | List of function score functions |
| 202 | """ | 208 | """ |
| 203 | functions = [] | 209 | functions = [] |
| 204 | - config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) | 210 | + if not self.function_score_config: |
| 211 | + return functions | ||
| 212 | + | ||
| 213 | + config_functions = self.function_score_config.functions or [] | ||
| 205 | 214 | ||
| 206 | for func_config in config_functions: | 215 | for func_config in config_functions: |
| 207 | func_type = func_config.get("type") | 216 | func_type = func_config.get("type") |
search/query_config.py deleted
| @@ -1,150 +0,0 @@ | @@ -1,150 +0,0 @@ | ||
| 1 | -""" | ||
| 2 | -Query configuration constants. | ||
| 3 | - | ||
| 4 | -Since all tenants share the same ES mapping, we can hardcode field lists here. | ||
| 5 | -""" | ||
| 6 | - | ||
| 7 | -import os | ||
| 8 | -from typing import Dict, List | ||
| 9 | - | ||
| 10 | -# Default index name | ||
| 11 | -DEFAULT_INDEX_NAME = "search_products" | ||
| 12 | - | ||
| 13 | -# Text embedding field | ||
| 14 | -TEXT_EMBEDDING_FIELD = "title_embedding" | ||
| 15 | - | ||
| 16 | -# Image embedding field | ||
| 17 | -IMAGE_EMBEDDING_FIELD = "image_embedding" | ||
| 18 | - | ||
| 19 | -# Default match fields for text search (with boost) | ||
| 20 | -# 文本召回:同时搜索中英文字段,两者相互补充 | ||
| 21 | -DEFAULT_MATCH_FIELDS = [ | ||
| 22 | - # 中文字段 | ||
| 23 | - "title_zh^3.0", | ||
| 24 | - "brief_zh^1.5", | ||
| 25 | - "description_zh^1.0", | ||
| 26 | - "vendor_zh^1.5", | ||
| 27 | - "category_path_zh^1.5", | ||
| 28 | - "category_name_zh^1.5", | ||
| 29 | - # 英文字段 | ||
| 30 | - "title_en^3.0", | ||
| 31 | - "brief_en^1.5", | ||
| 32 | - "description_en^1.0", | ||
| 33 | - "vendor_en^1.5", | ||
| 34 | - "category_path_en^1.5", | ||
| 35 | - "category_name_en^1.5", | ||
| 36 | - # 语言无关字段 | ||
| 37 | - "tags^1.0", | ||
| 38 | -] | ||
| 39 | - | ||
| 40 | -# Domain-specific match fields | ||
| 41 | -DOMAIN_FIELDS: Dict[str, List[str]] = { | ||
| 42 | - "default": DEFAULT_MATCH_FIELDS, | ||
| 43 | - "title": ["title_zh^2.0"], | ||
| 44 | - "vendor": ["vendor_zh^1.5"], | ||
| 45 | - "category": ["category_path_zh^1.5", "category_name_zh^1.5"], | ||
| 46 | - "tags": ["tags^1.0"] | ||
| 47 | -} | ||
| 48 | - | ||
| 49 | -# Source fields to return in search results | ||
| 50 | -# 注意:为了在后端做多语言选择,_zh / _en 字段仍然需要从 ES 取出, | ||
| 51 | -# 但不会原样透出给前端,而是统一映射到 title / description / vendor 等字段。 | ||
| 52 | -SOURCE_FIELDS = [ | ||
| 53 | - # 基本标识 | ||
| 54 | - "tenant_id", | ||
| 55 | - "spu_id", | ||
| 56 | - "create_time", | ||
| 57 | - "update_time", | ||
| 58 | - | ||
| 59 | - # 多语言文本字段(仅用于后端选择,不直接返回给前端) | ||
| 60 | - "title_zh", | ||
| 61 | - "title_en", | ||
| 62 | - "brief_zh", | ||
| 63 | - "brief_en", | ||
| 64 | - "description_zh", | ||
| 65 | - "description_en", | ||
| 66 | - "vendor_zh", | ||
| 67 | - "vendor_en", | ||
| 68 | - "category_path_zh", | ||
| 69 | - "category_path_en", | ||
| 70 | - "category_name_zh", | ||
| 71 | - "category_name_en", | ||
| 72 | - | ||
| 73 | - # 语言无关字段(直接返回给前端) | ||
| 74 | - "tags", | ||
| 75 | - "image_url", | ||
| 76 | - "category_id", | ||
| 77 | - "category_name", | ||
| 78 | - "category_level", | ||
| 79 | - "category1_name", | ||
| 80 | - "category2_name", | ||
| 81 | - "category3_name", | ||
| 82 | - "option1_name", | ||
| 83 | - "option2_name", | ||
| 84 | - "option3_name", | ||
| 85 | - "min_price", | ||
| 86 | - "max_price", | ||
| 87 | - "compare_at_price", | ||
| 88 | - "sku_prices", | ||
| 89 | - "sku_weights", | ||
| 90 | - "sku_weight_units", | ||
| 91 | - "total_inventory", | ||
| 92 | - "skus", | ||
| 93 | - "specifications", | ||
| 94 | -] | ||
| 95 | - | ||
| 96 | -# Query processing settings | ||
| 97 | -ENABLE_TRANSLATION = os.environ.get("ENABLE_TRANSLATION", "true").lower() == "true" | ||
| 98 | -ENABLE_TEXT_EMBEDDING = os.environ.get("ENABLE_TEXT_EMBEDDING", "true").lower() == "true" | ||
| 99 | -TRANSLATION_API_KEY = os.environ.get("DEEPL_API_KEY") | ||
| 100 | -TRANSLATION_SERVICE = "deepl" | ||
| 101 | - | ||
| 102 | -# Ranking expression (currently disabled) | ||
| 103 | -RANKING_EXPRESSION = "bm25() + 0.2*text_embedding_relevance()" | ||
| 104 | - | ||
| 105 | -# Function score config | ||
| 106 | -FUNCTION_SCORE_CONFIG = { | ||
| 107 | - "score_mode": "sum", | ||
| 108 | - "boost_mode": "multiply", | ||
| 109 | - "functions": [] | ||
| 110 | -} | ||
| 111 | - | ||
| 112 | -# Load rewrite dictionary from file if exists | ||
| 113 | -def load_rewrite_dictionary() -> Dict[str, str]: | ||
| 114 | - """Load query rewrite dictionary from file.""" | ||
| 115 | - rewrite_file = os.path.join( | ||
| 116 | - os.path.dirname(os.path.dirname(__file__)), | ||
| 117 | - "config", | ||
| 118 | - "query_rewrite.dict" | ||
| 119 | - ) | ||
| 120 | - | ||
| 121 | - if not os.path.exists(rewrite_file): | ||
| 122 | - return {} | ||
| 123 | - | ||
| 124 | - rewrite_dict = {} | ||
| 125 | - try: | ||
| 126 | - with open(rewrite_file, 'r', encoding='utf-8') as f: | ||
| 127 | - for line in f: | ||
| 128 | - line = line.strip() | ||
| 129 | - if not line or line.startswith('#'): | ||
| 130 | - continue | ||
| 131 | - parts = line.split('\t') | ||
| 132 | - if len(parts) == 2: | ||
| 133 | - rewrite_dict[parts[0].strip()] = parts[1].strip() | ||
| 134 | - except Exception as e: | ||
| 135 | - print(f"Warning: Failed to load rewrite dictionary: {e}") | ||
| 136 | - | ||
| 137 | - return rewrite_dict | ||
| 138 | - | ||
| 139 | -REWRITE_DICTIONARY = load_rewrite_dictionary() | ||
| 140 | - | ||
| 141 | -# Default facets for faceted search | ||
| 142 | -# 分类分面:使用category1_name, category2_name, category3_name | ||
| 143 | -# specifications分面:使用嵌套聚合,按name分组,然后按value聚合 | ||
| 144 | -DEFAULT_FACETS = [ | ||
| 145 | - "category1_name", # 一级分类 | ||
| 146 | - "category2_name", # 二级分类 | ||
| 147 | - "category3_name", # 三级分类 | ||
| 148 | - "specifications" # 规格分面(特殊处理:嵌套聚合) | ||
| 149 | -] | ||
| 150 | - |
search/searcher.py
| @@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder | @@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder | ||
| 14 | from .boolean_parser import BooleanParser, QueryNode | 14 | from .boolean_parser import BooleanParser, QueryNode |
| 15 | from .es_query_builder import ESQueryBuilder | 15 | from .es_query_builder import ESQueryBuilder |
| 16 | from .rerank_engine import RerankEngine | 16 | from .rerank_engine import RerankEngine |
| 17 | -from .query_config import ( | ||
| 18 | - DEFAULT_INDEX_NAME, | ||
| 19 | - DEFAULT_MATCH_FIELDS, | ||
| 20 | - TEXT_EMBEDDING_FIELD, | ||
| 21 | - IMAGE_EMBEDDING_FIELD, | ||
| 22 | - SOURCE_FIELDS, | ||
| 23 | - ENABLE_TRANSLATION, | ||
| 24 | - ENABLE_TEXT_EMBEDDING, | ||
| 25 | - RANKING_EXPRESSION | ||
| 26 | -) | 17 | +from config import SearchConfig |
| 18 | +from config.utils import get_match_fields_for_index | ||
| 27 | from context.request_context import RequestContext, RequestContextStage, create_request_context | 19 | from context.request_context import RequestContext, RequestContextStage, create_request_context |
| 28 | from api.models import FacetResult, FacetValue | 20 | from api.models import FacetResult, FacetValue |
| 29 | from api.result_formatter import ResultFormatter | 21 | from api.result_formatter import ResultFormatter |
| @@ -87,37 +79,40 @@ class Searcher: | @@ -87,37 +79,40 @@ class Searcher: | ||
| 87 | def __init__( | 79 | def __init__( |
| 88 | self, | 80 | self, |
| 89 | es_client: ESClient, | 81 | es_client: ESClient, |
| 90 | - query_parser: Optional[QueryParser] = None, | ||
| 91 | - index_name: str = DEFAULT_INDEX_NAME | 82 | + config: SearchConfig, |
| 83 | + query_parser: Optional[QueryParser] = None | ||
| 92 | ): | 84 | ): |
| 93 | """ | 85 | """ |
| 94 | Initialize searcher. | 86 | Initialize searcher. |
| 95 | 87 | ||
| 96 | Args: | 88 | Args: |
| 97 | es_client: Elasticsearch client | 89 | es_client: Elasticsearch client |
| 90 | + config: SearchConfig instance | ||
| 98 | query_parser: Query parser (created if not provided) | 91 | query_parser: Query parser (created if not provided) |
| 99 | - index_name: ES index name (default: search_products) | ||
| 100 | """ | 92 | """ |
| 101 | self.es_client = es_client | 93 | self.es_client = es_client |
| 102 | - self.index_name = index_name | ||
| 103 | - self.query_parser = query_parser or QueryParser() | 94 | + self.config = config |
| 95 | + self.index_name = config.es_index_name | ||
| 96 | + self.query_parser = query_parser or QueryParser(config) | ||
| 104 | 97 | ||
| 105 | # Initialize components | 98 | # Initialize components |
| 106 | self.boolean_parser = BooleanParser() | 99 | self.boolean_parser = BooleanParser() |
| 107 | - self.rerank_engine = RerankEngine(RANKING_EXPRESSION, enabled=False) | 100 | + self.rerank_engine = RerankEngine(config.ranking.expression, enabled=False) |
| 108 | 101 | ||
| 109 | - # Use constants from query_config | ||
| 110 | - self.match_fields = DEFAULT_MATCH_FIELDS | ||
| 111 | - self.text_embedding_field = TEXT_EMBEDDING_FIELD | ||
| 112 | - self.image_embedding_field = IMAGE_EMBEDDING_FIELD | 102 | + # Get match fields from config |
| 103 | + self.match_fields = get_match_fields_for_index(config, "default") | ||
| 104 | + self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding" | ||
| 105 | + self.image_embedding_field = config.query_config.image_embedding_field or "image_embedding" | ||
| 106 | + self.source_fields = config.query_config.source_fields or [] | ||
| 113 | 107 | ||
| 114 | # Query builder - simplified single-layer architecture | 108 | # Query builder - simplified single-layer architecture |
| 115 | self.query_builder = ESQueryBuilder( | 109 | self.query_builder = ESQueryBuilder( |
| 116 | - index_name=index_name, | 110 | + index_name=self.index_name, |
| 117 | match_fields=self.match_fields, | 111 | match_fields=self.match_fields, |
| 118 | text_embedding_field=self.text_embedding_field, | 112 | text_embedding_field=self.text_embedding_field, |
| 119 | image_embedding_field=self.image_embedding_field, | 113 | image_embedding_field=self.image_embedding_field, |
| 120 | - source_fields=SOURCE_FIELDS | 114 | + source_fields=self.source_fields, |
| 115 | + function_score_config=self.config.function_score | ||
| 121 | ) | 116 | ) |
| 122 | 117 | ||
| 123 | def search( | 118 | def search( |
| @@ -162,8 +157,8 @@ class Searcher: | @@ -162,8 +157,8 @@ class Searcher: | ||
| 162 | context = create_request_context() | 157 | context = create_request_context() |
| 163 | 158 | ||
| 164 | # Always use config defaults (these are backend configuration, not user parameters) | 159 | # Always use config defaults (these are backend configuration, not user parameters) |
| 165 | - enable_translation = ENABLE_TRANSLATION | ||
| 166 | - enable_embedding = ENABLE_TEXT_EMBEDDING | 160 | + enable_translation = self.config.query_config.enable_translation |
| 161 | + enable_embedding = self.config.query_config.enable_text_embedding | ||
| 167 | enable_rerank = False # Temporarily disabled | 162 | enable_rerank = False # Temporarily disabled |
| 168 | 163 | ||
| 169 | # Start timing | 164 | # Start timing |
| @@ -508,9 +503,9 @@ class Searcher: | @@ -508,9 +503,9 @@ class Searcher: | ||
| 508 | } | 503 | } |
| 509 | 504 | ||
| 510 | # Add _source filtering if source_fields are configured | 505 | # Add _source filtering if source_fields are configured |
| 511 | - if SOURCE_FIELDS: | 506 | + if self.source_fields: |
| 512 | es_query["_source"] = { | 507 | es_query["_source"] = { |
| 513 | - "includes": SOURCE_FIELDS | 508 | + "includes": self.source_fields |
| 514 | } | 509 | } |
| 515 | 510 | ||
| 516 | if filters or range_filters: | 511 | if filters or range_filters: |
tests/conftest.py
| @@ -137,8 +137,8 @@ def mock_es_client() -> Mock: | @@ -137,8 +137,8 @@ def mock_es_client() -> Mock: | ||
| 137 | def test_searcher(sample_search_config, mock_es_client) -> Searcher: | 137 | def test_searcher(sample_search_config, mock_es_client) -> Searcher: |
| 138 | """测试用Searcher实例""" | 138 | """测试用Searcher实例""" |
| 139 | return Searcher( | 139 | return Searcher( |
| 140 | - config=sample_search_config, | ||
| 141 | - es_client=mock_es_client | 140 | + es_client=mock_es_client, |
| 141 | + config=sample_search_config | ||
| 142 | ) | 142 | ) |
| 143 | 143 | ||
| 144 | 144 |