Compare View

switch
from
...
to
 
Commits (4)
  • tangwang
     
  • query config/ranking config优化
    tangwang
     
  • sku_filter_dimension=color
    sku_filter_dimension=option1 / option2 /option3
    以上两种方式都可以
    tangwang
     
  • 后端请求模型变更(api/models.py)
    SearchRequest.sku_filter_dimension 从 Optional[str] 改为 Optional[List[str]]。
    语义:列表表示一个或多个“维度标签”,例如:
    单维度:["color"]、["option1"]
    多维度:["color", "size"]、["option1", "option2"]
    描述更新为:对 维度组合进行分组,每个组合只保留一个 SKU。
    结果格式化与去重逻辑(api/result_formatter.py)
    ResultFormatter.format_search_results(..., sku_filter_dimension: Optional[List[str]] = None),调用处已同步更新。
    单维度旧逻辑升级为多维度逻辑:
    新方法:_filter_skus_by_dimensions(skus, dimensions, option1_name, option2_name, option3_name, specifications)。
    维度解析规则(按顺序处理,并去重):
    若维度是 option1 / option2 / option3 → 对应 option1_value / option2_value / option3_value。
    否则,将维度字符串转小写后,分别与 option1_name / option2_name / option3_name 对比,相等则映射到对应的 option*_value。
    未能映射到任何字段的维度会被忽略。
    对每个 SKU:
    按解析出的字段列表(例如 ["option1_value", "option2_value"])取值,组成 key,如 ("red", "L");None 用空串 ""。
    按 key 分组,每个 key 只保留遇到的第一个 SKU。
    若列表为空或所有维度都无法解析,则 不做过滤,返回原始 skus。
    Searcher 参数类型同步(search/searcher.py)
    Searcher.search(...) 中 sku_filter_dimension 参数类型从 Optional[str] 改为 Optional[List[str]]。
    传给 ResultFormatter.format_search_results 时,直接传该列表。
    前端参数格式调整(frontend/static/js/app.js)
    输入框 #skuFilterDimension 依旧是一个文本框,但解析方式改为:
    函数 getSkuFilterDimension():
    读取文本,如:"color" 或 "color,size" 或 "option1, color"。
    用逗号 , 拆分,trim() 后过滤空串,返回 字符串数组,例如:
    "color" → ["color"]
    "color,size" → ["color", "size"]
    若最终数组为空,则返回 null。
    搜索请求体中仍使用字段名 sku_filter_dimension,但现在值是 string[] 或 null:
        body: JSON.stringify({      // ...      sku_filter_dimension: skuFilterDimension,  // 例如 ["color", "size"]      debug: state.debug    })
    文档更新(docs/搜索API对接指南.md)
    请求体示例中的类型由:
    "sku_filter_dimension": "string"
    改为:
    "sku_filter_dimension": ["string"]
    参数表中:
    从 string 改为 array[string],说明为“维度列表,按组合分组,每个组合保留一个 SKU”。
    功能说明章节“SKU筛选维度 (sku_filter_dimension)”已调整为 列表语义 + 组合去重,并补充了示例:
    单维度:
          {        "query": "芭比娃娃",        "sku_filter_dimension": ["color"]      }
    多维度组合:
          {        "query": "芭比娃娃",        "sku_filter_dimension": ["color", "size"]      }
    使用方式总结
    单维度去重(保持旧行为的等价写法)
    旧:"sku_filter_dimension": "color"
    新:"sku_filter_dimension": ["color"]
    多维度组合去重(你新提的需求)
    例如希望“每个 SPU 下,同一颜色+尺码组合只保留一个 SKU”:
        {      "query": "芭比娃娃",      "sku_filter_dimension": ["color", "size"]    }
    tangwang
     
@@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address) @@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address)
41 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 41 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
42 42
43 from config.env_config import ES_CONFIG 43 from config.env_config import ES_CONFIG
  44 +from config import ConfigLoader
44 from utils import ESClient 45 from utils import ESClient
45 from search import Searcher 46 from search import Searcher
46 -from search.query_config import DEFAULT_INDEX_NAME  
47 from query import QueryParser 47 from query import QueryParser
48 48
49 # Global instances 49 # Global instances
50 _es_client: Optional[ESClient] = None 50 _es_client: Optional[ESClient] = None
51 _searcher: Optional[Searcher] = None 51 _searcher: Optional[Searcher] = None
52 _query_parser: Optional[QueryParser] = None 52 _query_parser: Optional[QueryParser] = None
  53 +_config = None
53 54
54 55
55 def init_service(es_host: str = "http://localhost:9200"): 56 def init_service(es_host: str = "http://localhost:9200"):
@@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"):
59 Args: 60 Args:
60 es_host: Elasticsearch host URL 61 es_host: Elasticsearch host URL
61 """ 62 """
62 - global _es_client, _searcher, _query_parser 63 + global _es_client, _searcher, _query_parser, _config
63 64
64 start_time = time.time() 65 start_time = time.time()
65 logger.info("Initializing search service (multi-tenant)") 66 logger.info("Initializing search service (multi-tenant)")
66 67
  68 + # Load configuration
  69 + logger.info("Loading configuration...")
  70 + config_loader = ConfigLoader("config/config.yaml")
  71 + _config = config_loader.load_config()
  72 + logger.info("Configuration loaded")
  73 +
67 # Get ES credentials 74 # Get ES credentials
68 es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username') 75 es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username')
69 es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password') 76 es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password')
@@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"):
81 88
82 # Initialize components 89 # Initialize components
83 logger.info("Initializing query parser...") 90 logger.info("Initializing query parser...")
84 - _query_parser = QueryParser() 91 + _query_parser = QueryParser(_config)
85 92
86 logger.info("Initializing searcher...") 93 logger.info("Initializing searcher...")
87 - _searcher = Searcher(_es_client, _query_parser, index_name=DEFAULT_INDEX_NAME) 94 + _searcher = Searcher(_es_client, _config, _query_parser)
88 95
89 elapsed = time.time() - start_time 96 elapsed = time.time() - start_time
90 - logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {DEFAULT_INDEX_NAME}") 97 + logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}")
91 98
92 99
93 100
@@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser: @@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser:
113 return _query_parser 120 return _query_parser
114 121
115 122
  123 +def get_config():
  124 + """Get global config instance."""
  125 + if _config is None:
  126 + raise RuntimeError("Service not initialized")
  127 + return _config
  128 +
  129 +
116 # Create FastAPI app with enhanced configuration 130 # Create FastAPI app with enhanced configuration
117 app = FastAPI( 131 app = FastAPI(
118 title="E-Commerce Search API", 132 title="E-Commerce Search API",
@@ -146,9 +146,14 @@ class SearchRequest(BaseModel): @@ -146,9 +146,14 @@ class SearchRequest(BaseModel):
146 debug: bool = Field(False, description="是否返回调试信息") 146 debug: bool = Field(False, description="是否返回调试信息")
147 147
148 # SKU筛选参数 148 # SKU筛选参数
149 - sku_filter_dimension: Optional[str] = Field( 149 + sku_filter_dimension: Optional[List[str]] = Field(
150 None, 150 None,
151 - description="子SKU筛选维度(店铺配置)。指定后,每个SPU下的SKU将按该维度分组,每组选择第一个SKU返回。例如:'color'表示按颜色分组,每种颜色选一款。支持的值:'option1'、'option2'、'option3'或specifications中的name(如'color'、'size')" 151 + description=(
  152 + "子SKU筛选维度(店铺配置),为字符串列表。"
  153 + "指定后,每个SPU下的SKU将按这些维度的组合进行分组,每个维度组合只保留一个SKU返回。"
  154 + "例如:['color'] 表示按颜色分组,每种颜色选一款;['color', 'size'] 表示按颜色+尺码组合分组。"
  155 + "支持的值:'option1'、'option2'、'option3' 或选项名称(如 'color'、'size',将通过 option1_name/2_name/3_name 匹配)。"
  156 + )
152 ) 157 )
153 158
154 # 个性化参数(预留) 159 # 个性化参数(预留)
api/result_formatter.py
@@ -14,7 +14,7 @@ class ResultFormatter: @@ -14,7 +14,7 @@ class ResultFormatter:
14 es_hits: List[Dict[str, Any]], 14 es_hits: List[Dict[str, Any]],
15 max_score: float = 1.0, 15 max_score: float = 1.0,
16 language: str = "zh", 16 language: str = "zh",
17 - sku_filter_dimension: Optional[str] = None 17 + sku_filter_dimension: Optional[List[str]] = None
18 ) -> List[SpuResult]: 18 ) -> List[SpuResult]:
19 """ 19 """
20 Convert ES hits to SpuResult list. 20 Convert ES hits to SpuResult list.
@@ -85,10 +85,10 @@ class ResultFormatter: @@ -85,10 +85,10 @@ class ResultFormatter:
85 ) 85 )
86 skus.append(sku) 86 skus.append(sku)
87 87
88 - # Apply SKU filtering if dimension is specified 88 + # Apply SKU filtering if dimension list is specified
89 if sku_filter_dimension and skus: 89 if sku_filter_dimension and skus:
90 - skus = ResultFormatter._filter_skus_by_dimension(  
91 - skus, 90 + skus = ResultFormatter._filter_skus_by_dimensions(
  91 + skus,
92 sku_filter_dimension, 92 sku_filter_dimension,
93 source.get('option1_name'), 93 source.get('option1_name'),
94 source.get('option2_name'), 94 source.get('option2_name'),
@@ -138,22 +138,22 @@ class ResultFormatter: @@ -138,22 +138,22 @@ class ResultFormatter:
138 return results 138 return results
139 139
140 @staticmethod 140 @staticmethod
141 - def _filter_skus_by_dimension( 141 + def _filter_skus_by_dimensions(
142 skus: List[SkuResult], 142 skus: List[SkuResult],
143 - dimension: str, 143 + dimensions: List[str],
144 option1_name: Optional[str] = None, 144 option1_name: Optional[str] = None,
145 option2_name: Optional[str] = None, 145 option2_name: Optional[str] = None,
146 option3_name: Optional[str] = None, 146 option3_name: Optional[str] = None,
147 specifications: Optional[List[Dict[str, Any]]] = None 147 specifications: Optional[List[Dict[str, Any]]] = None
148 ) -> List[SkuResult]: 148 ) -> List[SkuResult]:
149 """ 149 """
150 - Filter SKUs by dimension, keeping only one SKU per dimension value. 150 + Filter SKUs by one or more dimensions, keeping only one SKU per dimension value combination.
151 151
152 Args: 152 Args:
153 skus: List of SKU results to filter 153 skus: List of SKU results to filter
154 - dimension: Filter dimension, can be: 154 + dimensions: Filter dimensions, each dimension can be:
155 - 'option1', 'option2', 'option3': Direct option field 155 - 'option1', 'option2', 'option3': Direct option field
156 - - A specification name (e.g., 'color', 'size'): Match by option name 156 + - A specification/option name (e.g., 'color', 'size'): Match by option name
157 option1_name: Name of option1 (e.g., 'color') 157 option1_name: Name of option1 (e.g., 'color')
158 option2_name: Name of option2 (e.g., 'size') 158 option2_name: Name of option2 (e.g., 'size')
159 option3_name: Name of option3 159 option3_name: Name of option3
@@ -162,54 +162,59 @@ class ResultFormatter: @@ -162,54 +162,59 @@ class ResultFormatter:
162 Returns: 162 Returns:
163 Filtered list of SKUs (one per dimension value) 163 Filtered list of SKUs (one per dimension value)
164 """ 164 """
165 - if not skus: 165 + if not skus or not dimensions:
166 return skus 166 return skus
167 -  
168 - # Determine which field to use for filtering  
169 - filter_field = None  
170 -  
171 - # Direct option field (option1, option2, option3)  
172 - if dimension.lower() == 'option1':  
173 - filter_field = 'option1_value'  
174 - elif dimension.lower() == 'option2':  
175 - filter_field = 'option2_value'  
176 - elif dimension.lower() == 'option3':  
177 - filter_field = 'option3_value'  
178 - else:  
179 - # Try to match by option name  
180 - dimension_lower = dimension.lower()  
181 - if option1_name and option1_name.lower() == dimension_lower:  
182 - filter_field = 'option1_value'  
183 - elif option2_name and option2_name.lower() == dimension_lower:  
184 - filter_field = 'option2_value'  
185 - elif option3_name and option3_name.lower() == dimension_lower:  
186 - filter_field = 'option3_value'  
187 -  
188 - # If no matching field found, return all SKUs (no filtering)  
189 - if not filter_field:  
190 - return skus  
191 -  
192 - # Group SKUs by dimension value and select first one from each group  
193 - dimension_groups: Dict[str, SkuResult] = {}  
194 - 167 +
  168 + # Resolve each dimension to an underlying SKU field (option1_value / option2_value / option3_value)
  169 + filter_fields: List[str] = []
  170 +
  171 + for dim in dimensions:
  172 + if not dim:
  173 + continue
  174 + dim_lower = dim.lower()
  175 +
  176 + field_name: Optional[str] = None
  177 + # Direct option field (option1, option2, option3)
  178 + if dim_lower == 'option1':
  179 + field_name = 'option1_value'
  180 + elif dim_lower == 'option2':
  181 + field_name = 'option2_value'
  182 + elif dim_lower == 'option3':
  183 + field_name = 'option3_value'
  184 + else:
  185 + # Try to match by option name
  186 + if option1_name and option1_name.lower() == dim_lower:
  187 + field_name = 'option1_value'
  188 + elif option2_name and option2_name.lower() == dim_lower:
  189 + field_name = 'option2_value'
  190 + elif option3_name and option3_name.lower() == dim_lower:
  191 + field_name = 'option3_value'
  192 +
  193 + if field_name and field_name not in filter_fields:
  194 + filter_fields.append(field_name)
  195 +
  196 + # If no matching field found for all dimensions, do not return any child SKUs
  197 + if not filter_fields:
  198 + return []
  199 +
  200 + # Group SKUs by dimension value combination and select first one from each group
  201 + dimension_groups: Dict[tuple, SkuResult] = {}
  202 +
195 for sku in skus: 203 for sku in skus:
196 - # Get dimension value from the determined field  
197 - dimension_value = None  
198 - if filter_field == 'option1_value':  
199 - dimension_value = sku.option1_value  
200 - elif filter_field == 'option2_value':  
201 - dimension_value = sku.option2_value  
202 - elif filter_field == 'option3_value':  
203 - dimension_value = sku.option3_value  
204 -  
205 - # Use empty string as key for None values  
206 - key = str(dimension_value) if dimension_value is not None else ''  
207 -  
208 - # Keep first SKU for each dimension value 204 + # Build key as combination of all dimension values
  205 + key_values: List[str] = []
  206 + for field in filter_fields:
  207 + dimension_value = getattr(sku, field, None)
  208 + # Use empty string as key part for None values
  209 + key_values.append(str(dimension_value) if dimension_value is not None else '')
  210 +
  211 + key = tuple(key_values)
  212 +
  213 + # Keep first SKU for each dimension combination
209 if key not in dimension_groups: 214 if key not in dimension_groups:
210 dimension_groups[key] = sku 215 dimension_groups[key] = sku
211 -  
212 - # Return filtered SKUs (one per dimension value) 216 +
  217 + # Return filtered SKUs (one per dimension combination)
213 return list(dimension_groups.values()) 218 return list(dimension_groups.values())
214 219
215 @staticmethod 220 @staticmethod
api/routes/search.py
@@ -24,8 +24,8 @@ def extract_request_info(request: Request) -> tuple[str, str]: @@ -24,8 +24,8 @@ def extract_request_info(request: Request) -> tuple[str, str]:
24 # Try to get request ID from headers 24 # Try to get request ID from headers
25 reqid = request.headers.get('X-Request-ID') or str(uuid.uuid4())[:8] 25 reqid = request.headers.get('X-Request-ID') or str(uuid.uuid4())[:8]
26 26
27 - # Try to get user ID from headers or default to anonymous  
28 - uid = request.headers.get('X-User-ID') or request.headers.get('User-ID') or 'anonymous' 27 + # Try to get user ID from headers; if not found, use "-1" for correlation
  28 + uid = request.headers.get('X-User-ID') or request.headers.get('User-ID') or "-1"
29 29
30 return reqid, uid 30 return reqid, uid
31 31
@@ -70,10 +70,24 @@ async def search(request: SearchRequest, http_request: Request): @@ -70,10 +70,24 @@ async def search(request: SearchRequest, http_request: Request):
70 set_current_request_context(context) 70 set_current_request_context(context)
71 71
72 try: 72 try:
73 - # Log request start 73 + # Log request start (English logs, with key search parameters)
  74 + client_ip = http_request.client.host if http_request.client else "unknown"
  75 + user_agent = http_request.headers.get("User-Agent", "unknown")[:200]
74 context.logger.info( 76 context.logger.info(
75 - f"收到搜索请求 | Tenant: {tenant_id} | IP: {http_request.client.host if http_request.client else 'unknown'} | "  
76 - f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}", 77 + "Received search request | "
  78 + f"Tenant: {tenant_id} | "
  79 + f"Query: {request.query} | "
  80 + f"IP: {client_ip} | "
  81 + f"User agent: {user_agent} | "
  82 + f"size: {request.size} | from: {request.from_} | "
  83 + f"sort_by: {request.sort_by} | sort_order: {request.sort_order} | "
  84 + f"min_score: {request.min_score} | "
  85 + f"language: {request.language} | "
  86 + f"debug: {request.debug} | "
  87 + f"sku_filter_dimension: {request.sku_filter_dimension} | "
  88 + f"filters: {request.filters} | "
  89 + f"range_filters: {request.range_filters} | "
  90 + f"facets: {request.facets}",
77 extra={'reqid': context.reqid, 'uid': context.uid} 91 extra={'reqid': context.reqid, 'uid': context.uid}
78 ) 92 )
79 93
@@ -121,7 +135,7 @@ async def search(request: SearchRequest, http_request: Request): @@ -121,7 +135,7 @@ async def search(request: SearchRequest, http_request: Request):
121 if context: 135 if context:
122 context.set_error(e) 136 context.set_error(e)
123 context.logger.error( 137 context.logger.error(
124 - f"搜索请求失败 | 错误: {str(e)}", 138 + f"Search request failed | error: {str(e)}",
125 extra={'reqid': context.reqid, 'uid': context.uid} 139 extra={'reqid': context.reqid, 'uid': context.uid}
126 ) 140 )
127 raise HTTPException(status_code=500, detail=str(e)) 141 raise HTTPException(status_code=500, detail=str(e))
@@ -164,10 +178,13 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -164,10 +178,13 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
164 set_current_request_context(context) 178 set_current_request_context(context)
165 179
166 try: 180 try:
167 - # Log request start 181 + # Log request start for image search (English)
  182 + client_ip = http_request.client.host if http_request.client else "unknown"
168 context.logger.info( 183 context.logger.info(
169 - f"收到图片搜索请求 | Tenant: {tenant_id} | 图片URL: {request.image_url} | "  
170 - f"IP: {http_request.client.host if http_request.client else 'unknown'}", 184 + "Received image search request | "
  185 + f"Tenant: {tenant_id} | "
  186 + f"Image URL: {request.image_url} | "
  187 + f"IP: {client_ip}",
171 extra={'reqid': context.reqid, 'uid': context.uid} 188 extra={'reqid': context.reqid, 'uid': context.uid}
172 ) 189 )
173 190
@@ -202,7 +219,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -202,7 +219,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
202 if context: 219 if context:
203 context.set_error(e) 220 context.set_error(e)
204 context.logger.error( 221 context.logger.error(
205 - f"图片搜索请求参数错误 | 错误: {str(e)}", 222 + f"Image search request parameter error | error: {str(e)}",
206 extra={'reqid': context.reqid, 'uid': context.uid} 223 extra={'reqid': context.reqid, 'uid': context.uid}
207 ) 224 )
208 raise HTTPException(status_code=400, detail=str(e)) 225 raise HTTPException(status_code=400, detail=str(e))
@@ -210,7 +227,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -210,7 +227,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
210 if context: 227 if context:
211 context.set_error(e) 228 context.set_error(e)
212 context.logger.error( 229 context.logger.error(
213 - f"图片搜索请求失败 | 错误: {str(e)}", 230 + f"Image search request failed | error: {str(e)}",
214 extra={'reqid': context.reqid, 'uid': context.uid} 231 extra={'reqid': context.reqid, 'uid': context.uid}
215 ) 232 )
216 raise HTTPException(status_code=500, detail=str(e)) 233 raise HTTPException(status_code=500, detail=str(e))
config/__init__.py
@@ -23,6 +23,10 @@ from .config_loader import ( @@ -23,6 +23,10 @@ from .config_loader import (
23 RerankConfig, 23 RerankConfig,
24 ConfigurationError 24 ConfigurationError
25 ) 25 )
  26 +from .utils import (
  27 + get_match_fields_for_index,
  28 + get_domain_fields
  29 +)
26 30
27 __all__ = [ 31 __all__ = [
28 # Field types 32 # Field types
@@ -46,4 +50,6 @@ __all__ = [ @@ -46,4 +50,6 @@ __all__ = [
46 'FunctionScoreConfig', 50 'FunctionScoreConfig',
47 'RerankConfig', 51 'RerankConfig',
48 'ConfigurationError', 52 'ConfigurationError',
  53 + 'get_match_fields_for_index',
  54 + 'get_domain_fields',
49 ] 55 ]
config/config.yaml
@@ -412,6 +412,11 @@ query_config: @@ -412,6 +412,11 @@ query_config:
412 text_embedding_field: "title_embedding" # Field name for text embeddings 412 text_embedding_field: "title_embedding" # Field name for text embeddings
413 image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect) 413 image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect)
414 414
  415 + # Embedding disable thresholds (disable vector search for short queries)
  416 + embedding_disable_thresholds:
  417 + chinese_char_limit: 4 # Disable embedding for Chinese queries with <= 4 characters
  418 + english_word_limit: 3 # Disable embedding for English queries with <= 3 words
  419 +
415 # Translation API (DeepL) 420 # Translation API (DeepL)
416 translation_service: "deepl" 421 translation_service: "deepl"
417 translation_api_key: null # Set via environment variable 422 translation_api_key: null # Set via environment variable
config/config_loader.py
@@ -58,6 +58,10 @@ class QueryConfig: @@ -58,6 +58,10 @@ class QueryConfig:
58 text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding") 58 text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding")
59 image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding") 59 image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding")
60 60
  61 + # Embedding disable thresholds (disable vector search for short queries)
  62 + embedding_disable_chinese_char_limit: int = 4 # Disable embedding for Chinese queries with <= this many characters
  63 + embedding_disable_english_word_limit: int = 3 # Disable embedding for English queries with <= this many words
  64 +
61 # ES source fields configuration - fields to return in search results 65 # ES source fields configuration - fields to return in search results
62 # If None, auto-collect from field configs (fields with return_in_source=True) 66 # If None, auto-collect from field configs (fields with return_in_source=True)
63 # If empty list, return all fields. Otherwise, only return specified fields. 67 # If empty list, return all fields. Otherwise, only return specified fields.
@@ -165,15 +169,18 @@ class ConfigLoader: @@ -165,15 +169,18 @@ class ConfigLoader:
165 169
166 return rewrite_dict 170 return rewrite_dict
167 171
168 - def load_config(self) -> SearchConfig: 172 + def load_config(self, validate: bool = True) -> SearchConfig:
169 """ 173 """
170 Load unified configuration from YAML file. 174 Load unified configuration from YAML file.
171 175
  176 + Args:
  177 + validate: Whether to validate configuration after loading (default: True)
  178 +
172 Returns: 179 Returns:
173 SearchConfig object 180 SearchConfig object
174 181
175 Raises: 182 Raises:
176 - ConfigurationError: If config file not found or invalid 183 + ConfigurationError: If config file not found, invalid, or validation fails
177 """ 184 """
178 if not self.config_file.exists(): 185 if not self.config_file.exists():
179 raise ConfigurationError(f"Configuration file not found: {self.config_file}") 186 raise ConfigurationError(f"Configuration file not found: {self.config_file}")
@@ -184,7 +191,16 @@ class ConfigLoader: @@ -184,7 +191,16 @@ class ConfigLoader:
184 except yaml.YAMLError as e: 191 except yaml.YAMLError as e:
185 raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}") 192 raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}")
186 193
187 - return self._parse_config(config_data) 194 + config = self._parse_config(config_data)
  195 +
  196 + # Auto-validate configuration
  197 + if validate:
  198 + errors = self.validate_config(config)
  199 + if errors:
  200 + error_msg = "Configuration validation failed:\n" + "\n".join(f" - {err}" for err in errors)
  201 + raise ConfigurationError(error_msg)
  202 +
  203 + return config
188 204
189 def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig: 205 def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig:
190 """Parse configuration dictionary into SearchConfig object.""" 206 """Parse configuration dictionary into SearchConfig object."""
@@ -214,43 +230,48 @@ class ConfigLoader: @@ -214,43 +230,48 @@ class ConfigLoader:
214 if field.return_in_source 230 if field.return_in_source
215 ] 231 ]
216 232
  233 + # Parse embedding disable thresholds
  234 + embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {})
  235 +
217 query_config = QueryConfig( 236 query_config = QueryConfig(
218 - supported_languages=query_config_data.get("supported_languages", ["zh", "en"]),  
219 - default_language=query_config_data.get("default_language", "zh"), 237 + supported_languages=query_config_data.get("supported_languages") or ["zh", "en"],
  238 + default_language=query_config_data.get("default_language") or "zh",
220 enable_translation=query_config_data.get("enable_translation", True), 239 enable_translation=query_config_data.get("enable_translation", True),
221 enable_text_embedding=query_config_data.get("enable_text_embedding", True), 240 enable_text_embedding=query_config_data.get("enable_text_embedding", True),
222 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), 241 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
223 rewrite_dictionary=rewrite_dictionary, 242 rewrite_dictionary=rewrite_dictionary,
224 translation_api_key=query_config_data.get("translation_api_key"), 243 translation_api_key=query_config_data.get("translation_api_key"),
225 - translation_service=query_config_data.get("translation_service", "deepl"), 244 + translation_service=query_config_data.get("translation_service") or "deepl",
226 translation_glossary_id=query_config_data.get("translation_glossary_id"), 245 translation_glossary_id=query_config_data.get("translation_glossary_id"),
227 - translation_context=query_config_data.get("translation_context", "e-commerce product search"), 246 + translation_context=query_config_data.get("translation_context") or "e-commerce product search",
228 text_embedding_field=query_config_data.get("text_embedding_field"), 247 text_embedding_field=query_config_data.get("text_embedding_field"),
229 image_embedding_field=query_config_data.get("image_embedding_field"), 248 image_embedding_field=query_config_data.get("image_embedding_field"),
  249 + embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
  250 + embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
230 source_fields=source_fields 251 source_fields=source_fields
231 ) 252 )
232 253
233 # Parse ranking config 254 # Parse ranking config
234 ranking_data = config_data.get("ranking", {}) 255 ranking_data = config_data.get("ranking", {})
235 ranking = RankingConfig( 256 ranking = RankingConfig(
236 - expression=ranking_data.get("expression", "bm25() + 0.2*text_embedding_relevance()"),  
237 - description=ranking_data.get("description", "Default BM25 + text embedding ranking") 257 + expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
  258 + description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
238 ) 259 )
239 260
240 # Parse Function Score configuration 261 # Parse Function Score configuration
241 fs_data = config_data.get("function_score", {}) 262 fs_data = config_data.get("function_score", {})
242 function_score = FunctionScoreConfig( 263 function_score = FunctionScoreConfig(
243 - score_mode=fs_data.get("score_mode", "sum"),  
244 - boost_mode=fs_data.get("boost_mode", "multiply"),  
245 - functions=fs_data.get("functions", []) 264 + score_mode=fs_data.get("score_mode") or "sum",
  265 + boost_mode=fs_data.get("boost_mode") or "multiply",
  266 + functions=fs_data.get("functions") or []
246 ) 267 )
247 268
248 # Parse Rerank configuration 269 # Parse Rerank configuration
249 rerank_data = config_data.get("rerank", {}) 270 rerank_data = config_data.get("rerank", {})
250 rerank = RerankConfig( 271 rerank = RerankConfig(
251 enabled=rerank_data.get("enabled", False), 272 enabled=rerank_data.get("enabled", False),
252 - expression=rerank_data.get("expression", ""),  
253 - description=rerank_data.get("description", "") 273 + expression=rerank_data.get("expression") or "",
  274 + description=rerank_data.get("description") or ""
254 ) 275 )
255 276
256 # Parse SPU config 277 # Parse SPU config
@@ -447,21 +468,43 @@ class ConfigLoader: @@ -447,21 +468,43 @@ class ConfigLoader:
447 output_path = Path(output_path) 468 output_path = Path(output_path)
448 469
449 # Convert config back to dictionary format 470 # Convert config back to dictionary format
  471 + query_config_dict = {
  472 + "supported_languages": config.query_config.supported_languages,
  473 + "default_language": config.query_config.default_language,
  474 + "enable_translation": config.query_config.enable_translation,
  475 + "enable_text_embedding": config.query_config.enable_text_embedding,
  476 + "enable_query_rewrite": config.query_config.enable_query_rewrite,
  477 + "translation_service": config.query_config.translation_service,
  478 + }
  479 +
  480 + # Add optional fields only if they are set
  481 + if config.query_config.translation_api_key:
  482 + query_config_dict["translation_api_key"] = config.query_config.translation_api_key
  483 + if config.query_config.translation_glossary_id:
  484 + query_config_dict["translation_glossary_id"] = config.query_config.translation_glossary_id
  485 + if config.query_config.translation_context:
  486 + query_config_dict["translation_context"] = config.query_config.translation_context
  487 + if config.query_config.text_embedding_field:
  488 + query_config_dict["text_embedding_field"] = config.query_config.text_embedding_field
  489 + if config.query_config.image_embedding_field:
  490 + query_config_dict["image_embedding_field"] = config.query_config.image_embedding_field
  491 + if config.query_config.source_fields:
  492 + query_config_dict["source_fields"] = config.query_config.source_fields
  493 +
  494 + # Add embedding disable thresholds
  495 + if (config.query_config.embedding_disable_chinese_char_limit != 4 or
  496 + config.query_config.embedding_disable_english_word_limit != 3):
  497 + query_config_dict["embedding_disable_thresholds"] = {
  498 + "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit,
  499 + "english_word_limit": config.query_config.embedding_disable_english_word_limit
  500 + }
  501 +
450 config_dict = { 502 config_dict = {
451 "es_index_name": config.es_index_name, 503 "es_index_name": config.es_index_name,
452 "es_settings": config.es_settings, 504 "es_settings": config.es_settings,
453 "fields": [self._field_to_dict(field) for field in config.fields], 505 "fields": [self._field_to_dict(field) for field in config.fields],
454 "indexes": [self._index_to_dict(index) for index in config.indexes], 506 "indexes": [self._index_to_dict(index) for index in config.indexes],
455 - "query_config": {  
456 - "supported_languages": config.query_config.supported_languages,  
457 - "default_language": config.query_config.default_language,  
458 - "enable_translation": config.query_config.enable_translation,  
459 - "enable_text_embedding": config.query_config.enable_text_embedding,  
460 - "enable_query_rewrite": config.query_config.enable_query_rewrite,  
461 - # rewrite_dictionary is stored in separate file, not in config  
462 - "translation_api_key": config.query_config.translation_api_key,  
463 - "translation_service": config.query_config.translation_service,  
464 - }, 507 + "query_config": query_config_dict,
465 "ranking": { 508 "ranking": {
466 "expression": config.ranking.expression, 509 "expression": config.ranking.expression,
467 "description": config.ranking.description 510 "description": config.ranking.description
@@ -505,7 +548,7 @@ class ConfigLoader: @@ -505,7 +548,7 @@ class ConfigLoader:
505 f.write(f"{key}\t{value}\n") 548 f.write(f"{key}\t{value}\n")
506 549
507 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: 550 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]:
508 - """Convert FieldConfig to dictionary.""" 551 + """Convert FieldConfig to dictionary, preserving all fields."""
509 result = { 552 result = {
510 "name": field.name, 553 "name": field.name,
511 "type": field.field_type.value, 554 "type": field.field_type.value,
@@ -513,36 +556,49 @@ class ConfigLoader: @@ -513,36 +556,49 @@ class ConfigLoader:
513 "boost": field.boost, 556 "boost": field.boost,
514 "store": field.store, 557 "store": field.store,
515 "index": field.index, 558 "index": field.index,
  559 + "return_in_source": field.return_in_source,
516 } 560 }
517 561
  562 + # Add optional fields only if they differ from defaults or are set
518 if field.analyzer: 563 if field.analyzer:
519 result["analyzer"] = field.analyzer.value 564 result["analyzer"] = field.analyzer.value
520 if field.search_analyzer: 565 if field.search_analyzer:
521 result["search_analyzer"] = field.search_analyzer.value 566 result["search_analyzer"] = field.search_analyzer.value
522 if field.multi_language: 567 if field.multi_language:
523 result["multi_language"] = field.multi_language 568 result["multi_language"] = field.multi_language
524 - result["languages"] = field.languages 569 + if field.languages:
  570 + result["languages"] = field.languages
525 if field.embedding_dims != 1024: 571 if field.embedding_dims != 1024:
526 result["embedding_dims"] = field.embedding_dims 572 result["embedding_dims"] = field.embedding_dims
527 if field.embedding_similarity != "dot_product": 573 if field.embedding_similarity != "dot_product":
528 result["embedding_similarity"] = field.embedding_similarity 574 result["embedding_similarity"] = field.embedding_similarity
529 if field.nested: 575 if field.nested:
530 result["nested"] = field.nested 576 result["nested"] = field.nested
531 - result["nested_properties"] = field.nested_properties 577 + if field.nested_properties:
  578 + result["nested_properties"] = field.nested_properties
  579 + if field.keyword_subfield:
  580 + result["keyword_subfield"] = field.keyword_subfield
  581 + if field.keyword_ignore_above != 256:
  582 + result["keyword_ignore_above"] = field.keyword_ignore_above
  583 + if field.keyword_normalizer:
  584 + result["keyword_normalizer"] = field.keyword_normalizer
532 585
533 return result 586 return result
534 587
535 def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: 588 def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
536 - """Convert IndexConfig to dictionary.""" 589 + """Convert IndexConfig to dictionary, preserving all fields."""
537 result = { 590 result = {
538 "name": index.name, 591 "name": index.name,
539 "label": index.label, 592 "label": index.label,
540 "fields": index.fields, 593 "fields": index.fields,
541 "analyzer": index.analyzer.value, 594 "analyzer": index.analyzer.value,
542 - "boost": index.boost,  
543 - "example": index.example  
544 } 595 }
545 - 596 +
  597 + # Add optional fields only if they differ from defaults or are set
  598 + if index.boost != 1.0:
  599 + result["boost"] = index.boost
  600 + if index.example:
  601 + result["example"] = index.example
546 if index.language_field_mapping: 602 if index.language_field_mapping:
547 result["language_field_mapping"] = index.language_field_mapping 603 result["language_field_mapping"] = index.language_field_mapping
548 604
config/utils.py 0 → 100644
@@ -0,0 +1,70 @@ @@ -0,0 +1,70 @@
  1 +"""
  2 +Configuration utility functions.
  3 +
  4 +Helper functions for working with SearchConfig objects.
  5 +"""
  6 +
  7 +from typing import Dict, List
  8 +from .config_loader import SearchConfig
  9 +
  10 +
  11 +def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]:
  12 + """
  13 + Generate match fields list with boost from IndexConfig and FieldConfig.
  14 +
  15 + Args:
  16 + config: SearchConfig instance
  17 + index_name: Name of the index domain (default: "default")
  18 +
  19 + Returns:
  20 + List of field names with boost, e.g., ["title_zh^3.0", "brief_zh^1.5"]
  21 + """
  22 + # Find the index config
  23 + index_config = None
  24 + for idx in config.indexes:
  25 + if idx.name == index_name:
  26 + index_config = idx
  27 + break
  28 +
  29 + if not index_config:
  30 + return []
  31 +
  32 + # Create a field name to FieldConfig mapping
  33 + field_map = {field.name: field for field in config.fields}
  34 +
  35 + # Generate match fields with boost
  36 + match_fields = []
  37 + for field_name in index_config.fields:
  38 + field_config = field_map.get(field_name)
  39 + if field_config:
  40 + # Combine index boost and field boost
  41 + total_boost = index_config.boost * field_config.boost
  42 + if total_boost != 1.0:
  43 + match_fields.append(f"{field_name}^{total_boost}")
  44 + else:
  45 + match_fields.append(field_name)
  46 + else:
  47 + # Field not found in config, use index boost only
  48 + if index_config.boost != 1.0:
  49 + match_fields.append(f"{field_name}^{index_config.boost}")
  50 + else:
  51 + match_fields.append(field_name)
  52 +
  53 + return match_fields
  54 +
  55 +
  56 +def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]:
  57 + """
  58 + Generate domain-specific match fields from all index configs.
  59 +
  60 + Args:
  61 + config: SearchConfig instance
  62 +
  63 + Returns:
  64 + Dictionary mapping domain name to list of match fields
  65 + """
  66 + domain_fields = {}
  67 + for index_config in config.indexes:
  68 + domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name)
  69 + return domain_fields
  70 +
context/request_context.py
@@ -59,9 +59,10 @@ class RequestContext: @@ -59,9 +59,10 @@ class RequestContext:
59 """ 59 """
60 60
61 def __init__(self, reqid: str = None, uid: str = None): 61 def __init__(self, reqid: str = None, uid: str = None):
62 - # 生成唯一请求ID 62 + # 生成唯一请求ID;如果外部未提供,则自动生成
  63 + # 如果无法获取到 uid,则使用 "-1" 作为占位,用于日志关联
63 self.reqid = reqid or str(uuid.uuid4())[:8] 64 self.reqid = reqid or str(uuid.uuid4())[:8]
64 - self.uid = uid or 'anonymous' 65 + self.uid = uid or "-1"
65 66
66 # 查询分析结果 67 # 查询分析结果
67 self.query_analysis = QueryAnalysisResult() 68 self.query_analysis = QueryAnalysisResult()
@@ -111,7 +112,10 @@ class RequestContext: @@ -111,7 +112,10 @@ class RequestContext:
111 """ 112 """
112 start_time = time.time() 113 start_time = time.time()
113 self.performance_metrics.stage_start_times[stage.value] = start_time 114 self.performance_metrics.stage_start_times[stage.value] = start_time
114 - self.logger.debug(f"开始阶段 | {stage.value}", extra={'reqid': self.reqid, 'uid': self.uid}) 115 + self.logger.debug(
  116 + f"Start stage | {stage.value}",
  117 + extra={'reqid': self.reqid, 'uid': self.uid}
  118 + )
115 return start_time 119 return start_time
116 120
117 def end_stage(self, stage: RequestContextStage) -> float: 121 def end_stage(self, stage: RequestContextStage) -> float:
@@ -125,7 +129,10 @@ class RequestContext: @@ -125,7 +129,10 @@ class RequestContext:
125 阶段耗时(毫秒) 129 阶段耗时(毫秒)
126 """ 130 """
127 if stage.value not in self.performance_metrics.stage_start_times: 131 if stage.value not in self.performance_metrics.stage_start_times:
128 - self.logger.warning(f"阶段未开始计时 | {stage.value}", extra={'reqid': self.reqid, 'uid': self.uid}) 132 + self.logger.warning(
  133 + f"Stage not started | {stage.value}",
  134 + extra={'reqid': self.reqid, 'uid': self.uid}
  135 + )
129 return 0.0 136 return 0.0
130 137
131 start_time = self.performance_metrics.stage_start_times[stage.value] 138 start_time = self.performance_metrics.stage_start_times[stage.value]
@@ -133,7 +140,7 @@ class RequestContext: @@ -133,7 +140,7 @@ class RequestContext:
133 self.performance_metrics.stage_timings[stage.value] = duration_ms 140 self.performance_metrics.stage_timings[stage.value] = duration_ms
134 141
135 self.logger.debug( 142 self.logger.debug(
136 - f"结束阶段 | {stage.value} | 耗时: {duration_ms:.2f}ms", 143 + f"End stage | {stage.value} | duration: {duration_ms:.2f}ms",
137 extra={'reqid': self.reqid, 'uid': self.uid} 144 extra={'reqid': self.reqid, 'uid': self.uid}
138 ) 145 )
139 return duration_ms 146 return duration_ms
@@ -162,7 +169,7 @@ class RequestContext: @@ -162,7 +169,7 @@ class RequestContext:
162 setattr(self.query_analysis, key, value) 169 setattr(self.query_analysis, key, value)
163 else: 170 else:
164 self.logger.warning( 171 self.logger.warning(
165 - f"未知的查询分析字段 | {key}", 172 + f"Unknown query analysis field | {key}",
166 extra={'reqid': self.reqid, 'uid': self.uid} 173 extra={'reqid': self.reqid, 'uid': self.uid}
167 ) 174 )
168 175
@@ -175,7 +182,10 @@ class RequestContext: @@ -175,7 +182,10 @@ class RequestContext:
175 value: 结果值 182 value: 结果值
176 """ 183 """
177 self.intermediate_results[key] = value 184 self.intermediate_results[key] = value
178 - self.logger.debug(f"存储中间结果 | {key}", extra={'reqid': self.reqid, 'uid': self.uid}) 185 + self.logger.debug(
  186 + f"Store intermediate result | {key}",
  187 + extra={'reqid': self.reqid, 'uid': self.uid}
  188 + )
179 189
180 def get_intermediate_result(self, key: str, default: Any = None) -> Any: 190 def get_intermediate_result(self, key: str, default: Any = None) -> Any:
181 """ 191 """
@@ -213,7 +223,7 @@ class RequestContext: @@ -213,7 +223,7 @@ class RequestContext:
213 'details': {} 223 'details': {}
214 } 224 }
215 self.logger.error( 225 self.logger.error(
216 - f"设置错误信息 | {type(error).__name__}: {str(error)}", 226 + f"Set error info | {type(error).__name__}: {str(error)}",
217 extra={'reqid': self.reqid, 'uid': self.uid} 227 extra={'reqid': self.reqid, 'uid': self.uid}
218 ) 228 )
219 229
@@ -286,13 +296,13 @@ class RequestContext: @@ -286,13 +296,13 @@ class RequestContext:
286 296
287 # 构建详细的日志消息 297 # 构建详细的日志消息
288 msg_parts = [ 298 msg_parts = [
289 - f"搜索请求性能摘要 | reqid: {self.reqid}",  
290 - f"总耗时: {summary['performance']['total_duration_ms']:.2f}ms" 299 + f"Search request performance summary | reqid: {self.reqid}",
  300 + f"Total duration: {summary['performance']['total_duration_ms']:.2f}ms"
291 ] 301 ]
292 302
293 # 添加各阶段耗时 303 # 添加各阶段耗时
294 if summary['performance']['stage_timings_ms']: 304 if summary['performance']['stage_timings_ms']:
295 - msg_parts.append("阶段耗时:") 305 + msg_parts.append("Stage durations:")
296 for stage, duration in summary['performance']['stage_timings_ms'].items(): 306 for stage, duration in summary['performance']['stage_timings_ms'].items():
297 percentage = summary['performance']['stage_percentages'].get(stage, 0) 307 percentage = summary['performance']['stage_percentages'].get(stage, 0)
298 msg_parts.append(f" - {stage}: {duration:.2f}ms ({percentage}%)") 308 msg_parts.append(f" - {stage}: {duration:.2f}ms ({percentage}%)")
@@ -300,25 +310,26 @@ class RequestContext: @@ -300,25 +310,26 @@ class RequestContext:
300 # 添加查询信息 310 # 添加查询信息
301 if summary['query_analysis']['original_query']: 311 if summary['query_analysis']['original_query']:
302 msg_parts.append( 312 msg_parts.append(
303 - f"查询: '{summary['query_analysis']['original_query']}' " 313 + "Query: "
  314 + f"'{summary['query_analysis']['original_query']}' "
304 f"-> '{summary['query_analysis']['rewritten_query']}' " 315 f"-> '{summary['query_analysis']['rewritten_query']}' "
305 f"({summary['query_analysis']['detected_language']})" 316 f"({summary['query_analysis']['detected_language']})"
306 ) 317 )
307 318
308 # 添加结果统计 319 # 添加结果统计
309 msg_parts.append( 320 msg_parts.append(
310 - f"结果: {summary['results']['total_hits']} hits "  
311 - f"ES查询: {summary['results']['es_query_size']} chars" 321 + f"Results: {summary['results']['total_hits']} hits "
  322 + f"ES query size: {summary['results']['es_query_size']} chars"
312 ) 323 )
313 324
314 # 添加错误信息(如果有) 325 # 添加错误信息(如果有)
315 if summary['request_info']['has_error']: 326 if summary['request_info']['has_error']:
316 error_info = self.metadata['error_info'] 327 error_info = self.metadata['error_info']
317 - msg_parts.append(f"错误: {error_info['type']}: {error_info['message']}") 328 + msg_parts.append(f"Error: {error_info['type']}: {error_info['message']}")
318 329
319 # 添加警告信息(如果有) 330 # 添加警告信息(如果有)
320 if summary['request_info']['warnings_count'] > 0: 331 if summary['request_info']['warnings_count'] > 0:
321 - msg_parts.append(f"警告: {summary['request_info']['warnings_count']} 个") 332 + msg_parts.append(f"Warnings: {summary['request_info']['warnings_count']}")
322 333
323 log_message = " | ".join(msg_parts) 334 log_message = " | ".join(msg_parts)
324 335
docs/常用查询 - ES.md
@@ -17,4 +17,19 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/ @@ -17,4 +17,19 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/
17 ] 17 ]
18 } 18 }
19 } 19 }
20 - }'  
21 \ No newline at end of file 20 \ No newline at end of file
  21 + }'
  22 +
  23 +
  24 +
  25 +curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
  26 + "size": 5,
  27 + "query": {
  28 + "bool": {
  29 + "filter": [
  30 + { "term": { "spu_id": "74123" } }
  31 + ]
  32 + }
  33 + }
  34 + }'
  35 +
  36 +
docs/搜索API对接指南.md
@@ -104,7 +104,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -104,7 +104,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
104 "sort_by": "string", 104 "sort_by": "string",
105 "sort_order": "desc", 105 "sort_order": "desc",
106 "min_score": 0.0, 106 "min_score": 0.0,
107 - "sku_filter_dimension": "string", 107 + "sku_filter_dimension": ["string"],
108 "debug": false, 108 "debug": false,
109 "user_id": "string", 109 "user_id": "string",
110 "session_id": "string" 110 "session_id": "string"
@@ -127,7 +127,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -127,7 +127,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
127 | `sort_by` | string | N | null | 排序字段名(如 `min_price`, `max_price`) | 127 | `sort_by` | string | N | null | 排序字段名(如 `min_price`, `max_price`) |
128 | `sort_order` | string | N | "desc" | 排序方向:`asc`(升序)或 `desc`(降序) | 128 | `sort_order` | string | N | "desc" | 排序方向:`asc`(升序)或 `desc`(降序) |
129 | `min_score` | float | N | null | 最小相关性分数阈值 | 129 | `min_score` | float | N | null | 最小相关性分数阈值 |
130 -| `sku_filter_dimension` | string | N | null | 子SKU筛选维度(店铺配置)。指定后,每个SPU下的SKU将按该维度分组,每组选择第一个SKU返回。支持的值:`option1`、`option2`、`option3` 或 specifications 中的 name(如 `color`、`size`)。详见下文说明 | 130 +| `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(店铺配置)。指定后,每个SPU下的SKU将按这些维度的组合进行分组,每个组合只返回第一个SKU。支持的值:`option1`、`option2`、`option3` 或选项名称(如 `color`、`size`)。详见下文说明 |
131 | `debug` | boolean | N | false | 是否返回调试信息 | 131 | `debug` | boolean | N | false | 是否返回调试信息 |
132 | `user_id` | string | N | null | 用户ID(用于个性化,预留) | 132 | `user_id` | string | N | null | 用户ID(用于个性化,预留) |
133 | `session_id` | string | N | null | 会话ID(用于分析,预留) | 133 | `session_id` | string | N | null | 会话ID(用于分析,预留) |
@@ -349,7 +349,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -349,7 +349,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
349 ### SKU筛选维度 (sku_filter_dimension) 349 ### SKU筛选维度 (sku_filter_dimension)
350 350
351 **功能说明**: 351 **功能说明**:
352 -`sku_filter_dimension` 用于控制每个SPU下返回的SKU数量。当指定此参数后,系统会按指定维度对SKU进行分组,每个分组只返回第一个SKU(从简实现,选择该维度下的第一款)。 352 +`sku_filter_dimension` 用于控制每个SPU下返回的SKU数量,为字符串列表。当指定此参数后,系统会按指定维度**组合**对SKU进行分组,每个维度组合只返回第一个SKU(从简实现,选择该组合下的第一款)。
353 353
354 **使用场景**: 354 **使用场景**:
355 - 店铺配置了SKU筛选维度(如 `color`),希望每个SPU下每种颜色只显示一个SKU 355 - 店铺配置了SKU筛选维度(如 `color`),希望每个SPU下每种颜色只显示一个SKU
@@ -360,8 +360,8 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -360,8 +360,8 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
360 1. **直接选项字段**: `option1`、`option2`、`option3` 360 1. **直接选项字段**: `option1`、`option2`、`option3`
361 - 直接使用对应的 `option1_value`、`option2_value`、`option3_value` 字段进行分组 361 - 直接使用对应的 `option1_value`、`option2_value`、`option3_value` 字段进行分组
362 362
363 -2. **规格名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配  
364 - - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: "color"` 来按颜色分组 363 +2. **规格/选项名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配
  364 + - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: ["color"]` 来按颜色分组
365 365
366 **示例**: 366 **示例**:
367 367
@@ -369,7 +369,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -369,7 +369,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
369 ```json 369 ```json
370 { 370 {
371 "query": "芭比娃娃", 371 "query": "芭比娃娃",
372 - "sku_filter_dimension": "color" 372 + "sku_filter_dimension": ["color"]
373 } 373 }
374 ``` 374 ```
375 375
@@ -377,7 +377,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -377,7 +377,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
377 ```json 377 ```json
378 { 378 {
379 "query": "芭比娃娃", 379 "query": "芭比娃娃",
380 - "sku_filter_dimension": "option1" 380 + "sku_filter_dimension": ["option1"]
381 } 381 }
382 ``` 382 ```
383 383
@@ -385,7 +385,15 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -385,7 +385,15 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
385 ```json 385 ```json
386 { 386 {
387 "query": "芭比娃娃", 387 "query": "芭比娃娃",
388 - "sku_filter_dimension": "option2" 388 + "sku_filter_dimension": ["option2"]
  389 +}
  390 +```
  391 +
  392 +**按颜色 + 尺寸组合筛选(假设 option1_name = "color", option2_name = "size")**:
  393 +```json
  394 +{
  395 + "query": "芭比娃娃",
  396 + "sku_filter_dimension": ["color", "size"]
389 } 397 }
390 ``` 398 ```
391 399
frontend/index.html
@@ -25,6 +25,10 @@ @@ -25,6 +25,10 @@
25 <label for="tenantInput">tenant ID:</label> 25 <label for="tenantInput">tenant ID:</label>
26 <input type="text" id="tenantInput" placeholder="请输入租户ID" value="1"> 26 <input type="text" id="tenantInput" placeholder="请输入租户ID" value="1">
27 </div> 27 </div>
  28 + <div class="tenant-input-wrapper">
  29 + <label for="skuFilterDimension">sku_filter_dimension:</label>
  30 + <input type="text" id="skuFilterDimension" placeholder="SKU筛选维度" value="color">
  31 + </div>
28 <input type="text" id="searchInput" placeholder="输入搜索关键词... (支持中文、英文、俄文)" 32 <input type="text" id="searchInput" placeholder="输入搜索关键词... (支持中文、英文、俄文)"
29 onkeypress="handleKeyPress(event)"> 33 onkeypress="handleKeyPress(event)">
30 <button onclick="performSearch()" class="search-btn">Search</button> 34 <button onclick="performSearch()" class="search-btn">Search</button>
@@ -100,9 +104,10 @@ @@ -100,9 +104,10 @@
100 104
101 <div class="sort-right"> 105 <div class="sort-right">
102 <select id="resultSize" onchange="performSearch()"> 106 <select id="resultSize" onchange="performSearch()">
103 - <option value="10">10 per page</option>  
104 - <option value="20" selected>20 per page</option>  
105 - <option value="50">50 per page</option> 107 + <option value="20">20 per page</option>
  108 + <option value="50" selected>50 per page</option>
  109 + <option value="100">50 per page</option>
  110 + <option value="200">50 per page</option>
106 </select> 111 </select>
107 </div> 112 </div>
108 </div> 113 </div>
@@ -130,6 +135,6 @@ @@ -130,6 +135,6 @@
130 <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p> 135 <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p>
131 </footer> 136 </footer>
132 137
133 - <script src="/static/js/app.js?v=3.2"></script> 138 + <script src="/static/js/app.js?v=3.4"></script>
134 </body> 139 </body>
135 </html> 140 </html>
frontend/static/css/style.css
@@ -84,7 +84,8 @@ body { @@ -84,7 +84,8 @@ body {
84 white-space: nowrap; 84 white-space: nowrap;
85 } 85 }
86 86
87 -#tenantInput { 87 +#tenantInput,
  88 +#skuFilterDimension {
88 width: 120px; 89 width: 120px;
89 padding: 10px 15px; 90 padding: 10px 15px;
90 font-size: 14px; 91 font-size: 14px;
@@ -93,7 +94,8 @@ body { @@ -93,7 +94,8 @@ body {
93 outline: none; 94 outline: none;
94 } 95 }
95 96
96 -#tenantInput:focus { 97 +#tenantInput:focus,
  98 +#skuFilterDimension:focus {
97 border-color: #e74c3c; 99 border-color: #e74c3c;
98 } 100 }
99 101
frontend/static/js/app.js
@@ -14,6 +14,21 @@ function getTenantId() { @@ -14,6 +14,21 @@ function getTenantId() {
14 return '1'; // Default fallback 14 return '1'; // Default fallback
15 } 15 }
16 16
  17 +// Get sku_filter_dimension (as list) from input
  18 +function getSkuFilterDimension() {
  19 + const skuFilterInput = document.getElementById('skuFilterDimension');
  20 + if (skuFilterInput) {
  21 + const value = skuFilterInput.value.trim();
  22 + if (!value.length) {
  23 + return null;
  24 + }
  25 + // 支持用逗号分隔多个维度,例如:color,size 或 option1,color
  26 + const parts = value.split(',').map(v => v.trim()).filter(v => v.length > 0);
  27 + return parts.length > 0 ? parts : null;
  28 + }
  29 + return null;
  30 +}
  31 +
17 // State Management 32 // State Management
18 let state = { 33 let state = {
19 query: '', 34 query: '',
@@ -51,6 +66,7 @@ function toggleFilters() { @@ -51,6 +66,7 @@ function toggleFilters() {
51 async function performSearch(page = 1) { 66 async function performSearch(page = 1) {
52 const query = document.getElementById('searchInput').value.trim(); 67 const query = document.getElementById('searchInput').value.trim();
53 const tenantId = getTenantId(); 68 const tenantId = getTenantId();
  69 + const skuFilterDimension = getSkuFilterDimension();
54 70
55 if (!query) { 71 if (!query) {
56 alert('Please enter search keywords'); 72 alert('Please enter search keywords');
@@ -96,6 +112,7 @@ async function performSearch(page = 1) { @@ -96,6 +112,7 @@ async function performSearch(page = 1) {
96 facets: facets, 112 facets: facets,
97 sort_by: state.sortBy || null, 113 sort_by: state.sortBy || null,
98 sort_order: state.sortOrder, 114 sort_order: state.sortOrder,
  115 + sku_filter_dimension: skuFilterDimension,
99 debug: state.debug 116 debug: state.debug
100 }) 117 })
101 }); 118 });
@@ -93,7 +93,7 @@ def cmd_search(args): @@ -93,7 +93,7 @@ def cmd_search(args):
93 93
94 from query import QueryParser 94 from query import QueryParser
95 query_parser = QueryParser(config) 95 query_parser = QueryParser(config)
96 - searcher = Searcher(config, es_client, query_parser) 96 + searcher = Searcher(es_client, config, query_parser)
97 97
98 # Execute search 98 # Execute search
99 print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})") 99 print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})")
query/query_parser.py
@@ -9,13 +9,7 @@ import numpy as np @@ -9,13 +9,7 @@ import numpy as np
9 import logging 9 import logging
10 10
11 from embeddings import BgeEncoder 11 from embeddings import BgeEncoder
12 -from search.query_config import (  
13 - ENABLE_TEXT_EMBEDDING,  
14 - ENABLE_TRANSLATION,  
15 - REWRITE_DICTIONARY,  
16 - TRANSLATION_API_KEY,  
17 - TRANSLATION_SERVICE  
18 -) 12 +from config import SearchConfig
19 from .language_detector import LanguageDetector 13 from .language_detector import LanguageDetector
20 from .translator import Translator 14 from .translator import Translator
21 from .query_rewriter import QueryRewriter, QueryNormalizer 15 from .query_rewriter import QueryRewriter, QueryNormalizer
@@ -70,6 +64,7 @@ class QueryParser: @@ -70,6 +64,7 @@ class QueryParser:
70 64
71 def __init__( 65 def __init__(
72 self, 66 self,
  67 + config: SearchConfig,
73 text_encoder: Optional[BgeEncoder] = None, 68 text_encoder: Optional[BgeEncoder] = None,
74 translator: Optional[Translator] = None 69 translator: Optional[Translator] = None
75 ): 70 ):
@@ -77,21 +72,23 @@ class QueryParser: @@ -77,21 +72,23 @@ class QueryParser:
77 Initialize query parser. 72 Initialize query parser.
78 73
79 Args: 74 Args:
  75 + config: SearchConfig instance
80 text_encoder: Text embedding encoder (lazy loaded if not provided) 76 text_encoder: Text embedding encoder (lazy loaded if not provided)
81 translator: Translator instance (lazy loaded if not provided) 77 translator: Translator instance (lazy loaded if not provided)
82 """ 78 """
  79 + self.config = config
83 self._text_encoder = text_encoder 80 self._text_encoder = text_encoder
84 self._translator = translator 81 self._translator = translator
85 82
86 # Initialize components 83 # Initialize components
87 self.normalizer = QueryNormalizer() 84 self.normalizer = QueryNormalizer()
88 self.language_detector = LanguageDetector() 85 self.language_detector = LanguageDetector()
89 - self.rewriter = QueryRewriter(REWRITE_DICTIONARY) 86 + self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
90 87
91 @property 88 @property
92 def text_encoder(self) -> BgeEncoder: 89 def text_encoder(self) -> BgeEncoder:
93 """Lazy load text encoder.""" 90 """Lazy load text encoder."""
94 - if self._text_encoder is None and ENABLE_TEXT_EMBEDDING: 91 + if self._text_encoder is None and self.config.query_config.enable_text_embedding:
95 logger.info("Initializing text encoder (lazy load)...") 92 logger.info("Initializing text encoder (lazy load)...")
96 self._text_encoder = BgeEncoder() 93 self._text_encoder = BgeEncoder()
97 return self._text_encoder 94 return self._text_encoder
@@ -99,13 +96,13 @@ class QueryParser: @@ -99,13 +96,13 @@ class QueryParser:
99 @property 96 @property
100 def translator(self) -> Translator: 97 def translator(self) -> Translator:
101 """Lazy load translator.""" 98 """Lazy load translator."""
102 - if self._translator is None and ENABLE_TRANSLATION: 99 + if self._translator is None and self.config.query_config.enable_translation:
103 logger.info("Initializing translator (lazy load)...") 100 logger.info("Initializing translator (lazy load)...")
104 self._translator = Translator( 101 self._translator = Translator(
105 - api_key=TRANSLATION_API_KEY, 102 + api_key=self.config.query_config.translation_api_key,
106 use_cache=True, 103 use_cache=True,
107 - glossary_id=None, # Can be added to query_config if needed  
108 - translation_context='e-commerce product search' 104 + glossary_id=self.config.query_config.translation_glossary_id,
  105 + translation_context=self.config.query_config.translation_context
109 ) 106 )
110 return self._translator 107 return self._translator
111 108
@@ -156,7 +153,7 @@ class QueryParser: @@ -156,7 +153,7 @@ class QueryParser:
156 153
157 # Stage 2: Query rewriting 154 # Stage 2: Query rewriting
158 rewritten = None 155 rewritten = None
159 - if REWRITE_DICTIONARY: # Enable rewrite if dictionary exists 156 + if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
160 rewritten = self.rewriter.rewrite(query_text) 157 rewritten = self.rewriter.rewrite(query_text)
161 if rewritten != query_text: 158 if rewritten != query_text:
162 log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") 159 log_info(f"查询重写 | '{query_text}' -> '{rewritten}'")
@@ -173,7 +170,7 @@ class QueryParser: @@ -173,7 +170,7 @@ class QueryParser:
173 170
174 # Stage 4: Translation 171 # Stage 4: Translation
175 translations = {} 172 translations = {}
176 - if ENABLE_TRANSLATION: 173 + if self.config.query_config.enable_translation:
177 try: 174 try:
178 # Determine target languages for translation 175 # Determine target languages for translation
179 # Simplified: always translate to Chinese and English 176 # Simplified: always translate to Chinese and English
@@ -210,19 +207,47 @@ class QueryParser: @@ -210,19 +207,47 @@ class QueryParser:
210 # Stage 5: Text embedding 207 # Stage 5: Text embedding
211 query_vector = None 208 query_vector = None
212 if (generate_vector and 209 if (generate_vector and
213 - ENABLE_TEXT_EMBEDDING and 210 + self.config.query_config.enable_text_embedding and
214 domain == "default"): # Only generate vector for default domain 211 domain == "default"): # Only generate vector for default domain
215 - try:  
216 - log_debug("开始生成查询向量")  
217 - query_vector = self.text_encoder.encode([query_text])[0]  
218 - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}")  
219 - if context:  
220 - context.store_intermediate_result('query_vector_shape', query_vector.shape)  
221 - except Exception as e:  
222 - error_msg = f"查询向量生成失败 | 错误: {str(e)}"  
223 - log_info(error_msg)  
224 - if context:  
225 - context.add_warning(error_msg) 212 + # Get thresholds from config
  213 + chinese_limit = self.config.query_config.embedding_disable_chinese_char_limit
  214 + english_limit = self.config.query_config.embedding_disable_english_word_limit
  215 +
  216 + # Check if embedding should be disabled for short queries
  217 + should_disable_embedding = False
  218 + disable_reason = None
  219 +
  220 + if detected_lang == 'zh':
  221 + # For Chinese: disable embedding if character count <= threshold
  222 + char_count = len(query_text.strip())
  223 + if char_count <= chinese_limit:
  224 + should_disable_embedding = True
  225 + disable_reason = f"中文查询字数({char_count}) <= {chinese_limit},禁用向量搜索"
  226 + log_info(disable_reason)
  227 + if context:
  228 + context.store_intermediate_result('embedding_disabled_reason', disable_reason)
  229 + else:
  230 + # For English: disable embedding if word count <= threshold
  231 + word_count = len(query_text.strip().split())
  232 + if word_count <= english_limit:
  233 + should_disable_embedding = True
  234 + disable_reason = f"英文查询单词数({word_count}) <= {english_limit},禁用向量搜索"
  235 + log_info(disable_reason)
  236 + if context:
  237 + context.store_intermediate_result('embedding_disabled_reason', disable_reason)
  238 +
  239 + if not should_disable_embedding:
  240 + try:
  241 + log_debug("开始生成查询向量")
  242 + query_vector = self.text_encoder.encode([query_text])[0]
  243 + log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}")
  244 + if context:
  245 + context.store_intermediate_result('query_vector_shape', query_vector.shape)
  246 + except Exception as e:
  247 + error_msg = f"查询向量生成失败 | 错误: {str(e)}"
  248 + log_info(error_msg)
  249 + if context:
  250 + context.add_warning(error_msg)
226 251
227 # Build result 252 # Build result
228 result = ParsedQuery( 253 result = ParsedQuery(
search/es_query_builder.py
@@ -11,7 +11,7 @@ Simplified architecture: @@ -11,7 +11,7 @@ Simplified architecture:
11 from typing import Dict, Any, List, Optional, Union 11 from typing import Dict, Any, List, Optional, Union
12 import numpy as np 12 import numpy as np
13 from .boolean_parser import QueryNode 13 from .boolean_parser import QueryNode
14 -from .query_config import FUNCTION_SCORE_CONFIG 14 +from config import FunctionScoreConfig
15 15
16 16
17 class ESQueryBuilder: 17 class ESQueryBuilder:
@@ -23,7 +23,8 @@ class ESQueryBuilder: @@ -23,7 +23,8 @@ class ESQueryBuilder:
23 match_fields: List[str], 23 match_fields: List[str],
24 text_embedding_field: Optional[str] = None, 24 text_embedding_field: Optional[str] = None,
25 image_embedding_field: Optional[str] = None, 25 image_embedding_field: Optional[str] = None,
26 - source_fields: Optional[List[str]] = None 26 + source_fields: Optional[List[str]] = None,
  27 + function_score_config: Optional[FunctionScoreConfig] = None
27 ): 28 ):
28 """ 29 """
29 Initialize query builder. 30 Initialize query builder.
@@ -34,12 +35,14 @@ class ESQueryBuilder: @@ -34,12 +35,14 @@ class ESQueryBuilder:
34 text_embedding_field: Field name for text embeddings 35 text_embedding_field: Field name for text embeddings
35 image_embedding_field: Field name for image embeddings 36 image_embedding_field: Field name for image embeddings
36 source_fields: Fields to return in search results (_source includes) 37 source_fields: Fields to return in search results (_source includes)
  38 + function_score_config: Function score configuration
37 """ 39 """
38 self.index_name = index_name 40 self.index_name = index_name
39 self.match_fields = match_fields 41 self.match_fields = match_fields
40 self.text_embedding_field = text_embedding_field 42 self.text_embedding_field = text_embedding_field
41 self.image_embedding_field = image_embedding_field 43 self.image_embedding_field = image_embedding_field
42 self.source_fields = source_fields 44 self.source_fields = source_fields
  45 + self.function_score_config = function_score_config
43 46
44 def build_query( 47 def build_query(
45 self, 48 self,
@@ -182,12 +185,15 @@ class ESQueryBuilder: @@ -182,12 +185,15 @@ class ESQueryBuilder:
182 return query 185 return query
183 186
184 # Build function_score query 187 # Build function_score query
  188 + score_mode = self.function_score_config.score_mode if self.function_score_config else "sum"
  189 + boost_mode = self.function_score_config.boost_mode if self.function_score_config else "multiply"
  190 +
185 function_score_query = { 191 function_score_query = {
186 "function_score": { 192 "function_score": {
187 "query": query, 193 "query": query,
188 "functions": functions, 194 "functions": functions,
189 - "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"),  
190 - "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") 195 + "score_mode": score_mode,
  196 + "boost_mode": boost_mode
191 } 197 }
192 } 198 }
193 199
@@ -201,7 +207,10 @@ class ESQueryBuilder: @@ -201,7 +207,10 @@ class ESQueryBuilder:
201 List of function score functions 207 List of function score functions
202 """ 208 """
203 functions = [] 209 functions = []
204 - config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) 210 + if not self.function_score_config:
  211 + return functions
  212 +
  213 + config_functions = self.function_score_config.functions or []
205 214
206 for func_config in config_functions: 215 for func_config in config_functions:
207 func_type = func_config.get("type") 216 func_type = func_config.get("type")
search/searcher.py
@@ -5,7 +5,7 @@ Handles query parsing, boolean expressions, ranking, and result formatting. @@ -5,7 +5,7 @@ Handles query parsing, boolean expressions, ranking, and result formatting.
5 """ 5 """
6 6
7 from typing import Dict, Any, List, Optional, Union 7 from typing import Dict, Any, List, Optional, Union
8 -import time 8 +import time, json
9 import logging 9 import logging
10 10
11 from utils.es_client import ESClient 11 from utils.es_client import ESClient
@@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder @@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder
14 from .boolean_parser import BooleanParser, QueryNode 14 from .boolean_parser import BooleanParser, QueryNode
15 from .es_query_builder import ESQueryBuilder 15 from .es_query_builder import ESQueryBuilder
16 from .rerank_engine import RerankEngine 16 from .rerank_engine import RerankEngine
17 -from .query_config import (  
18 - DEFAULT_INDEX_NAME,  
19 - DEFAULT_MATCH_FIELDS,  
20 - TEXT_EMBEDDING_FIELD,  
21 - IMAGE_EMBEDDING_FIELD,  
22 - SOURCE_FIELDS,  
23 - ENABLE_TRANSLATION,  
24 - ENABLE_TEXT_EMBEDDING,  
25 - RANKING_EXPRESSION  
26 -) 17 +from config import SearchConfig
  18 +from config.utils import get_match_fields_for_index
27 from context.request_context import RequestContext, RequestContextStage, create_request_context 19 from context.request_context import RequestContext, RequestContextStage, create_request_context
28 from api.models import FacetResult, FacetValue 20 from api.models import FacetResult, FacetValue
29 from api.result_formatter import ResultFormatter 21 from api.result_formatter import ResultFormatter
@@ -87,37 +79,40 @@ class Searcher: @@ -87,37 +79,40 @@ class Searcher:
87 def __init__( 79 def __init__(
88 self, 80 self,
89 es_client: ESClient, 81 es_client: ESClient,
90 - query_parser: Optional[QueryParser] = None,  
91 - index_name: str = DEFAULT_INDEX_NAME 82 + config: SearchConfig,
  83 + query_parser: Optional[QueryParser] = None
92 ): 84 ):
93 """ 85 """
94 Initialize searcher. 86 Initialize searcher.
95 87
96 Args: 88 Args:
97 es_client: Elasticsearch client 89 es_client: Elasticsearch client
  90 + config: SearchConfig instance
98 query_parser: Query parser (created if not provided) 91 query_parser: Query parser (created if not provided)
99 - index_name: ES index name (default: search_products)  
100 """ 92 """
101 self.es_client = es_client 93 self.es_client = es_client
102 - self.index_name = index_name  
103 - self.query_parser = query_parser or QueryParser() 94 + self.config = config
  95 + self.index_name = config.es_index_name
  96 + self.query_parser = query_parser or QueryParser(config)
104 97
105 # Initialize components 98 # Initialize components
106 self.boolean_parser = BooleanParser() 99 self.boolean_parser = BooleanParser()
107 - self.rerank_engine = RerankEngine(RANKING_EXPRESSION, enabled=False) 100 + self.rerank_engine = RerankEngine(config.ranking.expression, enabled=False)
108 101
109 - # Use constants from query_config  
110 - self.match_fields = DEFAULT_MATCH_FIELDS  
111 - self.text_embedding_field = TEXT_EMBEDDING_FIELD  
112 - self.image_embedding_field = IMAGE_EMBEDDING_FIELD 102 + # Get match fields from config
  103 + self.match_fields = get_match_fields_for_index(config, "default")
  104 + self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding"
  105 + self.image_embedding_field = config.query_config.image_embedding_field or "image_embedding"
  106 + self.source_fields = config.query_config.source_fields or []
113 107
114 # Query builder - simplified single-layer architecture 108 # Query builder - simplified single-layer architecture
115 self.query_builder = ESQueryBuilder( 109 self.query_builder = ESQueryBuilder(
116 - index_name=index_name, 110 + index_name=self.index_name,
117 match_fields=self.match_fields, 111 match_fields=self.match_fields,
118 text_embedding_field=self.text_embedding_field, 112 text_embedding_field=self.text_embedding_field,
119 image_embedding_field=self.image_embedding_field, 113 image_embedding_field=self.image_embedding_field,
120 - source_fields=SOURCE_FIELDS 114 + source_fields=self.source_fields,
  115 + function_score_config=self.config.function_score
121 ) 116 )
122 117
123 def search( 118 def search(
@@ -135,7 +130,7 @@ class Searcher: @@ -135,7 +130,7 @@ class Searcher:
135 sort_order: Optional[str] = "desc", 130 sort_order: Optional[str] = "desc",
136 debug: bool = False, 131 debug: bool = False,
137 language: str = "zh", 132 language: str = "zh",
138 - sku_filter_dimension: Optional[str] = None, 133 + sku_filter_dimension: Optional[List[str]] = None,
139 ) -> SearchResult: 134 ) -> SearchResult:
140 """ 135 """
141 Execute search query (外部友好格式). 136 Execute search query (外部友好格式).
@@ -162,8 +157,8 @@ class Searcher: @@ -162,8 +157,8 @@ class Searcher:
162 context = create_request_context() 157 context = create_request_context()
163 158
164 # Always use config defaults (these are backend configuration, not user parameters) 159 # Always use config defaults (these are backend configuration, not user parameters)
165 - enable_translation = ENABLE_TRANSLATION  
166 - enable_embedding = ENABLE_TEXT_EMBEDDING 160 + enable_translation = self.config.query_config.enable_translation
  161 + enable_embedding = self.config.query_config.enable_text_embedding
167 enable_rerank = False # Temporarily disabled 162 enable_rerank = False # Temporarily disabled
168 163
169 # Start timing 164 # Start timing
@@ -305,14 +300,14 @@ class Searcher: @@ -305,14 +300,14 @@ class Searcher:
305 context.store_intermediate_result('es_query', es_query) 300 context.store_intermediate_result('es_query', es_query)
306 context.store_intermediate_result('es_body_for_search', body_for_es) 301 context.store_intermediate_result('es_body_for_search', body_for_es)
307 302
  303 + # Serialize ES query as a compact JSON string (no spaces or newlines)
  304 + es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(',', ':'))
  305 +
308 context.logger.info( 306 context.logger.info(
309 - f"ES查询构建完成 | 大小: {len(str(es_query))}字符 | "  
310 - f"KNN: {'是' if enable_embedding and parsed_query.query_vector is not None else '否'} | "  
311 - f"分面: {'是' if facets else '否'}",  
312 - extra={'reqid': context.reqid, 'uid': context.uid}  
313 - )  
314 - context.logger.debug(  
315 - f"ES查询详情: {es_query}", 307 + f"ES query built | size: {len(es_query_compact)} chars | "
  308 + f"KNN: {'yes' if enable_embedding and parsed_query.query_vector is not None else 'no'} | "
  309 + f"facets: {'yes' if facets else 'no'} | "
  310 + f"query: {es_query_compact}",
316 extra={'reqid': context.reqid, 'uid': context.uid} 311 extra={'reqid': context.reqid, 'uid': context.uid}
317 ) 312 )
318 except Exception as e: 313 except Exception as e:
@@ -508,9 +503,9 @@ class Searcher: @@ -508,9 +503,9 @@ class Searcher:
508 } 503 }
509 504
510 # Add _source filtering if source_fields are configured 505 # Add _source filtering if source_fields are configured
511 - if SOURCE_FIELDS: 506 + if self.source_fields:
512 es_query["_source"] = { 507 es_query["_source"] = {
513 - "includes": SOURCE_FIELDS 508 + "includes": self.source_fields
514 } 509 }
515 510
516 if filters or range_filters: 511 if filters or range_filters:
@@ -137,8 +137,8 @@ def mock_es_client() -&gt; Mock: @@ -137,8 +137,8 @@ def mock_es_client() -&gt; Mock:
137 def test_searcher(sample_search_config, mock_es_client) -> Searcher: 137 def test_searcher(sample_search_config, mock_es_client) -> Searcher:
138 """测试用Searcher实例""" 138 """测试用Searcher实例"""
139 return Searcher( 139 return Searcher(
140 - config=sample_search_config,  
141 - es_client=mock_es_client 140 + es_client=mock_es_client,
  141 + config=sample_search_config
142 ) 142 )
143 143
144 144
@@ -38,10 +38,11 @@ class StructuredFormatter(logging.Formatter): @@ -38,10 +38,11 @@ class StructuredFormatter(logging.Formatter):
38 # Add request context if available 38 # Add request context if available
39 reqid = getattr(record, 'reqid', None) 39 reqid = getattr(record, 'reqid', None)
40 uid = getattr(record, 'uid', None) 40 uid = getattr(record, 'uid', None)
41 - if reqid or uid: 41 + if reqid is not None or uid is not None:
  42 + # Normalize missing values to "-1" for easier correlation
42 log_entry['request_context'] = { 43 log_entry['request_context'] = {
43 - 'reqid': reqid,  
44 - 'uid': uid 44 + 'reqid': reqid if reqid is not None else "-1",
  45 + 'uid': uid if uid is not None else "-1"
45 } 46 }
46 47
47 # Add extra data if available 48 # Add extra data if available
@@ -98,13 +99,31 @@ class RequestContextFilter(logging.Filter): @@ -98,13 +99,31 @@ class RequestContextFilter(logging.Filter):
98 from context.request_context import get_current_request_context 99 from context.request_context import get_current_request_context
99 context = get_current_request_context() 100 context = get_current_request_context()
100 if context: 101 if context:
101 - record.reqid = context.reqid  
102 - record.uid = context.uid 102 + # Ensure every request-scoped log record carries reqid/uid.
  103 + # If they are missing in the context, fall back to "-1".
  104 + record.reqid = getattr(context, "reqid", None) or "-1"
  105 + record.uid = getattr(context, "uid", None) or "-1"
103 except (ImportError, AttributeError): 106 except (ImportError, AttributeError):
104 pass 107 pass
105 return True 108 return True
106 109
107 110
  111 +class ContextAwareConsoleFormatter(logging.Formatter):
  112 + """
  113 + Console formatter that injects reqid/uid into the log line.
  114 +
  115 + For non-request logs (no context), reqid/uid will be "-1".
  116 + """
  117 +
  118 + def format(self, record: logging.LogRecord) -> str:
  119 + # Provide safe defaults so format string never fails
  120 + if not hasattr(record, "reqid"):
  121 + record.reqid = "-1"
  122 + if not hasattr(record, "uid"):
  123 + record.uid = "-1"
  124 + return super().format(record)
  125 +
  126 +
108 def setup_logging( 127 def setup_logging(
109 log_level: str = "INFO", 128 log_level: str = "INFO",
110 log_dir: str = "logs", 129 log_dir: str = "logs",
@@ -137,8 +156,8 @@ def setup_logging( @@ -137,8 +156,8 @@ def setup_logging(
137 156
138 # Create formatters 157 # Create formatters
139 structured_formatter = StructuredFormatter() 158 structured_formatter = StructuredFormatter()
140 - console_formatter = logging.Formatter(  
141 - '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s' 159 + console_formatter = ContextAwareConsoleFormatter(
  160 + '%(asctime)s | reqid:%(reqid)s | uid:%(uid)s | %(levelname)-8s | %(name)-15s | %(message)s'
142 ) 161 )
143 162
144 # Add console handler 163 # Add console handler