Compare View
Commits (4)
-
query config/ranking config优化
-
sku_filter_dimension=color sku_filter_dimension=option1 / option2 /option3 以上两种方式都可以
-
后端请求模型变更(api/models.py) SearchRequest.sku_filter_dimension 从 Optional[str] 改为 Optional[List[str]]。 语义:列表表示一个或多个“维度标签”,例如: 单维度:["color"]、["option1"] 多维度:["color", "size"]、["option1", "option2"] 描述更新为:对 维度组合进行分组,每个组合只保留一个 SKU。 结果格式化与去重逻辑(api/result_formatter.py) ResultFormatter.format_search_results(..., sku_filter_dimension: Optional[List[str]] = None),调用处已同步更新。 单维度旧逻辑升级为多维度逻辑: 新方法:_filter_skus_by_dimensions(skus, dimensions, option1_name, option2_name, option3_name, specifications)。 维度解析规则(按顺序处理,并去重): 若维度是 option1 / option2 / option3 → 对应 option1_value / option2_value / option3_value。 否则,将维度字符串转小写后,分别与 option1_name / option2_name / option3_name 对比,相等则映射到对应的 option*_value。 未能映射到任何字段的维度会被忽略。 对每个 SKU: 按解析出的字段列表(例如 ["option1_value", "option2_value"])取值,组成 key,如 ("red", "L");None 用空串 ""。 按 key 分组,每个 key 只保留遇到的第一个 SKU。 若列表为空或所有维度都无法解析,则 不做过滤,返回原始 skus。 Searcher 参数类型同步(search/searcher.py) Searcher.search(...) 中 sku_filter_dimension 参数类型从 Optional[str] 改为 Optional[List[str]]。 传给 ResultFormatter.format_search_results 时,直接传该列表。 前端参数格式调整(frontend/static/js/app.js) 输入框 #skuFilterDimension 依旧是一个文本框,但解析方式改为: 函数 getSkuFilterDimension(): 读取文本,如:"color" 或 "color,size" 或 "option1, color"。 用逗号 , 拆分,trim() 后过滤空串,返回 字符串数组,例如: "color" → ["color"] "color,size" → ["color", "size"] 若最终数组为空,则返回 null。 搜索请求体中仍使用字段名 sku_filter_dimension,但现在值是 string[] 或 null: body: JSON.stringify({ // ... sku_filter_dimension: skuFilterDimension, // 例如 ["color", "size"] debug: state.debug }) 文档更新(docs/搜索API对接指南.md) 请求体示例中的类型由: "sku_filter_dimension": "string" 改为: "sku_filter_dimension": ["string"] 参数表中: 从 string 改为 array[string],说明为“维度列表,按组合分组,每个组合保留一个 SKU”。 功能说明章节“SKU筛选维度 (sku_filter_dimension)”已调整为 列表语义 + 组合去重,并补充了示例: 单维度: { "query": "芭比娃娃", "sku_filter_dimension": ["color"] } 多维度组合: { "query": "芭比娃娃", "sku_filter_dimension": ["color", "size"] } 使用方式总结 单维度去重(保持旧行为的等价写法) 旧:"sku_filter_dimension": "color" 新:"sku_filter_dimension": ["color"] 多维度组合去重(你新提的需求) 例如希望“每个 SPU 下,同一颜色+尺码组合只保留一个 SKU”: { "query": "芭比娃娃", "sku_filter_dimension": ["color", "size"] }
Showing
21 changed files
Show diff stats
api/app.py
| ... | ... | @@ -41,15 +41,16 @@ limiter = Limiter(key_func=get_remote_address) |
| 41 | 41 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 42 | 42 | |
| 43 | 43 | from config.env_config import ES_CONFIG |
| 44 | +from config import ConfigLoader | |
| 44 | 45 | from utils import ESClient |
| 45 | 46 | from search import Searcher |
| 46 | -from search.query_config import DEFAULT_INDEX_NAME | |
| 47 | 47 | from query import QueryParser |
| 48 | 48 | |
| 49 | 49 | # Global instances |
| 50 | 50 | _es_client: Optional[ESClient] = None |
| 51 | 51 | _searcher: Optional[Searcher] = None |
| 52 | 52 | _query_parser: Optional[QueryParser] = None |
| 53 | +_config = None | |
| 53 | 54 | |
| 54 | 55 | |
| 55 | 56 | def init_service(es_host: str = "http://localhost:9200"): |
| ... | ... | @@ -59,11 +60,17 @@ def init_service(es_host: str = "http://localhost:9200"): |
| 59 | 60 | Args: |
| 60 | 61 | es_host: Elasticsearch host URL |
| 61 | 62 | """ |
| 62 | - global _es_client, _searcher, _query_parser | |
| 63 | + global _es_client, _searcher, _query_parser, _config | |
| 63 | 64 | |
| 64 | 65 | start_time = time.time() |
| 65 | 66 | logger.info("Initializing search service (multi-tenant)") |
| 66 | 67 | |
| 68 | + # Load configuration | |
| 69 | + logger.info("Loading configuration...") | |
| 70 | + config_loader = ConfigLoader("config/config.yaml") | |
| 71 | + _config = config_loader.load_config() | |
| 72 | + logger.info("Configuration loaded") | |
| 73 | + | |
| 67 | 74 | # Get ES credentials |
| 68 | 75 | es_username = os.getenv('ES_USERNAME') or ES_CONFIG.get('username') |
| 69 | 76 | es_password = os.getenv('ES_PASSWORD') or ES_CONFIG.get('password') |
| ... | ... | @@ -81,13 +88,13 @@ def init_service(es_host: str = "http://localhost:9200"): |
| 81 | 88 | |
| 82 | 89 | # Initialize components |
| 83 | 90 | logger.info("Initializing query parser...") |
| 84 | - _query_parser = QueryParser() | |
| 91 | + _query_parser = QueryParser(_config) | |
| 85 | 92 | |
| 86 | 93 | logger.info("Initializing searcher...") |
| 87 | - _searcher = Searcher(_es_client, _query_parser, index_name=DEFAULT_INDEX_NAME) | |
| 94 | + _searcher = Searcher(_es_client, _config, _query_parser) | |
| 88 | 95 | |
| 89 | 96 | elapsed = time.time() - start_time |
| 90 | - logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {DEFAULT_INDEX_NAME}") | |
| 97 | + logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}") | |
| 91 | 98 | |
| 92 | 99 | |
| 93 | 100 | |
| ... | ... | @@ -113,6 +120,13 @@ def get_query_parser() -> QueryParser: |
| 113 | 120 | return _query_parser |
| 114 | 121 | |
| 115 | 122 | |
| 123 | +def get_config(): | |
| 124 | + """Get global config instance.""" | |
| 125 | + if _config is None: | |
| 126 | + raise RuntimeError("Service not initialized") | |
| 127 | + return _config | |
| 128 | + | |
| 129 | + | |
| 116 | 130 | # Create FastAPI app with enhanced configuration |
| 117 | 131 | app = FastAPI( |
| 118 | 132 | title="E-Commerce Search API", | ... | ... |
api/models.py
| ... | ... | @@ -146,9 +146,14 @@ class SearchRequest(BaseModel): |
| 146 | 146 | debug: bool = Field(False, description="是否返回调试信息") |
| 147 | 147 | |
| 148 | 148 | # SKU筛选参数 |
| 149 | - sku_filter_dimension: Optional[str] = Field( | |
| 149 | + sku_filter_dimension: Optional[List[str]] = Field( | |
| 150 | 150 | None, |
| 151 | - description="子SKU筛选维度(店铺配置)。指定后,每个SPU下的SKU将按该维度分组,每组选择第一个SKU返回。例如:'color'表示按颜色分组,每种颜色选一款。支持的值:'option1'、'option2'、'option3'或specifications中的name(如'color'、'size')" | |
| 151 | + description=( | |
| 152 | + "子SKU筛选维度(店铺配置),为字符串列表。" | |
| 153 | + "指定后,每个SPU下的SKU将按这些维度的组合进行分组,每个维度组合只保留一个SKU返回。" | |
| 154 | + "例如:['color'] 表示按颜色分组,每种颜色选一款;['color', 'size'] 表示按颜色+尺码组合分组。" | |
| 155 | + "支持的值:'option1'、'option2'、'option3' 或选项名称(如 'color'、'size',将通过 option1_name/2_name/3_name 匹配)。" | |
| 156 | + ) | |
| 152 | 157 | ) |
| 153 | 158 | |
| 154 | 159 | # 个性化参数(预留) | ... | ... |
api/result_formatter.py
| ... | ... | @@ -14,7 +14,7 @@ class ResultFormatter: |
| 14 | 14 | es_hits: List[Dict[str, Any]], |
| 15 | 15 | max_score: float = 1.0, |
| 16 | 16 | language: str = "zh", |
| 17 | - sku_filter_dimension: Optional[str] = None | |
| 17 | + sku_filter_dimension: Optional[List[str]] = None | |
| 18 | 18 | ) -> List[SpuResult]: |
| 19 | 19 | """ |
| 20 | 20 | Convert ES hits to SpuResult list. |
| ... | ... | @@ -85,10 +85,10 @@ class ResultFormatter: |
| 85 | 85 | ) |
| 86 | 86 | skus.append(sku) |
| 87 | 87 | |
| 88 | - # Apply SKU filtering if dimension is specified | |
| 88 | + # Apply SKU filtering if dimension list is specified | |
| 89 | 89 | if sku_filter_dimension and skus: |
| 90 | - skus = ResultFormatter._filter_skus_by_dimension( | |
| 91 | - skus, | |
| 90 | + skus = ResultFormatter._filter_skus_by_dimensions( | |
| 91 | + skus, | |
| 92 | 92 | sku_filter_dimension, |
| 93 | 93 | source.get('option1_name'), |
| 94 | 94 | source.get('option2_name'), |
| ... | ... | @@ -138,22 +138,22 @@ class ResultFormatter: |
| 138 | 138 | return results |
| 139 | 139 | |
| 140 | 140 | @staticmethod |
| 141 | - def _filter_skus_by_dimension( | |
| 141 | + def _filter_skus_by_dimensions( | |
| 142 | 142 | skus: List[SkuResult], |
| 143 | - dimension: str, | |
| 143 | + dimensions: List[str], | |
| 144 | 144 | option1_name: Optional[str] = None, |
| 145 | 145 | option2_name: Optional[str] = None, |
| 146 | 146 | option3_name: Optional[str] = None, |
| 147 | 147 | specifications: Optional[List[Dict[str, Any]]] = None |
| 148 | 148 | ) -> List[SkuResult]: |
| 149 | 149 | """ |
| 150 | - Filter SKUs by dimension, keeping only one SKU per dimension value. | |
| 150 | + Filter SKUs by one or more dimensions, keeping only one SKU per dimension value combination. | |
| 151 | 151 | |
| 152 | 152 | Args: |
| 153 | 153 | skus: List of SKU results to filter |
| 154 | - dimension: Filter dimension, can be: | |
| 154 | + dimensions: Filter dimensions, each dimension can be: | |
| 155 | 155 | - 'option1', 'option2', 'option3': Direct option field |
| 156 | - - A specification name (e.g., 'color', 'size'): Match by option name | |
| 156 | + - A specification/option name (e.g., 'color', 'size'): Match by option name | |
| 157 | 157 | option1_name: Name of option1 (e.g., 'color') |
| 158 | 158 | option2_name: Name of option2 (e.g., 'size') |
| 159 | 159 | option3_name: Name of option3 |
| ... | ... | @@ -162,54 +162,59 @@ class ResultFormatter: |
| 162 | 162 | Returns: |
| 163 | 163 | Filtered list of SKUs (one per dimension value) |
| 164 | 164 | """ |
| 165 | - if not skus: | |
| 165 | + if not skus or not dimensions: | |
| 166 | 166 | return skus |
| 167 | - | |
| 168 | - # Determine which field to use for filtering | |
| 169 | - filter_field = None | |
| 170 | - | |
| 171 | - # Direct option field (option1, option2, option3) | |
| 172 | - if dimension.lower() == 'option1': | |
| 173 | - filter_field = 'option1_value' | |
| 174 | - elif dimension.lower() == 'option2': | |
| 175 | - filter_field = 'option2_value' | |
| 176 | - elif dimension.lower() == 'option3': | |
| 177 | - filter_field = 'option3_value' | |
| 178 | - else: | |
| 179 | - # Try to match by option name | |
| 180 | - dimension_lower = dimension.lower() | |
| 181 | - if option1_name and option1_name.lower() == dimension_lower: | |
| 182 | - filter_field = 'option1_value' | |
| 183 | - elif option2_name and option2_name.lower() == dimension_lower: | |
| 184 | - filter_field = 'option2_value' | |
| 185 | - elif option3_name and option3_name.lower() == dimension_lower: | |
| 186 | - filter_field = 'option3_value' | |
| 187 | - | |
| 188 | - # If no matching field found, return all SKUs (no filtering) | |
| 189 | - if not filter_field: | |
| 190 | - return skus | |
| 191 | - | |
| 192 | - # Group SKUs by dimension value and select first one from each group | |
| 193 | - dimension_groups: Dict[str, SkuResult] = {} | |
| 194 | - | |
| 167 | + | |
| 168 | + # Resolve each dimension to an underlying SKU field (option1_value / option2_value / option3_value) | |
| 169 | + filter_fields: List[str] = [] | |
| 170 | + | |
| 171 | + for dim in dimensions: | |
| 172 | + if not dim: | |
| 173 | + continue | |
| 174 | + dim_lower = dim.lower() | |
| 175 | + | |
| 176 | + field_name: Optional[str] = None | |
| 177 | + # Direct option field (option1, option2, option3) | |
| 178 | + if dim_lower == 'option1': | |
| 179 | + field_name = 'option1_value' | |
| 180 | + elif dim_lower == 'option2': | |
| 181 | + field_name = 'option2_value' | |
| 182 | + elif dim_lower == 'option3': | |
| 183 | + field_name = 'option3_value' | |
| 184 | + else: | |
| 185 | + # Try to match by option name | |
| 186 | + if option1_name and option1_name.lower() == dim_lower: | |
| 187 | + field_name = 'option1_value' | |
| 188 | + elif option2_name and option2_name.lower() == dim_lower: | |
| 189 | + field_name = 'option2_value' | |
| 190 | + elif option3_name and option3_name.lower() == dim_lower: | |
| 191 | + field_name = 'option3_value' | |
| 192 | + | |
| 193 | + if field_name and field_name not in filter_fields: | |
| 194 | + filter_fields.append(field_name) | |
| 195 | + | |
| 196 | + # If no matching field found for all dimensions, do not return any child SKUs | |
| 197 | + if not filter_fields: | |
| 198 | + return [] | |
| 199 | + | |
| 200 | + # Group SKUs by dimension value combination and select first one from each group | |
| 201 | + dimension_groups: Dict[tuple, SkuResult] = {} | |
| 202 | + | |
| 195 | 203 | for sku in skus: |
| 196 | - # Get dimension value from the determined field | |
| 197 | - dimension_value = None | |
| 198 | - if filter_field == 'option1_value': | |
| 199 | - dimension_value = sku.option1_value | |
| 200 | - elif filter_field == 'option2_value': | |
| 201 | - dimension_value = sku.option2_value | |
| 202 | - elif filter_field == 'option3_value': | |
| 203 | - dimension_value = sku.option3_value | |
| 204 | - | |
| 205 | - # Use empty string as key for None values | |
| 206 | - key = str(dimension_value) if dimension_value is not None else '' | |
| 207 | - | |
| 208 | - # Keep first SKU for each dimension value | |
| 204 | + # Build key as combination of all dimension values | |
| 205 | + key_values: List[str] = [] | |
| 206 | + for field in filter_fields: | |
| 207 | + dimension_value = getattr(sku, field, None) | |
| 208 | + # Use empty string as key part for None values | |
| 209 | + key_values.append(str(dimension_value) if dimension_value is not None else '') | |
| 210 | + | |
| 211 | + key = tuple(key_values) | |
| 212 | + | |
| 213 | + # Keep first SKU for each dimension combination | |
| 209 | 214 | if key not in dimension_groups: |
| 210 | 215 | dimension_groups[key] = sku |
| 211 | - | |
| 212 | - # Return filtered SKUs (one per dimension value) | |
| 216 | + | |
| 217 | + # Return filtered SKUs (one per dimension combination) | |
| 213 | 218 | return list(dimension_groups.values()) |
| 214 | 219 | |
| 215 | 220 | @staticmethod | ... | ... |
api/routes/search.py
| ... | ... | @@ -24,8 +24,8 @@ def extract_request_info(request: Request) -> tuple[str, str]: |
| 24 | 24 | # Try to get request ID from headers |
| 25 | 25 | reqid = request.headers.get('X-Request-ID') or str(uuid.uuid4())[:8] |
| 26 | 26 | |
| 27 | - # Try to get user ID from headers or default to anonymous | |
| 28 | - uid = request.headers.get('X-User-ID') or request.headers.get('User-ID') or 'anonymous' | |
| 27 | + # Try to get user ID from headers; if not found, use "-1" for correlation | |
| 28 | + uid = request.headers.get('X-User-ID') or request.headers.get('User-ID') or "-1" | |
| 29 | 29 | |
| 30 | 30 | return reqid, uid |
| 31 | 31 | |
| ... | ... | @@ -70,10 +70,24 @@ async def search(request: SearchRequest, http_request: Request): |
| 70 | 70 | set_current_request_context(context) |
| 71 | 71 | |
| 72 | 72 | try: |
| 73 | - # Log request start | |
| 73 | + # Log request start (English logs, with key search parameters) | |
| 74 | + client_ip = http_request.client.host if http_request.client else "unknown" | |
| 75 | + user_agent = http_request.headers.get("User-Agent", "unknown")[:200] | |
| 74 | 76 | context.logger.info( |
| 75 | - f"收到搜索请求 | Tenant: {tenant_id} | IP: {http_request.client.host if http_request.client else 'unknown'} | " | |
| 76 | - f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}", | |
| 77 | + "Received search request | " | |
| 78 | + f"Tenant: {tenant_id} | " | |
| 79 | + f"Query: {request.query} | " | |
| 80 | + f"IP: {client_ip} | " | |
| 81 | + f"User agent: {user_agent} | " | |
| 82 | + f"size: {request.size} | from: {request.from_} | " | |
| 83 | + f"sort_by: {request.sort_by} | sort_order: {request.sort_order} | " | |
| 84 | + f"min_score: {request.min_score} | " | |
| 85 | + f"language: {request.language} | " | |
| 86 | + f"debug: {request.debug} | " | |
| 87 | + f"sku_filter_dimension: {request.sku_filter_dimension} | " | |
| 88 | + f"filters: {request.filters} | " | |
| 89 | + f"range_filters: {request.range_filters} | " | |
| 90 | + f"facets: {request.facets}", | |
| 77 | 91 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 78 | 92 | ) |
| 79 | 93 | |
| ... | ... | @@ -121,7 +135,7 @@ async def search(request: SearchRequest, http_request: Request): |
| 121 | 135 | if context: |
| 122 | 136 | context.set_error(e) |
| 123 | 137 | context.logger.error( |
| 124 | - f"搜索请求失败 | 错误: {str(e)}", | |
| 138 | + f"Search request failed | error: {str(e)}", | |
| 125 | 139 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 126 | 140 | ) |
| 127 | 141 | raise HTTPException(status_code=500, detail=str(e)) |
| ... | ... | @@ -164,10 +178,13 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): |
| 164 | 178 | set_current_request_context(context) |
| 165 | 179 | |
| 166 | 180 | try: |
| 167 | - # Log request start | |
| 181 | + # Log request start for image search (English) | |
| 182 | + client_ip = http_request.client.host if http_request.client else "unknown" | |
| 168 | 183 | context.logger.info( |
| 169 | - f"收到图片搜索请求 | Tenant: {tenant_id} | 图片URL: {request.image_url} | " | |
| 170 | - f"IP: {http_request.client.host if http_request.client else 'unknown'}", | |
| 184 | + "Received image search request | " | |
| 185 | + f"Tenant: {tenant_id} | " | |
| 186 | + f"Image URL: {request.image_url} | " | |
| 187 | + f"IP: {client_ip}", | |
| 171 | 188 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 172 | 189 | ) |
| 173 | 190 | |
| ... | ... | @@ -202,7 +219,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): |
| 202 | 219 | if context: |
| 203 | 220 | context.set_error(e) |
| 204 | 221 | context.logger.error( |
| 205 | - f"图片搜索请求参数错误 | 错误: {str(e)}", | |
| 222 | + f"Image search request parameter error | error: {str(e)}", | |
| 206 | 223 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 207 | 224 | ) |
| 208 | 225 | raise HTTPException(status_code=400, detail=str(e)) |
| ... | ... | @@ -210,7 +227,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): |
| 210 | 227 | if context: |
| 211 | 228 | context.set_error(e) |
| 212 | 229 | context.logger.error( |
| 213 | - f"图片搜索请求失败 | 错误: {str(e)}", | |
| 230 | + f"Image search request failed | error: {str(e)}", | |
| 214 | 231 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 215 | 232 | ) |
| 216 | 233 | raise HTTPException(status_code=500, detail=str(e)) | ... | ... |
config/__init__.py
| ... | ... | @@ -23,6 +23,10 @@ from .config_loader import ( |
| 23 | 23 | RerankConfig, |
| 24 | 24 | ConfigurationError |
| 25 | 25 | ) |
| 26 | +from .utils import ( | |
| 27 | + get_match_fields_for_index, | |
| 28 | + get_domain_fields | |
| 29 | +) | |
| 26 | 30 | |
| 27 | 31 | __all__ = [ |
| 28 | 32 | # Field types |
| ... | ... | @@ -46,4 +50,6 @@ __all__ = [ |
| 46 | 50 | 'FunctionScoreConfig', |
| 47 | 51 | 'RerankConfig', |
| 48 | 52 | 'ConfigurationError', |
| 53 | + 'get_match_fields_for_index', | |
| 54 | + 'get_domain_fields', | |
| 49 | 55 | ] | ... | ... |
config/config.yaml
| ... | ... | @@ -412,6 +412,11 @@ query_config: |
| 412 | 412 | text_embedding_field: "title_embedding" # Field name for text embeddings |
| 413 | 413 | image_embedding_field: null # Field name for image embeddings (if not set, will auto-detect) |
| 414 | 414 | |
| 415 | + # Embedding disable thresholds (disable vector search for short queries) | |
| 416 | + embedding_disable_thresholds: | |
| 417 | + chinese_char_limit: 4 # Disable embedding for Chinese queries with <= 4 characters | |
| 418 | + english_word_limit: 3 # Disable embedding for English queries with <= 3 words | |
| 419 | + | |
| 415 | 420 | # Translation API (DeepL) |
| 416 | 421 | translation_service: "deepl" |
| 417 | 422 | translation_api_key: null # Set via environment variable | ... | ... |
config/config_loader.py
| ... | ... | @@ -58,6 +58,10 @@ class QueryConfig: |
| 58 | 58 | text_embedding_field: Optional[str] = None # Field name for text embeddings (e.g., "title_embedding") |
| 59 | 59 | image_embedding_field: Optional[str] = None # Field name for image embeddings (e.g., "image_embedding") |
| 60 | 60 | |
| 61 | + # Embedding disable thresholds (disable vector search for short queries) | |
| 62 | + embedding_disable_chinese_char_limit: int = 4 # Disable embedding for Chinese queries with <= this many characters | |
| 63 | + embedding_disable_english_word_limit: int = 3 # Disable embedding for English queries with <= this many words | |
| 64 | + | |
| 61 | 65 | # ES source fields configuration - fields to return in search results |
| 62 | 66 | # If None, auto-collect from field configs (fields with return_in_source=True) |
| 63 | 67 | # If empty list, return all fields. Otherwise, only return specified fields. |
| ... | ... | @@ -165,15 +169,18 @@ class ConfigLoader: |
| 165 | 169 | |
| 166 | 170 | return rewrite_dict |
| 167 | 171 | |
| 168 | - def load_config(self) -> SearchConfig: | |
| 172 | + def load_config(self, validate: bool = True) -> SearchConfig: | |
| 169 | 173 | """ |
| 170 | 174 | Load unified configuration from YAML file. |
| 171 | 175 | |
| 176 | + Args: | |
| 177 | + validate: Whether to validate configuration after loading (default: True) | |
| 178 | + | |
| 172 | 179 | Returns: |
| 173 | 180 | SearchConfig object |
| 174 | 181 | |
| 175 | 182 | Raises: |
| 176 | - ConfigurationError: If config file not found or invalid | |
| 183 | + ConfigurationError: If config file not found, invalid, or validation fails | |
| 177 | 184 | """ |
| 178 | 185 | if not self.config_file.exists(): |
| 179 | 186 | raise ConfigurationError(f"Configuration file not found: {self.config_file}") |
| ... | ... | @@ -184,7 +191,16 @@ class ConfigLoader: |
| 184 | 191 | except yaml.YAMLError as e: |
| 185 | 192 | raise ConfigurationError(f"Invalid YAML in {self.config_file}: {e}") |
| 186 | 193 | |
| 187 | - return self._parse_config(config_data) | |
| 194 | + config = self._parse_config(config_data) | |
| 195 | + | |
| 196 | + # Auto-validate configuration | |
| 197 | + if validate: | |
| 198 | + errors = self.validate_config(config) | |
| 199 | + if errors: | |
| 200 | + error_msg = "Configuration validation failed:\n" + "\n".join(f" - {err}" for err in errors) | |
| 201 | + raise ConfigurationError(error_msg) | |
| 202 | + | |
| 203 | + return config | |
| 188 | 204 | |
| 189 | 205 | def _parse_config(self, config_data: Dict[str, Any]) -> SearchConfig: |
| 190 | 206 | """Parse configuration dictionary into SearchConfig object.""" |
| ... | ... | @@ -214,43 +230,48 @@ class ConfigLoader: |
| 214 | 230 | if field.return_in_source |
| 215 | 231 | ] |
| 216 | 232 | |
| 233 | + # Parse embedding disable thresholds | |
| 234 | + embedding_thresholds = query_config_data.get("embedding_disable_thresholds", {}) | |
| 235 | + | |
| 217 | 236 | query_config = QueryConfig( |
| 218 | - supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), | |
| 219 | - default_language=query_config_data.get("default_language", "zh"), | |
| 237 | + supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], | |
| 238 | + default_language=query_config_data.get("default_language") or "zh", | |
| 220 | 239 | enable_translation=query_config_data.get("enable_translation", True), |
| 221 | 240 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), |
| 222 | 241 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), |
| 223 | 242 | rewrite_dictionary=rewrite_dictionary, |
| 224 | 243 | translation_api_key=query_config_data.get("translation_api_key"), |
| 225 | - translation_service=query_config_data.get("translation_service", "deepl"), | |
| 244 | + translation_service=query_config_data.get("translation_service") or "deepl", | |
| 226 | 245 | translation_glossary_id=query_config_data.get("translation_glossary_id"), |
| 227 | - translation_context=query_config_data.get("translation_context", "e-commerce product search"), | |
| 246 | + translation_context=query_config_data.get("translation_context") or "e-commerce product search", | |
| 228 | 247 | text_embedding_field=query_config_data.get("text_embedding_field"), |
| 229 | 248 | image_embedding_field=query_config_data.get("image_embedding_field"), |
| 249 | + embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), | |
| 250 | + embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), | |
| 230 | 251 | source_fields=source_fields |
| 231 | 252 | ) |
| 232 | 253 | |
| 233 | 254 | # Parse ranking config |
| 234 | 255 | ranking_data = config_data.get("ranking", {}) |
| 235 | 256 | ranking = RankingConfig( |
| 236 | - expression=ranking_data.get("expression", "bm25() + 0.2*text_embedding_relevance()"), | |
| 237 | - description=ranking_data.get("description", "Default BM25 + text embedding ranking") | |
| 257 | + expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()", | |
| 258 | + description=ranking_data.get("description") or "Default BM25 + text embedding ranking" | |
| 238 | 259 | ) |
| 239 | 260 | |
| 240 | 261 | # Parse Function Score configuration |
| 241 | 262 | fs_data = config_data.get("function_score", {}) |
| 242 | 263 | function_score = FunctionScoreConfig( |
| 243 | - score_mode=fs_data.get("score_mode", "sum"), | |
| 244 | - boost_mode=fs_data.get("boost_mode", "multiply"), | |
| 245 | - functions=fs_data.get("functions", []) | |
| 264 | + score_mode=fs_data.get("score_mode") or "sum", | |
| 265 | + boost_mode=fs_data.get("boost_mode") or "multiply", | |
| 266 | + functions=fs_data.get("functions") or [] | |
| 246 | 267 | ) |
| 247 | 268 | |
| 248 | 269 | # Parse Rerank configuration |
| 249 | 270 | rerank_data = config_data.get("rerank", {}) |
| 250 | 271 | rerank = RerankConfig( |
| 251 | 272 | enabled=rerank_data.get("enabled", False), |
| 252 | - expression=rerank_data.get("expression", ""), | |
| 253 | - description=rerank_data.get("description", "") | |
| 273 | + expression=rerank_data.get("expression") or "", | |
| 274 | + description=rerank_data.get("description") or "" | |
| 254 | 275 | ) |
| 255 | 276 | |
| 256 | 277 | # Parse SPU config |
| ... | ... | @@ -447,21 +468,43 @@ class ConfigLoader: |
| 447 | 468 | output_path = Path(output_path) |
| 448 | 469 | |
| 449 | 470 | # Convert config back to dictionary format |
| 471 | + query_config_dict = { | |
| 472 | + "supported_languages": config.query_config.supported_languages, | |
| 473 | + "default_language": config.query_config.default_language, | |
| 474 | + "enable_translation": config.query_config.enable_translation, | |
| 475 | + "enable_text_embedding": config.query_config.enable_text_embedding, | |
| 476 | + "enable_query_rewrite": config.query_config.enable_query_rewrite, | |
| 477 | + "translation_service": config.query_config.translation_service, | |
| 478 | + } | |
| 479 | + | |
| 480 | + # Add optional fields only if they are set | |
| 481 | + if config.query_config.translation_api_key: | |
| 482 | + query_config_dict["translation_api_key"] = config.query_config.translation_api_key | |
| 483 | + if config.query_config.translation_glossary_id: | |
| 484 | + query_config_dict["translation_glossary_id"] = config.query_config.translation_glossary_id | |
| 485 | + if config.query_config.translation_context: | |
| 486 | + query_config_dict["translation_context"] = config.query_config.translation_context | |
| 487 | + if config.query_config.text_embedding_field: | |
| 488 | + query_config_dict["text_embedding_field"] = config.query_config.text_embedding_field | |
| 489 | + if config.query_config.image_embedding_field: | |
| 490 | + query_config_dict["image_embedding_field"] = config.query_config.image_embedding_field | |
| 491 | + if config.query_config.source_fields: | |
| 492 | + query_config_dict["source_fields"] = config.query_config.source_fields | |
| 493 | + | |
| 494 | + # Add embedding disable thresholds | |
| 495 | + if (config.query_config.embedding_disable_chinese_char_limit != 4 or | |
| 496 | + config.query_config.embedding_disable_english_word_limit != 3): | |
| 497 | + query_config_dict["embedding_disable_thresholds"] = { | |
| 498 | + "chinese_char_limit": config.query_config.embedding_disable_chinese_char_limit, | |
| 499 | + "english_word_limit": config.query_config.embedding_disable_english_word_limit | |
| 500 | + } | |
| 501 | + | |
| 450 | 502 | config_dict = { |
| 451 | 503 | "es_index_name": config.es_index_name, |
| 452 | 504 | "es_settings": config.es_settings, |
| 453 | 505 | "fields": [self._field_to_dict(field) for field in config.fields], |
| 454 | 506 | "indexes": [self._index_to_dict(index) for index in config.indexes], |
| 455 | - "query_config": { | |
| 456 | - "supported_languages": config.query_config.supported_languages, | |
| 457 | - "default_language": config.query_config.default_language, | |
| 458 | - "enable_translation": config.query_config.enable_translation, | |
| 459 | - "enable_text_embedding": config.query_config.enable_text_embedding, | |
| 460 | - "enable_query_rewrite": config.query_config.enable_query_rewrite, | |
| 461 | - # rewrite_dictionary is stored in separate file, not in config | |
| 462 | - "translation_api_key": config.query_config.translation_api_key, | |
| 463 | - "translation_service": config.query_config.translation_service, | |
| 464 | - }, | |
| 507 | + "query_config": query_config_dict, | |
| 465 | 508 | "ranking": { |
| 466 | 509 | "expression": config.ranking.expression, |
| 467 | 510 | "description": config.ranking.description |
| ... | ... | @@ -505,7 +548,7 @@ class ConfigLoader: |
| 505 | 548 | f.write(f"{key}\t{value}\n") |
| 506 | 549 | |
| 507 | 550 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: |
| 508 | - """Convert FieldConfig to dictionary.""" | |
| 551 | + """Convert FieldConfig to dictionary, preserving all fields.""" | |
| 509 | 552 | result = { |
| 510 | 553 | "name": field.name, |
| 511 | 554 | "type": field.field_type.value, |
| ... | ... | @@ -513,36 +556,49 @@ class ConfigLoader: |
| 513 | 556 | "boost": field.boost, |
| 514 | 557 | "store": field.store, |
| 515 | 558 | "index": field.index, |
| 559 | + "return_in_source": field.return_in_source, | |
| 516 | 560 | } |
| 517 | 561 | |
| 562 | + # Add optional fields only if they differ from defaults or are set | |
| 518 | 563 | if field.analyzer: |
| 519 | 564 | result["analyzer"] = field.analyzer.value |
| 520 | 565 | if field.search_analyzer: |
| 521 | 566 | result["search_analyzer"] = field.search_analyzer.value |
| 522 | 567 | if field.multi_language: |
| 523 | 568 | result["multi_language"] = field.multi_language |
| 524 | - result["languages"] = field.languages | |
| 569 | + if field.languages: | |
| 570 | + result["languages"] = field.languages | |
| 525 | 571 | if field.embedding_dims != 1024: |
| 526 | 572 | result["embedding_dims"] = field.embedding_dims |
| 527 | 573 | if field.embedding_similarity != "dot_product": |
| 528 | 574 | result["embedding_similarity"] = field.embedding_similarity |
| 529 | 575 | if field.nested: |
| 530 | 576 | result["nested"] = field.nested |
| 531 | - result["nested_properties"] = field.nested_properties | |
| 577 | + if field.nested_properties: | |
| 578 | + result["nested_properties"] = field.nested_properties | |
| 579 | + if field.keyword_subfield: | |
| 580 | + result["keyword_subfield"] = field.keyword_subfield | |
| 581 | + if field.keyword_ignore_above != 256: | |
| 582 | + result["keyword_ignore_above"] = field.keyword_ignore_above | |
| 583 | + if field.keyword_normalizer: | |
| 584 | + result["keyword_normalizer"] = field.keyword_normalizer | |
| 532 | 585 | |
| 533 | 586 | return result |
| 534 | 587 | |
| 535 | 588 | def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: |
| 536 | - """Convert IndexConfig to dictionary.""" | |
| 589 | + """Convert IndexConfig to dictionary, preserving all fields.""" | |
| 537 | 590 | result = { |
| 538 | 591 | "name": index.name, |
| 539 | 592 | "label": index.label, |
| 540 | 593 | "fields": index.fields, |
| 541 | 594 | "analyzer": index.analyzer.value, |
| 542 | - "boost": index.boost, | |
| 543 | - "example": index.example | |
| 544 | 595 | } |
| 545 | - | |
| 596 | + | |
| 597 | + # Add optional fields only if they differ from defaults or are set | |
| 598 | + if index.boost != 1.0: | |
| 599 | + result["boost"] = index.boost | |
| 600 | + if index.example: | |
| 601 | + result["example"] = index.example | |
| 546 | 602 | if index.language_field_mapping: |
| 547 | 603 | result["language_field_mapping"] = index.language_field_mapping |
| 548 | 604 | ... | ... |
| ... | ... | @@ -0,0 +1,70 @@ |
| 1 | +""" | |
| 2 | +Configuration utility functions. | |
| 3 | + | |
| 4 | +Helper functions for working with SearchConfig objects. | |
| 5 | +""" | |
| 6 | + | |
| 7 | +from typing import Dict, List | |
| 8 | +from .config_loader import SearchConfig | |
| 9 | + | |
| 10 | + | |
| 11 | +def get_match_fields_for_index(config: SearchConfig, index_name: str = "default") -> List[str]: | |
| 12 | + """ | |
| 13 | + Generate match fields list with boost from IndexConfig and FieldConfig. | |
| 14 | + | |
| 15 | + Args: | |
| 16 | + config: SearchConfig instance | |
| 17 | + index_name: Name of the index domain (default: "default") | |
| 18 | + | |
| 19 | + Returns: | |
| 20 | + List of field names with boost, e.g., ["title_zh^3.0", "brief_zh^1.5"] | |
| 21 | + """ | |
| 22 | + # Find the index config | |
| 23 | + index_config = None | |
| 24 | + for idx in config.indexes: | |
| 25 | + if idx.name == index_name: | |
| 26 | + index_config = idx | |
| 27 | + break | |
| 28 | + | |
| 29 | + if not index_config: | |
| 30 | + return [] | |
| 31 | + | |
| 32 | + # Create a field name to FieldConfig mapping | |
| 33 | + field_map = {field.name: field for field in config.fields} | |
| 34 | + | |
| 35 | + # Generate match fields with boost | |
| 36 | + match_fields = [] | |
| 37 | + for field_name in index_config.fields: | |
| 38 | + field_config = field_map.get(field_name) | |
| 39 | + if field_config: | |
| 40 | + # Combine index boost and field boost | |
| 41 | + total_boost = index_config.boost * field_config.boost | |
| 42 | + if total_boost != 1.0: | |
| 43 | + match_fields.append(f"{field_name}^{total_boost}") | |
| 44 | + else: | |
| 45 | + match_fields.append(field_name) | |
| 46 | + else: | |
| 47 | + # Field not found in config, use index boost only | |
| 48 | + if index_config.boost != 1.0: | |
| 49 | + match_fields.append(f"{field_name}^{index_config.boost}") | |
| 50 | + else: | |
| 51 | + match_fields.append(field_name) | |
| 52 | + | |
| 53 | + return match_fields | |
| 54 | + | |
| 55 | + | |
| 56 | +def get_domain_fields(config: SearchConfig) -> Dict[str, List[str]]: | |
| 57 | + """ | |
| 58 | + Generate domain-specific match fields from all index configs. | |
| 59 | + | |
| 60 | + Args: | |
| 61 | + config: SearchConfig instance | |
| 62 | + | |
| 63 | + Returns: | |
| 64 | + Dictionary mapping domain name to list of match fields | |
| 65 | + """ | |
| 66 | + domain_fields = {} | |
| 67 | + for index_config in config.indexes: | |
| 68 | + domain_fields[index_config.name] = get_match_fields_for_index(config, index_config.name) | |
| 69 | + return domain_fields | |
| 70 | + | ... | ... |
context/request_context.py
| ... | ... | @@ -59,9 +59,10 @@ class RequestContext: |
| 59 | 59 | """ |
| 60 | 60 | |
| 61 | 61 | def __init__(self, reqid: str = None, uid: str = None): |
| 62 | - # 生成唯一请求ID | |
| 62 | + # 生成唯一请求ID;如果外部未提供,则自动生成 | |
| 63 | + # 如果无法获取到 uid,则使用 "-1" 作为占位,用于日志关联 | |
| 63 | 64 | self.reqid = reqid or str(uuid.uuid4())[:8] |
| 64 | - self.uid = uid or 'anonymous' | |
| 65 | + self.uid = uid or "-1" | |
| 65 | 66 | |
| 66 | 67 | # 查询分析结果 |
| 67 | 68 | self.query_analysis = QueryAnalysisResult() |
| ... | ... | @@ -111,7 +112,10 @@ class RequestContext: |
| 111 | 112 | """ |
| 112 | 113 | start_time = time.time() |
| 113 | 114 | self.performance_metrics.stage_start_times[stage.value] = start_time |
| 114 | - self.logger.debug(f"开始阶段 | {stage.value}", extra={'reqid': self.reqid, 'uid': self.uid}) | |
| 115 | + self.logger.debug( | |
| 116 | + f"Start stage | {stage.value}", | |
| 117 | + extra={'reqid': self.reqid, 'uid': self.uid} | |
| 118 | + ) | |
| 115 | 119 | return start_time |
| 116 | 120 | |
| 117 | 121 | def end_stage(self, stage: RequestContextStage) -> float: |
| ... | ... | @@ -125,7 +129,10 @@ class RequestContext: |
| 125 | 129 | 阶段耗时(毫秒) |
| 126 | 130 | """ |
| 127 | 131 | if stage.value not in self.performance_metrics.stage_start_times: |
| 128 | - self.logger.warning(f"阶段未开始计时 | {stage.value}", extra={'reqid': self.reqid, 'uid': self.uid}) | |
| 132 | + self.logger.warning( | |
| 133 | + f"Stage not started | {stage.value}", | |
| 134 | + extra={'reqid': self.reqid, 'uid': self.uid} | |
| 135 | + ) | |
| 129 | 136 | return 0.0 |
| 130 | 137 | |
| 131 | 138 | start_time = self.performance_metrics.stage_start_times[stage.value] |
| ... | ... | @@ -133,7 +140,7 @@ class RequestContext: |
| 133 | 140 | self.performance_metrics.stage_timings[stage.value] = duration_ms |
| 134 | 141 | |
| 135 | 142 | self.logger.debug( |
| 136 | - f"结束阶段 | {stage.value} | 耗时: {duration_ms:.2f}ms", | |
| 143 | + f"End stage | {stage.value} | duration: {duration_ms:.2f}ms", | |
| 137 | 144 | extra={'reqid': self.reqid, 'uid': self.uid} |
| 138 | 145 | ) |
| 139 | 146 | return duration_ms |
| ... | ... | @@ -162,7 +169,7 @@ class RequestContext: |
| 162 | 169 | setattr(self.query_analysis, key, value) |
| 163 | 170 | else: |
| 164 | 171 | self.logger.warning( |
| 165 | - f"未知的查询分析字段 | {key}", | |
| 172 | + f"Unknown query analysis field | {key}", | |
| 166 | 173 | extra={'reqid': self.reqid, 'uid': self.uid} |
| 167 | 174 | ) |
| 168 | 175 | |
| ... | ... | @@ -175,7 +182,10 @@ class RequestContext: |
| 175 | 182 | value: 结果值 |
| 176 | 183 | """ |
| 177 | 184 | self.intermediate_results[key] = value |
| 178 | - self.logger.debug(f"存储中间结果 | {key}", extra={'reqid': self.reqid, 'uid': self.uid}) | |
| 185 | + self.logger.debug( | |
| 186 | + f"Store intermediate result | {key}", | |
| 187 | + extra={'reqid': self.reqid, 'uid': self.uid} | |
| 188 | + ) | |
| 179 | 189 | |
| 180 | 190 | def get_intermediate_result(self, key: str, default: Any = None) -> Any: |
| 181 | 191 | """ |
| ... | ... | @@ -213,7 +223,7 @@ class RequestContext: |
| 213 | 223 | 'details': {} |
| 214 | 224 | } |
| 215 | 225 | self.logger.error( |
| 216 | - f"设置错误信息 | {type(error).__name__}: {str(error)}", | |
| 226 | + f"Set error info | {type(error).__name__}: {str(error)}", | |
| 217 | 227 | extra={'reqid': self.reqid, 'uid': self.uid} |
| 218 | 228 | ) |
| 219 | 229 | |
| ... | ... | @@ -286,13 +296,13 @@ class RequestContext: |
| 286 | 296 | |
| 287 | 297 | # 构建详细的日志消息 |
| 288 | 298 | msg_parts = [ |
| 289 | - f"搜索请求性能摘要 | reqid: {self.reqid}", | |
| 290 | - f"总耗时: {summary['performance']['total_duration_ms']:.2f}ms" | |
| 299 | + f"Search request performance summary | reqid: {self.reqid}", | |
| 300 | + f"Total duration: {summary['performance']['total_duration_ms']:.2f}ms" | |
| 291 | 301 | ] |
| 292 | 302 | |
| 293 | 303 | # 添加各阶段耗时 |
| 294 | 304 | if summary['performance']['stage_timings_ms']: |
| 295 | - msg_parts.append("阶段耗时:") | |
| 305 | + msg_parts.append("Stage durations:") | |
| 296 | 306 | for stage, duration in summary['performance']['stage_timings_ms'].items(): |
| 297 | 307 | percentage = summary['performance']['stage_percentages'].get(stage, 0) |
| 298 | 308 | msg_parts.append(f" - {stage}: {duration:.2f}ms ({percentage}%)") |
| ... | ... | @@ -300,25 +310,26 @@ class RequestContext: |
| 300 | 310 | # 添加查询信息 |
| 301 | 311 | if summary['query_analysis']['original_query']: |
| 302 | 312 | msg_parts.append( |
| 303 | - f"查询: '{summary['query_analysis']['original_query']}' " | |
| 313 | + "Query: " | |
| 314 | + f"'{summary['query_analysis']['original_query']}' " | |
| 304 | 315 | f"-> '{summary['query_analysis']['rewritten_query']}' " |
| 305 | 316 | f"({summary['query_analysis']['detected_language']})" |
| 306 | 317 | ) |
| 307 | 318 | |
| 308 | 319 | # 添加结果统计 |
| 309 | 320 | msg_parts.append( |
| 310 | - f"结果: {summary['results']['total_hits']} hits " | |
| 311 | - f"ES查询: {summary['results']['es_query_size']} chars" | |
| 321 | + f"Results: {summary['results']['total_hits']} hits " | |
| 322 | + f"ES query size: {summary['results']['es_query_size']} chars" | |
| 312 | 323 | ) |
| 313 | 324 | |
| 314 | 325 | # 添加错误信息(如果有) |
| 315 | 326 | if summary['request_info']['has_error']: |
| 316 | 327 | error_info = self.metadata['error_info'] |
| 317 | - msg_parts.append(f"错误: {error_info['type']}: {error_info['message']}") | |
| 328 | + msg_parts.append(f"Error: {error_info['type']}: {error_info['message']}") | |
| 318 | 329 | |
| 319 | 330 | # 添加警告信息(如果有) |
| 320 | 331 | if summary['request_info']['warnings_count'] > 0: |
| 321 | - msg_parts.append(f"警告: {summary['request_info']['warnings_count']} 个") | |
| 332 | + msg_parts.append(f"Warnings: {summary['request_info']['warnings_count']}") | |
| 322 | 333 | |
| 323 | 334 | log_message = " | ".join(msg_parts) |
| 324 | 335 | ... | ... |
docs/常用查询 - ES.md
| ... | ... | @@ -17,4 +17,19 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ |
| 17 | 17 | ] |
| 18 | 18 | } |
| 19 | 19 | } |
| 20 | - }' | |
| 21 | 20 | \ No newline at end of file |
| 21 | + }' | |
| 22 | + | |
| 23 | + | |
| 24 | + | |
| 25 | +curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ | |
| 26 | + "size": 5, | |
| 27 | + "query": { | |
| 28 | + "bool": { | |
| 29 | + "filter": [ | |
| 30 | + { "term": { "spu_id": "74123" } } | |
| 31 | + ] | |
| 32 | + } | |
| 33 | + } | |
| 34 | + }' | |
| 35 | + | |
| 36 | + | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -104,7 +104,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 104 | 104 | "sort_by": "string", |
| 105 | 105 | "sort_order": "desc", |
| 106 | 106 | "min_score": 0.0, |
| 107 | - "sku_filter_dimension": "string", | |
| 107 | + "sku_filter_dimension": ["string"], | |
| 108 | 108 | "debug": false, |
| 109 | 109 | "user_id": "string", |
| 110 | 110 | "session_id": "string" |
| ... | ... | @@ -127,7 +127,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 127 | 127 | | `sort_by` | string | N | null | 排序字段名(如 `min_price`, `max_price`) | |
| 128 | 128 | | `sort_order` | string | N | "desc" | 排序方向:`asc`(升序)或 `desc`(降序) | |
| 129 | 129 | | `min_score` | float | N | null | 最小相关性分数阈值 | |
| 130 | -| `sku_filter_dimension` | string | N | null | 子SKU筛选维度(店铺配置)。指定后,每个SPU下的SKU将按该维度分组,每组选择第一个SKU返回。支持的值:`option1`、`option2`、`option3` 或 specifications 中的 name(如 `color`、`size`)。详见下文说明 | | |
| 130 | +| `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(店铺配置)。指定后,每个SPU下的SKU将按这些维度的组合进行分组,每个组合只返回第一个SKU。支持的值:`option1`、`option2`、`option3` 或选项名称(如 `color`、`size`)。详见下文说明 | | |
| 131 | 131 | | `debug` | boolean | N | false | 是否返回调试信息 | |
| 132 | 132 | | `user_id` | string | N | null | 用户ID(用于个性化,预留) | |
| 133 | 133 | | `session_id` | string | N | null | 会话ID(用于分析,预留) | |
| ... | ... | @@ -349,7 +349,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 349 | 349 | ### SKU筛选维度 (sku_filter_dimension) |
| 350 | 350 | |
| 351 | 351 | **功能说明**: |
| 352 | -`sku_filter_dimension` 用于控制每个SPU下返回的SKU数量。当指定此参数后,系统会按指定维度对SKU进行分组,每个分组只返回第一个SKU(从简实现,选择该维度下的第一款)。 | |
| 352 | +`sku_filter_dimension` 用于控制每个SPU下返回的SKU数量,为字符串列表。当指定此参数后,系统会按指定维度**组合**对SKU进行分组,每个维度组合只返回第一个SKU(从简实现,选择该组合下的第一款)。 | |
| 353 | 353 | |
| 354 | 354 | **使用场景**: |
| 355 | 355 | - 店铺配置了SKU筛选维度(如 `color`),希望每个SPU下每种颜色只显示一个SKU |
| ... | ... | @@ -360,8 +360,8 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 360 | 360 | 1. **直接选项字段**: `option1`、`option2`、`option3` |
| 361 | 361 | - 直接使用对应的 `option1_value`、`option2_value`、`option3_value` 字段进行分组 |
| 362 | 362 | |
| 363 | -2. **规格名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配 | |
| 364 | - - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: "color"` 来按颜色分组 | |
| 363 | +2. **规格/选项名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配 | |
| 364 | + - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: ["color"]` 来按颜色分组 | |
| 365 | 365 | |
| 366 | 366 | **示例**: |
| 367 | 367 | |
| ... | ... | @@ -369,7 +369,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 369 | 369 | ```json |
| 370 | 370 | { |
| 371 | 371 | "query": "芭比娃娃", |
| 372 | - "sku_filter_dimension": "color" | |
| 372 | + "sku_filter_dimension": ["color"] | |
| 373 | 373 | } |
| 374 | 374 | ``` |
| 375 | 375 | |
| ... | ... | @@ -377,7 +377,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 377 | 377 | ```json |
| 378 | 378 | { |
| 379 | 379 | "query": "芭比娃娃", |
| 380 | - "sku_filter_dimension": "option1" | |
| 380 | + "sku_filter_dimension": ["option1"] | |
| 381 | 381 | } |
| 382 | 382 | ``` |
| 383 | 383 | |
| ... | ... | @@ -385,7 +385,15 @@ curl -X POST "http://120.76.41.98:6002/search/" \ |
| 385 | 385 | ```json |
| 386 | 386 | { |
| 387 | 387 | "query": "芭比娃娃", |
| 388 | - "sku_filter_dimension": "option2" | |
| 388 | + "sku_filter_dimension": ["option2"] | |
| 389 | +} | |
| 390 | +``` | |
| 391 | + | |
| 392 | +**按颜色 + 尺寸组合筛选(假设 option1_name = "color", option2_name = "size")**: | |
| 393 | +```json | |
| 394 | +{ | |
| 395 | + "query": "芭比娃娃", | |
| 396 | + "sku_filter_dimension": ["color", "size"] | |
| 389 | 397 | } |
| 390 | 398 | ``` |
| 391 | 399 | ... | ... |
frontend/index.html
| ... | ... | @@ -25,6 +25,10 @@ |
| 25 | 25 | <label for="tenantInput">tenant ID:</label> |
| 26 | 26 | <input type="text" id="tenantInput" placeholder="请输入租户ID" value="1"> |
| 27 | 27 | </div> |
| 28 | + <div class="tenant-input-wrapper"> | |
| 29 | + <label for="skuFilterDimension">sku_filter_dimension:</label> | |
| 30 | + <input type="text" id="skuFilterDimension" placeholder="SKU筛选维度" value="color"> | |
| 31 | + </div> | |
| 28 | 32 | <input type="text" id="searchInput" placeholder="输入搜索关键词... (支持中文、英文、俄文)" |
| 29 | 33 | onkeypress="handleKeyPress(event)"> |
| 30 | 34 | <button onclick="performSearch()" class="search-btn">Search</button> |
| ... | ... | @@ -100,9 +104,10 @@ |
| 100 | 104 | |
| 101 | 105 | <div class="sort-right"> |
| 102 | 106 | <select id="resultSize" onchange="performSearch()"> |
| 103 | - <option value="10">10 per page</option> | |
| 104 | - <option value="20" selected>20 per page</option> | |
| 105 | - <option value="50">50 per page</option> | |
| 107 | + <option value="20">20 per page</option> | |
| 108 | + <option value="50" selected>50 per page</option> | |
| 109 | + <option value="100">50 per page</option> | |
| 110 | + <option value="200">50 per page</option> | |
| 106 | 111 | </select> |
| 107 | 112 | </div> |
| 108 | 113 | </div> |
| ... | ... | @@ -130,6 +135,6 @@ |
| 130 | 135 | <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p> |
| 131 | 136 | </footer> |
| 132 | 137 | |
| 133 | - <script src="/static/js/app.js?v=3.2"></script> | |
| 138 | + <script src="/static/js/app.js?v=3.4"></script> | |
| 134 | 139 | </body> |
| 135 | 140 | </html> | ... | ... |
frontend/static/css/style.css
| ... | ... | @@ -84,7 +84,8 @@ body { |
| 84 | 84 | white-space: nowrap; |
| 85 | 85 | } |
| 86 | 86 | |
| 87 | -#tenantInput { | |
| 87 | +#tenantInput, | |
| 88 | +#skuFilterDimension { | |
| 88 | 89 | width: 120px; |
| 89 | 90 | padding: 10px 15px; |
| 90 | 91 | font-size: 14px; |
| ... | ... | @@ -93,7 +94,8 @@ body { |
| 93 | 94 | outline: none; |
| 94 | 95 | } |
| 95 | 96 | |
| 96 | -#tenantInput:focus { | |
| 97 | +#tenantInput:focus, | |
| 98 | +#skuFilterDimension:focus { | |
| 97 | 99 | border-color: #e74c3c; |
| 98 | 100 | } |
| 99 | 101 | ... | ... |
frontend/static/js/app.js
| ... | ... | @@ -14,6 +14,21 @@ function getTenantId() { |
| 14 | 14 | return '1'; // Default fallback |
| 15 | 15 | } |
| 16 | 16 | |
| 17 | +// Get sku_filter_dimension (as list) from input | |
| 18 | +function getSkuFilterDimension() { | |
| 19 | + const skuFilterInput = document.getElementById('skuFilterDimension'); | |
| 20 | + if (skuFilterInput) { | |
| 21 | + const value = skuFilterInput.value.trim(); | |
| 22 | + if (!value.length) { | |
| 23 | + return null; | |
| 24 | + } | |
| 25 | + // 支持用逗号分隔多个维度,例如:color,size 或 option1,color | |
| 26 | + const parts = value.split(',').map(v => v.trim()).filter(v => v.length > 0); | |
| 27 | + return parts.length > 0 ? parts : null; | |
| 28 | + } | |
| 29 | + return null; | |
| 30 | +} | |
| 31 | + | |
| 17 | 32 | // State Management |
| 18 | 33 | let state = { |
| 19 | 34 | query: '', |
| ... | ... | @@ -51,6 +66,7 @@ function toggleFilters() { |
| 51 | 66 | async function performSearch(page = 1) { |
| 52 | 67 | const query = document.getElementById('searchInput').value.trim(); |
| 53 | 68 | const tenantId = getTenantId(); |
| 69 | + const skuFilterDimension = getSkuFilterDimension(); | |
| 54 | 70 | |
| 55 | 71 | if (!query) { |
| 56 | 72 | alert('Please enter search keywords'); |
| ... | ... | @@ -96,6 +112,7 @@ async function performSearch(page = 1) { |
| 96 | 112 | facets: facets, |
| 97 | 113 | sort_by: state.sortBy || null, |
| 98 | 114 | sort_order: state.sortOrder, |
| 115 | + sku_filter_dimension: skuFilterDimension, | |
| 99 | 116 | debug: state.debug |
| 100 | 117 | }) |
| 101 | 118 | }); | ... | ... |
main.py
| ... | ... | @@ -93,7 +93,7 @@ def cmd_search(args): |
| 93 | 93 | |
| 94 | 94 | from query import QueryParser |
| 95 | 95 | query_parser = QueryParser(config) |
| 96 | - searcher = Searcher(config, es_client, query_parser) | |
| 96 | + searcher = Searcher(es_client, config, query_parser) | |
| 97 | 97 | |
| 98 | 98 | # Execute search |
| 99 | 99 | print(f"Searching for: '{args.query}' (tenant: {args.tenant_id})") | ... | ... |
query/query_parser.py
| ... | ... | @@ -9,13 +9,7 @@ import numpy as np |
| 9 | 9 | import logging |
| 10 | 10 | |
| 11 | 11 | from embeddings import BgeEncoder |
| 12 | -from search.query_config import ( | |
| 13 | - ENABLE_TEXT_EMBEDDING, | |
| 14 | - ENABLE_TRANSLATION, | |
| 15 | - REWRITE_DICTIONARY, | |
| 16 | - TRANSLATION_API_KEY, | |
| 17 | - TRANSLATION_SERVICE | |
| 18 | -) | |
| 12 | +from config import SearchConfig | |
| 19 | 13 | from .language_detector import LanguageDetector |
| 20 | 14 | from .translator import Translator |
| 21 | 15 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| ... | ... | @@ -70,6 +64,7 @@ class QueryParser: |
| 70 | 64 | |
| 71 | 65 | def __init__( |
| 72 | 66 | self, |
| 67 | + config: SearchConfig, | |
| 73 | 68 | text_encoder: Optional[BgeEncoder] = None, |
| 74 | 69 | translator: Optional[Translator] = None |
| 75 | 70 | ): |
| ... | ... | @@ -77,21 +72,23 @@ class QueryParser: |
| 77 | 72 | Initialize query parser. |
| 78 | 73 | |
| 79 | 74 | Args: |
| 75 | + config: SearchConfig instance | |
| 80 | 76 | text_encoder: Text embedding encoder (lazy loaded if not provided) |
| 81 | 77 | translator: Translator instance (lazy loaded if not provided) |
| 82 | 78 | """ |
| 79 | + self.config = config | |
| 83 | 80 | self._text_encoder = text_encoder |
| 84 | 81 | self._translator = translator |
| 85 | 82 | |
| 86 | 83 | # Initialize components |
| 87 | 84 | self.normalizer = QueryNormalizer() |
| 88 | 85 | self.language_detector = LanguageDetector() |
| 89 | - self.rewriter = QueryRewriter(REWRITE_DICTIONARY) | |
| 86 | + self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) | |
| 90 | 87 | |
| 91 | 88 | @property |
| 92 | 89 | def text_encoder(self) -> BgeEncoder: |
| 93 | 90 | """Lazy load text encoder.""" |
| 94 | - if self._text_encoder is None and ENABLE_TEXT_EMBEDDING: | |
| 91 | + if self._text_encoder is None and self.config.query_config.enable_text_embedding: | |
| 95 | 92 | logger.info("Initializing text encoder (lazy load)...") |
| 96 | 93 | self._text_encoder = BgeEncoder() |
| 97 | 94 | return self._text_encoder |
| ... | ... | @@ -99,13 +96,13 @@ class QueryParser: |
| 99 | 96 | @property |
| 100 | 97 | def translator(self) -> Translator: |
| 101 | 98 | """Lazy load translator.""" |
| 102 | - if self._translator is None and ENABLE_TRANSLATION: | |
| 99 | + if self._translator is None and self.config.query_config.enable_translation: | |
| 103 | 100 | logger.info("Initializing translator (lazy load)...") |
| 104 | 101 | self._translator = Translator( |
| 105 | - api_key=TRANSLATION_API_KEY, | |
| 102 | + api_key=self.config.query_config.translation_api_key, | |
| 106 | 103 | use_cache=True, |
| 107 | - glossary_id=None, # Can be added to query_config if needed | |
| 108 | - translation_context='e-commerce product search' | |
| 104 | + glossary_id=self.config.query_config.translation_glossary_id, | |
| 105 | + translation_context=self.config.query_config.translation_context | |
| 109 | 106 | ) |
| 110 | 107 | return self._translator |
| 111 | 108 | |
| ... | ... | @@ -156,7 +153,7 @@ class QueryParser: |
| 156 | 153 | |
| 157 | 154 | # Stage 2: Query rewriting |
| 158 | 155 | rewritten = None |
| 159 | - if REWRITE_DICTIONARY: # Enable rewrite if dictionary exists | |
| 156 | + if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists | |
| 160 | 157 | rewritten = self.rewriter.rewrite(query_text) |
| 161 | 158 | if rewritten != query_text: |
| 162 | 159 | log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") |
| ... | ... | @@ -173,7 +170,7 @@ class QueryParser: |
| 173 | 170 | |
| 174 | 171 | # Stage 4: Translation |
| 175 | 172 | translations = {} |
| 176 | - if ENABLE_TRANSLATION: | |
| 173 | + if self.config.query_config.enable_translation: | |
| 177 | 174 | try: |
| 178 | 175 | # Determine target languages for translation |
| 179 | 176 | # Simplified: always translate to Chinese and English |
| ... | ... | @@ -210,19 +207,47 @@ class QueryParser: |
| 210 | 207 | # Stage 5: Text embedding |
| 211 | 208 | query_vector = None |
| 212 | 209 | if (generate_vector and |
| 213 | - ENABLE_TEXT_EMBEDDING and | |
| 210 | + self.config.query_config.enable_text_embedding and | |
| 214 | 211 | domain == "default"): # Only generate vector for default domain |
| 215 | - try: | |
| 216 | - log_debug("开始生成查询向量") | |
| 217 | - query_vector = self.text_encoder.encode([query_text])[0] | |
| 218 | - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") | |
| 219 | - if context: | |
| 220 | - context.store_intermediate_result('query_vector_shape', query_vector.shape) | |
| 221 | - except Exception as e: | |
| 222 | - error_msg = f"查询向量生成失败 | 错误: {str(e)}" | |
| 223 | - log_info(error_msg) | |
| 224 | - if context: | |
| 225 | - context.add_warning(error_msg) | |
| 212 | + # Get thresholds from config | |
| 213 | + chinese_limit = self.config.query_config.embedding_disable_chinese_char_limit | |
| 214 | + english_limit = self.config.query_config.embedding_disable_english_word_limit | |
| 215 | + | |
| 216 | + # Check if embedding should be disabled for short queries | |
| 217 | + should_disable_embedding = False | |
| 218 | + disable_reason = None | |
| 219 | + | |
| 220 | + if detected_lang == 'zh': | |
| 221 | + # For Chinese: disable embedding if character count <= threshold | |
| 222 | + char_count = len(query_text.strip()) | |
| 223 | + if char_count <= chinese_limit: | |
| 224 | + should_disable_embedding = True | |
| 225 | + disable_reason = f"中文查询字数({char_count}) <= {chinese_limit},禁用向量搜索" | |
| 226 | + log_info(disable_reason) | |
| 227 | + if context: | |
| 228 | + context.store_intermediate_result('embedding_disabled_reason', disable_reason) | |
| 229 | + else: | |
| 230 | + # For English: disable embedding if word count <= threshold | |
| 231 | + word_count = len(query_text.strip().split()) | |
| 232 | + if word_count <= english_limit: | |
| 233 | + should_disable_embedding = True | |
| 234 | + disable_reason = f"英文查询单词数({word_count}) <= {english_limit},禁用向量搜索" | |
| 235 | + log_info(disable_reason) | |
| 236 | + if context: | |
| 237 | + context.store_intermediate_result('embedding_disabled_reason', disable_reason) | |
| 238 | + | |
| 239 | + if not should_disable_embedding: | |
| 240 | + try: | |
| 241 | + log_debug("开始生成查询向量") | |
| 242 | + query_vector = self.text_encoder.encode([query_text])[0] | |
| 243 | + log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") | |
| 244 | + if context: | |
| 245 | + context.store_intermediate_result('query_vector_shape', query_vector.shape) | |
| 246 | + except Exception as e: | |
| 247 | + error_msg = f"查询向量生成失败 | 错误: {str(e)}" | |
| 248 | + log_info(error_msg) | |
| 249 | + if context: | |
| 250 | + context.add_warning(error_msg) | |
| 226 | 251 | |
| 227 | 252 | # Build result |
| 228 | 253 | result = ParsedQuery( | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -11,7 +11,7 @@ Simplified architecture: |
| 11 | 11 | from typing import Dict, Any, List, Optional, Union |
| 12 | 12 | import numpy as np |
| 13 | 13 | from .boolean_parser import QueryNode |
| 14 | -from .query_config import FUNCTION_SCORE_CONFIG | |
| 14 | +from config import FunctionScoreConfig | |
| 15 | 15 | |
| 16 | 16 | |
| 17 | 17 | class ESQueryBuilder: |
| ... | ... | @@ -23,7 +23,8 @@ class ESQueryBuilder: |
| 23 | 23 | match_fields: List[str], |
| 24 | 24 | text_embedding_field: Optional[str] = None, |
| 25 | 25 | image_embedding_field: Optional[str] = None, |
| 26 | - source_fields: Optional[List[str]] = None | |
| 26 | + source_fields: Optional[List[str]] = None, | |
| 27 | + function_score_config: Optional[FunctionScoreConfig] = None | |
| 27 | 28 | ): |
| 28 | 29 | """ |
| 29 | 30 | Initialize query builder. |
| ... | ... | @@ -34,12 +35,14 @@ class ESQueryBuilder: |
| 34 | 35 | text_embedding_field: Field name for text embeddings |
| 35 | 36 | image_embedding_field: Field name for image embeddings |
| 36 | 37 | source_fields: Fields to return in search results (_source includes) |
| 38 | + function_score_config: Function score configuration | |
| 37 | 39 | """ |
| 38 | 40 | self.index_name = index_name |
| 39 | 41 | self.match_fields = match_fields |
| 40 | 42 | self.text_embedding_field = text_embedding_field |
| 41 | 43 | self.image_embedding_field = image_embedding_field |
| 42 | 44 | self.source_fields = source_fields |
| 45 | + self.function_score_config = function_score_config | |
| 43 | 46 | |
| 44 | 47 | def build_query( |
| 45 | 48 | self, |
| ... | ... | @@ -182,12 +185,15 @@ class ESQueryBuilder: |
| 182 | 185 | return query |
| 183 | 186 | |
| 184 | 187 | # Build function_score query |
| 188 | + score_mode = self.function_score_config.score_mode if self.function_score_config else "sum" | |
| 189 | + boost_mode = self.function_score_config.boost_mode if self.function_score_config else "multiply" | |
| 190 | + | |
| 185 | 191 | function_score_query = { |
| 186 | 192 | "function_score": { |
| 187 | 193 | "query": query, |
| 188 | 194 | "functions": functions, |
| 189 | - "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"), | |
| 190 | - "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") | |
| 195 | + "score_mode": score_mode, | |
| 196 | + "boost_mode": boost_mode | |
| 191 | 197 | } |
| 192 | 198 | } |
| 193 | 199 | |
| ... | ... | @@ -201,7 +207,10 @@ class ESQueryBuilder: |
| 201 | 207 | List of function score functions |
| 202 | 208 | """ |
| 203 | 209 | functions = [] |
| 204 | - config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) | |
| 210 | + if not self.function_score_config: | |
| 211 | + return functions | |
| 212 | + | |
| 213 | + config_functions = self.function_score_config.functions or [] | |
| 205 | 214 | |
| 206 | 215 | for func_config in config_functions: |
| 207 | 216 | func_type = func_config.get("type") | ... | ... |
search/searcher.py
| ... | ... | @@ -5,7 +5,7 @@ Handles query parsing, boolean expressions, ranking, and result formatting. |
| 5 | 5 | """ |
| 6 | 6 | |
| 7 | 7 | from typing import Dict, Any, List, Optional, Union |
| 8 | -import time | |
| 8 | +import time, json | |
| 9 | 9 | import logging |
| 10 | 10 | |
| 11 | 11 | from utils.es_client import ESClient |
| ... | ... | @@ -14,16 +14,8 @@ from embeddings import CLIPImageEncoder |
| 14 | 14 | from .boolean_parser import BooleanParser, QueryNode |
| 15 | 15 | from .es_query_builder import ESQueryBuilder |
| 16 | 16 | from .rerank_engine import RerankEngine |
| 17 | -from .query_config import ( | |
| 18 | - DEFAULT_INDEX_NAME, | |
| 19 | - DEFAULT_MATCH_FIELDS, | |
| 20 | - TEXT_EMBEDDING_FIELD, | |
| 21 | - IMAGE_EMBEDDING_FIELD, | |
| 22 | - SOURCE_FIELDS, | |
| 23 | - ENABLE_TRANSLATION, | |
| 24 | - ENABLE_TEXT_EMBEDDING, | |
| 25 | - RANKING_EXPRESSION | |
| 26 | -) | |
| 17 | +from config import SearchConfig | |
| 18 | +from config.utils import get_match_fields_for_index | |
| 27 | 19 | from context.request_context import RequestContext, RequestContextStage, create_request_context |
| 28 | 20 | from api.models import FacetResult, FacetValue |
| 29 | 21 | from api.result_formatter import ResultFormatter |
| ... | ... | @@ -87,37 +79,40 @@ class Searcher: |
| 87 | 79 | def __init__( |
| 88 | 80 | self, |
| 89 | 81 | es_client: ESClient, |
| 90 | - query_parser: Optional[QueryParser] = None, | |
| 91 | - index_name: str = DEFAULT_INDEX_NAME | |
| 82 | + config: SearchConfig, | |
| 83 | + query_parser: Optional[QueryParser] = None | |
| 92 | 84 | ): |
| 93 | 85 | """ |
| 94 | 86 | Initialize searcher. |
| 95 | 87 | |
| 96 | 88 | Args: |
| 97 | 89 | es_client: Elasticsearch client |
| 90 | + config: SearchConfig instance | |
| 98 | 91 | query_parser: Query parser (created if not provided) |
| 99 | - index_name: ES index name (default: search_products) | |
| 100 | 92 | """ |
| 101 | 93 | self.es_client = es_client |
| 102 | - self.index_name = index_name | |
| 103 | - self.query_parser = query_parser or QueryParser() | |
| 94 | + self.config = config | |
| 95 | + self.index_name = config.es_index_name | |
| 96 | + self.query_parser = query_parser or QueryParser(config) | |
| 104 | 97 | |
| 105 | 98 | # Initialize components |
| 106 | 99 | self.boolean_parser = BooleanParser() |
| 107 | - self.rerank_engine = RerankEngine(RANKING_EXPRESSION, enabled=False) | |
| 100 | + self.rerank_engine = RerankEngine(config.ranking.expression, enabled=False) | |
| 108 | 101 | |
| 109 | - # Use constants from query_config | |
| 110 | - self.match_fields = DEFAULT_MATCH_FIELDS | |
| 111 | - self.text_embedding_field = TEXT_EMBEDDING_FIELD | |
| 112 | - self.image_embedding_field = IMAGE_EMBEDDING_FIELD | |
| 102 | + # Get match fields from config | |
| 103 | + self.match_fields = get_match_fields_for_index(config, "default") | |
| 104 | + self.text_embedding_field = config.query_config.text_embedding_field or "title_embedding" | |
| 105 | + self.image_embedding_field = config.query_config.image_embedding_field or "image_embedding" | |
| 106 | + self.source_fields = config.query_config.source_fields or [] | |
| 113 | 107 | |
| 114 | 108 | # Query builder - simplified single-layer architecture |
| 115 | 109 | self.query_builder = ESQueryBuilder( |
| 116 | - index_name=index_name, | |
| 110 | + index_name=self.index_name, | |
| 117 | 111 | match_fields=self.match_fields, |
| 118 | 112 | text_embedding_field=self.text_embedding_field, |
| 119 | 113 | image_embedding_field=self.image_embedding_field, |
| 120 | - source_fields=SOURCE_FIELDS | |
| 114 | + source_fields=self.source_fields, | |
| 115 | + function_score_config=self.config.function_score | |
| 121 | 116 | ) |
| 122 | 117 | |
| 123 | 118 | def search( |
| ... | ... | @@ -135,7 +130,7 @@ class Searcher: |
| 135 | 130 | sort_order: Optional[str] = "desc", |
| 136 | 131 | debug: bool = False, |
| 137 | 132 | language: str = "zh", |
| 138 | - sku_filter_dimension: Optional[str] = None, | |
| 133 | + sku_filter_dimension: Optional[List[str]] = None, | |
| 139 | 134 | ) -> SearchResult: |
| 140 | 135 | """ |
| 141 | 136 | Execute search query (外部友好格式). |
| ... | ... | @@ -162,8 +157,8 @@ class Searcher: |
| 162 | 157 | context = create_request_context() |
| 163 | 158 | |
| 164 | 159 | # Always use config defaults (these are backend configuration, not user parameters) |
| 165 | - enable_translation = ENABLE_TRANSLATION | |
| 166 | - enable_embedding = ENABLE_TEXT_EMBEDDING | |
| 160 | + enable_translation = self.config.query_config.enable_translation | |
| 161 | + enable_embedding = self.config.query_config.enable_text_embedding | |
| 167 | 162 | enable_rerank = False # Temporarily disabled |
| 168 | 163 | |
| 169 | 164 | # Start timing |
| ... | ... | @@ -305,14 +300,14 @@ class Searcher: |
| 305 | 300 | context.store_intermediate_result('es_query', es_query) |
| 306 | 301 | context.store_intermediate_result('es_body_for_search', body_for_es) |
| 307 | 302 | |
| 303 | + # Serialize ES query as a compact JSON string (no spaces or newlines) | |
| 304 | + es_query_compact = json.dumps(es_query, ensure_ascii=False, separators=(',', ':')) | |
| 305 | + | |
| 308 | 306 | context.logger.info( |
| 309 | - f"ES查询构建完成 | 大小: {len(str(es_query))}字符 | " | |
| 310 | - f"KNN: {'是' if enable_embedding and parsed_query.query_vector is not None else '否'} | " | |
| 311 | - f"分面: {'是' if facets else '否'}", | |
| 312 | - extra={'reqid': context.reqid, 'uid': context.uid} | |
| 313 | - ) | |
| 314 | - context.logger.debug( | |
| 315 | - f"ES查询详情: {es_query}", | |
| 307 | + f"ES query built | size: {len(es_query_compact)} chars | " | |
| 308 | + f"KNN: {'yes' if enable_embedding and parsed_query.query_vector is not None else 'no'} | " | |
| 309 | + f"facets: {'yes' if facets else 'no'} | " | |
| 310 | + f"query: {es_query_compact}", | |
| 316 | 311 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 317 | 312 | ) |
| 318 | 313 | except Exception as e: |
| ... | ... | @@ -508,9 +503,9 @@ class Searcher: |
| 508 | 503 | } |
| 509 | 504 | |
| 510 | 505 | # Add _source filtering if source_fields are configured |
| 511 | - if SOURCE_FIELDS: | |
| 506 | + if self.source_fields: | |
| 512 | 507 | es_query["_source"] = { |
| 513 | - "includes": SOURCE_FIELDS | |
| 508 | + "includes": self.source_fields | |
| 514 | 509 | } |
| 515 | 510 | |
| 516 | 511 | if filters or range_filters: | ... | ... |
tests/conftest.py
| ... | ... | @@ -137,8 +137,8 @@ def mock_es_client() -> Mock: |
| 137 | 137 | def test_searcher(sample_search_config, mock_es_client) -> Searcher: |
| 138 | 138 | """测试用Searcher实例""" |
| 139 | 139 | return Searcher( |
| 140 | - config=sample_search_config, | |
| 141 | - es_client=mock_es_client | |
| 140 | + es_client=mock_es_client, | |
| 141 | + config=sample_search_config | |
| 142 | 142 | ) |
| 143 | 143 | |
| 144 | 144 | ... | ... |
utils/logger.py
| ... | ... | @@ -38,10 +38,11 @@ class StructuredFormatter(logging.Formatter): |
| 38 | 38 | # Add request context if available |
| 39 | 39 | reqid = getattr(record, 'reqid', None) |
| 40 | 40 | uid = getattr(record, 'uid', None) |
| 41 | - if reqid or uid: | |
| 41 | + if reqid is not None or uid is not None: | |
| 42 | + # Normalize missing values to "-1" for easier correlation | |
| 42 | 43 | log_entry['request_context'] = { |
| 43 | - 'reqid': reqid, | |
| 44 | - 'uid': uid | |
| 44 | + 'reqid': reqid if reqid is not None else "-1", | |
| 45 | + 'uid': uid if uid is not None else "-1" | |
| 45 | 46 | } |
| 46 | 47 | |
| 47 | 48 | # Add extra data if available |
| ... | ... | @@ -98,13 +99,31 @@ class RequestContextFilter(logging.Filter): |
| 98 | 99 | from context.request_context import get_current_request_context |
| 99 | 100 | context = get_current_request_context() |
| 100 | 101 | if context: |
| 101 | - record.reqid = context.reqid | |
| 102 | - record.uid = context.uid | |
| 102 | + # Ensure every request-scoped log record carries reqid/uid. | |
| 103 | + # If they are missing in the context, fall back to "-1". | |
| 104 | + record.reqid = getattr(context, "reqid", None) or "-1" | |
| 105 | + record.uid = getattr(context, "uid", None) or "-1" | |
| 103 | 106 | except (ImportError, AttributeError): |
| 104 | 107 | pass |
| 105 | 108 | return True |
| 106 | 109 | |
| 107 | 110 | |
| 111 | +class ContextAwareConsoleFormatter(logging.Formatter): | |
| 112 | + """ | |
| 113 | + Console formatter that injects reqid/uid into the log line. | |
| 114 | + | |
| 115 | + For non-request logs (no context), reqid/uid will be "-1". | |
| 116 | + """ | |
| 117 | + | |
| 118 | + def format(self, record: logging.LogRecord) -> str: | |
| 119 | + # Provide safe defaults so format string never fails | |
| 120 | + if not hasattr(record, "reqid"): | |
| 121 | + record.reqid = "-1" | |
| 122 | + if not hasattr(record, "uid"): | |
| 123 | + record.uid = "-1" | |
| 124 | + return super().format(record) | |
| 125 | + | |
| 126 | + | |
| 108 | 127 | def setup_logging( |
| 109 | 128 | log_level: str = "INFO", |
| 110 | 129 | log_dir: str = "logs", |
| ... | ... | @@ -137,8 +156,8 @@ def setup_logging( |
| 137 | 156 | |
| 138 | 157 | # Create formatters |
| 139 | 158 | structured_formatter = StructuredFormatter() |
| 140 | - console_formatter = logging.Formatter( | |
| 141 | - '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s' | |
| 159 | + console_formatter = ContextAwareConsoleFormatter( | |
| 160 | + '%(asctime)s | reqid:%(reqid)s | uid:%(uid)s | %(levelname)-8s | %(name)-15s | %(message)s' | |
| 142 | 161 | ) |
| 143 | 162 | |
| 144 | 163 | # Add console handler | ... | ... |