Compare View
Commits (6)
Showing
29 changed files
Show diff stats
api/routes/indexer.py
| ... | ... | @@ -449,7 +449,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: |
| 449 | 449 | 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, |
| 450 | 450 | 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 |
| 451 | 451 | """ |
| 452 | - from indexer.product_enrich import analyze_products | |
| 452 | + from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 453 | 453 | |
| 454 | 454 | llm_langs = list(dict.fromkeys(languages)) or ["en"] |
| 455 | 455 | |
| ... | ... | @@ -510,10 +510,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: |
| 510 | 510 | raw = row.get(name) |
| 511 | 511 | if not raw: |
| 512 | 512 | continue |
| 513 | - for part in re.split(r"[,;|/\n\t]+", str(raw)): | |
| 514 | - value = part.strip() | |
| 515 | - if not value: | |
| 516 | - continue | |
| 513 | + for value in split_multi_value_field(str(raw)): | |
| 517 | 514 | rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value}) |
| 518 | 515 | if name == "tags": |
| 519 | 516 | rec["tags"].append(value) | ... | ... |
config/config.yaml
| 1 | 1 | # Unified Configuration for Multi-Tenant Search Engine |
| 2 | 2 | # 统一配置文件,所有租户共用一套配置 |
| 3 | 3 | # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 |
| 4 | +# | |
| 5 | +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 | |
| 6 | +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 | |
| 7 | + | |
| 8 | +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) | |
| 9 | +runtime: | |
| 10 | + environment: "prod" | |
| 11 | + index_namespace: "" | |
| 12 | + api_host: "0.0.0.0" | |
| 13 | + api_port: 6002 | |
| 14 | + indexer_host: "0.0.0.0" | |
| 15 | + indexer_port: 6004 | |
| 16 | + embedding_host: "0.0.0.0" | |
| 17 | + embedding_port: 6005 | |
| 18 | + embedding_text_port: 6005 | |
| 19 | + embedding_image_port: 6008 | |
| 20 | + translator_host: "127.0.0.1" | |
| 21 | + translator_port: 6006 | |
| 22 | + reranker_host: "127.0.0.1" | |
| 23 | + reranker_port: 6007 | |
| 24 | + | |
| 25 | +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) | |
| 26 | +infrastructure: | |
| 27 | + elasticsearch: | |
| 28 | + host: "http://localhost:9200" | |
| 29 | + username: null | |
| 30 | + password: null | |
| 31 | + redis: | |
| 32 | + host: "localhost" | |
| 33 | + port: 6479 | |
| 34 | + snapshot_db: 0 | |
| 35 | + password: null | |
| 36 | + socket_timeout: 1 | |
| 37 | + socket_connect_timeout: 1 | |
| 38 | + retry_on_timeout: false | |
| 39 | + cache_expire_days: 720 | |
| 40 | + embedding_cache_prefix: "embedding" | |
| 41 | + anchor_cache_prefix: "product_anchors" | |
| 42 | + anchor_cache_expire_days: 30 | |
| 43 | + database: | |
| 44 | + host: null | |
| 45 | + port: 3306 | |
| 46 | + database: null | |
| 47 | + username: null | |
| 48 | + password: null | |
| 49 | + secrets: | |
| 50 | + dashscope_api_key: null | |
| 51 | + deepl_auth_key: null | |
| 4 | 52 | |
| 5 | 53 | # Elasticsearch Index |
| 6 | 54 | es_index_name: "search_products" |
| 7 | 55 | |
| 56 | +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) | |
| 57 | +indexes: [] | |
| 58 | + | |
| 8 | 59 | # Config assets |
| 9 | 60 | assets: |
| 10 | 61 | query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" |
| ... | ... | @@ -20,20 +71,19 @@ es_settings: |
| 20 | 71 | refresh_interval: "30s" |
| 21 | 72 | |
| 22 | 73 | # 字段权重配置(用于搜索时的字段boost) |
| 23 | -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 | |
| 74 | +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 | |
| 24 | 75 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 |
| 25 | 76 | field_boosts: |
| 26 | 77 | title: 3.0 |
| 78 | + qanchors: 2.5 | |
| 79 | + tags: 2.0 | |
| 80 | + category_name_text: 2.0 | |
| 81 | + category_path: 2.0 | |
| 27 | 82 | brief: 1.5 |
| 28 | - description: 1.0 | |
| 29 | - qanchors: 1.5 | |
| 30 | - vendor: 1.5 | |
| 31 | - category_path: 1.5 | |
| 32 | - category_name_text: 1.5 | |
| 33 | - tags: 1.0 | |
| 34 | - option1_values: 0.6 | |
| 35 | - option2_values: 0.4 | |
| 36 | - option3_values: 0.4 | |
| 83 | + description: 1.5 | |
| 84 | + option1_values: 1.5 | |
| 85 | + option2_values: 1.5 | |
| 86 | + option3_values: 1.5 | |
| 37 | 87 | |
| 38 | 88 | # Query Configuration(查询配置) |
| 39 | 89 | query_config: |
| ... | ... | @@ -47,10 +97,23 @@ query_config: |
| 47 | 97 | enable_text_embedding: true |
| 48 | 98 | enable_query_rewrite: true |
| 49 | 99 | |
| 100 | + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) | |
| 101 | + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 | |
| 102 | + # zh_to_en_model: "opus-mt-zh-en" | |
| 103 | + # en_to_zh_model: "opus-mt-en-zh" | |
| 104 | + # default_translation_model: "nllb-200-distilled-600m" | |
| 105 | + zh_to_en_model: "deepl" | |
| 106 | + en_to_zh_model: "deepl" | |
| 107 | + default_translation_model: "deepl" | |
| 108 | + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) | |
| 109 | + zh_to_en_model__source_not_in_index: "deepl" | |
| 110 | + en_to_zh_model__source_not_in_index: "deepl" | |
| 111 | + default_translation_model__source_not_in_index: "deepl" | |
| 112 | + | |
| 50 | 113 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 51 | 114 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 52 | - translation_embedding_wait_budget_ms_source_in_index: 80 | |
| 53 | - translation_embedding_wait_budget_ms_source_not_in_index: 200 | |
| 115 | + translation_embedding_wait_budget_ms_source_in_index: 500 # 80 | |
| 116 | + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 | |
| 54 | 117 | |
| 55 | 118 | # 动态多语言检索字段配置 |
| 56 | 119 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; |
| ... | ... | @@ -58,11 +121,11 @@ query_config: |
| 58 | 121 | search_fields: |
| 59 | 122 | multilingual_fields: |
| 60 | 123 | - "title" |
| 61 | - - "brief" | |
| 62 | - - "description" | |
| 63 | - - "vendor" | |
| 124 | + - "qanchors" | |
| 64 | 125 | - "category_path" |
| 65 | 126 | - "category_name_text" |
| 127 | + - "brief" | |
| 128 | + - "description" | |
| 66 | 129 | shared_fields: |
| 67 | 130 | - "tags" |
| 68 | 131 | - "option1_values" |
| ... | ... | @@ -71,18 +134,14 @@ query_config: |
| 71 | 134 | core_multilingual_fields: |
| 72 | 135 | - "title" |
| 73 | 136 | - "brief" |
| 74 | - - "vendor" | |
| 75 | 137 | - "category_name_text" |
| 76 | 138 | |
| 77 | - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底) | |
| 139 | + # 统一文本召回策略(主查询 + 翻译查询) | |
| 78 | 140 | text_query_strategy: |
| 79 | 141 | base_minimum_should_match: "75%" |
| 80 | 142 | translation_minimum_should_match: "75%" |
| 81 | - translation_boost: 0.4 | |
| 82 | - translation_boost_when_source_missing: 1.0 | |
| 83 | - source_boost_when_missing: 0.6 | |
| 84 | - original_query_fallback_boost_when_translation_missing: 0.2 | |
| 85 | - tie_breaker_base_query: 0.9 | |
| 143 | + translation_boost: 0.75 | |
| 144 | + tie_breaker_base_query: 0.5 | |
| 86 | 145 | |
| 87 | 146 | # Embedding字段名称 |
| 88 | 147 | text_embedding_field: "title_embedding" |
| ... | ... | @@ -120,7 +179,7 @@ query_config: |
| 120 | 179 | - skus |
| 121 | 180 | |
| 122 | 181 | # KNN boost配置(向量召回的boost值) |
| 123 | - knn_boost: 0.25 # Lower boost for embedding recall | |
| 182 | + knn_boost: 2.0 # Lower boost for embedding recall | |
| 124 | 183 | |
| 125 | 184 | # Function Score配置(ES层打分规则) |
| 126 | 185 | function_score: |
| ... | ... | @@ -148,6 +207,17 @@ services: |
| 148 | 207 | cache: |
| 149 | 208 | ttl_seconds: 62208000 |
| 150 | 209 | sliding_expiration: true |
| 210 | + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups). | |
| 211 | + enable_model_quality_tier_cache: true | |
| 212 | + # Higher tier = better quality. Multiple models may share one tier (同级). | |
| 213 | + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers). | |
| 214 | + model_quality_tiers: | |
| 215 | + deepl: 30 | |
| 216 | + qwen-mt: 30 | |
| 217 | + llm: 30 | |
| 218 | + nllb-200-distilled-600m: 20 | |
| 219 | + opus-mt-zh-en: 10 | |
| 220 | + opus-mt-en-zh: 10 | |
| 151 | 221 | capabilities: |
| 152 | 222 | qwen-mt: |
| 153 | 223 | enabled: true |
| ... | ... | @@ -290,7 +360,7 @@ services: |
| 290 | 360 | engine: "vllm" |
| 291 | 361 | max_model_len: 160 |
| 292 | 362 | tensor_parallel_size: 1 |
| 293 | - gpu_memory_utilization: 0.36 | |
| 363 | + gpu_memory_utilization: 0.20 | |
| 294 | 364 | dtype: "float16" |
| 295 | 365 | enable_prefix_caching: true |
| 296 | 366 | enforce_eager: false | ... | ... |
config/loader.py
| ... | ... | @@ -284,19 +284,30 @@ class AppConfigLoader: |
| 284 | 284 | base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), |
| 285 | 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), |
| 286 | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | - translation_boost_when_source_missing=float( | |
| 288 | - text_strategy.get("translation_boost_when_source_missing", 1.0) | |
| 289 | - ), | |
| 290 | - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)), | |
| 291 | - original_query_fallback_boost_when_translation_missing=float( | |
| 292 | - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2) | |
| 293 | - ), | |
| 294 | 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), |
| 295 | 288 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), |
| 296 | 289 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), |
| 297 | 290 | default_translation_model=str( |
| 298 | 291 | query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" |
| 299 | 292 | ), |
| 293 | + zh_to_en_model_source_not_in_index=( | |
| 294 | + str(v) | |
| 295 | + if (v := query_cfg.get("zh_to_en_model__source_not_in_index")) | |
| 296 | + not in (None, "") | |
| 297 | + else None | |
| 298 | + ), | |
| 299 | + en_to_zh_model_source_not_in_index=( | |
| 300 | + str(v) | |
| 301 | + if (v := query_cfg.get("en_to_zh_model__source_not_in_index")) | |
| 302 | + not in (None, "") | |
| 303 | + else None | |
| 304 | + ), | |
| 305 | + default_translation_model_source_not_in_index=( | |
| 306 | + str(v) | |
| 307 | + if (v := query_cfg.get("default_translation_model__source_not_in_index")) | |
| 308 | + not in (None, "") | |
| 309 | + else None | |
| 310 | + ), | |
| 300 | 311 | translation_embedding_wait_budget_ms_source_in_index=int( |
| 301 | 312 | query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) |
| 302 | 313 | ), | ... | ... |
config/schema.py
| ... | ... | @@ -54,13 +54,14 @@ class QueryConfig: |
| 54 | 54 | base_minimum_should_match: str = "70%" |
| 55 | 55 | translation_minimum_should_match: str = "70%" |
| 56 | 56 | translation_boost: float = 0.4 |
| 57 | - translation_boost_when_source_missing: float = 1.0 | |
| 58 | - source_boost_when_missing: float = 0.6 | |
| 59 | - original_query_fallback_boost_when_translation_missing: float = 0.2 | |
| 60 | 57 | tie_breaker_base_query: float = 0.9 |
| 61 | 58 | zh_to_en_model: str = "opus-mt-zh-en" |
| 62 | 59 | en_to_zh_model: str = "opus-mt-en-zh" |
| 63 | 60 | default_translation_model: str = "nllb-200-distilled-600m" |
| 61 | + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。 | |
| 62 | + zh_to_en_model_source_not_in_index: Optional[str] = None | |
| 63 | + en_to_zh_model_source_not_in_index: Optional[str] = None | |
| 64 | + default_translation_model_source_not_in_index: Optional[str] = None | |
| 64 | 65 | # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 |
| 65 | 66 | # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 |
| 66 | 67 | # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 | ... | ... |
docs/DEVELOPER_GUIDE.md
| ... | ... | @@ -147,7 +147,7 @@ docs/ # 文档(含本指南) |
| 147 | 147 | |
| 148 | 148 | ### 4.4 query |
| 149 | 149 | |
| 150 | -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 | |
| 150 | +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装。 | |
| 151 | 151 | - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 |
| 152 | 152 | |
| 153 | 153 | ### 4.5 search | ... | ... |
docs/QUICKSTART.md
| ... | ... | @@ -558,6 +558,21 @@ lsof -i :6004 |
| 558 | 558 | |
| 559 | 559 | 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 |
| 560 | 560 | |
| 561 | +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`) | |
| 562 | + | |
| 563 | +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。 | |
| 564 | + | |
| 565 | +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP: | |
| 566 | + | |
| 567 | +```bash | |
| 568 | +source activate.sh | |
| 569 | +pip install -r requirements_hanlp.txt | |
| 570 | +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))" | |
| 571 | +# 期望:4.x 且 True | |
| 572 | +``` | |
| 573 | + | |
| 574 | +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。 | |
| 575 | + | |
| 561 | 576 | --- |
| 562 | 577 | |
| 563 | 578 | ## 6. 相关文档 | ... | ... |
| ... | ... | @@ -0,0 +1,69 @@ |
| 1 | +ES 付费版本 or 定制开发(建议先看下付费版本价格) | |
| 2 | +ES定制开发: | |
| 3 | +RRF / retrievers | |
| 4 | + | |
| 5 | +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。 | |
| 6 | + | |
| 7 | +这类能力最有价值的点是: | |
| 8 | +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 | |
| 9 | + | |
| 10 | +{ | |
| 11 | + "retriever": { | |
| 12 | + "rrf": { | |
| 13 | + "retrievers": [ | |
| 14 | + { "standard": { "query": { ... } } }, | |
| 15 | + { "knn": { ... } } | |
| 16 | + ] | |
| 17 | + } | |
| 18 | + } | |
| 19 | +} | |
| 20 | + | |
| 21 | + | |
| 22 | +加reranker: | |
| 23 | +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 | |
| 24 | + | |
| 25 | +{ | |
| 26 | + "retriever": { | |
| 27 | + "text_similarity_reranker": { | |
| 28 | + "retriever": { | |
| 29 | + "rrf": { ... } | |
| 30 | + }, | |
| 31 | + ... | |
| 32 | + } | |
| 33 | + } | |
| 34 | +} | |
| 35 | + | |
| 36 | +{ | |
| 37 | + "retriever": { | |
| 38 | + "text_similarity_reranker": { | |
| 39 | + "retriever": { | |
| 40 | + "rrf": { | |
| 41 | + "retrievers": [ | |
| 42 | + { | |
| 43 | + "standard": { | |
| 44 | + "query": { | |
| 45 | + "...": "..." | |
| 46 | + } | |
| 47 | + } | |
| 48 | + }, | |
| 49 | + { | |
| 50 | + "knn": { | |
| 51 | + "...": "..." | |
| 52 | + } | |
| 53 | + } | |
| 54 | + ], | |
| 55 | + "rank_window_size": 100, | |
| 56 | + "rank_constant": 20 | |
| 57 | + } | |
| 58 | + }, | |
| 59 | + "field": "your_rerank_text_field", | |
| 60 | + "inference_text": "白色 oversized T-shirt", | |
| 61 | + "inference_id": ".rerank-v1-elasticsearch", | |
| 62 | + "rank_window_size": 50 | |
| 63 | + } | |
| 64 | + }, | |
| 65 | + "size": 20 | |
| 66 | +} | |
| 67 | + | |
| 68 | + | |
| 69 | + | ... | ... |
docs/TODO.txt
| 1 | 1 | |
| 2 | 2 | |
| 3 | -@reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗? | |
| 4 | -测试了,让每个批次都并发地进行,耗时没有变化 | |
| 3 | + | |
| 4 | +本地部署一个7b Q4量化的大模型 | |
| 5 | +es需要licence的两个功能,如果费用低,开通下licence,或者改es源码定制开发下,支持 rank.rrf,reranker | |
| 6 | + | |
| 7 | + | |
| 8 | + | |
| 9 | +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗? | |
| 10 | + | |
| 11 | + knn_boost: 2.0 | |
| 12 | + | |
| 13 | + | |
| 14 | +{ | |
| 15 | + "query": { ...全文检索... }, | |
| 16 | + "knn": { ...向量检索... }, | |
| 17 | + "rank": { | |
| 18 | + "rrf": {} | |
| 19 | + } | |
| 20 | +} | |
| 21 | + | |
| 22 | + | |
| 23 | +"image_embedding": { | |
| 24 | + "type": "nested", | |
| 25 | + "properties": { | |
| 26 | + "vector": { | |
| 27 | + "type": "dense_vector", | |
| 28 | + "dims": 1024, | |
| 29 | + "index": true, | |
| 30 | + "similarity": "dot_product", | |
| 31 | + "element_type": "bfloat16" | |
| 32 | + }, | |
| 33 | + "url": { | |
| 34 | + "type": "text" | |
| 35 | + } | |
| 36 | + } | |
| 37 | +}, | |
| 38 | +去掉 image_embedding_512 | |
| 39 | +image_embedding改为,一个spu有多个sku向量,每个向量内部properties: | |
| 40 | +除了vector url还应该包括,该图片是对应哪些sku | |
| 41 | +"image_embedding": { | |
| 42 | + "type": "nested", | |
| 43 | + "properties": { | |
| 44 | + "vector": { | |
| 45 | + "type": "dense_vector", | |
| 46 | + "dims": 1024, | |
| 47 | + "index": true, | |
| 48 | + "similarity": "dot_product", | |
| 49 | + "element_type": "bfloat16" | |
| 50 | + }, | |
| 51 | + "url": { | |
| 52 | + "type": "text" | |
| 53 | + } | |
| 54 | + } | |
| 55 | +}, | |
| 56 | + | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | +tags字段使用的优化: | |
| 61 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | |
| 62 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | |
| 63 | + | |
| 64 | + | |
| 65 | + | |
| 66 | +外部需求: | |
| 67 | +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | |
| 68 | +2. ES支持reranker pipline? | |
| 69 | + | |
| 70 | + | |
| 71 | + | |
| 72 | + | |
| 73 | + | |
| 5 | 74 | |
| 6 | 75 | 增加款式意图识别模块 |
| 7 | 76 | |
| 8 | -意图类型: 颜色,尺寸(目前只需要支持这两种) | |
| 77 | +意图类型: 颜色,尺码(目前只需要支持这两种) | |
| 9 | 78 | |
| 10 | 79 | 意图召回层: |
| 11 | 80 | 每种意图,有一个召回词集合 |
| 12 | 81 | 对query(包括原始query、各种翻译query 都做匹配) |
| 13 | 82 | |
| 14 | -意图识别层: | |
| 15 | -如果召回 判断有款式需求, | |
| 83 | +以颜色意图为例: | |
| 84 | +有一个词表,每一行 都逗号分割,互为同义词,行内第一个为标准化词 | |
| 85 | +query匹配了其中任何一个词,都认为,具有颜色意图 | |
| 86 | +匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 | |
| 87 | + | |
| 88 | +意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 | |
| 89 | + | |
| 90 | + | |
| 91 | + | |
| 92 | +意图使用: | |
| 93 | + | |
| 94 | +我们第一阶段,使用 参与ES提权。 | |
| 95 | + | |
| 96 | +一、参与ES提权 | |
| 97 | + | |
| 98 | + | |
| 99 | +二、参与reranker | |
| 16 | 100 | |
| 17 | 101 | |
| 18 | -是否有: | |
| 19 | -颜色需求 | |
| 20 | -尺码需求 | |
| 21 | 102 | 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。 |
| 22 | 103 | |
| 23 | 104 | |
| 24 | 105 | 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。 |
| 25 | 106 | 需要优化: |
| 26 | 107 | 现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为 |
| 27 | -1. 第一轮:遍历完,如果有且仅有一个才这样。 | |
| 28 | -2. 第二轮:如果有多个,跳到3。如果没有,对每个词都走泛化词表进行匹配。 | |
| 108 | +1. 第一轮:遍历完,如果有且仅有一个被query包含,那么认为匹配。 | |
| 109 | +2. 第二轮:如果有多个符合(被query包含),跳到3。如果没有,对每个词都走泛化词表进行匹配。 | |
| 29 | 110 | 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的 |
| 30 | -这个sku筛选也需要提取为一个独立的模块 | |
| 31 | - | |
| 32 | - | |
| 33 | - | |
| 34 | -2026-03-21 10:29:23,698 - elastic_transport.transport - INFO - POST http://localhost:9200/search_products_tenant_163/_search?include_named_queries_score=false [status:200 duration:0.009s] | |
| 35 | -2026-03-21 10:29:23,700 - request_context - INFO - 分页详情回填 | ids=20 | filled=20 | took=7ms | |
| 36 | -2026-03-21 10:29:23,700 - request_context - INFO - 重排分页切片 | from=20, size=20, 返回=20条 | |
| 37 | -2026-03-21 10:29:23,720 - embeddings.text_encoder - ERROR - TextEmbeddingEncoder service request failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 | |
| 38 | -Traceback (most recent call last): | |
| 39 | - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service | |
| 40 | - response.raise_for_status() | |
| 41 | - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status | |
| 42 | - raise HTTPError(http_error_msg, response=self) | |
| 43 | -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 | |
| 44 | -2026-03-21 10:29:23,720 - search.searcher - WARNING - Failed to encode SKU option1 values for final-page sorting: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 | |
| 45 | -Traceback (most recent call last): | |
| 46 | - File "/data/saas-search/search/searcher.py", line 448, in _apply_sku_sorting_for_page_hits | |
| 47 | - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1) | |
| 48 | - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| 49 | - File "/data/saas-search/embeddings/text_encoder.py", line 112, in encode | |
| 50 | - response_data = self._call_service( | |
| 51 | - ^^^^^^^^^^^^^^^^^^^ | |
| 52 | - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service | |
| 53 | - response.raise_for_status() | |
| 54 | - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status | |
| 55 | - raise HTTPError(http_error_msg, response=self) | |
| 56 | -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 | |
| 57 | -2026-03-21 10:29:23,721 - request_context - WARNING - SKU option embedding failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 | |
| 111 | +这个sku筛选也需要提取为一个独立的模块。 | |
| 112 | + | |
| 113 | + | |
| 114 | +另外:现在是reranker、分页之后做sku筛选,要改为: | |
| 115 | +1. 有款式意图的时候,才做sku筛选 | |
| 116 | +2. sku筛选的时机,改为在reranker之前,对所有内容做sku筛选,然后 | |
| 117 | +3. 从仅 option1 扩展到多个维度,识别的意图,包含意图的维度名(color)和维度名的泛化词list(color、颜色、colour、olors、、、、),遍历option1_name,option2_name,option3_name,看哪个能匹配上意图的维度名list,哪个匹配上了,则在这个维度筛选。 | |
| 118 | +4. Rerank doc (有款式意图的时候)要带上属性后缀,拼接到title后面。在调用 run_rerank 前,对每条 hit 生成「用于重排的 doc 文本」(标题 + 可选后缀) | |
| 119 | +5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) | |
| 120 | + | |
| 58 | 121 | |
| 59 | 122 | |
| 60 | 123 | |
| 124 | +当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 | |
| 125 | + | |
| 126 | +请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 | |
| 127 | + | |
| 128 | + | |
| 129 | + | |
| 130 | + | |
| 131 | + | |
| 132 | + | |
| 133 | + | |
| 134 | + | |
| 135 | + | |
| 136 | + | |
| 137 | + | |
| 138 | + | |
| 139 | + | |
| 140 | +是否需要: | |
| 141 | +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 | |
| 142 | + | |
| 143 | + | |
| 61 | 144 | |
| 62 | 145 | 先阅读文本embedding相关的代码: |
| 63 | 146 | @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py |
| ... | ... | @@ -361,6 +444,31 @@ embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec) |
| 361 | 444 | |
| 362 | 445 | |
| 363 | 446 | |
| 447 | + | |
| 448 | + | |
| 449 | + | |
| 450 | + | |
| 451 | + | |
| 452 | + | |
| 453 | + | |
| 454 | +多reranker: | |
| 455 | + | |
| 456 | +改 reranker 服务,一次请求返回多路分 | |
| 457 | +服务启动时 加载多个 backend(或按请求懒加载),/rerank 响应扩展为例如 | |
| 458 | +scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vllm": [...] }。 | |
| 459 | +搜索侧解析多路分,再融合或只透传 debug。 | |
| 460 | +优点:搜索侧仍只调一个 URL。缺点:单进程多大模型 显存压力很大; | |
| 461 | + | |
| 462 | +融合层要注意的一点 | |
| 463 | +fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score | |
| 464 | +多 backend 之后需要rerank_scores 都参与融合 | |
| 465 | + | |
| 466 | + | |
| 467 | + | |
| 468 | + | |
| 469 | + | |
| 470 | + | |
| 471 | + | |
| 364 | 472 | product_enrich : Partial Mode : done |
| 365 | 473 | https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR |
| 366 | 474 | 需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下: |
| ... | ... | @@ -383,6 +491,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men |
| 383 | 491 | |
| 384 | 492 | |
| 385 | 493 | 融合打分(已完成,2026-03) |
| 494 | + | |
| 495 | +以下已经完成: | |
| 386 | 496 | 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取: |
| 387 | 497 | - `base_query` |
| 388 | 498 | - `base_query_trans_*` |
| ... | ... | @@ -397,7 +507,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men |
| 397 | 507 | - `docs/搜索API对接指南.md` |
| 398 | 508 | - `docs/Usage-Guide.md` |
| 399 | 509 | |
| 400 | - | |
| 510 | +未完成的: | |
| 511 | +(归一化、次序融合?还乘法公式?) | |
| 512 | +RRF:先把多路召回稳妥融合 | |
| 513 | +linear + minmax:让你能精调 knn 和文本的权重 | |
| 514 | +reranker:对前面召回出来的 top-k 再做“最后一刀” | |
| 401 | 515 | |
| 402 | 516 | |
| 403 | 517 | ... | ... |
docs/搜索API对接指南-01-搜索接口.md
| ... | ... | @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 66 | 66 | | `min_score` | float | N | null | 最小相关性分数阈值 | |
| 67 | 67 | | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | |
| 68 | 68 | | `debug` | boolean | N | false | 是否返回调试信息 | |
| 69 | -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 | | |
| 70 | -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 | | |
| 71 | -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | | |
| 69 | +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) | | |
| 70 | +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` | | |
| 71 | +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` | | |
| 72 | + | |
| 73 | +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。 | |
| 72 | 74 | | `user_id` | string | N | null | 用户ID(用于个性化,预留) | |
| 73 | 75 | | `session_id` | string | N | null | 会话ID(用于分析,预留) | |
| 74 | 76 | |
| ... | ... | @@ -551,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 551 | 553 | | `rewritten_query` | string | 重写后的查询 | |
| 552 | 554 | | `detected_language` | string | 检测到的语言 | |
| 553 | 555 | | `translations` | object | 翻译结果 | |
| 554 | -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 | | |
| 555 | -| `search_langs` | array[string] | 实际参与检索的语言列表 | | |
| 556 | -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 | | |
| 557 | 556 | | `has_vector` | boolean | 是否生成了向量 | |
| 558 | 557 | |
| 559 | 558 | `debug_info.per_result[]` 常见字段: |
| ... | ... | @@ -563,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 563 | 562 | | `spu_id` | string | 结果 SPU ID | |
| 564 | 563 | | `es_score` | float | ES 原始 `_score` | |
| 565 | 564 | | `rerank_score` | float | 重排分数 | |
| 566 | -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | | |
| 565 | +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) | | |
| 567 | 566 | | `text_source_score` | float | `base_query` 分数 | |
| 568 | 567 | | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | |
| 569 | -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 | | |
| 570 | 568 | | `text_primary_score` | float | 文本大分中的主证据部分 | |
| 571 | 569 | | `text_support_score` | float | 文本大分中的辅助证据部分 | |
| 572 | 570 | | `knn_score` | float | `knn_query` 分数 | | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -2,11 +2,11 @@ |
| 2 | 2 | |
| 3 | 3 | ## 1. 文档目标 |
| 4 | 4 | |
| 5 | -本文描述当前线上代码的文本检索策略,重点覆盖: | |
| 5 | +本文描述当前代码中的文本检索策略,重点覆盖: | |
| 6 | 6 | |
| 7 | 7 | - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) |
| 8 | 8 | - 统一文本召回表达式(无布尔 AST 分支) |
| 9 | -- 翻译缺失时的兜底策略 | |
| 9 | +- 解析层与检索表达式层的职责边界 | |
| 10 | 10 | - 重排融合打分与调试字段 |
| 11 | 11 | - 典型场景下实际生成的 ES 查询结构 |
| 12 | 12 | |
| ... | ... | @@ -17,9 +17,11 @@ |
| 17 | 17 | 查询链路(文本相关): |
| 18 | 18 | |
| 19 | 19 | 1. `QueryParser.parse()` |
| 20 | - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 | |
| 20 | + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 | |
| 21 | +2. `Searcher.search()` | |
| 22 | + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 | |
| 21 | 23 | 2. `ESQueryBuilder._build_advanced_text_query()` |
| 22 | - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 | |
| 24 | + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 | |
| 23 | 25 | 3. `build_query()` |
| 24 | 26 | 统一走文本策略,不再有布尔 AST 枝路。 |
| 25 | 27 | |
| ... | ... | @@ -37,18 +39,18 @@ |
| 37 | 39 | 源语言字段做主召回;其他语言走翻译补召回(低权重)。 |
| 38 | 40 | 2. 若 `detected_language not in index_languages`: |
| 39 | 41 | 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 |
| 40 | -3. 若第 2 步翻译部分失败或全部失败: | |
| 41 | - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 | |
| 42 | +3. 若翻译部分失败或全部失败: | |
| 43 | + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。 | |
| 42 | 44 | |
| 43 | 45 | ### 3.2 翻译与向量:并发提交与共享超时 |
| 44 | 46 | |
| 45 | -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: | |
| 47 | +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`: | |
| 46 | 48 | |
| 47 | -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 | |
| 48 | -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 | |
| 49 | +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 | |
| 50 | +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。 | |
| 49 | 51 | - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 |
| 50 | 52 | |
| 51 | -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): | |
| 53 | +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`): | |
| 52 | 54 | |
| 53 | 55 | - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 |
| 54 | 56 | - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 |
| ... | ... | @@ -62,7 +64,7 @@ |
| 62 | 64 | ```json |
| 63 | 65 | { |
| 64 | 66 | "multi_match": { |
| 65 | - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", | |
| 67 | + "_name": "base_query|base_query_trans_xx", | |
| 66 | 68 | "query": "<text>", |
| 67 | 69 | "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], |
| 68 | 70 | "minimum_should_match": "75%", |
| ... | ... | @@ -75,7 +77,7 @@ |
| 75 | 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 |
| 76 | 78 | |
| 77 | 79 | > **附 — 混写辅助召回** |
| 78 | -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | |
| 80 | +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | |
| 79 | 81 | |
| 80 | 82 | ## 5. 关键配置项(文本策略) |
| 81 | 83 | |
| ... | ... | @@ -88,20 +90,12 @@ |
| 88 | 90 | |
| 89 | 91 | - `base_minimum_should_match` |
| 90 | 92 | - `translation_minimum_should_match` |
| 91 | -- `translation_boost` | |
| 92 | -- `translation_boost_when_source_missing` | |
| 93 | -- `source_boost_when_missing` | |
| 94 | -- `original_query_fallback_boost_when_translation_missing`(新增) | |
| 93 | +- `translation_boost`(所有 `base_query_trans_*` 共用) | |
| 95 | 94 | - `tie_breaker_base_query` |
| 96 | 95 | |
| 97 | -新增项说明: | |
| 98 | - | |
| 99 | -- `original_query_fallback_boost_when_translation_missing`: | |
| 100 | - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。 | |
| 101 | - | |
| 102 | 96 | 说明: |
| 103 | 97 | |
| 104 | -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 | |
| 98 | +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*` 两类子句组成。 | |
| 105 | 99 | |
| 106 | 100 | ## 6. 典型场景与实际 DSL |
| 107 | 101 | |
| ... | ... | @@ -111,11 +105,12 @@ |
| 111 | 105 | |
| 112 | 106 | - `detected_language=de` |
| 113 | 107 | - `index_languages=[de,en]` |
| 114 | -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` | |
| 108 | +- `rewritten_query="herren schuhe"` | |
| 109 | +- `translations={en:"men shoes"}` | |
| 115 | 110 | |
| 116 | 111 | 策略结果: |
| 117 | 112 | |
| 118 | -- `base_query`:德语字段,正常权重 | |
| 113 | +- `base_query`:德语字段,**不写** `multi_match.boost` | |
| 119 | 114 | - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) |
| 120 | 115 | |
| 121 | 116 | ### 场景 B:源语种不在索引语言中,部分翻译缺失 |
| ... | ... | @@ -126,38 +121,44 @@ |
| 126 | 121 | |
| 127 | 122 | 策略结果: |
| 128 | 123 | |
| 129 | -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6) | |
| 130 | -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0) | |
| 131 | -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) | |
| 124 | +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0) | |
| 125 | +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4) | |
| 126 | +- 不会生成额外中文兜底子句 | |
| 132 | 127 | |
| 133 | 128 | ### 场景 C:源语种不在索引语言中,翻译全部失败 |
| 134 | 129 | |
| 135 | 130 | - `detected_language=de` |
| 136 | 131 | - `index_languages=[en,zh]` |
| 137 | -- `query_text_by_lang` 仅有 `de` | |
| 132 | +- `translations={}` | |
| 138 | 133 | |
| 139 | 134 | 策略结果: |
| 140 | 135 | |
| 141 | -- `base_query`(德语字段,低权重) | |
| 142 | -- `fallback_original_query_en`(英文字段原文兜底) | |
| 143 | -- `fallback_original_query_zh`(中文字段原文兜底) | |
| 136 | +- `base_query`(德语字段,**无** `boost` 字段) | |
| 137 | +- 不会生成 `base_query_trans_*` | |
| 144 | 138 | |
| 145 | -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题。 | |
| 139 | +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”。 | |
| 146 | 140 | |
| 147 | -## 7. QueryParser 与 ESBuilder 的职责分工 | |
| 141 | +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工 | |
| 148 | 142 | |
| 149 | -- `QueryParser` 负责“语言计划”与“可用文本”: | |
| 150 | - - `search_langs` | |
| 151 | - - `query_text_by_lang` | |
| 152 | - - `source_in_index_languages` | |
| 153 | - - `index_languages` | |
| 143 | +- `QueryParser` 负责“解析事实”: | |
| 144 | + - `query_normalized` | |
| 145 | + - `rewritten_query` | |
| 146 | + - `detected_language` | |
| 147 | + - `translations` | |
| 148 | + - `query_vector` | |
| 149 | + - `query_tokens` | |
| 154 | 150 | - `contains_chinese` / `contains_english` |
| 151 | +- `Searcher` 负责“租户语境”: | |
| 152 | + - `index_languages` | |
| 153 | + - 将其传给 parser 作为 `target_languages` | |
| 154 | + - 将其传给 builder 作为字段展开约束 | |
| 155 | 155 | - `ESQueryBuilder` 负责“表达式展开”: |
| 156 | 156 | - 动态字段组装 |
| 157 | 157 | - 子句权重分配 |
| 158 | - - 翻译缺失兜底子句拼接 | |
| 158 | + - `base_query` / `base_query_trans_*` 子句拼接 | |
| 159 | + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句 | |
| 159 | 160 | |
| 160 | -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界。 | |
| 161 | +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰。 | |
| 161 | 162 | |
| 162 | 163 | ## 8. 融合打分(Rerank + Text + KNN) |
| 163 | 164 | |
| ... | ... | @@ -165,24 +166,21 @@ |
| 165 | 166 | |
| 166 | 167 | ### 8.1 文本相关性大分 |
| 167 | 168 | |
| 168 | -文本大分由三部分组成: | |
| 169 | +文本大分由两部分组成: | |
| 169 | 170 | |
| 170 | 171 | - `base_query` |
| 171 | 172 | - `base_query_trans_*` |
| 172 | -- `fallback_original_query_*` | |
| 173 | 173 | |
| 174 | 174 | 聚合方式: |
| 175 | 175 | |
| 176 | 176 | 1. `source_score = base_query` |
| 177 | 177 | 2. `translation_score = max(base_query_trans_*)` |
| 178 | -3. `fallback_score = max(fallback_original_query_*)` | |
| 179 | -4. 加权: | |
| 178 | +3. 加权: | |
| 180 | 179 | - `weighted_source = source_score` |
| 181 | 180 | - `weighted_translation = 0.8 * translation_score` |
| 182 | - - `weighted_fallback = 0.55 * fallback_score` | |
| 183 | -5. 合成: | |
| 184 | - - `primary = max(weighted_source, weighted_translation, weighted_fallback)` | |
| 185 | - - `support = weighted_source + weighted_translation + weighted_fallback - primary` | |
| 181 | +4. 合成: | |
| 182 | + - `primary = max(weighted_source, weighted_translation)` | |
| 183 | + - `support = weighted_source + weighted_translation - primary` | |
| 186 | 184 | - `text_score = primary + 0.25 * support` |
| 187 | 185 | |
| 188 | 186 | 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 |
| ... | ... | @@ -212,7 +210,6 @@ fused_score = ( |
| 212 | 210 | - `text_score` |
| 213 | 211 | - `text_source_score` |
| 214 | 212 | - `text_translation_score` |
| 215 | -- `text_fallback_score` | |
| 216 | 213 | - `text_primary_score` |
| 217 | 214 | - `text_support_score` |
| 218 | 215 | - `knn_score` |
| ... | ... | @@ -221,9 +218,9 @@ fused_score = ( |
| 221 | 218 | |
| 222 | 219 | `debug_info.query_analysis` 还会暴露: |
| 223 | 220 | |
| 224 | -- `query_text_by_lang` | |
| 225 | -- `search_langs` | |
| 226 | -- `supplemental_search_langs` | |
| 221 | +- `translations` | |
| 222 | +- `detected_language` | |
| 223 | +- `rewritten_query` | |
| 227 | 224 | |
| 228 | 225 | 这些字段用于检索效果评估与 bad case 归因。 |
| 229 | 226 | |
| ... | ... | @@ -231,7 +228,7 @@ fused_score = ( |
| 231 | 228 | |
| 232 | 229 | 1. 当前文本主链路已移除布尔 AST 分支。 |
| 233 | 230 | 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 |
| 234 | -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性。 | |
| 231 | +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback。 | |
| 235 | 232 | 4. 若后续扩展到更多语种,请确保: |
| 236 | 233 | - mapping 中存在对应 `.<lang>` 字段 |
| 237 | 234 | - `index_languages` 配置在支持列表内 |
| ... | ... | @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py |
| 263 | 260 | 建议在 `tests/` 增加文本策略用例: |
| 264 | 261 | |
| 265 | 262 | 1. 源语种在索引语言,翻译命中缓存 |
| 266 | -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句) | |
| 267 | -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback) | |
| 268 | -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效 | |
| 269 | -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | |
| 263 | +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句) | |
| 264 | +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行) | |
| 265 | +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | |
| 270 | 266 | |
| 271 | 267 | |
| 272 | 268 | |
| ... | ... | @@ -281,3 +277,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid- |
| 281 | 277 | Rerank score: 0.9643 |
| 282 | 278 | title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top |
| 283 | 279 | title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣 |
| 280 | + | |
| 281 | + | |
| 282 | + | |
| 283 | +qwen3-0.6b的严重badcase: | |
| 284 | +q=牛仔裤 | |
| 285 | + | |
| 286 | +Rerank score: 0.0002 | |
| 287 | +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach | |
| 288 | +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色 | |
| 289 | + | |
| 290 | +Rerank score: 0.0168 | |
| 291 | +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter | |
| 292 | +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜 | |
| 293 | + | |
| 294 | +Rerank score: 0.1366 | |
| 295 | +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall) | |
| 296 | +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码) | |
| 297 | + | |
| 298 | +Rerank score: 0.0981 | |
| 299 | +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear | |
| 300 | +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰 | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -13,7 +13,7 @@ import numpy as np |
| 13 | 13 | import logging |
| 14 | 14 | import re |
| 15 | 15 | from typing import Dict, Any, Optional, List |
| 16 | -from indexer.product_enrich import analyze_products | |
| 16 | +from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 17 | 17 | |
| 18 | 18 | logger = logging.getLogger(__name__) |
| 19 | 19 | |
| ... | ... | @@ -121,7 +121,7 @@ class SPUDocumentTransformer: |
| 121 | 121 | # Tags |
| 122 | 122 | if pd.notna(spu_row.get('tags')): |
| 123 | 123 | tags_str = str(spu_row['tags']) |
| 124 | - doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()] | |
| 124 | + doc['tags'] = split_multi_value_field(tags_str) | |
| 125 | 125 | |
| 126 | 126 | # Category相关字段 |
| 127 | 127 | self._fill_category_fields(doc, spu_row) |
| ... | ... | @@ -282,11 +282,7 @@ class SPUDocumentTransformer: |
| 282 | 282 | raw = row.get(name) |
| 283 | 283 | if not raw: |
| 284 | 284 | continue |
| 285 | - parts = re.split(r"[,;|/\n\t]+", str(raw)) | |
| 286 | - for part in parts: | |
| 287 | - value = part.strip() | |
| 288 | - if not value: | |
| 289 | - continue | |
| 285 | + for value in split_multi_value_field(str(raw)): | |
| 290 | 286 | semantic_list.append({"lang": lang, "name": name, "value": value}) |
| 291 | 287 | |
| 292 | 288 | if qanchors_obj: |
| ... | ... | @@ -703,11 +699,7 @@ class SPUDocumentTransformer: |
| 703 | 699 | raw = row.get(name) |
| 704 | 700 | if not raw: |
| 705 | 701 | continue |
| 706 | - parts = re.split(r"[,;|/\n\t]+", str(raw)) | |
| 707 | - for part in parts: | |
| 708 | - value = part.strip() | |
| 709 | - if not value: | |
| 710 | - continue | |
| 702 | + for value in split_multi_value_field(str(raw)): | |
| 711 | 703 | semantic_list.append( |
| 712 | 704 | { |
| 713 | 705 | "lang": lang, | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -144,6 +144,20 @@ if _missing_prompt_langs: |
| 144 | 144 | ) |
| 145 | 145 | |
| 146 | 146 | |
| 147 | +# 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 | |
| 148 | +_MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") | |
| 149 | + | |
| 150 | + | |
| 151 | +def split_multi_value_field(text: Optional[str]) -> List[str]: | |
| 152 | + """将 LLM/业务中的多值字符串拆成短语列表(strip 后去空)。""" | |
| 153 | + if text is None: | |
| 154 | + return [] | |
| 155 | + s = str(text).strip() | |
| 156 | + if not s: | |
| 157 | + return [] | |
| 158 | + return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] | |
| 159 | + | |
| 160 | + | |
| 147 | 161 | def _normalize_space(text: str) -> str: |
| 148 | 162 | return re.sub(r"\s+", " ", (text or "").strip()) |
| 149 | 163 | ... | ... |
query/query_parser.py
| 1 | 1 | """ |
| 2 | 2 | Query parser - main module for query processing. |
| 3 | 3 | |
| 4 | -Handles query rewriting, translation, and embedding generation. | |
| 4 | +Responsibilities are intentionally narrow: | |
| 5 | +- normalize and rewrite the incoming query | |
| 6 | +- detect language and tokenize with HanLP | |
| 7 | +- run translation and embedding requests concurrently | |
| 8 | +- return parser facts, not Elasticsearch language-planning data | |
| 5 | 9 | """ |
| 6 | 10 | |
| 7 | -from typing import Dict, List, Optional, Any, Union, Tuple | |
| 11 | +from dataclasses import dataclass, field | |
| 12 | +from typing import Any, Callable, Dict, List, Optional, Tuple | |
| 8 | 13 | import numpy as np |
| 9 | 14 | import logging |
| 10 | 15 | import re |
| ... | ... | @@ -18,15 +23,12 @@ from .query_rewriter import QueryRewriter, QueryNormalizer |
| 18 | 23 | |
| 19 | 24 | logger = logging.getLogger(__name__) |
| 20 | 25 | |
| 21 | -try: | |
| 22 | - import hanlp # type: ignore | |
| 23 | -except Exception: # pragma: no cover | |
| 24 | - hanlp = None | |
| 26 | +import hanlp # type: ignore | |
| 25 | 27 | |
| 26 | 28 | |
| 27 | 29 | def simple_tokenize_query(text: str) -> List[str]: |
| 28 | 30 | """ |
| 29 | - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback). | |
| 31 | + Lightweight tokenizer for suggestion-side heuristics only. | |
| 30 | 32 | |
| 31 | 33 | - Consecutive CJK characters form one token |
| 32 | 34 | - Latin / digit runs (with internal hyphens) form tokens |
| ... | ... | @@ -37,63 +39,32 @@ def simple_tokenize_query(text: str) -> List[str]: |
| 37 | 39 | return pattern.findall(text) |
| 38 | 40 | |
| 39 | 41 | |
| 42 | +@dataclass(slots=True) | |
| 40 | 43 | class ParsedQuery: |
| 41 | - """Container for parsed query results.""" | |
| 42 | - | |
| 43 | - def __init__( | |
| 44 | - self, | |
| 45 | - original_query: str, | |
| 46 | - query_normalized: str, | |
| 47 | - rewritten_query: Optional[str] = None, | |
| 48 | - detected_language: Optional[str] = None, | |
| 49 | - translations: Dict[str, str] = None, | |
| 50 | - query_vector: Optional[np.ndarray] = None, | |
| 51 | - domain: str = "default", | |
| 52 | - keywords: str = "", | |
| 53 | - token_count: int = 0, | |
| 54 | - query_tokens: Optional[List[str]] = None, | |
| 55 | - query_text_by_lang: Optional[Dict[str, str]] = None, | |
| 56 | - search_langs: Optional[List[str]] = None, | |
| 57 | - index_languages: Optional[List[str]] = None, | |
| 58 | - source_in_index_languages: bool = True, | |
| 59 | - contains_chinese: bool = False, | |
| 60 | - contains_english: bool = False, | |
| 61 | - ): | |
| 62 | - self.original_query = original_query | |
| 63 | - self.query_normalized = query_normalized | |
| 64 | - self.rewritten_query = rewritten_query or query_normalized | |
| 65 | - self.detected_language = detected_language | |
| 66 | - self.translations = translations or {} | |
| 67 | - self.query_vector = query_vector | |
| 68 | - self.domain = domain | |
| 69 | - # Query analysis fields | |
| 70 | - self.keywords = keywords | |
| 71 | - self.token_count = token_count | |
| 72 | - self.query_tokens = query_tokens or [] | |
| 73 | - self.query_text_by_lang = query_text_by_lang or {} | |
| 74 | - self.search_langs = search_langs or [] | |
| 75 | - self.index_languages = index_languages or [] | |
| 76 | - self.source_in_index_languages = bool(source_in_index_languages) | |
| 77 | - self.contains_chinese = bool(contains_chinese) | |
| 78 | - self.contains_english = bool(contains_english) | |
| 44 | + """Container for query parser facts.""" | |
| 45 | + | |
| 46 | + original_query: str | |
| 47 | + query_normalized: str | |
| 48 | + rewritten_query: str | |
| 49 | + detected_language: Optional[str] = None | |
| 50 | + translations: Dict[str, str] = field(default_factory=dict) | |
| 51 | + query_vector: Optional[np.ndarray] = None | |
| 52 | + query_tokens: List[str] = field(default_factory=list) | |
| 53 | + contains_chinese: bool = False | |
| 54 | + contains_english: bool = False | |
| 79 | 55 | |
| 80 | 56 | def to_dict(self) -> Dict[str, Any]: |
| 81 | 57 | """Convert to dictionary representation.""" |
| 82 | - result = { | |
| 58 | + return { | |
| 83 | 59 | "original_query": self.original_query, |
| 84 | 60 | "query_normalized": self.query_normalized, |
| 85 | 61 | "rewritten_query": self.rewritten_query, |
| 86 | 62 | "detected_language": self.detected_language, |
| 87 | 63 | "translations": self.translations, |
| 88 | - "domain": self.domain | |
| 64 | + "query_tokens": self.query_tokens, | |
| 65 | + "contains_chinese": self.contains_chinese, | |
| 66 | + "contains_english": self.contains_english, | |
| 89 | 67 | } |
| 90 | - result["query_text_by_lang"] = self.query_text_by_lang | |
| 91 | - result["search_langs"] = self.search_langs | |
| 92 | - result["index_languages"] = self.index_languages | |
| 93 | - result["source_in_index_languages"] = self.source_in_index_languages | |
| 94 | - result["contains_chinese"] = self.contains_chinese | |
| 95 | - result["contains_english"] = self.contains_english | |
| 96 | - return result | |
| 97 | 68 | |
| 98 | 69 | |
| 99 | 70 | class QueryParser: |
| ... | ... | @@ -102,7 +73,7 @@ class QueryParser: |
| 102 | 73 | 1. Normalization |
| 103 | 74 | 2. Query rewriting (brand/category mappings, synonyms) |
| 104 | 75 | 3. Language detection |
| 105 | - 4. Translation to target languages | |
| 76 | + 4. Translation to caller-provided target languages | |
| 106 | 77 | 5. Text embedding generation (for semantic search) |
| 107 | 78 | """ |
| 108 | 79 | |
| ... | ... | @@ -110,7 +81,8 @@ class QueryParser: |
| 110 | 81 | self, |
| 111 | 82 | config: SearchConfig, |
| 112 | 83 | text_encoder: Optional[TextEmbeddingEncoder] = None, |
| 113 | - translator: Optional[Any] = None | |
| 84 | + translator: Optional[Any] = None, | |
| 85 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 114 | 86 | ): |
| 115 | 87 | """ |
| 116 | 88 | Initialize query parser. |
| ... | ... | @@ -128,23 +100,7 @@ class QueryParser: |
| 128 | 100 | self.normalizer = QueryNormalizer() |
| 129 | 101 | self.language_detector = LanguageDetector() |
| 130 | 102 | self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) |
| 131 | - | |
| 132 | - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer. | |
| 133 | - self._tok = None | |
| 134 | - self._pos_tag = None | |
| 135 | - if hanlp is not None: | |
| 136 | - try: | |
| 137 | - logger.info("Initializing HanLP components...") | |
| 138 | - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 139 | - self._tok.config.output_spans = True | |
| 140 | - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) | |
| 141 | - logger.info("HanLP components initialized") | |
| 142 | - except Exception as e: | |
| 143 | - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}") | |
| 144 | - self._tok = None | |
| 145 | - self._pos_tag = None | |
| 146 | - else: | |
| 147 | - logger.info("HanLP not installed; using simple tokenizer") | |
| 103 | + self._tokenizer = tokenizer or self._build_tokenizer() | |
| 148 | 104 | |
| 149 | 105 | # Eager initialization (startup-time failure visibility, no lazy init in request path) |
| 150 | 106 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: |
| ... | ... | @@ -170,57 +126,81 @@ class QueryParser: |
| 170 | 126 | """Return pre-initialized translator.""" |
| 171 | 127 | return self._translator |
| 172 | 128 | |
| 129 | + def _build_tokenizer(self) -> Callable[[str], Any]: | |
| 130 | + """Build the tokenizer used by query parsing. No fallback path by design.""" | |
| 131 | + if hanlp is None: | |
| 132 | + raise RuntimeError("HanLP is required for QueryParser tokenization") | |
| 133 | + logger.info("Initializing HanLP tokenizer...") | |
| 134 | + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 135 | + tokenizer.config.output_spans = True | |
| 136 | + logger.info("HanLP tokenizer initialized") | |
| 137 | + return tokenizer | |
| 138 | + | |
| 173 | 139 | @staticmethod |
| 174 | - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: | |
| 140 | + def _pick_query_translation_model( | |
| 141 | + source_lang: str, | |
| 142 | + target_lang: str, | |
| 143 | + config: SearchConfig, | |
| 144 | + source_language_in_index: bool, | |
| 145 | + ) -> str: | |
| 175 | 146 | """Pick the translation capability for query-time translation (configurable).""" |
| 176 | 147 | src = str(source_lang or "").strip().lower() |
| 177 | 148 | tgt = str(target_lang or "").strip().lower() |
| 149 | + qc = config.query_config | |
| 150 | + | |
| 151 | + if source_language_in_index: | |
| 152 | + if src == "zh" and tgt == "en": | |
| 153 | + return qc.zh_to_en_model | |
| 154 | + if src == "en" and tgt == "zh": | |
| 155 | + return qc.en_to_zh_model | |
| 156 | + return qc.default_translation_model | |
| 178 | 157 | |
| 179 | - # Use dedicated models for zh<->en if configured | |
| 180 | 158 | if src == "zh" and tgt == "en": |
| 181 | - return config.query_config.zh_to_en_model | |
| 159 | + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model | |
| 182 | 160 | if src == "en" and tgt == "zh": |
| 183 | - return config.query_config.en_to_zh_model | |
| 184 | - | |
| 185 | - # For any other language pairs, fall back to the configurable default model. | |
| 186 | - # By default this is `nllb-200-distilled-600m` (multi-lingual local model). | |
| 187 | - return config.query_config.default_translation_model | |
| 188 | - | |
| 189 | - def _simple_tokenize(self, text: str) -> List[str]: | |
| 190 | - return simple_tokenize_query(text) | |
| 191 | - | |
| 192 | - def _extract_keywords(self, query: str) -> str: | |
| 193 | - """Extract keywords (nouns with length > 1) from query.""" | |
| 194 | - if self._tok is not None and self._pos_tag is not None: | |
| 195 | - tok_result = self._tok(query) | |
| 196 | - if not tok_result: | |
| 197 | - return "" | |
| 198 | - words = [x[0] for x in tok_result] | |
| 199 | - pos_tags = self._pos_tag(words) | |
| 200 | - keywords = [] | |
| 201 | - for word, pos in zip(words, pos_tags): | |
| 202 | - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"): | |
| 203 | - keywords.append(word) | |
| 204 | - return " ".join(keywords) | |
| 205 | - | |
| 206 | - # Fallback: treat tokens with length > 1 as "keywords" | |
| 207 | - tokens = self._simple_tokenize(query) | |
| 208 | - keywords = [t for t in tokens if len(t) > 1] | |
| 209 | - return " ".join(keywords) | |
| 210 | - | |
| 211 | - def _get_token_count(self, query: str) -> int: | |
| 212 | - """Get token count (HanLP if available, otherwise simple).""" | |
| 213 | - if self._tok is not None: | |
| 214 | - tok_result = self._tok(query) | |
| 215 | - return len(tok_result) if tok_result else 0 | |
| 216 | - return len(self._simple_tokenize(query)) | |
| 161 | + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model | |
| 162 | + return qc.default_translation_model_source_not_in_index or qc.default_translation_model | |
| 163 | + | |
| 164 | + @staticmethod | |
| 165 | + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: | |
| 166 | + normalized: List[str] = [] | |
| 167 | + seen = set() | |
| 168 | + for language in languages or []: | |
| 169 | + token = str(language or "").strip().lower() | |
| 170 | + if not token or token in seen: | |
| 171 | + continue | |
| 172 | + seen.add(token) | |
| 173 | + normalized.append(token) | |
| 174 | + return normalized | |
| 175 | + | |
| 176 | + @staticmethod | |
| 177 | + def _extract_tokens(tokenizer_result: Any) -> List[str]: | |
| 178 | + """Normalize tokenizer output into a flat token string list.""" | |
| 179 | + if not tokenizer_result: | |
| 180 | + return [] | |
| 181 | + if isinstance(tokenizer_result, str): | |
| 182 | + token = tokenizer_result.strip() | |
| 183 | + return [token] if token else [] | |
| 184 | + | |
| 185 | + tokens: List[str] = [] | |
| 186 | + for item in tokenizer_result: | |
| 187 | + token: Optional[str] = None | |
| 188 | + if isinstance(item, str): | |
| 189 | + token = item | |
| 190 | + elif isinstance(item, (list, tuple)) and item: | |
| 191 | + token = str(item[0]) | |
| 192 | + elif item is not None: | |
| 193 | + token = str(item) | |
| 194 | + | |
| 195 | + if token is None: | |
| 196 | + continue | |
| 197 | + token = token.strip() | |
| 198 | + if token: | |
| 199 | + tokens.append(token) | |
| 200 | + return tokens | |
| 217 | 201 | |
| 218 | 202 | def _get_query_tokens(self, query: str) -> List[str]: |
| 219 | - """Get token list (HanLP if available, otherwise simple).""" | |
| 220 | - if self._tok is not None: | |
| 221 | - tok_result = self._tok(query) | |
| 222 | - return [x[0] for x in tok_result] if tok_result else [] | |
| 223 | - return self._simple_tokenize(query) | |
| 203 | + return self._extract_tokens(self._tokenizer(query)) | |
| 224 | 204 | |
| 225 | 205 | @staticmethod |
| 226 | 206 | def _contains_cjk(text: str) -> bool: |
| ... | ... | @@ -237,64 +217,24 @@ class QueryParser: |
| 237 | 217 | return False |
| 238 | 218 | return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) |
| 239 | 219 | |
| 240 | - @staticmethod | |
| 241 | - def _extract_latin_tokens(text: str) -> List[str]: | |
| 242 | - """Extract latin word tokens from query text.""" | |
| 243 | - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") | |
| 244 | - | |
| 245 | - def _infer_supplemental_search_langs( | |
| 246 | - self, | |
| 247 | - query_text: str, | |
| 248 | - detected_lang: str, | |
| 249 | - index_langs: List[str], | |
| 250 | - ) -> List[str]: | |
| 251 | - """ | |
| 252 | - Infer extra languages to search when the query mixes scripts. | |
| 253 | - | |
| 254 | - Rules: | |
| 255 | - - If any Chinese characters appear, include `zh` when available. | |
| 256 | - - If the query contains meaningful latin tokens, include `en` when available. | |
| 257 | - "Meaningful" means either: | |
| 258 | - 1) at least 2 latin tokens with length >= 4, or | |
| 259 | - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. | |
| 260 | - """ | |
| 261 | - supplemental: List[str] = [] | |
| 262 | - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} | |
| 263 | - normalized_detected = str(detected_lang or "").strip().lower() | |
| 264 | - query_text = str(query_text or "") | |
| 265 | - | |
| 266 | - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": | |
| 267 | - supplemental.append("zh") | |
| 268 | - | |
| 269 | - latin_tokens = self._extract_latin_tokens(query_text) | |
| 270 | - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] | |
| 271 | - latin_chars = sum(len(tok) for tok in latin_tokens) | |
| 272 | - non_space_chars = len(re.sub(r"\s+", "", query_text)) | |
| 273 | - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 | |
| 274 | - has_meaningful_english = ( | |
| 275 | - len(significant_latin_tokens) >= 2 or | |
| 276 | - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) | |
| 277 | - ) | |
| 278 | - | |
| 279 | - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": | |
| 280 | - supplemental.append("en") | |
| 281 | - | |
| 282 | - return supplemental | |
| 283 | - | |
| 284 | 220 | def parse( |
| 285 | 221 | self, |
| 286 | 222 | query: str, |
| 287 | 223 | tenant_id: Optional[str] = None, |
| 288 | 224 | generate_vector: bool = True, |
| 289 | - context: Optional[Any] = None | |
| 225 | + context: Optional[Any] = None, | |
| 226 | + target_languages: Optional[List[str]] = None, | |
| 290 | 227 | ) -> ParsedQuery: |
| 291 | 228 | """ |
| 292 | 229 | Parse query through all processing stages. |
| 293 | 230 | |
| 294 | 231 | Args: |
| 295 | 232 | query: Raw query string |
| 233 | + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily | |
| 234 | + to avoid a wider refactor in this first step. | |
| 296 | 235 | generate_vector: Whether to generate query embedding |
| 297 | 236 | context: Optional request context for tracking and logging |
| 237 | + target_languages: Translation target languages decided by the caller | |
| 298 | 238 | |
| 299 | 239 | Returns: |
| 300 | 240 | ParsedQuery object with all processing results |
| ... | ... | @@ -325,15 +265,9 @@ class QueryParser: |
| 325 | 265 | if context: |
| 326 | 266 | context.store_intermediate_result('query_normalized', normalized) |
| 327 | 267 | |
| 328 | - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike") | |
| 329 | - domain, query_text = self.normalizer.extract_domain_query(normalized) | |
| 330 | - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'") | |
| 331 | - if context: | |
| 332 | - context.store_intermediate_result('extracted_domain', domain) | |
| 333 | - context.store_intermediate_result('domain_query', query_text) | |
| 334 | - | |
| 335 | 268 | # Stage 2: Query rewriting |
| 336 | - rewritten = None | |
| 269 | + query_text = normalized | |
| 270 | + rewritten = normalized | |
| 337 | 271 | if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists |
| 338 | 272 | rewritten = self.rewriter.rewrite(query_text) |
| 339 | 273 | if rewritten != query_text: |
| ... | ... | @@ -351,43 +285,57 @@ class QueryParser: |
| 351 | 285 | log_info(f"Language detection | Detected language: {detected_lang}") |
| 352 | 286 | if context: |
| 353 | 287 | context.store_intermediate_result('detected_language', detected_lang) |
| 288 | + # Stage 4: Query analysis (tokenization + script flags) | |
| 289 | + query_tokens = self._get_query_tokens(query_text) | |
| 290 | + contains_chinese = self._contains_cjk(query_text) | |
| 291 | + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 292 | + | |
| 293 | + log_debug( | |
| 294 | + f"Query analysis | Query tokens: {query_tokens} | " | |
| 295 | + f"contains_chinese={contains_chinese} | contains_english={contains_english}" | |
| 296 | + ) | |
| 297 | + if context: | |
| 298 | + context.store_intermediate_result('query_tokens', query_tokens) | |
| 299 | + context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 300 | + context.store_intermediate_result('contains_english', contains_english) | |
| 354 | 301 | |
| 355 | - # Stage 4: Translation — always submit to thread pool; results are collected together with | |
| 356 | - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index). | |
| 302 | + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the | |
| 303 | + # caller decides translation targets and later search-field planning. | |
| 357 | 304 | translations: Dict[str, str] = {} |
| 358 | - translation_futures: Dict[str, Any] = {} | |
| 359 | - translation_executor: Optional[ThreadPoolExecutor] = None | |
| 360 | - index_langs: List[str] = [] | |
| 305 | + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {} | |
| 306 | + async_executor: Optional[ThreadPoolExecutor] = None | |
| 361 | 307 | detected_norm = str(detected_lang or "").strip().lower() |
| 308 | + normalized_targets = self._normalize_language_codes(target_languages) | |
| 309 | + translation_targets = [lang for lang in normalized_targets if lang != detected_norm] | |
| 310 | + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets | |
| 311 | + | |
| 312 | + # Stage 6: Text embedding - async execution | |
| 313 | + query_vector = None | |
| 314 | + should_generate_embedding = ( | |
| 315 | + generate_vector and | |
| 316 | + self.config.query_config.enable_text_embedding | |
| 317 | + ) | |
| 318 | + | |
| 319 | + task_count = len(translation_targets) + (1 if should_generate_embedding else 0) | |
| 320 | + if task_count > 0: | |
| 321 | + async_executor = ThreadPoolExecutor( | |
| 322 | + max_workers=max(1, min(task_count, 4)), | |
| 323 | + thread_name_prefix="query-enrichment", | |
| 324 | + ) | |
| 362 | 325 | |
| 363 | 326 | try: |
| 364 | - # 根据租户配置的 index_languages 决定翻译目标语言 | |
| 365 | - from config.tenant_config_loader import get_tenant_config_loader | |
| 366 | - tenant_loader = get_tenant_config_loader() | |
| 367 | - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") | |
| 368 | - raw_index_langs = tenant_cfg.get("index_languages") or [] | |
| 369 | - index_langs = [] | |
| 370 | - seen_langs = set() | |
| 371 | - for lang in raw_index_langs: | |
| 372 | - norm_lang = str(lang or "").strip().lower() | |
| 373 | - if not norm_lang or norm_lang in seen_langs: | |
| 374 | - continue | |
| 375 | - seen_langs.add(norm_lang) | |
| 376 | - index_langs.append(norm_lang) | |
| 377 | - | |
| 378 | - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm] | |
| 379 | - | |
| 380 | - if target_langs_for_translation: | |
| 381 | - translation_executor = ThreadPoolExecutor( | |
| 382 | - max_workers=max(1, min(len(target_langs_for_translation), 4)), | |
| 383 | - thread_name_prefix="query-translation", | |
| 384 | - ) | |
| 385 | - for lang in target_langs_for_translation: | |
| 386 | - model_name = self._pick_query_translation_model(detected_lang, lang, self.config) | |
| 327 | + if async_executor is not None: | |
| 328 | + for lang in translation_targets: | |
| 329 | + model_name = self._pick_query_translation_model( | |
| 330 | + detected_lang, | |
| 331 | + lang, | |
| 332 | + self.config, | |
| 333 | + source_language_in_index, | |
| 334 | + ) | |
| 387 | 335 | log_debug( |
| 388 | 336 | f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" |
| 389 | 337 | ) |
| 390 | - translation_futures[lang] = translation_executor.submit( | |
| 338 | + future = async_executor.submit( | |
| 391 | 339 | self.translator.translate, |
| 392 | 340 | query_text, |
| 393 | 341 | lang, |
| ... | ... | @@ -395,107 +343,61 @@ class QueryParser: |
| 395 | 343 | "ecommerce_search_query", |
| 396 | 344 | model_name, |
| 397 | 345 | ) |
| 398 | - | |
| 399 | - if context: | |
| 400 | - context.store_intermediate_result('translations', translations) | |
| 401 | - for lang, translation in translations.items(): | |
| 402 | - if translation: | |
| 403 | - context.store_intermediate_result(f'translation_{lang}', translation) | |
| 404 | - | |
| 346 | + future_to_task[future] = ("translation", lang) | |
| 347 | + | |
| 348 | + if should_generate_embedding: | |
| 349 | + if self.text_encoder is None: | |
| 350 | + raise RuntimeError("Text embedding is enabled but text encoder is not initialized") | |
| 351 | + log_debug("Submitting query vector generation") | |
| 352 | + | |
| 353 | + def _encode_query_vector() -> Optional[np.ndarray]: | |
| 354 | + arr = self.text_encoder.encode([query_text], priority=1) | |
| 355 | + if arr is None or len(arr) == 0: | |
| 356 | + return None | |
| 357 | + vec = arr[0] | |
| 358 | + if vec is None: | |
| 359 | + return None | |
| 360 | + return np.asarray(vec, dtype=np.float32) | |
| 361 | + | |
| 362 | + future = async_executor.submit(_encode_query_vector) | |
| 363 | + future_to_task[future] = ("embedding", None) | |
| 405 | 364 | except Exception as e: |
| 406 | - error_msg = f"Translation failed | Error: {str(e)}" | |
| 365 | + error_msg = f"Async query enrichment submission failed | Error: {str(e)}" | |
| 407 | 366 | log_info(error_msg) |
| 408 | 367 | if context: |
| 409 | 368 | context.add_warning(error_msg) |
| 369 | + if async_executor is not None: | |
| 370 | + async_executor.shutdown(wait=False) | |
| 371 | + async_executor = None | |
| 372 | + future_to_task.clear() | |
| 410 | 373 | |
| 411 | - # Stage 5: Query analysis (keywords, token count, query_tokens) | |
| 412 | - keywords = self._extract_keywords(query_text) | |
| 413 | - query_tokens = self._get_query_tokens(query_text) | |
| 414 | - token_count = len(query_tokens) | |
| 415 | - contains_chinese = self._contains_cjk(query_text) | |
| 416 | - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 417 | - | |
| 418 | - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " | |
| 419 | - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | " | |
| 420 | - f"contains_english={contains_english}") | |
| 421 | - if context: | |
| 422 | - context.store_intermediate_result('keywords', keywords) | |
| 423 | - context.store_intermediate_result('token_count', token_count) | |
| 424 | - context.store_intermediate_result('query_tokens', query_tokens) | |
| 425 | - context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 426 | - context.store_intermediate_result('contains_english', contains_english) | |
| 427 | - | |
| 428 | - # Stage 6: Text embedding (only for non-short queries) - async execution | |
| 429 | - query_vector = None | |
| 430 | - embedding_future = None | |
| 431 | - should_generate_embedding = ( | |
| 432 | - generate_vector and | |
| 433 | - self.config.query_config.enable_text_embedding and | |
| 434 | - domain == "default" | |
| 435 | - ) | |
| 436 | - | |
| 437 | - encoding_executor = None | |
| 438 | - if should_generate_embedding: | |
| 439 | - try: | |
| 440 | - if self.text_encoder is None: | |
| 441 | - raise RuntimeError("Text embedding is enabled but text encoder is not initialized") | |
| 442 | - log_debug("Starting query vector generation (async)") | |
| 443 | - # Submit encoding task to thread pool for async execution | |
| 444 | - encoding_executor = ThreadPoolExecutor(max_workers=1) | |
| 445 | - def _encode_query_vector() -> Optional[np.ndarray]: | |
| 446 | - arr = self.text_encoder.encode([query_text], priority=1) | |
| 447 | - if arr is None or len(arr) == 0: | |
| 448 | - return None | |
| 449 | - vec = arr[0] | |
| 450 | - return vec if isinstance(vec, np.ndarray) else None | |
| 451 | - embedding_future = encoding_executor.submit( | |
| 452 | - _encode_query_vector | |
| 453 | - ) | |
| 454 | - except Exception as e: | |
| 455 | - error_msg = f"Query vector generation task submission failed | Error: {str(e)}" | |
| 456 | - log_info(error_msg) | |
| 457 | - if context: | |
| 458 | - context.add_warning(error_msg) | |
| 459 | - encoding_executor = None | |
| 460 | - embedding_future = None | |
| 461 | - | |
| 462 | - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether | |
| 463 | - # the detected language is in tenant index_languages. | |
| 374 | + # Wait for translation + embedding concurrently; shared budget depends on whether | |
| 375 | + # the detected language belongs to caller-provided target_languages. | |
| 464 | 376 | qc = self.config.query_config |
| 465 | - source_in_index_for_budget = detected_norm in index_langs | |
| 377 | + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets | |
| 466 | 378 | budget_ms = ( |
| 467 | 379 | qc.translation_embedding_wait_budget_ms_source_in_index |
| 468 | - if source_in_index_for_budget | |
| 380 | + if source_in_target_languages | |
| 469 | 381 | else qc.translation_embedding_wait_budget_ms_source_not_in_index |
| 470 | 382 | ) |
| 471 | 383 | budget_sec = max(0.0, float(budget_ms) / 1000.0) |
| 472 | 384 | |
| 473 | - if translation_futures: | |
| 385 | + if translation_targets: | |
| 474 | 386 | log_info( |
| 475 | 387 | f"Translation+embedding shared wait budget | budget_ms={budget_ms} | " |
| 476 | - f"source_in_index_languages={source_in_index_for_budget} | " | |
| 477 | - f"translation_targets={list(translation_futures.keys())}" | |
| 388 | + f"source_in_target_languages={source_in_target_languages} | " | |
| 389 | + f"translation_targets={translation_targets}" | |
| 478 | 390 | ) |
| 479 | 391 | |
| 480 | - if translation_futures or embedding_future: | |
| 392 | + if future_to_task: | |
| 481 | 393 | log_debug( |
| 482 | 394 | f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | " |
| 483 | - f"source_in_index_languages={source_in_index_for_budget}" | |
| 395 | + f"source_in_target_languages={source_in_target_languages}" | |
| 484 | 396 | ) |
| 485 | 397 | |
| 486 | - all_futures: List[Any] = [] | |
| 487 | - future_to_lang: Dict[Any, tuple] = {} | |
| 488 | - for lang, future in translation_futures.items(): | |
| 489 | - all_futures.append(future) | |
| 490 | - future_to_lang[future] = ("translation", lang) | |
| 491 | - | |
| 492 | - if embedding_future: | |
| 493 | - all_futures.append(embedding_future) | |
| 494 | - future_to_lang[embedding_future] = ("embedding", None) | |
| 495 | - | |
| 496 | - done, not_done = wait(all_futures, timeout=budget_sec) | |
| 398 | + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) | |
| 497 | 399 | for future in done: |
| 498 | - task_type, lang = future_to_lang[future] | |
| 400 | + task_type, lang = future_to_task[future] | |
| 499 | 401 | try: |
| 500 | 402 | result = future.result() |
| 501 | 403 | if task_type == "translation": |
| ... | ... | @@ -528,7 +430,7 @@ class QueryParser: |
| 528 | 430 | |
| 529 | 431 | if not_done: |
| 530 | 432 | for future in not_done: |
| 531 | - task_type, lang = future_to_lang[future] | |
| 433 | + task_type, lang = future_to_task[future] | |
| 532 | 434 | if task_type == "translation": |
| 533 | 435 | timeout_msg = ( |
| 534 | 436 | f"Translation timeout (>{budget_ms}ms) | Language: {lang} | " |
| ... | ... | @@ -542,68 +444,21 @@ class QueryParser: |
| 542 | 444 | if context: |
| 543 | 445 | context.add_warning(timeout_msg) |
| 544 | 446 | |
| 545 | - if encoding_executor: | |
| 546 | - encoding_executor.shutdown(wait=False) | |
| 547 | - if translation_executor: | |
| 548 | - translation_executor.shutdown(wait=False) | |
| 447 | + if async_executor: | |
| 448 | + async_executor.shutdown(wait=False) | |
| 549 | 449 | |
| 550 | 450 | if translations and context: |
| 551 | 451 | context.store_intermediate_result("translations", translations) |
| 552 | - | |
| 553 | - # Build language-scoped query plan: source language + available translations | |
| 554 | - query_text_by_lang: Dict[str, str] = {} | |
| 555 | - if query_text: | |
| 556 | - query_text_by_lang[detected_lang] = query_text | |
| 557 | - for lang, translated_text in (translations or {}).items(): | |
| 558 | - if translated_text and str(translated_text).strip(): | |
| 559 | - query_text_by_lang[str(lang).strip().lower()] = str(translated_text) | |
| 560 | - | |
| 561 | - supplemental_search_langs = self._infer_supplemental_search_langs( | |
| 562 | - query_text=query_text, | |
| 563 | - detected_lang=detected_lang, | |
| 564 | - index_langs=index_langs, | |
| 565 | - ) | |
| 566 | - for lang in supplemental_search_langs: | |
| 567 | - if lang not in query_text_by_lang and query_text: | |
| 568 | - # Use the original mixed-script query as a robust fallback probe for that language field set. | |
| 569 | - query_text_by_lang[lang] = query_text | |
| 570 | - | |
| 571 | - source_in_index_languages = detected_norm in index_langs | |
| 572 | - ordered_search_langs: List[str] = [] | |
| 573 | - seen_order = set() | |
| 574 | - if detected_lang in query_text_by_lang: | |
| 575 | - ordered_search_langs.append(detected_lang) | |
| 576 | - seen_order.add(detected_lang) | |
| 577 | - for lang in index_langs: | |
| 578 | - if lang in query_text_by_lang and lang not in seen_order: | |
| 579 | - ordered_search_langs.append(lang) | |
| 580 | - seen_order.add(lang) | |
| 581 | - for lang in query_text_by_lang.keys(): | |
| 582 | - if lang not in seen_order: | |
| 583 | - ordered_search_langs.append(lang) | |
| 584 | - seen_order.add(lang) | |
| 585 | - | |
| 586 | - if context: | |
| 587 | - context.store_intermediate_result("search_langs", ordered_search_langs) | |
| 588 | - context.store_intermediate_result("query_text_by_lang", query_text_by_lang) | |
| 589 | - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) | |
| 590 | 452 | |
| 591 | 453 | # Build result |
| 592 | 454 | result = ParsedQuery( |
| 593 | 455 | original_query=query, |
| 594 | 456 | query_normalized=normalized, |
| 595 | - rewritten_query=rewritten, | |
| 457 | + rewritten_query=query_text, | |
| 596 | 458 | detected_language=detected_lang, |
| 597 | 459 | translations=translations, |
| 598 | 460 | query_vector=query_vector, |
| 599 | - domain=domain, | |
| 600 | - keywords=keywords, | |
| 601 | - token_count=token_count, | |
| 602 | 461 | query_tokens=query_tokens, |
| 603 | - query_text_by_lang=query_text_by_lang, | |
| 604 | - search_langs=ordered_search_langs, | |
| 605 | - index_languages=index_langs, | |
| 606 | - source_in_index_languages=source_in_index_languages, | |
| 607 | 462 | contains_chinese=contains_chinese, |
| 608 | 463 | contains_english=contains_english, |
| 609 | 464 | ) |
| ... | ... | @@ -611,14 +466,13 @@ class QueryParser: |
| 611 | 466 | if context and hasattr(context, 'logger'): |
| 612 | 467 | context.logger.info( |
| 613 | 468 | f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " |
| 614 | - f"Language: {detected_lang} | Domain: {domain} | " | |
| 615 | 469 | f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", |
| 616 | 470 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 617 | 471 | ) |
| 618 | 472 | else: |
| 619 | 473 | logger.info( |
| 620 | 474 | f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " |
| 621 | - f"Language: {detected_lang} | Domain: {domain}" | |
| 475 | + f"Language: {detected_lang}" | |
| 622 | 476 | ) |
| 623 | 477 | |
| 624 | 478 | return result | ... | ... |
| ... | ... | @@ -0,0 +1,13 @@ |
| 1 | +# Optional: HanLP query tokenization for the main backend venv (QueryParser). | |
| 2 | +# | |
| 3 | +# Install: | |
| 4 | +# source activate.sh | |
| 5 | +# pip install -r requirements_hanlp.txt | |
| 6 | +# | |
| 7 | +# Why pin transformers<5: | |
| 8 | +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x | |
| 9 | +# still calls it → AttributeError during `hanlp.load(...)`. | |
| 10 | +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP. | |
| 11 | + | |
| 12 | +hanlp>=2.1.0 | |
| 13 | +transformers>=4.44,<5 | ... | ... |
scripts/eval_search_quality.py
| ... | ... | @@ -83,7 +83,6 @@ class RankedItem: |
| 83 | 83 | text_score: float | None |
| 84 | 84 | text_source_score: float | None |
| 85 | 85 | text_translation_score: float | None |
| 86 | - text_fallback_score: float | None | |
| 87 | 86 | text_primary_score: float | None |
| 88 | 87 | text_support_score: float | None |
| 89 | 88 | knn_score: float | None |
| ... | ... | @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]: |
| 146 | 145 | text_score=_to_float(debug_item.get("text_score")), |
| 147 | 146 | text_source_score=_to_float(debug_item.get("text_source_score")), |
| 148 | 147 | text_translation_score=_to_float(debug_item.get("text_translation_score")), |
| 149 | - text_fallback_score=_to_float(debug_item.get("text_fallback_score")), | |
| 150 | 148 | text_primary_score=_to_float(debug_item.get("text_primary_score")), |
| 151 | 149 | text_support_score=_to_float(debug_item.get("text_support_score")), |
| 152 | 150 | knn_score=_to_float(debug_item.get("knn_score")), |
| ... | ... | @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -> str: |
| 185 | 183 | f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" |
| 186 | 184 | ) |
| 187 | 185 | lines.append( |
| 188 | - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" | |
| 186 | + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}" | |
| 189 | 187 | ) |
| 190 | - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}") | |
| 191 | 188 | lines.append("") |
| 192 | - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |") | |
| 193 | - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | |
| 189 | + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |") | |
| 190 | + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | |
| 194 | 191 | for item in entry.get("top20", []): |
| 195 | 192 | title = str(item.get("title", "")).replace("|", "/") |
| 196 | 193 | matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) |
| ... | ... | @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -> str: |
| 199 | 196 | f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " |
| 200 | 197 | f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " |
| 201 | 198 | f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " |
| 202 | - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" | |
| 199 | + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |" | |
| 203 | 200 | ) |
| 204 | 201 | lines.append("") |
| 205 | 202 | return "\n".join(lines) | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -36,11 +36,12 @@ class ESQueryBuilder: |
| 36 | 36 | base_minimum_should_match: str = "70%", |
| 37 | 37 | translation_minimum_should_match: str = "70%", |
| 38 | 38 | translation_boost: float = 0.4, |
| 39 | - translation_boost_when_source_missing: float = 1.0, | |
| 40 | - source_boost_when_missing: float = 0.6, | |
| 41 | - original_query_fallback_boost_when_translation_missing: float = 0.2, | |
| 42 | 39 | tie_breaker_base_query: float = 0.9, |
| 43 | 40 | mixed_script_merged_field_boost_scale: float = 0.6, |
| 41 | + phrase_match_base_fields: Optional[Tuple[str, ...]] = None, | |
| 42 | + phrase_match_slop: int = 2, | |
| 43 | + phrase_match_tie_breaker: float = 0.4, | |
| 44 | + phrase_match_boost: float = 3.0, | |
| 44 | 45 | ): |
| 45 | 46 | """ |
| 46 | 47 | Initialize query builder. |
| ... | ... | @@ -74,13 +75,12 @@ class ESQueryBuilder: |
| 74 | 75 | self.base_minimum_should_match = base_minimum_should_match |
| 75 | 76 | self.translation_minimum_should_match = translation_minimum_should_match |
| 76 | 77 | self.translation_boost = float(translation_boost) |
| 77 | - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) | |
| 78 | - self.source_boost_when_missing = float(source_boost_when_missing) | |
| 79 | - self.original_query_fallback_boost_when_translation_missing = float( | |
| 80 | - original_query_fallback_boost_when_translation_missing | |
| 81 | - ) | |
| 82 | 78 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 83 | 79 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) |
| 80 | + self.phrase_match_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors")) | |
| 81 | + self.phrase_match_slop = int(phrase_match_slop) | |
| 82 | + self.phrase_match_tie_breaker = float(phrase_match_tie_breaker) | |
| 83 | + self.phrase_match_boost = float(phrase_match_boost) | |
| 84 | 84 | |
| 85 | 85 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: |
| 86 | 86 | """ |
| ... | ... | @@ -159,7 +159,8 @@ class ESQueryBuilder: |
| 159 | 159 | knn_k: int = 50, |
| 160 | 160 | knn_num_candidates: int = 200, |
| 161 | 161 | min_score: Optional[float] = None, |
| 162 | - parsed_query: Optional[Any] = None | |
| 162 | + parsed_query: Optional[Any] = None, | |
| 163 | + index_languages: Optional[List[str]] = None, | |
| 163 | 164 | ) -> Dict[str, Any]: |
| 164 | 165 | """ |
| 165 | 166 | Build complete ES query with post_filter support for multi-select faceting. |
| ... | ... | @@ -167,7 +168,7 @@ class ESQueryBuilder: |
| 167 | 168 | 结构:filters and (text_recall or embedding_recall) + post_filter |
| 168 | 169 | - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) |
| 169 | 170 | - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) |
| 170 | - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) | |
| 171 | + - text_recall: 文本相关性召回(按实际 clause 语言动态字段) | |
| 171 | 172 | - embedding_recall: 向量召回(KNN) |
| 172 | 173 | - function_score: 包装召回部分,支持提权字段 |
| 173 | 174 | |
| ... | ... | @@ -202,7 +203,11 @@ class ESQueryBuilder: |
| 202 | 203 | # Text recall (always include if query_text exists) |
| 203 | 204 | if query_text: |
| 204 | 205 | # Unified text query strategy |
| 205 | - text_query = self._build_advanced_text_query(query_text, parsed_query) | |
| 206 | + text_query = self._build_advanced_text_query( | |
| 207 | + query_text, | |
| 208 | + parsed_query, | |
| 209 | + index_languages=index_languages, | |
| 210 | + ) | |
| 206 | 211 | recall_clauses.append(text_query) |
| 207 | 212 | |
| 208 | 213 | # Embedding recall (KNN - separate from query, handled below) |
| ... | ... | @@ -456,6 +461,44 @@ class ESQueryBuilder: |
| 456 | 461 | """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" |
| 457 | 462 | return [self._format_field_with_boost(path, boost) for path, boost in specs] |
| 458 | 463 | |
| 464 | + def _build_phrase_match_fields(self, language: str) -> List[str]: | |
| 465 | + """Fields for phrase multi_match: base names × ``.{lang}`` with ``field_boosts``.""" | |
| 466 | + lang = (language or "").strip().lower() | |
| 467 | + if not lang: | |
| 468 | + return [] | |
| 469 | + out: List[str] = [] | |
| 470 | + for base in self.phrase_match_base_fields: | |
| 471 | + path = f"{base}.{lang}" | |
| 472 | + boost = self._get_field_boost(base, lang) | |
| 473 | + out.append(self._format_field_with_boost(path, boost)) | |
| 474 | + return out | |
| 475 | + | |
| 476 | + def _append_phrase_should_clause( | |
| 477 | + self, | |
| 478 | + should_clauses: List[Dict[str, Any]], | |
| 479 | + lang: str, | |
| 480 | + lang_query: str, | |
| 481 | + clause_name: str | |
| 482 | + ) -> None: | |
| 483 | + text = (lang_query or "").strip() | |
| 484 | + if not text: | |
| 485 | + return | |
| 486 | + phrase_fields = self._build_phrase_match_fields(lang) | |
| 487 | + if not phrase_fields: | |
| 488 | + return | |
| 489 | + boost = self.phrase_match_boost | |
| 490 | + should_clauses.append({ | |
| 491 | + "multi_match": { | |
| 492 | + "_name": f"{clause_name}_phrase", | |
| 493 | + "query": lang_query, | |
| 494 | + "type": "phrase", | |
| 495 | + "fields": phrase_fields, | |
| 496 | + "slop": self.phrase_match_slop, | |
| 497 | + "tie_breaker": self.phrase_match_tie_breaker, | |
| 498 | + "boost": boost, | |
| 499 | + } | |
| 500 | + }) | |
| 501 | + | |
| 459 | 502 | def _merge_supplemental_lang_field_specs( |
| 460 | 503 | self, |
| 461 | 504 | specs: List[MatchFieldSpec], |
| ... | ... | @@ -479,6 +522,7 @@ class ESQueryBuilder: |
| 479 | 522 | contains_chinese: bool, |
| 480 | 523 | contains_english: bool, |
| 481 | 524 | index_languages: List[str], |
| 525 | + is_source: bool = False | |
| 482 | 526 | ) -> List[MatchFieldSpec]: |
| 483 | 527 | """ |
| 484 | 528 | When the query mixes scripts, widen each clause to indexed fields for the other script |
| ... | ... | @@ -492,10 +536,11 @@ class ESQueryBuilder: |
| 492 | 536 | |
| 493 | 537 | out = list(specs) |
| 494 | 538 | lnorm = (lang or "").strip().lower() |
| 495 | - if contains_english and lnorm != "en" and can_use("en"): | |
| 496 | - out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 497 | - if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 498 | - out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 539 | + if is_source: | |
| 540 | + if contains_english and lnorm != "en" and can_use("en"): | |
| 541 | + out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 542 | + if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 543 | + out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 499 | 544 | return out |
| 500 | 545 | |
| 501 | 546 | def _get_embedding_field(self, language: str) -> str: |
| ... | ... | @@ -503,13 +548,31 @@ class ESQueryBuilder: |
| 503 | 548 | # Currently using unified embedding field |
| 504 | 549 | return self.text_embedding_field or "title_embedding" |
| 505 | 550 | |
| 506 | - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]: | |
| 551 | + @staticmethod | |
| 552 | + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: | |
| 553 | + normalized: List[str] = [] | |
| 554 | + seen = set() | |
| 555 | + for language in languages or []: | |
| 556 | + token = str(language or "").strip().lower() | |
| 557 | + if not token or token in seen: | |
| 558 | + continue | |
| 559 | + seen.add(token) | |
| 560 | + normalized.append(token) | |
| 561 | + return normalized | |
| 562 | + | |
| 563 | + def _build_advanced_text_query( | |
| 564 | + self, | |
| 565 | + query_text: str, | |
| 566 | + parsed_query: Optional[Any] = None, | |
| 567 | + *, | |
| 568 | + index_languages: Optional[List[str]] = None, | |
| 569 | + ) -> Dict[str, Any]: | |
| 507 | 570 | """ |
| 508 | - Build advanced text query using should clauses with primary and fallback lexical strategies. | |
| 571 | + Build advanced text query using base and translated lexical clauses. | |
| 509 | 572 | |
| 510 | 573 | Unified implementation: |
| 511 | 574 | - base_query: source-language clause |
| 512 | - - translation queries: target-language clauses from search_langs/query_text_by_lang | |
| 575 | + - translation queries: target-language clauses from translations | |
| 513 | 576 | - KNN query: added separately in build_query |
| 514 | 577 | |
| 515 | 578 | Args: |
| ... | ... | @@ -520,66 +583,41 @@ class ESQueryBuilder: |
| 520 | 583 | ES bool query with should clauses |
| 521 | 584 | """ |
| 522 | 585 | should_clauses = [] |
| 523 | - | |
| 524 | - # Get query analysis from parsed_query | |
| 525 | - query_text_by_lang: Dict[str, str] = {} | |
| 526 | - search_langs: List[str] = [] | |
| 527 | 586 | source_lang = self.default_language |
| 528 | - source_in_index_languages = True | |
| 529 | - index_languages: List[str] = [] | |
| 530 | - | |
| 587 | + translations: Dict[str, str] = {} | |
| 531 | 588 | contains_chinese = False |
| 532 | 589 | contains_english = False |
| 590 | + normalized_index_languages = self._normalize_language_list(index_languages) | |
| 591 | + | |
| 533 | 592 | if parsed_query: |
| 534 | - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} | |
| 535 | - search_langs = getattr(parsed_query, "search_langs", None) or [] | |
| 536 | 593 | detected_lang = getattr(parsed_query, "detected_language", None) |
| 537 | 594 | source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language |
| 538 | - source_in_index_languages = bool( | |
| 539 | - getattr(parsed_query, "source_in_index_languages", True) | |
| 540 | - ) | |
| 541 | - index_languages = getattr(parsed_query, "index_languages", None) or [] | |
| 595 | + translations = getattr(parsed_query, "translations", None) or {} | |
| 542 | 596 | contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) |
| 543 | 597 | contains_english = bool(getattr(parsed_query, "contains_english", False)) |
| 544 | 598 | |
| 545 | - if not query_text_by_lang: | |
| 546 | - query_text_by_lang = {source_lang: query_text} | |
| 547 | - if source_lang not in query_text_by_lang and query_text: | |
| 548 | - query_text_by_lang[source_lang] = query_text | |
| 549 | - if not search_langs: | |
| 550 | - search_langs = list(query_text_by_lang.keys()) | |
| 551 | - | |
| 552 | - # Base + translated clauses based on language plan. | |
| 553 | - for lang in search_langs: | |
| 554 | - lang_query = query_text_by_lang.get(lang) | |
| 555 | - if not lang_query: | |
| 556 | - continue | |
| 599 | + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language | |
| 600 | + base_query_text = ( | |
| 601 | + getattr(parsed_query, "rewritten_query", None) if parsed_query else None | |
| 602 | + ) or query_text | |
| 603 | + | |
| 604 | + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: | |
| 605 | + nonlocal should_clauses | |
| 557 | 606 | all_specs, _ = self._build_match_field_specs(lang) |
| 558 | 607 | expanded_specs = self._expand_match_field_specs_for_mixed_script( |
| 559 | 608 | lang, |
| 560 | 609 | all_specs, |
| 561 | 610 | contains_chinese, |
| 562 | 611 | contains_english, |
| 563 | - index_languages, | |
| 612 | + normalized_index_languages, | |
| 613 | + is_source, | |
| 564 | 614 | ) |
| 565 | 615 | match_fields = self._format_match_field_specs(expanded_specs) |
| 566 | 616 | if not match_fields: |
| 567 | - continue | |
| 568 | - | |
| 569 | - is_source = (lang == source_lang) | |
| 570 | - clause_boost = 1.0 | |
| 571 | - clause_name = "base_query" if is_source else f"base_query_trans_{lang}" | |
| 617 | + return | |
| 572 | 618 | minimum_should_match = ( |
| 573 | 619 | self.base_minimum_should_match if is_source else self.translation_minimum_should_match |
| 574 | 620 | ) |
| 575 | - if is_source and not source_in_index_languages: | |
| 576 | - clause_boost = self.source_boost_when_missing | |
| 577 | - elif not is_source: | |
| 578 | - clause_boost = ( | |
| 579 | - self.translation_boost | |
| 580 | - if source_in_index_languages | |
| 581 | - else self.translation_boost_when_source_missing | |
| 582 | - ) | |
| 583 | 621 | |
| 584 | 622 | clause = { |
| 585 | 623 | "multi_match": { |
| ... | ... | @@ -590,55 +628,34 @@ class ESQueryBuilder: |
| 590 | 628 | "tie_breaker": self.tie_breaker_base_query, |
| 591 | 629 | } |
| 592 | 630 | } |
| 593 | - if abs(clause_boost - 1.0) > 1e-9: | |
| 594 | - clause["multi_match"]["boost"] = clause_boost | |
| 631 | + # base_query: never set multi_match.boost (ES default 1.0). | |
| 632 | + # Translation clauses: single knob from config — translation_boost. | |
| 633 | + if not is_source: | |
| 634 | + tb = float(self.translation_boost) | |
| 635 | + clause["multi_match"]["boost"] = tb | |
| 595 | 636 | should_clauses.append({ |
| 596 | 637 | "multi_match": clause["multi_match"] |
| 597 | 638 | }) |
| 639 | + self._append_phrase_should_clause( | |
| 640 | + should_clauses, lang, lang_query, clause_name | |
| 641 | + ) | |
| 598 | 642 | |
| 599 | - # Fallback: source language is not indexed and translation for some index languages is missing. | |
| 600 | - # Use original query text on missing index-language fields with a low boost. | |
| 601 | - if not source_in_index_languages and query_text and index_languages: | |
| 602 | - normalized_index_langs: List[str] = [] | |
| 603 | - seen_langs = set() | |
| 604 | - for lang in index_languages: | |
| 605 | - norm_lang = str(lang or "").strip().lower() | |
| 606 | - if not norm_lang or norm_lang in seen_langs: | |
| 607 | - continue | |
| 608 | - seen_langs.add(norm_lang) | |
| 609 | - normalized_index_langs.append(norm_lang) | |
| 643 | + if base_query_text: | |
| 644 | + append_clause(source_lang, base_query_text, "base_query", True) | |
| 610 | 645 | |
| 611 | - for lang in normalized_index_langs: | |
| 612 | - if lang == source_lang: | |
| 613 | - continue | |
| 614 | - if lang in query_text_by_lang: | |
| 615 | - continue | |
| 616 | - fb_specs, _ = self._build_match_field_specs(lang) | |
| 617 | - expanded_fb = self._expand_match_field_specs_for_mixed_script( | |
| 618 | - lang, | |
| 619 | - fb_specs, | |
| 620 | - contains_chinese, | |
| 621 | - contains_english, | |
| 622 | - index_languages, | |
| 623 | - ) | |
| 624 | - match_fields = self._format_match_field_specs(expanded_fb) | |
| 625 | - if not match_fields: | |
| 626 | - continue | |
| 627 | - should_clauses.append({ | |
| 628 | - "multi_match": { | |
| 629 | - "_name": f"fallback_original_query_{lang}", | |
| 630 | - "query": query_text, | |
| 631 | - "fields": match_fields, | |
| 632 | - "minimum_should_match": self.translation_minimum_should_match, | |
| 633 | - "tie_breaker": self.tie_breaker_base_query, | |
| 634 | - "boost": self.original_query_fallback_boost_when_translation_missing, | |
| 635 | - } | |
| 636 | - }) | |
| 646 | + for lang, translated_text in translations.items(): | |
| 647 | + normalized_lang = str(lang or "").strip().lower() | |
| 648 | + normalized_text = str(translated_text or "").strip() | |
| 649 | + if not normalized_lang or not normalized_text: | |
| 650 | + continue | |
| 651 | + if normalized_lang == source_lang and normalized_text == base_query_text: | |
| 652 | + continue | |
| 653 | + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) | |
| 637 | 654 | |
| 638 | 655 | # Fallback to a simple query when language fields cannot be resolved. |
| 639 | 656 | if not should_clauses: |
| 640 | 657 | fallback_fields = self.match_fields or ["title.en^1.0"] |
| 641 | - return { | |
| 658 | + fallback_lexical = { | |
| 642 | 659 | "multi_match": { |
| 643 | 660 | "_name": "base_query_fallback", |
| 644 | 661 | "query": query_text, |
| ... | ... | @@ -647,6 +664,21 @@ class ESQueryBuilder: |
| 647 | 664 | "tie_breaker": self.tie_breaker_base_query, |
| 648 | 665 | } |
| 649 | 666 | } |
| 667 | + fb_should: List[Dict[str, Any]] = [fallback_lexical] | |
| 668 | + self._append_phrase_should_clause( | |
| 669 | + fb_should, | |
| 670 | + self.default_language, | |
| 671 | + query_text, | |
| 672 | + "base_query_fallback" | |
| 673 | + ) | |
| 674 | + if len(fb_should) == 1: | |
| 675 | + return fallback_lexical | |
| 676 | + return { | |
| 677 | + "bool": { | |
| 678 | + "should": fb_should, | |
| 679 | + "minimum_should_match": 1, | |
| 680 | + } | |
| 681 | + } | |
| 650 | 682 | |
| 651 | 683 | # Return bool query with should clauses |
| 652 | 684 | if len(should_clauses) == 1: | ... | ... |
search/rerank_client.py
| ... | ... | @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -> float: |
| 116 | 116 | def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: |
| 117 | 117 | source_score = _extract_named_query_score(matched_queries, "base_query") |
| 118 | 118 | translation_score = 0.0 |
| 119 | - fallback_score = 0.0 | |
| 120 | 119 | |
| 121 | 120 | if isinstance(matched_queries, dict): |
| 122 | 121 | for query_name, score in matched_queries.items(): |
| ... | ... | @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa |
| 125 | 124 | numeric_score = _to_score(score) |
| 126 | 125 | if query_name.startswith("base_query_trans_"): |
| 127 | 126 | translation_score = max(translation_score, numeric_score) |
| 128 | - elif query_name.startswith("fallback_original_query_"): | |
| 129 | - fallback_score = max(fallback_score, numeric_score) | |
| 130 | 127 | elif isinstance(matched_queries, list): |
| 131 | 128 | for query_name in matched_queries: |
| 132 | 129 | if not isinstance(query_name, str): |
| 133 | 130 | continue |
| 134 | 131 | if query_name.startswith("base_query_trans_"): |
| 135 | 132 | translation_score = 1.0 |
| 136 | - elif query_name.startswith("fallback_original_query_"): | |
| 137 | - fallback_score = 1.0 | |
| 138 | 133 | |
| 139 | 134 | weighted_source = source_score |
| 140 | 135 | weighted_translation = 0.8 * translation_score |
| 141 | - weighted_fallback = 0.55 * fallback_score | |
| 142 | - weighted_components = [weighted_source, weighted_translation, weighted_fallback] | |
| 136 | + weighted_components = [weighted_source, weighted_translation] | |
| 143 | 137 | primary_text_score = max(weighted_components) |
| 144 | 138 | support_text_score = sum(weighted_components) - primary_text_score |
| 145 | 139 | text_score = primary_text_score + 0.25 * support_text_score |
| ... | ... | @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa |
| 153 | 147 | return { |
| 154 | 148 | "source_score": source_score, |
| 155 | 149 | "translation_score": translation_score, |
| 156 | - "fallback_score": fallback_score, | |
| 157 | 150 | "weighted_source_score": weighted_source, |
| 158 | 151 | "weighted_translation_score": weighted_translation, |
| 159 | - "weighted_fallback_score": weighted_fallback, | |
| 160 | 152 | "primary_text_score": primary_text_score, |
| 161 | 153 | "support_text_score": support_text_score, |
| 162 | 154 | "text_score": text_score, |
| ... | ... | @@ -219,7 +211,6 @@ def fuse_scores_and_resort( |
| 219 | 211 | hit["_knn_score"] = knn_score |
| 220 | 212 | hit["_text_source_score"] = text_components["source_score"] |
| 221 | 213 | hit["_text_translation_score"] = text_components["translation_score"] |
| 222 | - hit["_text_fallback_score"] = text_components["fallback_score"] | |
| 223 | 214 | hit["_text_primary_score"] = text_components["primary_text_score"] |
| 224 | 215 | hit["_text_support_score"] = text_components["support_text_score"] |
| 225 | 216 | hit["_fused_score"] = fused |
| ... | ... | @@ -231,7 +222,6 @@ def fuse_scores_and_resort( |
| 231 | 222 | "text_score": text_score, |
| 232 | 223 | "text_source_score": text_components["source_score"], |
| 233 | 224 | "text_translation_score": text_components["translation_score"], |
| 234 | - "text_fallback_score": text_components["fallback_score"], | |
| 235 | 225 | "text_primary_score": text_components["primary_text_score"], |
| 236 | 226 | "text_support_score": text_components["support_text_score"], |
| 237 | 227 | "knn_score": knn_score, | ... | ... |
search/searcher.py
| ... | ... | @@ -132,11 +132,6 @@ class Searcher: |
| 132 | 132 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, |
| 133 | 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 134 | 134 | translation_boost=self.config.query_config.translation_boost, |
| 135 | - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, | |
| 136 | - source_boost_when_missing=self.config.query_config.source_boost_when_missing, | |
| 137 | - original_query_fallback_boost_when_translation_missing=( | |
| 138 | - self.config.query_config.original_query_fallback_boost_when_translation_missing | |
| 139 | - ), | |
| 140 | 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, |
| 141 | 136 | ) |
| 142 | 137 | |
| ... | ... | @@ -267,13 +262,6 @@ class Searcher: |
| 267 | 262 | if normalized: |
| 268 | 263 | candidates.append(normalized) |
| 269 | 264 | |
| 270 | - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {} | |
| 271 | - if isinstance(query_text_by_lang, dict): | |
| 272 | - for text in query_text_by_lang.values(): | |
| 273 | - normalized = self._normalize_sku_match_text(text) | |
| 274 | - if normalized: | |
| 275 | - candidates.append(normalized) | |
| 276 | - | |
| 277 | 265 | translations = getattr(parsed_query, "translations", {}) or {} |
| 278 | 266 | if isinstance(translations, dict): |
| 279 | 267 | for text in translations.values(): |
| ... | ... | @@ -516,10 +504,19 @@ class Searcher: |
| 516 | 504 | range_filters: Range filters for numeric fields |
| 517 | 505 | facets: Facet configurations for faceted search |
| 518 | 506 | min_score: Minimum score threshold |
| 519 | - context: Request context for tracking (created if not provided) | |
| 507 | + context: Request context for tracking (required) | |
| 520 | 508 | sort_by: Field name for sorting |
| 521 | 509 | sort_order: Sort order: 'asc' or 'desc' |
| 522 | 510 | debug: Enable debug information output |
| 511 | + language: Response / field selection language hint (e.g. zh, en) | |
| 512 | + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick | |
| 513 | + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides | |
| 514 | + whether the rerank provider is invoked (subject to rerank window). | |
| 515 | + rerank_query_template: Override for rerank query text template; None uses | |
| 516 | + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``). | |
| 517 | + rerank_doc_template: Override for per-hit document text passed to rerank; | |
| 518 | + None uses ``config.rerank.rerank_doc_template``. Placeholders are | |
| 519 | + resolved in ``search/rerank_client.py``. | |
| 523 | 520 | |
| 524 | 521 | Returns: |
| 525 | 522 | SearchResult object with formatted results |
| ... | ... | @@ -592,7 +589,8 @@ class Searcher: |
| 592 | 589 | query, |
| 593 | 590 | tenant_id=tenant_id, |
| 594 | 591 | generate_vector=enable_embedding, |
| 595 | - context=context | |
| 592 | + context=context, | |
| 593 | + target_languages=index_langs if enable_translation else [], | |
| 596 | 594 | ) |
| 597 | 595 | # Store query analysis results in context |
| 598 | 596 | context.store_query_analysis( |
| ... | ... | @@ -602,7 +600,7 @@ class Searcher: |
| 602 | 600 | detected_language=parsed_query.detected_language, |
| 603 | 601 | translations=parsed_query.translations, |
| 604 | 602 | query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, |
| 605 | - domain=parsed_query.domain, | |
| 603 | + domain="default", | |
| 606 | 604 | is_simple_query=True |
| 607 | 605 | ) |
| 608 | 606 | |
| ... | ... | @@ -610,7 +608,6 @@ class Searcher: |
| 610 | 608 | f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " |
| 611 | 609 | f"重写后: '{parsed_query.rewritten_query}' | " |
| 612 | 610 | f"语言: {parsed_query.detected_language} | " |
| 613 | - f"域: {parsed_query.domain} | " | |
| 614 | 611 | f"向量: {'是' if parsed_query.query_vector is not None else '否'}", |
| 615 | 612 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 616 | 613 | ) |
| ... | ... | @@ -643,7 +640,8 @@ class Searcher: |
| 643 | 640 | from_=es_fetch_from, |
| 644 | 641 | enable_knn=enable_embedding and parsed_query.query_vector is not None, |
| 645 | 642 | min_score=min_score, |
| 646 | - parsed_query=parsed_query | |
| 643 | + parsed_query=parsed_query, | |
| 644 | + index_languages=index_langs, | |
| 647 | 645 | ) |
| 648 | 646 | |
| 649 | 647 | # Add facets for faceted search |
| ... | ... | @@ -933,7 +931,6 @@ class Searcher: |
| 933 | 931 | debug_entry["text_score"] = rerank_debug.get("text_score") |
| 934 | 932 | debug_entry["text_source_score"] = rerank_debug.get("text_source_score") |
| 935 | 933 | debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") |
| 936 | - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score") | |
| 937 | 934 | debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") |
| 938 | 935 | debug_entry["text_support_score"] = rerank_debug.get("text_support_score") |
| 939 | 936 | debug_entry["knn_score"] = rerank_debug.get("knn_score") |
| ... | ... | @@ -985,9 +982,6 @@ class Searcher: |
| 985 | 982 | "rewritten_query": context.query_analysis.rewritten_query, |
| 986 | 983 | "detected_language": context.query_analysis.detected_language, |
| 987 | 984 | "translations": context.query_analysis.translations, |
| 988 | - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), | |
| 989 | - "search_langs": context.get_intermediate_result("search_langs", []), | |
| 990 | - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), | |
| 991 | 985 | "has_vector": context.query_analysis.query_vector is not None, |
| 992 | 986 | "is_simple_query": context.query_analysis.is_simple_query, |
| 993 | 987 | "domain": context.query_analysis.domain | ... | ... |
suggestion/builder.py
| ... | ... | @@ -147,7 +147,7 @@ class SuggestionIndexBuilder: |
| 147 | 147 | raw = str(value).strip() |
| 148 | 148 | if not raw: |
| 149 | 149 | return [] |
| 150 | - parts = re.split(r"[,;|/\n\t]+", raw) | |
| 150 | + parts = re.split(r"[,、,;|/\n\t]+", raw) | |
| 151 | 151 | out = [p.strip() for p in parts if p and p.strip()] |
| 152 | 152 | if not out: |
| 153 | 153 | return [raw] |
| ... | ... | @@ -162,7 +162,7 @@ class SuggestionIndexBuilder: |
| 162 | 162 | s = str(raw).strip() |
| 163 | 163 | if not s: |
| 164 | 164 | return [] |
| 165 | - parts = re.split(r"[,;|/\n\t]+", s) | |
| 165 | + parts = re.split(r"[,、,;|/\n\t]+", s) | |
| 166 | 166 | out = [p.strip() for p in parts if p and p.strip()] |
| 167 | 167 | return out if out else [s] |
| 168 | 168 | ... | ... |
tests/test_embedding_pipeline.py
| ... | ... | @@ -73,6 +73,10 @@ class _FakeQueryEncoder: |
| 73 | 73 | return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object) |
| 74 | 74 | |
| 75 | 75 | |
| 76 | +def _tokenizer(text): | |
| 77 | + return str(text).split() | |
| 78 | + | |
| 79 | + | |
| 76 | 80 | class _FakeEmbeddingCache: |
| 77 | 81 | def __init__(self): |
| 78 | 82 | self.store: Dict[str, np.ndarray] = {} |
| ... | ... | @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder(): |
| 210 | 214 | config=_build_test_config(), |
| 211 | 215 | text_encoder=encoder, |
| 212 | 216 | translator=_FakeTranslator(), |
| 217 | + tokenizer=_tokenizer, | |
| 213 | 218 | ) |
| 214 | 219 | |
| 215 | 220 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=True) |
| ... | ... | @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled(): |
| 224 | 229 | config=_build_test_config(), |
| 225 | 230 | text_encoder=_FakeQueryEncoder(), |
| 226 | 231 | translator=_FakeTranslator(), |
| 232 | + tokenizer=_tokenizer, | |
| 227 | 233 | ) |
| 228 | 234 | |
| 229 | 235 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) | ... | ... |
tests/test_es_query_builder.py
| 1 | 1 | from types import SimpleNamespace |
| 2 | +from typing import Any, Dict | |
| 2 | 3 | |
| 3 | 4 | import numpy as np |
| 4 | 5 | |
| ... | ... | @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder: |
| 13 | 14 | ) |
| 14 | 15 | |
| 15 | 16 | |
| 17 | +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: | |
| 18 | + """Fields from the non-phrase multi_match (bool.should or single clause).""" | |
| 19 | + if "multi_match" in query_root: | |
| 20 | + mm = query_root["multi_match"] | |
| 21 | + if mm.get("type") == "phrase": | |
| 22 | + raise AssertionError("root multi_match is phrase-only") | |
| 23 | + return mm["fields"] | |
| 24 | + for clause in query_root.get("bool", {}).get("should", []): | |
| 25 | + mm = clause.get("multi_match") or {} | |
| 26 | + if mm.get("type") == "phrase": | |
| 27 | + continue | |
| 28 | + return mm["fields"] | |
| 29 | + raise AssertionError("no lexical multi_match in query_root") | |
| 30 | + | |
| 31 | + | |
| 16 | 32 | def test_knn_prefilter_includes_range_filters(): |
| 17 | 33 | qb = _builder() |
| 18 | 34 | q = qb.build_query( |
| ... | ... | @@ -65,21 +81,49 @@ def test_knn_prefilter_not_added_without_filters(): |
| 65 | 81 | assert q["knn"]["_name"] == "knn_query" |
| 66 | 82 | |
| 67 | 83 | |
| 68 | -def test_text_query_contains_only_base_translation_and_fallback_named_queries(): | |
| 84 | +def test_text_query_contains_only_base_and_translation_named_queries(): | |
| 69 | 85 | qb = _builder() |
| 70 | 86 | parsed_query = SimpleNamespace( |
| 71 | - query_text_by_lang={"en": "dress", "zh": "连衣裙"}, | |
| 72 | - search_langs=["en", "zh"], | |
| 87 | + rewritten_query="dress", | |
| 73 | 88 | detected_language="en", |
| 74 | - source_in_index_languages=False, | |
| 75 | - index_languages=["en", "zh", "fr"], | |
| 89 | + translations={"en": "dress", "zh": "连衣裙"}, | |
| 76 | 90 | ) |
| 77 | 91 | |
| 78 | - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False) | |
| 92 | + q = qb.build_query( | |
| 93 | + query_text="dress", | |
| 94 | + parsed_query=parsed_query, | |
| 95 | + enable_knn=False, | |
| 96 | + index_languages=["en", "zh", "fr"], | |
| 97 | + ) | |
| 79 | 98 | should = q["query"]["bool"]["should"] |
| 80 | 99 | names = [clause["multi_match"]["_name"] for clause in should] |
| 81 | 100 | |
| 82 | - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] | |
| 101 | + assert names == [ | |
| 102 | + "base_query", | |
| 103 | + "base_query_phrase", | |
| 104 | + "base_query_trans_zh", | |
| 105 | + "base_query_trans_zh_phrase", | |
| 106 | + ] | |
| 107 | + | |
| 108 | + | |
| 109 | +def test_text_query_skips_duplicate_translation_same_as_base(): | |
| 110 | + qb = _builder() | |
| 111 | + parsed_query = SimpleNamespace( | |
| 112 | + rewritten_query="dress", | |
| 113 | + detected_language="en", | |
| 114 | + translations={"en": "dress"}, | |
| 115 | + ) | |
| 116 | + | |
| 117 | + q = qb.build_query( | |
| 118 | + query_text="dress", | |
| 119 | + parsed_query=parsed_query, | |
| 120 | + enable_knn=False, | |
| 121 | + index_languages=["en", "zh"], | |
| 122 | + ) | |
| 123 | + | |
| 124 | + root = q["query"] | |
| 125 | + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" | |
| 126 | + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" | |
| 83 | 127 | |
| 84 | 128 | |
| 85 | 129 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| ... | ... | @@ -91,22 +135,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): |
| 91 | 135 | default_language="en", |
| 92 | 136 | ) |
| 93 | 137 | parsed_query = SimpleNamespace( |
| 94 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 95 | - search_langs=["zh"], | |
| 138 | + rewritten_query="法式 dress", | |
| 96 | 139 | detected_language="zh", |
| 97 | - source_in_index_languages=True, | |
| 98 | - index_languages=["zh", "en"], | |
| 140 | + translations={}, | |
| 99 | 141 | contains_chinese=True, |
| 100 | 142 | contains_english=True, |
| 101 | 143 | ) |
| 102 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 103 | - fields = q["query"]["multi_match"]["fields"] | |
| 144 | + q = qb.build_query( | |
| 145 | + query_text="法式 dress", | |
| 146 | + parsed_query=parsed_query, | |
| 147 | + enable_knn=False, | |
| 148 | + index_languages=["zh", "en"], | |
| 149 | + ) | |
| 150 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 104 | 151 | bases = {f.split("^", 1)[0] for f in fields} |
| 105 | 152 | assert "title.zh" in bases and "title.en" in bases |
| 106 | 153 | assert "brief.zh" in bases and "brief.en" in bases |
| 107 | - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8) | |
| 108 | - assert "title.en^0.8" in fields | |
| 109 | - assert "brief.en^0.8" in fields | |
| 154 | + # Merged supplemental language fields use boost * 0.6 by default. | |
| 155 | + assert "title.en^0.6" in fields | |
| 156 | + assert "brief.en^0.6" in fields | |
| 110 | 157 | |
| 111 | 158 | |
| 112 | 159 | def test_mixed_script_merges_zh_fields_into_en_clause(): |
| ... | ... | @@ -118,19 +165,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): |
| 118 | 165 | default_language="en", |
| 119 | 166 | ) |
| 120 | 167 | parsed_query = SimpleNamespace( |
| 121 | - query_text_by_lang={"en": "red 连衣裙"}, | |
| 122 | - search_langs=["en"], | |
| 168 | + rewritten_query="red 连衣裙", | |
| 123 | 169 | detected_language="en", |
| 124 | - source_in_index_languages=True, | |
| 125 | - index_languages=["zh", "en"], | |
| 170 | + translations={}, | |
| 126 | 171 | contains_chinese=True, |
| 127 | 172 | contains_english=True, |
| 128 | 173 | ) |
| 129 | - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) | |
| 130 | - fields = q["query"]["multi_match"]["fields"] | |
| 174 | + q = qb.build_query( | |
| 175 | + query_text="red 连衣裙", | |
| 176 | + parsed_query=parsed_query, | |
| 177 | + enable_knn=False, | |
| 178 | + index_languages=["zh", "en"], | |
| 179 | + ) | |
| 180 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 131 | 181 | bases = {f.split("^", 1)[0] for f in fields} |
| 132 | 182 | assert "title.en" in bases and "title.zh" in bases |
| 133 | - assert "title.zh^0.8" in fields | |
| 183 | + assert "title.zh^0.6" in fields | |
| 134 | 184 | |
| 135 | 185 | |
| 136 | 186 | def test_mixed_script_merged_fields_scale_configured_boosts(): |
| ... | ... | @@ -143,18 +193,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): |
| 143 | 193 | default_language="en", |
| 144 | 194 | ) |
| 145 | 195 | parsed_query = SimpleNamespace( |
| 146 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 147 | - search_langs=["zh"], | |
| 196 | + rewritten_query="法式 dress", | |
| 148 | 197 | detected_language="zh", |
| 149 | - source_in_index_languages=True, | |
| 150 | - index_languages=["zh", "en"], | |
| 198 | + translations={}, | |
| 151 | 199 | contains_chinese=True, |
| 152 | 200 | contains_english=True, |
| 153 | 201 | ) |
| 154 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 155 | - fields = q["query"]["multi_match"]["fields"] | |
| 202 | + q = qb.build_query( | |
| 203 | + query_text="法式 dress", | |
| 204 | + parsed_query=parsed_query, | |
| 205 | + enable_knn=False, | |
| 206 | + index_languages=["zh", "en"], | |
| 207 | + ) | |
| 208 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 156 | 209 | assert "title.zh^5.0" in fields |
| 157 | - assert "title.en^8.0" in fields # 10.0 * 0.8 | |
| 210 | + assert "title.en^6.0" in fields # 10.0 * 0.6 | |
| 158 | 211 | |
| 159 | 212 | |
| 160 | 213 | def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| ... | ... | @@ -166,16 +219,19 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| 166 | 219 | default_language="zh", |
| 167 | 220 | ) |
| 168 | 221 | parsed_query = SimpleNamespace( |
| 169 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 170 | - search_langs=["zh"], | |
| 222 | + rewritten_query="法式 dress", | |
| 171 | 223 | detected_language="zh", |
| 172 | - source_in_index_languages=True, | |
| 173 | - index_languages=["zh"], | |
| 224 | + translations={}, | |
| 174 | 225 | contains_chinese=True, |
| 175 | 226 | contains_english=True, |
| 176 | 227 | ) |
| 177 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 178 | - fields = q["query"]["multi_match"]["fields"] | |
| 228 | + q = qb.build_query( | |
| 229 | + query_text="法式 dress", | |
| 230 | + parsed_query=parsed_query, | |
| 231 | + enable_knn=False, | |
| 232 | + index_languages=["zh"], | |
| 233 | + ) | |
| 234 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 179 | 235 | bases = {f.split("^", 1)[0] for f in fields} |
| 180 | 236 | assert "title.zh" in bases |
| 181 | 237 | assert "title.en" not in bases | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
0 → 100644
| ... | ... | @@ -0,0 +1,453 @@ |
| 1 | +""" | |
| 2 | +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. | |
| 3 | + | |
| 4 | +Covers combinations of query language vs tenant index_languages, translations, | |
| 5 | +and mixed Chinese/English queries. Asserts multi_match _name, query text, and | |
| 6 | +target language fields (title.{lang}). | |
| 7 | +""" | |
| 8 | + | |
| 9 | +from types import SimpleNamespace | |
| 10 | +from typing import Any, Dict, List | |
| 11 | + | |
| 12 | +import numpy as np | |
| 13 | + | |
| 14 | +from search.es_query_builder import ESQueryBuilder | |
| 15 | + | |
| 16 | + | |
| 17 | +def _builder_multilingual_title_only( | |
| 18 | + *, | |
| 19 | + default_language: str = "en", | |
| 20 | + mixed_script_scale: float = 0.6, | |
| 21 | +) -> ESQueryBuilder: | |
| 22 | + """Minimal builder: only title.{lang} for easy field assertions.""" | |
| 23 | + return ESQueryBuilder( | |
| 24 | + match_fields=["title.en^1.0"], | |
| 25 | + multilingual_fields=["title"], | |
| 26 | + shared_fields=[], | |
| 27 | + text_embedding_field="title_embedding", | |
| 28 | + default_language=default_language, | |
| 29 | + mixed_script_merged_field_boost_scale=mixed_script_scale, | |
| 30 | + function_score_config=None, | |
| 31 | + ) | |
| 32 | + | |
| 33 | + | |
| 34 | +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: | |
| 35 | + """Navigate bool.must / function_score wrappers to the text recall root.""" | |
| 36 | + q = es_body.get("query") or {} | |
| 37 | + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: | |
| 38 | + q = q["bool"]["must"][0] | |
| 39 | + if "function_score" in q: | |
| 40 | + q = q["function_score"]["query"] | |
| 41 | + return q | |
| 42 | + | |
| 43 | + | |
| 44 | +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| 45 | + inner = _unwrap_inner_query(es_body) | |
| 46 | + if "multi_match" in inner: | |
| 47 | + return [inner["multi_match"]] | |
| 48 | + should = (inner.get("bool") or {}).get("should") or [] | |
| 49 | + return [c["multi_match"] for c in should if "multi_match" in c] | |
| 50 | + | |
| 51 | + | |
| 52 | +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | |
| 53 | + """Map _name -> multi_match dict.""" | |
| 54 | + out: Dict[str, Dict[str, Any]] = {} | |
| 55 | + for mm in _extract_multi_match_clauses(es_body): | |
| 56 | + name = mm.get("_name") | |
| 57 | + if name: | |
| 58 | + out[str(name)] = mm | |
| 59 | + return out | |
| 60 | + | |
| 61 | + | |
| 62 | +def _with_phrase(lexical_names: set[str]) -> set[str]: | |
| 63 | + """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" | |
| 64 | + return lexical_names | {f"{n}_phrase" for n in lexical_names} | |
| 65 | + | |
| 66 | + | |
| 67 | +def _title_fields(mm: Dict[str, Any]) -> List[str]: | |
| 68 | + fields = mm.get("fields") or [] | |
| 69 | + return [f for f in fields if str(f).startswith("title.")] | |
| 70 | + | |
| 71 | + | |
| 72 | +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: | |
| 73 | + """True if any field is title.{lang} with optional ^boost suffix.""" | |
| 74 | + prefix = f"title.{lang}" | |
| 75 | + for f in mm.get("fields") or []: | |
| 76 | + s = str(f) | |
| 77 | + if s == prefix or s.startswith(prefix + "^"): | |
| 78 | + return True | |
| 79 | + return False | |
| 80 | + | |
| 81 | + | |
| 82 | +def _build( | |
| 83 | + qb: ESQueryBuilder, | |
| 84 | + *, | |
| 85 | + query_text: str, | |
| 86 | + rewritten: str, | |
| 87 | + detected_language: str, | |
| 88 | + translations: Dict[str, str], | |
| 89 | + index_languages: List[str], | |
| 90 | + contains_chinese: bool = False, | |
| 91 | + contains_english: bool = False, | |
| 92 | +) -> Dict[str, Any]: | |
| 93 | + parsed = SimpleNamespace( | |
| 94 | + rewritten_query=rewritten, | |
| 95 | + detected_language=detected_language, | |
| 96 | + translations=dict(translations), | |
| 97 | + contains_chinese=contains_chinese, | |
| 98 | + contains_english=contains_english, | |
| 99 | + ) | |
| 100 | + return qb.build_query( | |
| 101 | + query_text=query_text, | |
| 102 | + parsed_query=parsed, | |
| 103 | + enable_knn=False, | |
| 104 | + index_languages=index_languages, | |
| 105 | + ) | |
| 106 | + | |
| 107 | + | |
| 108 | +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- | |
| 109 | + | |
| 110 | + | |
| 111 | +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | |
| 112 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 113 | + q = _build( | |
| 114 | + qb, | |
| 115 | + query_text="连衣裙", | |
| 116 | + rewritten="连衣裙", | |
| 117 | + detected_language="zh", | |
| 118 | + translations={"en": "dress"}, | |
| 119 | + index_languages=["zh", "en"], | |
| 120 | + ) | |
| 121 | + idx = _clauses_index(q) | |
| 122 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 123 | + assert idx["base_query"]["query"] == "连衣裙" | |
| 124 | + assert "title.zh" in _title_fields(idx["base_query"]) | |
| 125 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 126 | + assert "title.en" in _title_fields(idx["base_query_trans_en"]) | |
| 127 | + | |
| 128 | + | |
| 129 | +def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | |
| 130 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 131 | + q = _build( | |
| 132 | + qb, | |
| 133 | + query_text="dress", | |
| 134 | + rewritten="dress", | |
| 135 | + detected_language="en", | |
| 136 | + translations={"zh": "连衣裙"}, | |
| 137 | + index_languages=["en", "zh"], | |
| 138 | + ) | |
| 139 | + idx = _clauses_index(q) | |
| 140 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 141 | + assert idx["base_query"]["query"] == "dress" | |
| 142 | + assert "title.en" in _title_fields(idx["base_query"]) | |
| 143 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 144 | + assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) | |
| 145 | + | |
| 146 | + | |
| 147 | +def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | |
| 148 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 149 | + q = _build( | |
| 150 | + qb, | |
| 151 | + query_text="kleid", | |
| 152 | + rewritten="kleid", | |
| 153 | + detected_language="de", | |
| 154 | + translations={"en": "dress", "fr": "robe"}, | |
| 155 | + index_languages=["de", "en", "fr"], | |
| 156 | + ) | |
| 157 | + idx = _clauses_index(q) | |
| 158 | + assert set(idx) == _with_phrase( | |
| 159 | + {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 160 | + ) | |
| 161 | + assert idx["base_query"]["query"] == "kleid" | |
| 162 | + assert "title.de" in _title_fields(idx["base_query"]) | |
| 163 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 164 | + assert idx["base_query_trans_fr"]["query"] == "robe" | |
| 165 | + | |
| 166 | + | |
| 167 | +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- | |
| 168 | + | |
| 169 | + | |
| 170 | +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | |
| 171 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 172 | + q = _build( | |
| 173 | + qb, | |
| 174 | + query_text="schuh", | |
| 175 | + rewritten="schuh", | |
| 176 | + detected_language="de", | |
| 177 | + translations={"en": "shoe", "zh": "鞋"}, | |
| 178 | + index_languages=["en", "zh"], | |
| 179 | + ) | |
| 180 | + idx = _clauses_index(q) | |
| 181 | + assert set(idx) == _with_phrase( | |
| 182 | + {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 183 | + ) | |
| 184 | + assert idx["base_query"]["query"] == "schuh" | |
| 185 | + assert "title.de" in _title_fields(idx["base_query"]) | |
| 186 | + assert "boost" not in idx["base_query"] | |
| 187 | + assert idx["base_query_trans_en"]["query"] == "shoe" | |
| 188 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | |
| 189 | + assert idx["base_query_trans_zh"]["query"] == "鞋" | |
| 190 | + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost | |
| 191 | + | |
| 192 | + | |
| 193 | +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- | |
| 194 | + | |
| 195 | + | |
| 196 | +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | |
| 197 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 198 | + q = _build( | |
| 199 | + qb, | |
| 200 | + query_text="红色 dress", | |
| 201 | + rewritten="红色 dress", | |
| 202 | + detected_language="zh", | |
| 203 | + translations={"en": "red dress"}, | |
| 204 | + index_languages=["zh", "en"], | |
| 205 | + contains_chinese=True, | |
| 206 | + contains_english=True, | |
| 207 | + ) | |
| 208 | + idx = _clauses_index(q) | |
| 209 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 210 | + assert idx["base_query"]["query"] == "红色 dress" | |
| 211 | + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | |
| 212 | + assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 213 | + assert _has_title_lang(idx["base_query_trans_en"], "en") | |
| 214 | + | |
| 215 | + | |
| 216 | +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | |
| 217 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 218 | + q = _build( | |
| 219 | + qb, | |
| 220 | + query_text="nike 运动鞋", | |
| 221 | + rewritten="nike 运动鞋", | |
| 222 | + detected_language="en", | |
| 223 | + translations={"zh": "耐克运动鞋"}, | |
| 224 | + index_languages=["zh", "en"], | |
| 225 | + contains_chinese=True, | |
| 226 | + contains_english=True, | |
| 227 | + ) | |
| 228 | + idx = _clauses_index(q) | |
| 229 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 230 | + assert idx["base_query"]["query"] == "nike 运动鞋" | |
| 231 | + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | |
| 232 | + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | |
| 233 | + | |
| 234 | + | |
| 235 | +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | |
| 236 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 237 | + q = _build( | |
| 238 | + qb, | |
| 239 | + query_text="法式 dress", | |
| 240 | + rewritten="法式 dress", | |
| 241 | + detected_language="zh", | |
| 242 | + translations={}, | |
| 243 | + index_languages=["zh"], | |
| 244 | + contains_chinese=True, | |
| 245 | + contains_english=True, | |
| 246 | + ) | |
| 247 | + idx = _clauses_index(q) | |
| 248 | + assert set(idx) == _with_phrase({"base_query"}) | |
| 249 | + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} | |
| 250 | + assert bases == {"title.zh"} | |
| 251 | + | |
| 252 | + | |
| 253 | +# --- 去重:与 base 同语言同文本的翻译项跳过 --- | |
| 254 | + | |
| 255 | + | |
| 256 | +def test_skips_translation_when_same_lang_and_same_text_as_base(): | |
| 257 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 258 | + q = _build( | |
| 259 | + qb, | |
| 260 | + query_text="NIKE", | |
| 261 | + rewritten="NIKE", | |
| 262 | + detected_language="en", | |
| 263 | + translations={"en": "NIKE", "zh": "耐克"}, | |
| 264 | + index_languages=["en", "zh"], | |
| 265 | + ) | |
| 266 | + idx = _clauses_index(q) | |
| 267 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 268 | + | |
| 269 | + | |
| 270 | +def test_keeps_translation_when_same_text_but_different_lang_than_base(): | |
| 271 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 272 | + q = _build( | |
| 273 | + qb, | |
| 274 | + query_text="NIKE", | |
| 275 | + rewritten="NIKE", | |
| 276 | + detected_language="en", | |
| 277 | + translations={"zh": "NIKE"}, | |
| 278 | + index_languages=["en", "zh"], | |
| 279 | + ) | |
| 280 | + idx = _clauses_index(q) | |
| 281 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 282 | + assert idx["base_query_trans_zh"]["query"] == "NIKE" | |
| 283 | + | |
| 284 | + | |
| 285 | +# --- 翻译 key 规范化、空翻译跳过 --- | |
| 286 | + | |
| 287 | + | |
| 288 | +def test_translation_language_key_is_normalized_case_insensitive(): | |
| 289 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 290 | + q = _build( | |
| 291 | + qb, | |
| 292 | + query_text="dress", | |
| 293 | + rewritten="dress", | |
| 294 | + detected_language="en", | |
| 295 | + translations={"ZH": "连衣裙"}, | |
| 296 | + index_languages=["en", "zh"], | |
| 297 | + ) | |
| 298 | + idx = _clauses_index(q) | |
| 299 | + assert "base_query_trans_zh" in idx | |
| 300 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 301 | + | |
| 302 | + | |
| 303 | +def test_empty_translation_value_is_skipped(): | |
| 304 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 305 | + q = _build( | |
| 306 | + qb, | |
| 307 | + query_text="dress", | |
| 308 | + rewritten="dress", | |
| 309 | + detected_language="en", | |
| 310 | + translations={"zh": " ", "fr": "robe"}, | |
| 311 | + index_languages=["en", "zh", "fr"], | |
| 312 | + ) | |
| 313 | + idx = _clauses_index(q) | |
| 314 | + assert "base_query_trans_zh" not in idx | |
| 315 | + assert "base_query_trans_fr" in idx | |
| 316 | + | |
| 317 | + | |
| 318 | +# --- index_languages 为空:视为「未约束」source_in_index 为 True --- | |
| 319 | + | |
| 320 | + | |
| 321 | +def test_empty_index_languages_treats_source_as_in_index_boosts(): | |
| 322 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 323 | + q = _build( | |
| 324 | + qb, | |
| 325 | + query_text="x", | |
| 326 | + rewritten="x", | |
| 327 | + detected_language="de", | |
| 328 | + translations={"en": "y"}, | |
| 329 | + index_languages=[], | |
| 330 | + ) | |
| 331 | + idx = _clauses_index(q) | |
| 332 | + assert "boost" not in idx["base_query"] | |
| 333 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | |
| 334 | + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost | |
| 335 | + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost | |
| 336 | + | |
| 337 | + | |
| 338 | +# --- 无翻译:仅 base_query --- | |
| 339 | + | |
| 340 | + | |
| 341 | +def test_no_translations_only_base_query(): | |
| 342 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 343 | + q = _build( | |
| 344 | + qb, | |
| 345 | + query_text="hello", | |
| 346 | + rewritten="hello", | |
| 347 | + detected_language="en", | |
| 348 | + translations={}, | |
| 349 | + index_languages=["en", "zh"], | |
| 350 | + ) | |
| 351 | + idx = _clauses_index(q) | |
| 352 | + assert set(idx) == _with_phrase({"base_query"}) | |
| 353 | + | |
| 354 | + | |
| 355 | +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- | |
| 356 | + | |
| 357 | + | |
| 358 | +def test_text_clauses_present_alongside_knn(): | |
| 359 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 360 | + parsed = SimpleNamespace( | |
| 361 | + rewritten_query="dress", | |
| 362 | + detected_language="en", | |
| 363 | + translations={"zh": "连衣裙"}, | |
| 364 | + contains_chinese=False, | |
| 365 | + contains_english=True, | |
| 366 | + ) | |
| 367 | + q = qb.build_query( | |
| 368 | + query_text="dress", | |
| 369 | + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), | |
| 370 | + parsed_query=parsed, | |
| 371 | + enable_knn=True, | |
| 372 | + index_languages=["en", "zh"], | |
| 373 | + ) | |
| 374 | + assert "knn" in q | |
| 375 | + idx = _clauses_index(q) | |
| 376 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 377 | + | |
| 378 | + | |
| 379 | +def test_detected_language_unknown_falls_back_to_default_language(): | |
| 380 | + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | |
| 381 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 382 | + parsed = SimpleNamespace( | |
| 383 | + rewritten_query="shirt", | |
| 384 | + detected_language="unknown", | |
| 385 | + translations={"zh": "衬衫"}, | |
| 386 | + contains_chinese=False, | |
| 387 | + contains_english=True, | |
| 388 | + ) | |
| 389 | + q = qb.build_query( | |
| 390 | + query_text="shirt", | |
| 391 | + parsed_query=parsed, | |
| 392 | + enable_knn=False, | |
| 393 | + index_languages=["en", "zh"], | |
| 394 | + ) | |
| 395 | + idx = _clauses_index(q) | |
| 396 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 397 | + assert idx["base_query"]["query"] == "shirt" | |
| 398 | + assert _has_title_lang(idx["base_query"], "en") | |
| 399 | + | |
| 400 | + | |
| 401 | +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | |
| 402 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 403 | + q = _build( | |
| 404 | + qb, | |
| 405 | + query_text="платье", | |
| 406 | + rewritten="платье", | |
| 407 | + detected_language="ru", | |
| 408 | + translations={"en": "dress"}, | |
| 409 | + index_languages=["ru", "en"], | |
| 410 | + ) | |
| 411 | + idx = _clauses_index(q) | |
| 412 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 413 | + assert idx["base_query"]["query"] == "платье" | |
| 414 | + assert _has_title_lang(idx["base_query"], "ru") | |
| 415 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 416 | + | |
| 417 | + | |
| 418 | +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | |
| 419 | + """ | |
| 420 | + 当前实现:凡是 translations 里非空的条目都会生成子句; | |
| 421 | + index_languages 只约束混写扩列,不用于过滤翻译子句。 | |
| 422 | + """ | |
| 423 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 424 | + q = _build( | |
| 425 | + qb, | |
| 426 | + query_text="dress", | |
| 427 | + rewritten="dress", | |
| 428 | + detected_language="en", | |
| 429 | + translations={"zh": "连衣裙", "de": "Kleid"}, | |
| 430 | + index_languages=["en", "zh"], | |
| 431 | + ) | |
| 432 | + idx = _clauses_index(q) | |
| 433 | + assert "base_query_trans_de" in idx | |
| 434 | + assert idx["base_query_trans_de"]["query"] == "Kleid" | |
| 435 | + assert _has_title_lang(idx["base_query_trans_de"], "de") | |
| 436 | + | |
| 437 | + | |
| 438 | +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | |
| 439 | + """base_query 始终用 rewritten_query,而非仅 query_text。""" | |
| 440 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 441 | + q = _build( | |
| 442 | + qb, | |
| 443 | + query_text=" 红色 ", | |
| 444 | + rewritten="红色连衣裙", | |
| 445 | + detected_language="zh", | |
| 446 | + translations={"en": "red dress"}, | |
| 447 | + index_languages=["zh", "en"], | |
| 448 | + contains_chinese=True, | |
| 449 | + contains_english=False, | |
| 450 | + ) | |
| 451 | + idx = _clauses_index(q) | |
| 452 | + assert idx["base_query"]["query"] == "红色连衣裙" | |
| 453 | + assert idx["base_query_trans_en"]["query"] == "red dress" | ... | ... |
tests/test_query_parser_mixed_language.py
| 1 | -from types import SimpleNamespace | |
| 2 | - | |
| 3 | 1 | from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig |
| 4 | 2 | from query.query_parser import QueryParser |
| 5 | 3 | |
| ... | ... | @@ -9,6 +7,10 @@ class _DummyTranslator: |
| 9 | 7 | return f"{text}-{target_lang}" |
| 10 | 8 | |
| 11 | 9 | |
| 10 | +def _tokenizer(text): | |
| 11 | + return str(text).split() | |
| 12 | + | |
| 13 | + | |
| 12 | 14 | def test_pure_english_word_token_length_and_script(): |
| 13 | 15 | assert QueryParser._is_pure_english_word_token("ab") is False |
| 14 | 16 | assert QueryParser._is_pure_english_word_token("abc") is True |
| ... | ... | @@ -35,59 +37,57 @@ def _build_config() -> SearchConfig: |
| 35 | 37 | |
| 36 | 38 | |
| 37 | 39 | def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): |
| 38 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 40 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 39 | 41 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") |
| 40 | - monkeypatch.setattr( | |
| 41 | - "query.query_parser.get_tenant_config_loader", | |
| 42 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), | |
| 43 | - raising=False, | |
| 44 | - ) | |
| 45 | 42 | |
| 46 | - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) | |
| 43 | + result = parser.parse( | |
| 44 | + "法式 dress 连衣裙", | |
| 45 | + tenant_id="162", | |
| 46 | + generate_vector=False, | |
| 47 | + target_languages=["zh", "en"], | |
| 48 | + ) | |
| 47 | 49 | |
| 48 | 50 | assert result.detected_language == "zh" |
| 49 | 51 | assert result.contains_chinese is True |
| 50 | 52 | assert result.contains_english is True |
| 51 | - assert "en" in result.search_langs | |
| 52 | - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) | |
| 53 | - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" | |
| 54 | - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" | |
| 53 | + assert result.translations == {"en": "法式 dress 连衣裙-en"} | |
| 54 | + assert result.query_tokens == ["法式", "dress", "连衣裙"] | |
| 55 | + assert not hasattr(result, "query_text_by_lang") | |
| 56 | + assert not hasattr(result, "search_langs") | |
| 55 | 57 | |
| 56 | 58 | |
| 57 | 59 | def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): |
| 58 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 60 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 59 | 61 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 60 | - monkeypatch.setattr( | |
| 61 | - "query.query_parser.get_tenant_config_loader", | |
| 62 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | |
| 63 | - raising=False, | |
| 64 | - ) | |
| 65 | 62 | |
| 66 | - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) | |
| 63 | + result = parser.parse( | |
| 64 | + "red 连衣裙", | |
| 65 | + tenant_id="0", | |
| 66 | + generate_vector=False, | |
| 67 | + target_languages=["en", "zh"], | |
| 68 | + ) | |
| 67 | 69 | |
| 68 | 70 | assert result.detected_language == "en" |
| 69 | 71 | assert result.contains_chinese is True |
| 70 | 72 | assert result.contains_english is True |
| 71 | - assert "zh" in result.search_langs | |
| 72 | - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" | |
| 73 | - assert result.query_text_by_lang["en"] == "red 连衣裙" | |
| 73 | + assert result.translations == {"zh": "red 连衣裙-zh"} | |
| 74 | + assert result.query_tokens == ["red", "连衣裙"] | |
| 74 | 75 | |
| 75 | 76 | |
| 76 | 77 | def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): |
| 77 | 78 | """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" |
| 78 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 79 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 79 | 80 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 80 | - monkeypatch.setattr( | |
| 81 | - "query.query_parser.get_tenant_config_loader", | |
| 82 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | |
| 83 | - raising=False, | |
| 84 | - ) | |
| 85 | 81 | |
| 86 | - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) | |
| 82 | + result = parser.parse( | |
| 83 | + "off shoulder top", | |
| 84 | + tenant_id="0", | |
| 85 | + generate_vector=False, | |
| 86 | + target_languages=["en", "zh"], | |
| 87 | + ) | |
| 87 | 88 | |
| 88 | 89 | assert result.detected_language == "en" |
| 89 | 90 | assert result.contains_chinese is False |
| 90 | 91 | assert result.contains_english is True |
| 91 | 92 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 92 | - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" | |
| 93 | - assert result.source_in_index_languages is True | |
| 93 | + assert not hasattr(result, "source_in_index_languages") | ... | ... |
tests/test_rerank_client.py
| ... | ... | @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 11 | 11 | "matched_queries": { |
| 12 | 12 | "base_query": 2.4, |
| 13 | 13 | "base_query_trans_zh": 1.8, |
| 14 | - "fallback_original_query_zh": 1.2, | |
| 15 | 14 | "knn_query": 0.8, |
| 16 | 15 | }, |
| 17 | 16 | }, |
| ... | ... | @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 27 | 26 | |
| 28 | 27 | debug = fuse_scores_and_resort(hits, [0.9, 0.7]) |
| 29 | 28 | |
| 30 | - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) | |
| 29 | + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8) | |
| 31 | 30 | expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) |
| 32 | 31 | expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) |
| 33 | 32 | |
| ... | ... | @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 38 | 37 | assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) |
| 39 | 38 | assert debug[0]["text_source_score"] == 2.4 |
| 40 | 39 | assert debug[0]["text_translation_score"] == 1.8 |
| 41 | - assert debug[0]["text_fallback_score"] == 1.2 | |
| 42 | 40 | assert debug[0]["knn_score"] == 0.8 |
| 43 | 41 | assert [hit["_id"] for hit in hits] == ["2", "1"] |
| 44 | 42 | ... | ... |
tests/test_search_rerank_window.py
| ... | ... | @@ -43,7 +43,14 @@ class _FakeParsedQuery: |
| 43 | 43 | |
| 44 | 44 | |
| 45 | 45 | class _FakeQueryParser: |
| 46 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 46 | + def parse( | |
| 47 | + self, | |
| 48 | + query: str, | |
| 49 | + tenant_id: str, | |
| 50 | + generate_vector: bool, | |
| 51 | + context: Any, | |
| 52 | + target_languages: Any = None, | |
| 53 | + ): | |
| 47 | 54 | return _FakeParsedQuery( |
| 48 | 55 | original_query=query, |
| 49 | 56 | query_normalized=query, |
| ... | ... | @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): |
| 191 | 198 | "field_boosts": {"title.en": 3.0}, |
| 192 | 199 | "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], |
| 193 | 200 | "query_config": {"supported_languages": ["en"], "default_language": "en"}, |
| 201 | + "services": { | |
| 202 | + "translation": { | |
| 203 | + "service_url": "http://localhost:6005", | |
| 204 | + "timeout_sec": 3.0, | |
| 205 | + "default_model": "dummy-model", | |
| 206 | + "default_scene": "general", | |
| 207 | + "cache": { | |
| 208 | + "ttl_seconds": 60, | |
| 209 | + "sliding_expiration": True, | |
| 210 | + }, | |
| 211 | + "capabilities": { | |
| 212 | + "dummy-model": { | |
| 213 | + "enabled": True, | |
| 214 | + "backend": "llm", | |
| 215 | + "use_cache": True, | |
| 216 | + "model": "dummy-model", | |
| 217 | + "base_url": "http://localhost:6005/v1", | |
| 218 | + "timeout_sec": 3.0, | |
| 219 | + } | |
| 220 | + }, | |
| 221 | + }, | |
| 222 | + "embedding": { | |
| 223 | + "provider": "http", | |
| 224 | + "providers": { | |
| 225 | + "http": { | |
| 226 | + "text_base_url": "http://localhost:6005", | |
| 227 | + "image_base_url": "http://localhost:6008", | |
| 228 | + } | |
| 229 | + }, | |
| 230 | + "backend": "tei", | |
| 231 | + "backends": { | |
| 232 | + "tei": { | |
| 233 | + "base_url": "http://localhost:8080", | |
| 234 | + "timeout_sec": 3.0, | |
| 235 | + "model_id": "dummy-embedding-model", | |
| 236 | + } | |
| 237 | + }, | |
| 238 | + }, | |
| 239 | + "rerank": { | |
| 240 | + "provider": "http", | |
| 241 | + "providers": { | |
| 242 | + "http": { | |
| 243 | + "base_url": "http://localhost:6007", | |
| 244 | + "service_url": "http://localhost:6007/rerank", | |
| 245 | + } | |
| 246 | + }, | |
| 247 | + "backend": "bge", | |
| 248 | + "backends": { | |
| 249 | + "bge": { | |
| 250 | + "model_name": "dummy-rerank-model", | |
| 251 | + "device": "cpu", | |
| 252 | + "use_fp16": False, | |
| 253 | + "batch_size": 8, | |
| 254 | + "max_length": 128, | |
| 255 | + "cache_dir": "./model_cache", | |
| 256 | + "enable_warmup": False, | |
| 257 | + } | |
| 258 | + }, | |
| 259 | + }, | |
| 260 | + }, | |
| 194 | 261 | "spu_config": {"enabled": False}, |
| 195 | 262 | "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, |
| 196 | 263 | "rerank": {"rerank_window": 384}, |
| ... | ... | @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch |
| 354 | 421 | class _TranslatedQueryParser: |
| 355 | 422 | text_encoder = None |
| 356 | 423 | |
| 357 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 424 | + def parse( | |
| 425 | + self, | |
| 426 | + query: str, | |
| 427 | + tenant_id: str, | |
| 428 | + generate_vector: bool, | |
| 429 | + context: Any, | |
| 430 | + target_languages: Any = None, | |
| 431 | + ): | |
| 358 | 432 | return _FakeParsedQuery( |
| 359 | 433 | original_query=query, |
| 360 | 434 | query_normalized=query, |
| ... | ... | @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc |
| 407 | 481 | encoder = _FakeTextEncoder( |
| 408 | 482 | { |
| 409 | 483 | "linen summer dress": [0.8, 0.2], |
| 410 | - "color:Red": [1.0, 0.0], | |
| 411 | - "color:Blue": [0.0, 1.0], | |
| 484 | + "color:red": [1.0, 0.0], | |
| 485 | + "color:blue": [0.0, 1.0], | |
| 412 | 486 | } |
| 413 | 487 | ) |
| 414 | 488 | |
| 415 | 489 | class _EmbeddingQueryParser: |
| 416 | 490 | text_encoder = encoder |
| 417 | 491 | |
| 418 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 492 | + def parse( | |
| 493 | + self, | |
| 494 | + query: str, | |
| 495 | + tenant_id: str, | |
| 496 | + generate_vector: bool, | |
| 497 | + context: Any, | |
| 498 | + target_languages: Any = None, | |
| 499 | + ): | |
| 419 | 500 | return _FakeParsedQuery( |
| 420 | 501 | original_query=query, |
| 421 | 502 | query_normalized=query, | ... | ... |
tests/test_translator_failure_semantics.py
| 1 | 1 | import logging |
| 2 | 2 | |
| 3 | +import pytest | |
| 4 | + | |
| 3 | 5 | from translation.cache import TranslationCache |
| 4 | 6 | from translation.logging_utils import ( |
| 5 | 7 | TranslationRequestFilter, |
| ... | ... | @@ -7,6 +9,7 @@ from translation.logging_utils import ( |
| 7 | 9 | reset_translation_request_id, |
| 8 | 10 | ) |
| 9 | 11 | from translation.service import TranslationService |
| 12 | +from translation.settings import build_translation_config, translation_cache_probe_models | |
| 10 | 13 | |
| 11 | 14 | |
| 12 | 15 | class _FakeCache: |
| ... | ... | @@ -16,7 +19,8 @@ class _FakeCache: |
| 16 | 19 | self.get_calls = [] |
| 17 | 20 | self.set_calls = [] |
| 18 | 21 | |
| 19 | - def get(self, *, model, target_lang, source_text): | |
| 22 | + def get(self, *, model, target_lang, source_text, log_lookup=True): | |
| 23 | + del log_lookup | |
| 20 | 24 | self.get_calls.append((model, target_lang, source_text)) |
| 21 | 25 | return self.storage.get((model, target_lang, source_text)) |
| 22 | 26 | |
| ... | ... | @@ -191,3 +195,262 @@ def test_translation_route_log_focuses_on_routing_decision(monkeypatch, caplog): |
| 191 | 195 | assert route_messages == [ |
| 192 | 196 | "Translation route | backend=llm request_type=single use_cache=True cache_available=False" |
| 193 | 197 | ] |
| 198 | + | |
| 199 | + | |
| 200 | +def test_translation_cache_probe_models_order(): | |
| 201 | + cfg = {"cache": {"model_quality_tiers": {"low": 10, "high": 50, "mid": 30}}} | |
| 202 | + assert translation_cache_probe_models(cfg, "low") == ["high", "mid", "low"] | |
| 203 | + assert translation_cache_probe_models(cfg, "mid") == ["high", "mid"] | |
| 204 | + assert translation_cache_probe_models(cfg, "high") == ["high"] | |
| 205 | + assert translation_cache_probe_models(cfg, "unknown") == ["unknown"] | |
| 206 | + | |
| 207 | + | |
| 208 | +def test_translation_cache_probe_models_respects_enable_switch(): | |
| 209 | + cfg = { | |
| 210 | + "cache": { | |
| 211 | + "enable_model_quality_tier_cache": False, | |
| 212 | + "model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100}, | |
| 213 | + } | |
| 214 | + } | |
| 215 | + assert translation_cache_probe_models(cfg, "peer-a") == ["peer-a"] | |
| 216 | + | |
| 217 | + | |
| 218 | +def test_translation_cache_probe_models_same_tier_included(): | |
| 219 | + """Same numeric tier: all peers are probed (higher tier first, then name order).""" | |
| 220 | + cfg = {"cache": {"model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100}}} | |
| 221 | + assert translation_cache_probe_models(cfg, "peer-a") == ["top", "peer-a", "peer-b"] | |
| 222 | + assert translation_cache_probe_models(cfg, "peer-b") == ["top", "peer-b", "peer-a"] | |
| 223 | + | |
| 224 | + | |
| 225 | +def test_model_quality_tiers_unknown_capability_raises(): | |
| 226 | + with pytest.raises(ValueError, match="unknown capability"): | |
| 227 | + build_translation_config( | |
| 228 | + { | |
| 229 | + "service_url": "http://127.0.0.1:6006", | |
| 230 | + "timeout_sec": 10.0, | |
| 231 | + "default_model": "llm", | |
| 232 | + "default_scene": "general", | |
| 233 | + "cache": { | |
| 234 | + "ttl_seconds": 60, | |
| 235 | + "sliding_expiration": True, | |
| 236 | + "model_quality_tiers": {"ghost": 1}, | |
| 237 | + }, | |
| 238 | + "capabilities": { | |
| 239 | + "llm": { | |
| 240 | + "enabled": True, | |
| 241 | + "backend": "llm", | |
| 242 | + "model": "dummy-llm", | |
| 243 | + "base_url": "https://example.com", | |
| 244 | + "timeout_sec": 10.0, | |
| 245 | + "use_cache": True, | |
| 246 | + } | |
| 247 | + }, | |
| 248 | + } | |
| 249 | + ) | |
| 250 | + | |
| 251 | + | |
| 252 | +def test_tiered_cache_reuses_higher_tier_entry(monkeypatch): | |
| 253 | + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None)) | |
| 254 | + translate_calls = [] | |
| 255 | + | |
| 256 | + def _fake_create_backend(self, *, name, backend_type, cfg): | |
| 257 | + del self, backend_type, cfg | |
| 258 | + | |
| 259 | + class _Backend: | |
| 260 | + model = name | |
| 261 | + | |
| 262 | + @property | |
| 263 | + def supports_batch(self): | |
| 264 | + return True | |
| 265 | + | |
| 266 | + def translate(self, text, target_lang, source_lang=None, scene=None): | |
| 267 | + del target_lang, source_lang, scene | |
| 268 | + translate_calls.append((name, text)) | |
| 269 | + if isinstance(text, list): | |
| 270 | + return [f"{name}:{item}" for item in text] | |
| 271 | + return f"{name}:{text}" | |
| 272 | + | |
| 273 | + return _Backend() | |
| 274 | + | |
| 275 | + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend) | |
| 276 | + config = { | |
| 277 | + "service_url": "http://127.0.0.1:6006", | |
| 278 | + "timeout_sec": 10.0, | |
| 279 | + "default_model": "opus-mt-zh-en", | |
| 280 | + "default_scene": "general", | |
| 281 | + "capabilities": { | |
| 282 | + "deepl": { | |
| 283 | + "enabled": True, | |
| 284 | + "backend": "deepl", | |
| 285 | + "api_url": "https://api.deepl.com/v2/translate", | |
| 286 | + "timeout_sec": 10.0, | |
| 287 | + "use_cache": True, | |
| 288 | + }, | |
| 289 | + "opus-mt-zh-en": { | |
| 290 | + "enabled": True, | |
| 291 | + "backend": "local_marian", | |
| 292 | + "model_id": "dummy", | |
| 293 | + "model_dir": "dummy", | |
| 294 | + "device": "cpu", | |
| 295 | + "torch_dtype": "float32", | |
| 296 | + "batch_size": 8, | |
| 297 | + "max_input_length": 16, | |
| 298 | + "max_new_tokens": 16, | |
| 299 | + "num_beams": 1, | |
| 300 | + "use_cache": True, | |
| 301 | + }, | |
| 302 | + }, | |
| 303 | + "cache": { | |
| 304 | + "ttl_seconds": 60, | |
| 305 | + "sliding_expiration": True, | |
| 306 | + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40}, | |
| 307 | + }, | |
| 308 | + } | |
| 309 | + | |
| 310 | + service = TranslationService(config) | |
| 311 | + fake_cache = _FakeCache() | |
| 312 | + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl" | |
| 313 | + service._translation_cache = fake_cache | |
| 314 | + | |
| 315 | + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en") | |
| 316 | + assert out == "from-deepl" | |
| 317 | + assert translate_calls == [] | |
| 318 | + assert fake_cache.get_calls == [("deepl", "en", "商品标题")] | |
| 319 | + | |
| 320 | + | |
| 321 | +def test_tiered_cache_reuses_same_tier_peer(monkeypatch): | |
| 322 | + """Model A may use cache written under model B when both share the same tier.""" | |
| 323 | + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None)) | |
| 324 | + translate_calls = [] | |
| 325 | + | |
| 326 | + def _fake_create_backend(self, *, name, backend_type, cfg): | |
| 327 | + del self, backend_type, cfg | |
| 328 | + | |
| 329 | + class _Backend: | |
| 330 | + model = name | |
| 331 | + | |
| 332 | + @property | |
| 333 | + def supports_batch(self): | |
| 334 | + return True | |
| 335 | + | |
| 336 | + def translate(self, text, target_lang, source_lang=None, scene=None): | |
| 337 | + del target_lang, source_lang, scene | |
| 338 | + translate_calls.append((name, text)) | |
| 339 | + if isinstance(text, list): | |
| 340 | + return [f"{name}:{item}" for item in text] | |
| 341 | + return f"{name}:{text}" | |
| 342 | + | |
| 343 | + return _Backend() | |
| 344 | + | |
| 345 | + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend) | |
| 346 | + marian_cap = { | |
| 347 | + "enabled": True, | |
| 348 | + "backend": "local_marian", | |
| 349 | + "model_id": "dummy", | |
| 350 | + "model_dir": "dummy", | |
| 351 | + "device": "cpu", | |
| 352 | + "torch_dtype": "float32", | |
| 353 | + "batch_size": 8, | |
| 354 | + "max_input_length": 16, | |
| 355 | + "max_new_tokens": 16, | |
| 356 | + "num_beams": 1, | |
| 357 | + "use_cache": True, | |
| 358 | + } | |
| 359 | + config = { | |
| 360 | + "service_url": "http://127.0.0.1:6006", | |
| 361 | + "timeout_sec": 10.0, | |
| 362 | + "default_model": "opus-mt-en-zh", | |
| 363 | + "default_scene": "general", | |
| 364 | + "capabilities": { | |
| 365 | + "opus-mt-zh-en": dict(marian_cap), | |
| 366 | + "opus-mt-en-zh": dict(marian_cap), | |
| 367 | + }, | |
| 368 | + "cache": { | |
| 369 | + "ttl_seconds": 60, | |
| 370 | + "sliding_expiration": True, | |
| 371 | + "model_quality_tiers": {"opus-mt-zh-en": 50, "opus-mt-en-zh": 50}, | |
| 372 | + }, | |
| 373 | + } | |
| 374 | + | |
| 375 | + service = TranslationService(config) | |
| 376 | + fake_cache = _FakeCache() | |
| 377 | + fake_cache.storage[("opus-mt-zh-en", "en", "hello")] = "from-zh-en" | |
| 378 | + service._translation_cache = fake_cache | |
| 379 | + | |
| 380 | + out = service.translate("hello", target_lang="en", source_lang="zh", model="opus-mt-en-zh") | |
| 381 | + assert out == "from-zh-en" | |
| 382 | + assert translate_calls == [] | |
| 383 | + assert fake_cache.get_calls == [ | |
| 384 | + ("opus-mt-en-zh", "en", "hello"), | |
| 385 | + ("opus-mt-zh-en", "en", "hello"), | |
| 386 | + ] | |
| 387 | + | |
| 388 | + | |
| 389 | +def test_tiered_cache_switch_off_uses_exact_model_only(monkeypatch): | |
| 390 | + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None)) | |
| 391 | + translate_calls = [] | |
| 392 | + | |
| 393 | + def _fake_create_backend(self, *, name, backend_type, cfg): | |
| 394 | + del self, backend_type, cfg | |
| 395 | + | |
| 396 | + class _Backend: | |
| 397 | + model = name | |
| 398 | + | |
| 399 | + @property | |
| 400 | + def supports_batch(self): | |
| 401 | + return True | |
| 402 | + | |
| 403 | + def translate(self, text, target_lang, source_lang=None, scene=None): | |
| 404 | + del target_lang, source_lang, scene | |
| 405 | + translate_calls.append((name, text)) | |
| 406 | + if isinstance(text, list): | |
| 407 | + return [f"{name}:{item}" for item in text] | |
| 408 | + return f"{name}:{text}" | |
| 409 | + | |
| 410 | + return _Backend() | |
| 411 | + | |
| 412 | + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend) | |
| 413 | + config = { | |
| 414 | + "service_url": "http://127.0.0.1:6006", | |
| 415 | + "timeout_sec": 10.0, | |
| 416 | + "default_model": "opus-mt-zh-en", | |
| 417 | + "default_scene": "general", | |
| 418 | + "capabilities": { | |
| 419 | + "deepl": { | |
| 420 | + "enabled": True, | |
| 421 | + "backend": "deepl", | |
| 422 | + "api_url": "https://api.deepl.com/v2/translate", | |
| 423 | + "timeout_sec": 10.0, | |
| 424 | + "use_cache": True, | |
| 425 | + }, | |
| 426 | + "opus-mt-zh-en": { | |
| 427 | + "enabled": True, | |
| 428 | + "backend": "local_marian", | |
| 429 | + "model_id": "dummy", | |
| 430 | + "model_dir": "dummy", | |
| 431 | + "device": "cpu", | |
| 432 | + "torch_dtype": "float32", | |
| 433 | + "batch_size": 8, | |
| 434 | + "max_input_length": 16, | |
| 435 | + "max_new_tokens": 16, | |
| 436 | + "num_beams": 1, | |
| 437 | + "use_cache": True, | |
| 438 | + }, | |
| 439 | + }, | |
| 440 | + "cache": { | |
| 441 | + "ttl_seconds": 60, | |
| 442 | + "sliding_expiration": True, | |
| 443 | + "enable_model_quality_tier_cache": False, | |
| 444 | + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40}, | |
| 445 | + }, | |
| 446 | + } | |
| 447 | + | |
| 448 | + service = TranslationService(config) | |
| 449 | + fake_cache = _FakeCache() | |
| 450 | + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl" | |
| 451 | + service._translation_cache = fake_cache | |
| 452 | + | |
| 453 | + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en") | |
| 454 | + assert out == "opus-mt-zh-en:商品标题" | |
| 455 | + assert translate_calls == [("opus-mt-zh-en", "商品标题")] | |
| 456 | + assert fake_cache.get_calls == [("opus-mt-zh-en", "en", "商品标题")] | ... | ... |
translation/cache.py
| ... | ... | @@ -36,7 +36,13 @@ class TranslationCache: |
| 36 | 36 | digest = hashlib.sha256(text.encode("utf-8")).hexdigest() |
| 37 | 37 | return f"trans:{normalized_model}:{normalized_target_lang}:{text_prefix}{digest}" |
| 38 | 38 | |
| 39 | - def get(self, *, model: str, target_lang: str, source_text: str) -> Optional[str]: | |
| 39 | + def get( | |
| 40 | + self, | |
| 41 | + *, | |
| 42 | + model: str, | |
| 43 | + target_lang: str, | |
| 44 | + source_text: str | |
| 45 | + ) -> Optional[str]: | |
| 40 | 46 | if self.redis_client is None: |
| 41 | 47 | return None |
| 42 | 48 | key = self.build_key(model=model, target_lang=target_lang, source_text=source_text) | ... | ... |
translation/service.py
| ... | ... | @@ -3,7 +3,7 @@ |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | import logging |
| 6 | -from typing import Dict, List, Optional | |
| 6 | +from typing import Dict, List, Optional, Tuple | |
| 7 | 7 | |
| 8 | 8 | from config.loader import get_app_config |
| 9 | 9 | from config.schema import AppConfig |
| ... | ... | @@ -15,6 +15,7 @@ from translation.settings import ( |
| 15 | 15 | get_translation_capability, |
| 16 | 16 | normalize_translation_model, |
| 17 | 17 | normalize_translation_scene, |
| 18 | + translation_cache_probe_models, | |
| 18 | 19 | ) |
| 19 | 20 | |
| 20 | 21 | logger = logging.getLogger(__name__) |
| ... | ... | @@ -247,7 +248,11 @@ class TranslationService: |
| 247 | 248 | ) -> Optional[str]: |
| 248 | 249 | if not text.strip(): |
| 249 | 250 | return text |
| 250 | - cached = self._translation_cache.get(model=model, target_lang=target_lang, source_text=text) | |
| 251 | + cached, _served = self._tiered_cache_get( | |
| 252 | + request_model=model, | |
| 253 | + target_lang=target_lang, | |
| 254 | + source_text=text, | |
| 255 | + ) | |
| 251 | 256 | if cached is not None: |
| 252 | 257 | logger.info( |
| 253 | 258 | "Translation cache served | request_type=single text_len=%s", |
| ... | ... | @@ -279,6 +284,30 @@ class TranslationService: |
| 279 | 284 | ) |
| 280 | 285 | return translated |
| 281 | 286 | |
| 287 | + def _tiered_cache_get( | |
| 288 | + self, | |
| 289 | + *, | |
| 290 | + request_model: str, | |
| 291 | + target_lang: str, | |
| 292 | + source_text: str, | |
| 293 | + ) -> Tuple[Optional[str], Optional[str]]: | |
| 294 | + """Redis lookup: cache from higher-tier or **same-tier** models may satisfy A. | |
| 295 | + | |
| 296 | + Lower-tier entries are never read. Returns ``(translated, served_model)``. | |
| 297 | + """ | |
| 298 | + probe_models = translation_cache_probe_models(self.config, request_model) | |
| 299 | + | |
| 300 | + for probe_model in probe_models: | |
| 301 | + hit = self._translation_cache.get( | |
| 302 | + model=probe_model, | |
| 303 | + target_lang=target_lang, | |
| 304 | + source_text=source_text, | |
| 305 | + ) | |
| 306 | + if hit is not None: | |
| 307 | + return hit, probe_model | |
| 308 | + | |
| 309 | + return None, None | |
| 310 | + | |
| 282 | 311 | def _translate_batch_with_cache( |
| 283 | 312 | self, |
| 284 | 313 | *, |
| ... | ... | @@ -300,8 +329,8 @@ class TranslationService: |
| 300 | 329 | if not normalized_text.strip(): |
| 301 | 330 | results[idx] = normalized_text |
| 302 | 331 | continue |
| 303 | - cached = self._translation_cache.get( | |
| 304 | - model=model, | |
| 332 | + cached, _served = self._tiered_cache_get( | |
| 333 | + request_model=model, | |
| 305 | 334 | target_lang=target_lang, |
| 306 | 335 | source_text=normalized_text, |
| 307 | 336 | ) | ... | ... |
translation/settings.py
| ... | ... | @@ -2,7 +2,7 @@ |
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | -from typing import Any, Dict, List, Mapping, Optional | |
| 5 | +from typing import Any, Dict, List, Mapping, Optional, Tuple | |
| 6 | 6 | |
| 7 | 7 | from translation.scenes import normalize_scene_name |
| 8 | 8 | |
| ... | ... | @@ -38,6 +38,7 @@ def build_translation_config(raw_cfg: Mapping[str, Any]) -> TranslationConfig: |
| 38 | 38 | if not get_enabled_translation_models(config): |
| 39 | 39 | raise ValueError("At least one translation capability must be enabled") |
| 40 | 40 | |
| 41 | + _validate_model_quality_tiers(config) | |
| 41 | 42 | return config |
| 42 | 43 | |
| 43 | 44 | |
| ... | ... | @@ -86,18 +87,107 @@ def get_translation_cache(config: Mapping[str, Any]) -> Dict[str, Any]: |
| 86 | 87 | return dict(cache) |
| 87 | 88 | |
| 88 | 89 | |
| 90 | +def translation_cache_probe_models(config: Mapping[str, Any], request_model: str) -> List[str]: | |
| 91 | + """Redis cache key models to try. | |
| 92 | + | |
| 93 | + Sort order: (1) **tier** descending (higher quality first); (2) within the same tier, | |
| 94 | + the **request model** before other peers; (3) remaining ties by model name. | |
| 95 | + | |
| 96 | + For a request to model A with tier T, probes every configured model whose tier is | |
| 97 | + **greater than or equal to** T. Lower tiers are never used. | |
| 98 | + | |
| 99 | + When ``enable_model_quality_tier_cache`` is false, only the request model is probed. | |
| 100 | + | |
| 101 | + When ``model_quality_tiers`` is empty or ``request_model`` is not listed, only the | |
| 102 | + request model is probed (legacy exact-match behavior). | |
| 103 | + """ | |
| 104 | + rm = str(request_model or "").strip().lower() | |
| 105 | + cache = config.get("cache") | |
| 106 | + if not isinstance(cache, Mapping): | |
| 107 | + return [rm] | |
| 108 | + if not bool(cache.get("enable_model_quality_tier_cache", True)): | |
| 109 | + return [rm] | |
| 110 | + tiers = cache.get("model_quality_tiers") | |
| 111 | + if not isinstance(tiers, Mapping) or not tiers: | |
| 112 | + return [rm] | |
| 113 | + if rm not in tiers: | |
| 114 | + return [rm] | |
| 115 | + threshold = int(tiers[rm]) | |
| 116 | + scored: List[Tuple[int, str]] = [] | |
| 117 | + for name, tier_val in tiers.items(): | |
| 118 | + n = str(name).strip().lower() | |
| 119 | + t = int(tier_val) | |
| 120 | + if t >= threshold: | |
| 121 | + scored.append((t, n)) | |
| 122 | + scored.sort( | |
| 123 | + key=lambda item: ( | |
| 124 | + -item[0], | |
| 125 | + 0 if item[1] == rm else 1, | |
| 126 | + item[1], | |
| 127 | + ) | |
| 128 | + ) | |
| 129 | + out: List[str] = [] | |
| 130 | + seen: set[str] = set() | |
| 131 | + for _t, n in scored: | |
| 132 | + if n not in seen: | |
| 133 | + seen.add(n) | |
| 134 | + out.append(n) | |
| 135 | + return out | |
| 136 | + | |
| 137 | + | |
| 89 | 138 | def _build_cache_config(raw_cache: Any) -> Dict[str, Any]: |
| 90 | 139 | if not isinstance(raw_cache, Mapping): |
| 91 | 140 | raise ValueError("services.translation.cache must be a mapping") |
| 141 | + if "enable_model_quality_tier_cache" in raw_cache: | |
| 142 | + enable_tier_cache = _require_bool( | |
| 143 | + raw_cache["enable_model_quality_tier_cache"], | |
| 144 | + "services.translation.cache.enable_model_quality_tier_cache", | |
| 145 | + ) | |
| 146 | + else: | |
| 147 | + enable_tier_cache = True | |
| 92 | 148 | return { |
| 93 | 149 | "ttl_seconds": _require_positive_int(raw_cache.get("ttl_seconds"), "services.translation.cache.ttl_seconds"), |
| 94 | 150 | "sliding_expiration": _require_bool( |
| 95 | 151 | raw_cache.get("sliding_expiration"), |
| 96 | 152 | "services.translation.cache.sliding_expiration", |
| 97 | 153 | ), |
| 154 | + "enable_model_quality_tier_cache": enable_tier_cache, | |
| 155 | + "model_quality_tiers": _build_model_quality_tiers(raw_cache.get("model_quality_tiers")), | |
| 98 | 156 | } |
| 99 | 157 | |
| 100 | 158 | |
| 159 | +def _build_model_quality_tiers(raw: Any) -> Dict[str, int]: | |
| 160 | + if raw is None: | |
| 161 | + return {} | |
| 162 | + if not isinstance(raw, Mapping): | |
| 163 | + raise ValueError("services.translation.cache.model_quality_tiers must be a mapping") | |
| 164 | + resolved: Dict[str, int] = {} | |
| 165 | + for name, tier_val in raw.items(): | |
| 166 | + cap = _require_string(name, "services.translation.cache.model_quality_tiers key").lower() | |
| 167 | + field = f"services.translation.cache.model_quality_tiers.{cap}" | |
| 168 | + resolved[cap] = _require_non_negative_int(tier_val, field) | |
| 169 | + return resolved | |
| 170 | + | |
| 171 | + | |
| 172 | +def _validate_model_quality_tiers(config: TranslationConfig) -> None: | |
| 173 | + tiers = config["cache"].get("model_quality_tiers") | |
| 174 | + if not isinstance(tiers, Mapping) or not tiers: | |
| 175 | + return | |
| 176 | + caps = config["capabilities"] | |
| 177 | + for name in tiers: | |
| 178 | + if name not in caps: | |
| 179 | + raise ValueError( | |
| 180 | + f"services.translation.cache.model_quality_tiers references unknown capability '{name}'" | |
| 181 | + ) | |
| 182 | + | |
| 183 | + | |
| 184 | +def _require_non_negative_int(value: Any, field_name: str) -> int: | |
| 185 | + parsed = _require_int(value, field_name) | |
| 186 | + if parsed < 0: | |
| 187 | + raise ValueError(f"{field_name} must be >= 0") | |
| 188 | + return parsed | |
| 189 | + | |
| 190 | + | |
| 101 | 191 | def _build_capabilities(raw_capabilities: Any) -> Dict[str, Dict[str, Any]]: |
| 102 | 192 | if not isinstance(raw_capabilities, Mapping): |
| 103 | 193 | raise ValueError("services.translation.capabilities must be a mapping") | ... | ... |