tangwang · tangwang · tangwang · tangwang · tangwang · tangwang
Showing 29 changed files Show diff stats
.env.example
]
config/config.yaml
config/loader.py
config/schema.py
docs/TODO.txt
docs/搜索API对接指南-07-微服务接口（Embedding-Reranker-Translation）.md
docs/搜索API速查表.md -> docs/搜索API对接指南-速查表.md
docs/搜索API对接指南—拆分前版本存档.md
docs/相关性检索优化说明.md
embeddings/README.md
embeddings/image_encoder.py
embeddings/server.py
embeddings/text_encoder.py
frontend/static/css/style.css
frontend/static/js/app.js
perf_reports/README.md
query/query_parser.py
scripts/perf_api_benchmark.py
scripts/perf_cases.json.example
@@ -22,6 +22,8 @@ API_HOST=0.0.0.0
 API_PORT=6002
 INDEXER_HOST=0.0.0.0
 INDEXER_PORT=6004
+# Embedding HTTP servers (text 6005 / image 6008): bind all interfaces
+EMBEDDING_HOST=0.0.0.0
 # Optional service ports
 FRONTEND_PORT=6003
@@ -0,0 +1,17 @@
+docs
+# Please enter the commit message for your changes. Lines starting
+# with '#' will be ignored, and an empty message aborts the commit.
+#
+# On branch master
+# Your branch is ahead of 'origin/master' by 5 commits.
+#   (use "git push" to publish your local commits)
+#
+# Changes to be committed:
+#	modified:   config/config.yaml
+#	modified:   docs/TODO.txt
+#	modified:   "docs/\346\220\234\347\264\242API\345\257\271\346\216\245\346\214\207\345\215\227-07-\345\276\256\346\234\215\345\212\241\346\216\245\345\217\243\357\274\210Embedding-Reranker-Translation\357\274\211.md"
+#	modified:   "docs/\347\233\270\345\205\263\346\200\247\346\243\200\347\264\242\344\274\230\345\214\226\350\257\264\346\230\216.md"
+#
+# Changes not staged for commit:
+#	modified:   third-party/clip-as-service (untracked content)
+#
@@ -31,9 +31,9 @@ field_boosts:
   category_path: 1.5
   category_name_text: 1.5
   tags: 1.0
-  option1_values: 0.5
-  option2_values: 0.5
-  option3_values: 0.5
+  option1_values: 0.6
+  option2_values: 0.4
+  option3_values: 0.4
 # Query Configuration（查询配置）
 query_config:
@@ -47,6 +47,11 @@ query_config:
   enable_text_embedding: true
   enable_query_rewrite: true
+  # 查询解析阶段：翻译与 query 向量并发执行，共用同一等待预算（毫秒）。
+  # 检测语言已在租户 index_languages 内：较短；不在索引语言内：较长（翻译对召回更关键）。
+  translation_embedding_wait_budget_ms_source_in_index: 80
+  translation_embedding_wait_budget_ms_source_not_in_index: 200
+
   # 动态多语言检索字段配置
   # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式；
   # shared_fields 为无语言后缀字段。
@@ -85,7 +90,34 @@ query_config:
   # 返回字段配置（_source includes）
   # null表示返回所有字段，[]表示不返回任何字段，列表表示只返回指定字段
-  source_fields: null
+  # 下列字段与 api/result_formatter.py（SpuResult 填充）及 search/searcher.py（SKU 排序/主图替换）一致
+  source_fields:
+    - spu_id
+    - handle
+    - title
+    - brief
+    - description
+    - vendor
+    - category_name
+    - category_name_text
+    - category_path
+    - category_id
+    - category_level
+    - category1_name
+    - category2_name
+    - category3_name
+    - tags
+    - min_price
+    - compare_at_price
+    - image_url
+    - sku_prices
+    - sku_weights
+    - sku_weight_units
+    - total_inventory
+    - option1_name
+    - option1_values
+    - specifications
+    - skus
   # KNN boost配置（向量召回的boost值）
   knn_boost: 0.25  # Lower boost for embedding recall
@@ -110,7 +142,7 @@ rerank:
 services:
   translation:
     service_url: "http://127.0.0.1:6006"
-    default_model: "llm"
+    default_model: "nllb-200-distilled-600m"
     default_scene: "general"
     timeout_sec: 10.0
     cache:
@@ -297,6 +297,12 @@ class AppConfigLoader:
             default_translation_model=str(
                 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
             ),
+            translation_embedding_wait_budget_ms_source_in_index=int(
+                query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
+            ),
+            translation_embedding_wait_budget_ms_source_not_in_index=int(
+                query_cfg.get("translation_embedding_wait_budget_ms_source_not_in_index", 200)
+            ),
         )
         function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {}
@@ -445,7 +451,7 @@ class AppConfigLoader:
             api_port=int(os.getenv("API_PORT", 6002)),
             indexer_host=os.getenv("INDEXER_HOST", "0.0.0.0"),
             indexer_port=int(os.getenv("INDEXER_PORT", 6004)),
-            embedding_host=os.getenv("EMBEDDING_HOST", "127.0.0.1"),
+            embedding_host=os.getenv("EMBEDDING_HOST", "0.0.0.0"),
             embedding_port=int(os.getenv("EMBEDDING_PORT", 6005)),
             embedding_text_port=int(os.getenv("EMBEDDING_TEXT_PORT", 6005)),
             embedding_image_port=int(os.getenv("EMBEDDING_IMAGE_PORT", 6008)),
@@ -61,6 +61,11 @@ class QueryConfig:
     zh_to_en_model: str = "opus-mt-zh-en"
     en_to_zh_model: str = "opus-mt-en-zh"
     default_translation_model: str = "nllb-200-distilled-600m"
+    # 查询阶段：翻译与向量生成并发提交后，共用同一等待预算（毫秒）。
+    # 检测语言已在租户 index_languages 内：偏快返回，预算较短。
+    # 检测语言不在 index_languages 内：翻译对召回更关键，预算较长。
+    translation_embedding_wait_budget_ms_source_in_index: int = 80
+    translation_embedding_wait_budget_ms_source_not_in_index: int = 200
 @dataclass(frozen=True)
@@ -254,13 +259,13 @@ class RuntimeConfig:
     api_port: int = 6002
     indexer_host: str = "0.0.0.0"
     indexer_port: int = 6004
-    embedding_host: str = "127.0.0.1"
+    embedding_host: str = "0.0.0.0"
     embedding_port: int = 6005
     embedding_text_port: int = 6005
     embedding_image_port: int = 6008
-    translator_host: str = "127.0.0.1"
+    translator_host: str = "0.0.0.0"
     translator_port: int = 6006
-    reranker_host: str = "127.0.0.1"
+    reranker_host: str = "0.0.0.0"
     reranker_port: int = 6007
+先阅读文本embedding相关的代码：
+@embeddings/README.md @embeddings/server.py  @docs/搜索API对接指南-07-微服务接口（Embedding-Reranker-Translation）.md  @embeddings/text_encoder.py 
+目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制，超限返回过载状态码。
+
+文本embedding服务，要支持 priority 查询参数，priority > 0：不计入上述 inflight、不会因准入被拒绝（图片embedding不需要支持，因为只有离线需要用到图片embedding）
+priority == 0（默认，适合做索引之类的离线任务）：仍走原有 TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入；超限返回过载状态码。
+priority > 0（或者==1）（适合在线请求）：不会因准入被拒绝，但是仍然需要占用inflight，这样保证在线请求不被限制，并且在线请求很多的时候可以拒绝掉离线的请求。
+
+除了限制规则的修改，更进一步的，也需要保证这种请求是优先处理的（priority=1的相比=0的更优先被处理）。
+关于技术方案，有Worker + 双队列、PriorityMutex等等，除此之外，也请你思考合适的方案。
+成熟稳定、不带来复杂度、性能、稳定性方面的副作用，是最重要的。请先了解代码、需求，深度思考解决方案
+
+
+
+配置体系的重构。
+
+Referring to @docs/config-system-review-and-redesign.md , most of the modifications have been completed. Could you conduct a review to check what else needs improvement in the configuration documentation system? Are there any outstanding issues?
+
+一、仍然存在大量通过环境变量获取配置的地方
+_SERVICE_KIND = (os.getenv("EMBEDDING_SERVICE_KIND", "all") or "all").strip().lower()
+if _SERVICE_KIND not in {"all", "text", "image"}:
+    raise RuntimeError(
+        f"Invalid EMBEDDING_SERVICE_KIND={_SERVICE_KIND!r}; expected all, text, or image"
+    )
+_TEXT_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_TEXT_MODEL", "true").lower() in ("1", "true", "yes")
+_IMAGE_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_IMAGE_MODEL", "true").lower() in ("1", "true", "yes")
+open_text_model = _TEXT_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "text"}
+open_image_model = _IMAGE_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "image"}
+
+_text_encode_lock = threading.Lock()
+_image_encode_lock = threading.Lock()
+
+_TEXT_MICROBATCH_WINDOW_SEC = max(
+    0.0, float(os.getenv("TEXT_MICROBATCH_WINDOW_MS", "4")) / 1000.0
+)
+_TEXT_REQUEST_TIMEOUT_SEC = max(
+    1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30"))
+)
+_TEXT_MAX_INFLIGHT = max(1, int(os.getenv("TEXT_MAX_INFLIGHT", "32")))
+_IMAGE_MAX_INFLIGHT = max(1, int(os.getenv("IMAGE_MAX_INFLIGHT", "1")))
+_OVERLOAD_STATUS_CODE = int(os.getenv("EMBEDDING_OVERLOAD_STATUS_CODE", "503"))
+_LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3")))
+_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120")))
+_LOG_IMAGE_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_IMAGE_PREVIEW_CHARS", "180")))
+_VECTOR_PREVIEW_DIMS = max(1, int(os.getenv("EMBEDDING_VECTOR_PREVIEW_DIMS", "6")))
+_CACHE_PREFIX = str(REDIS_CONFIG.get("embedding_cache_prefix", "embedding")).strip() or "embedding"
+
+
+
+
+
+还有这些写死的地址 @embedding/config.py 
+
+self.TEI_BASE_URL = str(text_backend.get("base_url") or "http://127.0.0.1:8080")
+self.TEI_TIMEOUT_SEC = int(text_backend.get("timeout_sec", 60))
+
+self.USE_CLIP_AS_SERVICE = services.image_backend == "clip_as_service"
+self.CLIP_AS_SERVICE_SERVER = str(image_backend.get("server") or "grpc://127.0.0.1:51000")
+
+
+
+
+看起来似乎并没有完全遵循这些原则？
+4. 重新设计的设计原则
+重新设计应遵循以下规则。
+
+4.1 单一逻辑配置系统
+可以有多个文件，但不能有多个职责重叠的加载器。
+必须有一个加载器管道，能够生成一个类型化的 AppConfig 对象。
+
+4.2 配置文件负责声明，解析代码负责解释，环境变量负责运行时注入
+职责应明确如下：
+配置文件
+声明非敏感的目标行为和可部署的非敏感设置
+解析逻辑
+加载、合并、验证、规范化并暴露类型化的配置
+绝不发明隐藏的业务行为
+环境变量
+承载密钥和少量运行时/进程相关的值
+不随意地重新定义业务行为
+
+4.3 整个系统采用单一的优先级规则
+除非明确豁免，否则每个配置类别都应遵循相同的合并模型。
+
+4.4 业务行为不得有静默的隐式后备
+在启动时，如果必需的配置缺失或无效，应快速失败。
+不要静默地回退到诸如硬编码语言列表之类的遗留行为。
+
+4.5 有效配置必须可观测
+每个服务都应能够展示：
+配置版本或哈希值
+加载的源文件
+环境名称
+经过清理的有效配置
+
+5. 推荐的目标设计
+
+5.1 边界模型
+使用三个清晰的层级。
+层级 1：代码仓库管理的静态配置
+目的：
+搜索行为
+租户行为
+提供商/后端注册表
+非敏感的服务拓扑默认值
+功能开关
+示例：
+字段权重
+查询策略
+重排序融合参数
+租户语言方案
+翻译能力注册表
+嵌入后端选择默认值
+
+层级 2：特定环境的层叠配置
+目的：
+按环境区分的非敏感差异
+按环境区分的服务端点
+按环境区分的资源大小默认值
+开发/测试/生产环境的运维差异
+示例：
+本地嵌入 URL 与生产环境嵌入 URL
+开发环境重排序后端与生产环境重排序后端
+本地开发环境中较低的并发度
+
+层级 3：环境变量
+目的：
+密钥
+绑定主机/端口
+外部基础设施凭证
+容器编排器的最后一步注入
+示例：
+ES_HOST, ES_USERNAME, ES_PASSWORD
+DB_HOST, DB_USERNAME, DB_PASSWORD
+REDIS_HOST, REDIS_PASSWORD
+DASHSCOPE_API_KEY, DEEPL_AUTH_KEY
+API_HOST, API_PORT, INDEXER_PORT, TRANSLATION_PORT
+规则：
+环境变量不应成为选择业务行为（如翻译模型、嵌入后端或租户语言策略）的常规途径
+如果允许对非敏感字段进行环境变量覆盖，则必须将其明确列出并记录为运维覆盖手段，而非隐藏的约定
+
+5.2 统一的优先级
+推荐的优先级顺序：
+代码中的模式（schema）默认值
+config/base.yaml
+config/environments/<env>.yaml
+来自 config/tenants/ 的租户层叠配置
+针对明确允许的运行时键的环境变量
+仅适用于当前进程的命令行标志
+重要规则：
+只能有一个模块实现此合并逻辑
+任何业务模块不得直接调用 os.getenv() 来获取配置
+
+
+
+二、cursor（composer2）帮我审查的
+
+## 配置文档系统审查
+
+这是对**配置文档系统**的简要审查，对照 [`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 和当前代码库进行。
+
+### 实现已匹配的部分
+
+代码库已朝着重新设计的目标迈进了很大一步：
+
+- **单一类型化根节点**：`AppConfig` 位于 [`config/schema.py`](config/schema.py)，通过 [`config/loader.py`](config/loader.py) 和 [`get_app_config()`](config/loader.py) 加载。
+- **精简的 `services_config`**：[`config/services_config.py`](config/services_config.py) 委托给 `get_app_config()`（解决了服务层的发现 A）。
+- **租户目录**：[`config/tenant_config_loader.py`](config/tenant_config_loader.py) 使用 `get_app_config().tenants`。
+- **重写资产路径**：`config/dictionaries/query_rewrite.dict` 已存在（解决了发现 E 中的文件名不匹配问题）。
+- **可观测性**：[`GET /admin/config`](api/routes/admin.py) 返回**经过脱敏处理的有效配置树**；[`GET /admin/config/meta`](api/routes/admin.py) 暴露环境信息、`config_hash`、`loaded_files`、`deprecated_keys`（涵盖了 §5.10 的意图；文档中提议的是 `/admin/config/effective`，但实际实现位于 `/admin/config`）。
+
+因此，**"单一加载器 + 有效配置可见性"** 的故事在代码中已基本实现；文档尚未完全跟上。
+
+---
+
+## 文档问题（影响最大）
+
+### 1. 管理 API 文档中关于 `/admin/config` 的描述错误
+
+[`docs/搜索API对接指南.md`](docs/搜索API对接指南.md)（管理部分附近）和 [`docs/搜索API对接指南-06-管理接口（Admin）.md`](docs/搜索API对接指南-06-管理接口（Admin）.md) 仍将 `/admin/config` 描述为**按租户**的 JSON（包含 `tenant_id`、`es_index_name`、`supported_languages` 等字段）。实际实现返回的是 **`AppConfig.sanitized_dict()`**（完整的应用配置，敏感信息已脱敏），而不是租户摘要字段。
+
+**这些指南中还缺少：** `GET /admin/config/meta`。
+
+**健康检查：** 拆分指南中的示例包含了 [`HealthResponse`](api/models.py) 中不存在的字段（只有 `status` 和 `elasticsearch`）。
+
+对于任何仅根据文档进行 API 集成的人来说，这是最明显的"未解决问题"。
+
+### 2. 面向开发者的指南仍将 `services_config` 作为"配置解析器"的核心
+
+[`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §5.2 仍说搜索配置由 **`ConfigLoader`** 加载，服务由 **`config/services_config`** "解析"。§6.2 仍将 **`config/services_config.py`** 列为主要的"解析入口"。[`docs/QUICKSTART.md`](docs/QUICKSTART.md) §3.1 仍说"配置解析：`config/services_config.py`"。
+
+文档中准确的说法应该是：**规范入口是 `config/loader.py` + `get_app_config()`**；[`config/config_loader.py`](config/config_loader.py) 中的 `ConfigLoader` 包装了统一加载器；`services_config` 是现有调用点的**兼容性外观**。
+
+### 3. 重新设计文档本身不是"活的"状态文档
+
+[`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 读起来仍是**纯粹的问题陈述 + 目标**，没有简短的**"已实现 vs 剩余"**部分。这很容易让人假设什么都没做，或者重复工作。添加一个小附录（或一页 `config/README.md` —— 见下文）可以解决这个问题。
+
+### 4. 缺少 `config/README.md`（§5.3 中推荐）
+
+仍然没有专门的 **`config/README.md`** 来描述：加载器入口点、高级优先级、字典存放位置、指向 `/admin/config` + `/admin/config/meta` 的链接，以及重新设计文档的链接。这是重新设计中明确的交付物，可以锚定"文档系统"。
+
+### 5. 轻微的文档整洁问题
+
+- [`docs/QUICKSTART.md`](docs/QUICKSTART.md) §1.9 环境变量项目后的行有一个多余字符：`---·`（可能是打字错误）。
+- [`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §10 **文档索引**没有列出 `config-system-review-and-redesign.md` 或未来的 `config/README.md`。
+
+---
+
+## 重新设计目标与当前代码之间的差距（文档不应声称"已完成"）
+
+这些影响文档的诚实度：
+
+| 主题 | 状态 |
+|--------|--------|
+| **`config dump` CLI**（§5.10） | `main.py` 中不存在；运维人员依赖 HTTP 或临时脚本。 |
+| **隐藏的 `["en", "zh"]` 回退**（阶段 3 / 发现 D） | 仍在 [`indexer/document_transformer.py`](indexer/document_transformer.py)、[`suggestion/builder.py`](suggestion/builder.py) 等中使用。 |
+| **加载器外的 `os.getenv`**（规则 1–2） | 仍在例如 [`embeddings/server.py`](embeddings/server.py)、[`reranker/server.py`](reranker/server.py)、[`api/app.py`](api/app.py) 中使用 —— 文档声称"仅加载器"将是夸大其词。 |
+| **拆分 `base.yaml` / `environments/` / `tenants/*.yaml`**（阶段 5） | 未采用；仍是单一的 [`config/config.yaml`](config/config.yaml)。 |
+| **遗留租户标志**（阶段 6 / 发现 H） | [`indexer/README.md`](indexer/README.md) 仍描述上游 MySQL 的 `translate_to_en` / `translate_to_zh`（这可能作为**上游模式**文档保留；需与 Python `tenant_config` 模型区分开来）。 |
+
+---
+
+## 推荐的后续步骤（仅文档，按优先级排序）
+
+1. **修复管理 API 文档**（合并指南 + `-06-` 拆分）：`/admin/config` 的响应格式，添加 `/admin/config/meta`，使健康检查示例与 [`HealthResponse`](api/models.py) 一致。
+2. **更新 DEVELOPER_GUIDE §5–§6 和 QUICKSTART §1.9 / §3.1**，将 **`get_app_config()` / `loader.py`** 描述为主要入口，将 **`services_config`** 描述为适配器。
+3. **添加 `config/README.md`**（简短的操作 + 开发者入口）。
+4. **在 `config-system-review-and-redesign.md` 中添加**带日期的**实现状态表**（已交付 vs 推迟的内容），使审查文档不与现实矛盾。
+5. **DEVELOPER_GUIDE §9 检查清单**：将"配置来自 `services_config`"替换为允许 `get_app_config()` 或精简适配器的语言，与 §6 保持一致。
+
+如果需要，我可以在后续处理中为项目 1–3 和重新设计文档中的简短状态块应用补丁。
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+检索相关性优化：
+原始搜索词和翻译的词，都需要有对应的主干分析
+这个主干可以根据词性简单提取名词即可
+在搜索时，原始词和主干都成对地出现，原始词和trunk_keywords一起组成一个或查询。
+有一种方案是把原始词和主干词拼接起来。但是bm25要调tf系数。
+
+
+
+
 nllb-200-distilled-600M性能优化
 已完成（2026-03）
 - CTranslate2 迁移 + float16 转换
@@ -38,6 +38,10 @@
   - `TEXT_MAX_INFLIGHT`
   - `IMAGE_MAX_INFLIGHT`
 - 当超过处理能力时，服务会直接返回过载错误，而不是无限排队。
+- 文本与图片服务均支持 `priority` query 参数（图片不做队列插队，仅 admission 规则与文本一致）：
+  - `priority=0`（默认）：适合离线索引，仍分别受 `TEXT_MAX_INFLIGHT` / `IMAGE_MAX_INFLIGHT` admission control 约束。
+  - `priority>0`（建议在线请求用 `1`）：不会因 admission control 被拒绝，但仍会占用对应 text/image 的 inflight。
+  - 文本服务端会优先处理高优先级文本请求；图片端不实现插队，顺序按请求到达处理即可。
 - `GET /health` 会返回各自的 `limits`、`stats`、`cache_enabled` 等状态；`GET /ready` 用于就绪探针。
 #### 7.1.1 `POST /embed/text` — 文本向量化
@@ -59,11 +63,15 @@
 **完整 curl 示例**:
 ```bash
-curl -X POST "http://localhost:6005/embed/text?normalize=true" \
+curl -X POST "http://localhost:6005/embed/text?normalize=true&priority=1" \
   -H "Content-Type: application/json" \
   -d '["芭比娃娃 儿童玩具", "纯棉T恤 短袖"]'
 ```
+说明：
+- 在线 query / 实时请求：建议显式传 `priority=1`
+- 离线索引 / 批量回填：保持默认 `priority=0` 即可
+
 #### 7.1.2 `POST /embed/image` — 图片向量化
 将图片 URL 或路径转为向量，用于以图搜图。
@@ -85,11 +93,13 @@ curl -X POST &quot;http://localhost:6005/embed/text?normalize=true&quot; \
 **完整 curl 示例**:
 ```bash
-curl -X POST "http://localhost:6008/embed/image?normalize=true" \
+curl -X POST "http://localhost:6008/embed/image?normalize=true&priority=1" \
   -H "Content-Type: application/json" \
   -d '["https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg"]'
 ```
+在线以图搜图等实时场景可传 `priority=1`；离线索引回填保持默认 `priority=0`。
+
 #### 7.1.3 `GET /health` — 健康检查
 ```bash
@@ -118,6 +128,8 @@ curl &quot;http://localhost:6008/ready&quot;
 - cache key 已区分 `normalize=true/false`，避免不同归一化策略命中同一条缓存。
 - 当服务端发现请求是 **full-cache-hit** 时，会直接返回，不占用模型并发槽位。
 - 当服务端发现超过 `TEXT_MAX_INFLIGHT` / `IMAGE_MAX_INFLIGHT` 时，会直接拒绝，而不是无限排队。
+- 其中 `POST /embed/text` 的 `priority=0` 会按上面的 inflight 规则直接拒绝；`priority>0` 不会被 admission 拒绝，但仍计入 inflight，并在服务端排队时优先于 `priority=0` 请求。
+- `POST /embed/image` 的 `priority=0` 受 `IMAGE_MAX_INFLIGHT` 约束；`priority>0` 不会被 admission 拒绝，但仍计入 inflight（无插队）。
 #### 7.1.6 TEI 统一调优建议（主服务）
@@ -252,9 +264,9 @@ curl &quot;http://localhost:6007/health&quot;
 - 如果是en-zh互译、期待更高的速度，可以考虑`opus-mt-zh-en` / `opus-mt-en-zh`。（质量未详细评测，一些文章说比blib-200-600m更好，但是我看了些case感觉要差不少）
 **实时翻译选型建议**:
-- 在线 query 翻译如果只是 `en/zh` 互译，优先使用 `opus-mt-zh-en` 或 `opus-mt-en-zh`，它们是当前已测本地模型里延迟最低的一档。
+- 在线 query 翻译如果只是 `en/zh` 互译，优先使用 `opus-mt-zh-en` 或 `opus-mt-en-zh`。
 - 如果涉及其他语言，或对质量要求高于本地轻量模型，优先考虑 `deepl`。
-- `nllb-200-distilled-600m` 不建议作为在线 query 翻译默认方案；我们在 `Tesla T4` 上测到 `batch_size=1` 时，`zh -> en` p50 约 `292.54 ms`、p95 约 `624.12 ms`，`en -> zh` p50 约 `481.61 ms`、p95 约 `1171.71 ms`。
+- `nllb-200-distilled-600m` 不建议作为在线 query 翻译默认方案；我们在 `Tesla T4` 上测到 `batch_size=1` 时，根据query长短，耗时大概在70-150ms之间。
 **Batch Size / 调用方式建议**:
 - 本接口支持 `text: string[]`；离线或批量索引翻译时，应尽量合并请求，让底层 backend 发挥批处理能力。
@@ -17,9 +17,9 @@
 查询链路（文本相关）：
 1. `QueryParser.parse()`  
-   输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`。
+   输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`；另输出 `contains_chinese` / `contains_english`（仅服务混写辅助召回，见 §4 末）。
 2. `ESQueryBuilder._build_advanced_text_query()`  
-   按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段，叠加 shared 字段（`tags`、`option*_values`）。
+   按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段，叠加 shared 字段（`tags`、`option*_values`）；若命中混写辅助条件，在同一子句内并入另一语种列（§4 末）。
 3. `build_query()`  
    统一走文本策略，不再有布尔 AST 枝路。
@@ -40,14 +40,20 @@
 3. 若第 2 步翻译部分失败或全部失败：  
    对缺失翻译的 `index_languages` 字段，追加“原文低权重兜底”子句，避免完全丢失这些语种索引面的召回机会。
-### 3.2 翻译等待策略
+### 3.2 翻译与向量：并发提交与共享超时
-`QueryParser.parse()` 中：
+`QueryParser.parse()` 内（Stage 4–6）对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`：
-- 当源语种不在 `index_languages`：使用 `translate_multi_async(...)` 并等待 futures 收敛
-- 当源语种在 `index_languages`：使用 `translate_multi(..., async_mode=True)`，优先缓存命中，未命中可后台补齐
+- **翻译**：对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务（多目标时并发执行）。
+- **查询向量**（若开启 `enable_text_embedding` 且域为 default）：再提交一个 `text_encoder.encode` 任务。
+- 上述任务进入**同一** future 集合；例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时，常为 **2 路翻译 + 1 路向量，共 3 个任务并发**，共用超时。
-这保证了“必须翻译才能检索”的场景不会直接空跑。
+**等待预算（毫秒）**由 `detected_language` 是否属于租户 `index_languages` 决定（`query_config`）：
+
+- **在索引内**：`translation_embedding_wait_budget_ms_source_in_index`（默认较短，如 80ms）— 主召回已能打在源语种字段，翻译/向量稍慢可容忍。
+- **不在索引内**：`translation_embedding_wait_budget_ms_source_not_in_index`（默认较长，如 200ms）— 翻译对可检索文本更关键，给足时间。
+
+超时未完成的任务会被丢弃并记 warning，解析继续（可能无部分译文或无数向量）。
 ## 4. 统一文本召回表达式
@@ -68,8 +74,16 @@
 最终按 `bool.should` 组合，`minimum_should_match: 1`。
+> **附 — 混写辅助召回**  
+> 当中英（或多脚本）混写时，为略抬召回：`QueryParser` 用 `contains_chinese`（文中有汉字）、`contains_english`（分词中有长度 ≥3 的纯英文 token）打标；`ESQueryBuilder` 在某一语言的 `multi_match` 上，按规则把**另一语种**的同类字段并入同一 `fields`（受 `index_languages` 限制），并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`（默认 0.8，`ESQueryBuilder` 构造参数）**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
+
 ## 5. 关键配置项（文本策略）
+`query_config` 下与解析等待相关的项：
+
+- `translation_embedding_wait_budget_ms_source_in_index`
+- `translation_embedding_wait_budget_ms_source_not_in_index`
+
 位于 `config/config.yaml -> query_config.text_query_strategy`：
 - `base_minimum_should_match`
@@ -137,6 +151,7 @@
   - `query_text_by_lang`
   - `source_in_index_languages`
   - `index_languages`
+  - `contains_chinese` / `contains_english`
 - `ESQueryBuilder` 负责“表达式展开”：
   - 动态字段组装
   - 子句权重分配
@@ -30,13 +30,13 @@
 - 文本服务（默认 `6005`）
   - `POST /embed/text`
   - 请求体：`["文本1", "文本2", ...]`
-  - 可选 query 参数：`normalize=true|false`
+  - 可选 query 参数：`normalize=true|false`、`priority=0|1`
   - 返回：`[[...], [...], ...]`
   - 健康接口：`GET /health`、`GET /ready`
 - 图片服务（默认 `6008`）
   - `POST /embed/image`
   - 请求体：`["url或本地路径1", ...]`
-  - 可选 query 参数：`normalize=true|false`
+  - 可选 query 参数：`normalize=true|false`、`priority=0|1`
   - 返回：`[[...], [...], ...]`
   - 健康接口：`GET /health`、`GET /ready`
@@ -61,6 +61,11 @@
 - 图片服务可以配置得比文本更严格。
 - 请求若是 full-cache-hit，会在服务端直接返回，不占用模型并发槽位。
 - 超过处理能力时直接拒绝，比无限排队更稳定。
+- 文本服务支持 `priority`：
+  - `priority=0`（默认，适合离线索引）仍受 `TEXT_MAX_INFLIGHT` 限制，超限直接返回 overload。
+  - `priority>0`（建议在线 query 用 `1`）不会因 admission control 被拒绝，但仍会计入 inflight。
+  - 文本服务内部使用双队列调度，处理时会优先消费高优先级请求，避免在线请求长期排在离线批量任务后面。
+- 图片服务同样支持 `priority`（语义与文本一致，按 `IMAGE_MAX_INFLIGHT` 计数；不做队列插队，仅 admission 规则不同）。
 ### 图片向量：clip-as-service（推荐）
@@ -86,6 +91,14 @@
    - `CLIP_AS_SERVICE_MODEL_NAME=CN-CLIP/ViT-L-14`
    - `scripts/start_cnclip_service.sh` 默认会读取同一个 `CLIP_AS_SERVICE_MODEL_NAME`，也可用 `CNCLIP_MODEL_NAME` 或 `--model-name` 临时覆盖
+### 性能与压测（沿用仓库脚本）
+
+- 接口级压测（与 `perf_reports/2026-03-12/matrix_report/` 等方法一致）：`scripts/perf_api_benchmark.py`
+  - 示例：`python scripts/perf_api_benchmark.py --scenario embed_text --duration 30 --concurrency 20`
+  - 文本/图片向量可带 `priority`（与线上 admission 语义一致）：`--embed-text-priority 1`、`--embed-image-priority 1`
+  - 自定义请求模板：`--cases-file scripts/perf_cases.json.example`
+- 历史矩阵结果与说明见 `perf_reports/2026-03-12/matrix_report/summary.md`。
+
 ### 启动服务
 使用仓库脚本启动：
@@ -35,7 +35,12 @@ class CLIPImageEncoder:
             namespace="image",
         )
-    def _call_service(self, request_data: List[str], normalize_embeddings: bool = True) -> List[Any]:
+    def _call_service(
+        self,
+        request_data: List[str],
+        normalize_embeddings: bool = True,
+        priority: int = 0,
+    ) -> List[Any]:
         """
         Call the embedding service API.
@@ -48,7 +53,10 @@ class CLIPImageEncoder:
         try:
             response = requests.post(
                 self.endpoint,
-                params={"normalize": "true" if normalize_embeddings else "false"},
+                params={
+                    "normalize": "true" if normalize_embeddings else "false",
+                    "priority": max(0, int(priority)),
+                },
                 json=request_data,
                 timeout=60
             )
@@ -66,7 +74,12 @@ class CLIPImageEncoder:
         """
         raise NotImplementedError("encode_image with PIL Image is not supported by embedding service")
-    def encode_image_from_url(self, url: str, normalize_embeddings: bool = True) -> np.ndarray:
+    def encode_image_from_url(
+        self,
+        url: str,
+        normalize_embeddings: bool = True,
+        priority: int = 0,
+    ) -> np.ndarray:
         """
         Generate image embedding via network service using URL.
@@ -81,7 +94,11 @@ class CLIPImageEncoder:
         if cached is not None:
             return cached
-        response_data = self._call_service([url], normalize_embeddings=normalize_embeddings)
+        response_data = self._call_service(
+            [url],
+            normalize_embeddings=normalize_embeddings,
+            priority=priority,
+        )
         if not response_data or len(response_data) != 1 or response_data[0] is None:
             raise RuntimeError(f"No image embedding returned for URL: {url}")
         vec = np.array(response_data[0], dtype=np.float32)
@@ -95,6 +112,7 @@ class CLIPImageEncoder:
         images: List[Union[str, Image.Image]],
         batch_size: int = 8,
         normalize_embeddings: bool = True,
+        priority: int = 0,
     ) -> List[np.ndarray]:
         """
         Encode a batch of images efficiently via network service.
@@ -129,7 +147,11 @@ class CLIPImageEncoder:
         for i in range(0, len(pending_urls), batch_size):
             batch_urls = pending_urls[i : i + batch_size]
-            response_data = self._call_service(batch_urls, normalize_embeddings=normalize_embeddings)
+            response_data = self._call_service(
+                batch_urls,
+                normalize_embeddings=normalize_embeddings,
+                priority=priority,
+            )
             if not response_data or len(response_data) != len(batch_urls):
                 raise RuntimeError(
                     f"Image embedding response length mismatch: expected {len(batch_urls)}, "
@@ -153,6 +175,7 @@ class CLIPImageEncoder:
         urls: List[str],
         batch_size: Optional[int] = None,
         normalize_embeddings: bool = True,
+        priority: int = 0,
     ) -> List[np.ndarray]:
         """
         与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口，供索引器 document_transformer 调用。
@@ -168,4 +191,5 @@ class CLIPImageEncoder:
             urls,
             batch_size=batch_size or 8,
             normalize_embeddings=normalize_embeddings,
+            priority=priority,
         )
@@ -129,7 +129,7 @@ _TEXT_REQUEST_TIMEOUT_SEC = max(
     1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30"))
 )
 _TEXT_MAX_INFLIGHT = max(1, int(os.getenv("TEXT_MAX_INFLIGHT", "32")))
-_IMAGE_MAX_INFLIGHT = max(1, int(os.getenv("IMAGE_MAX_INFLIGHT", "1")))
+_IMAGE_MAX_INFLIGHT = max(1, int(os.getenv("IMAGE_MAX_INFLIGHT", "20")))
 _OVERLOAD_STATUS_CODE = int(os.getenv("EMBEDDING_OVERLOAD_STATUS_CODE", "503"))
 _LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3")))
 _LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120")))
@@ -206,23 +206,24 @@ class _InflightLimiter:
     def __init__(self, name: str, limit: int):
         self.name = name
         self.limit = max(1, int(limit))
-        self._sem = threading.BoundedSemaphore(self.limit)
         self._lock = threading.Lock()
         self._active = 0
         self._rejected = 0
         self._completed = 0
         self._failed = 0
         self._max_active = 0
+        self._priority_bypass_total = 0
-    def try_acquire(self) -> tuple[bool, int]:
-        if not self._sem.acquire(blocking=False):
-            with self._lock:
+    def try_acquire(self, *, bypass_limit: bool = False) -> tuple[bool, int]:
+        with self._lock:
+            if not bypass_limit and self._active >= self.limit:
                 self._rejected += 1
                 active = self._active
-            return False, active
-        with self._lock:
+                return False, active
             self._active += 1
             self._max_active = max(self._max_active, self._active)
+            if bypass_limit:
+                self._priority_bypass_total += 1
             active = self._active
         return True, active
@@ -234,7 +235,6 @@ class _InflightLimiter:
             else:
                 self._failed += 1
             active = self._active
-        self._sem.release()
         return active
     def snapshot(self) -> Dict[str, int]:
@@ -246,9 +246,157 @@ class _InflightLimiter:
                 "completed_total": self._completed,
                 "failed_total": self._failed,
                 "max_active": self._max_active,
+                "priority_bypass_total": self._priority_bypass_total,
             }
+def _effective_priority(priority: int) -> int:
+    return 1 if int(priority) > 0 else 0
+
+
+def _priority_label(priority: int) -> str:
+    return "high" if _effective_priority(priority) > 0 else "normal"
+
+
+@dataclass
+class _TextDispatchTask:
+    normalized: List[str]
+    effective_normalize: bool
+    request_id: str
+    priority: int
+    created_at: float
+    done: threading.Event
+    result: Optional[_EmbedResult] = None
+    error: Optional[Exception] = None
+
+
+_text_dispatch_high_queue: "deque[_TextDispatchTask]" = deque()
+_text_dispatch_normal_queue: "deque[_TextDispatchTask]" = deque()
+_text_dispatch_cv = threading.Condition()
+_text_dispatch_workers: List[threading.Thread] = []
+_text_dispatch_worker_stop = False
+_text_dispatch_worker_count = 0
+
+
+def _text_dispatch_queue_depth() -> Dict[str, int]:
+    with _text_dispatch_cv:
+        return {
+            "high": len(_text_dispatch_high_queue),
+            "normal": len(_text_dispatch_normal_queue),
+            "total": len(_text_dispatch_high_queue) + len(_text_dispatch_normal_queue),
+        }
+
+
+def _pop_text_dispatch_task_locked() -> Optional["_TextDispatchTask"]:
+    if _text_dispatch_high_queue:
+        return _text_dispatch_high_queue.popleft()
+    if _text_dispatch_normal_queue:
+        return _text_dispatch_normal_queue.popleft()
+    return None
+
+
+def _start_text_dispatch_workers() -> None:
+    global _text_dispatch_workers, _text_dispatch_worker_stop, _text_dispatch_worker_count
+    if _text_model is None:
+        return
+    target_worker_count = 1 if _text_backend_name == "local_st" else _TEXT_MAX_INFLIGHT
+    alive_workers = [worker for worker in _text_dispatch_workers if worker.is_alive()]
+    if len(alive_workers) == target_worker_count:
+        _text_dispatch_workers = alive_workers
+        _text_dispatch_worker_count = target_worker_count
+        return
+    _text_dispatch_worker_stop = False
+    _text_dispatch_worker_count = target_worker_count
+    _text_dispatch_workers = []
+    for idx in range(target_worker_count):
+        worker = threading.Thread(
+            target=_text_dispatch_worker_loop,
+            args=(idx,),
+            name=f"embed-text-dispatch-{idx}",
+            daemon=True,
+        )
+        worker.start()
+        _text_dispatch_workers.append(worker)
+    logger.info(
+        "Started text dispatch workers | backend=%s workers=%d",
+        _text_backend_name,
+        target_worker_count,
+    )
+
+
+def _stop_text_dispatch_workers() -> None:
+    global _text_dispatch_worker_stop
+    with _text_dispatch_cv:
+        _text_dispatch_worker_stop = True
+        _text_dispatch_cv.notify_all()
+
+
+def _text_dispatch_worker_loop(worker_idx: int) -> None:
+    while True:
+        with _text_dispatch_cv:
+            while (
+                not _text_dispatch_high_queue
+                and not _text_dispatch_normal_queue
+                and not _text_dispatch_worker_stop
+            ):
+                _text_dispatch_cv.wait()
+            if _text_dispatch_worker_stop:
+                return
+            task = _pop_text_dispatch_task_locked()
+        if task is None:
+            continue
+        try:
+            queue_wait_ms = (time.perf_counter() - task.created_at) * 1000.0
+            logger.info(
+                "text dispatch start | worker=%d priority=%s inputs=%d queue_wait_ms=%.2f",
+                worker_idx,
+                _priority_label(task.priority),
+                len(task.normalized),
+                queue_wait_ms,
+                extra=_request_log_extra(task.request_id),
+            )
+            task.result = _embed_text_impl(
+                task.normalized,
+                task.effective_normalize,
+                task.request_id,
+                task.priority,
+            )
+        except Exception as exc:
+            task.error = exc
+        finally:
+            task.done.set()
+
+
+def _submit_text_dispatch_and_wait(
+    normalized: List[str],
+    effective_normalize: bool,
+    request_id: str,
+    priority: int,
+) -> _EmbedResult:
+    if not any(worker.is_alive() for worker in _text_dispatch_workers):
+        _start_text_dispatch_workers()
+    task = _TextDispatchTask(
+        normalized=normalized,
+        effective_normalize=effective_normalize,
+        request_id=request_id,
+        priority=_effective_priority(priority),
+        created_at=time.perf_counter(),
+        done=threading.Event(),
+    )
+    with _text_dispatch_cv:
+        if task.priority > 0:
+            _text_dispatch_high_queue.append(task)
+        else:
+            _text_dispatch_normal_queue.append(task)
+        _text_dispatch_cv.notify()
+    task.done.wait()
+    if task.error is not None:
+        raise task.error
+    if task.result is None:
+        raise RuntimeError("Text dispatch worker returned empty result")
+    return task.result
+
+
 _text_request_limiter = _InflightLimiter(name="text", limit=_TEXT_MAX_INFLIGHT)
 _image_request_limiter = _InflightLimiter(name="image", limit=_IMAGE_MAX_INFLIGHT)
 _text_stats = _EndpointStats(name="text")
@@ -261,6 +409,7 @@ _image_cache = RedisEmbeddingCache(key_prefix=_CACHE_PREFIX, namespace=&quot;image&quot;)
 class _SingleTextTask:
     text: str
     normalize: bool
+    priority: int
     created_at: float
     request_id: str
     done: threading.Event
@@ -268,12 +417,30 @@ class _SingleTextTask:
     error: Optional[Exception] = None
-_text_single_queue: "deque[_SingleTextTask]" = deque()
+_text_single_high_queue: "deque[_SingleTextTask]" = deque()
+_text_single_normal_queue: "deque[_SingleTextTask]" = deque()
 _text_single_queue_cv = threading.Condition()
 _text_batch_worker: Optional[threading.Thread] = None
 _text_batch_worker_stop = False
+def _text_microbatch_queue_depth() -> Dict[str, int]:
+    with _text_single_queue_cv:
+        return {
+            "high": len(_text_single_high_queue),
+            "normal": len(_text_single_normal_queue),
+            "total": len(_text_single_high_queue) + len(_text_single_normal_queue),
+        }
+
+
+def _pop_single_text_task_locked() -> Optional["_SingleTextTask"]:
+    if _text_single_high_queue:
+        return _text_single_high_queue.popleft()
+    if _text_single_normal_queue:
+        return _text_single_normal_queue.popleft()
+    return None
+
+
 def _compact_preview(text: str, max_chars: int) -> str:
     compact = " ".join((text or "").split())
     if len(compact) <= max_chars:
@@ -356,30 +523,41 @@ def _text_batch_worker_loop() -&gt; None:
     max_batch = max(1, int(CONFIG.TEXT_BATCH_SIZE))
     while True:
         with _text_single_queue_cv:
-            while not _text_single_queue and not _text_batch_worker_stop:
+            while (
+                not _text_single_high_queue
+                and not _text_single_normal_queue
+                and not _text_batch_worker_stop
+            ):
                 _text_single_queue_cv.wait()
             if _text_batch_worker_stop:
                 return
-            batch: List[_SingleTextTask] = [_text_single_queue.popleft()]
+            first_task = _pop_single_text_task_locked()
+            if first_task is None:
+                continue
+            batch: List[_SingleTextTask] = [first_task]
             deadline = time.perf_counter() + _TEXT_MICROBATCH_WINDOW_SEC
             while len(batch) < max_batch:
                 remaining = deadline - time.perf_counter()
                 if remaining <= 0:
                     break
-                if not _text_single_queue:
+                if not _text_single_high_queue and not _text_single_normal_queue:
                     _text_single_queue_cv.wait(timeout=remaining)
                     continue
-                while _text_single_queue and len(batch) < max_batch:
-                    batch.append(_text_single_queue.popleft())
+                while len(batch) < max_batch:
+                    next_task = _pop_single_text_task_locked()
+                    if next_task is None:
+                        break
+                    batch.append(next_task)
         try:
             queue_wait_ms = [(time.perf_counter() - task.created_at) * 1000.0 for task in batch]
             reqids = [task.request_id for task in batch]
             logger.info(
-                "text microbatch dispatch | size=%d queue_wait_ms_min=%.2f queue_wait_ms_max=%.2f reqids=%s preview=%s",
+                "text microbatch dispatch | size=%d priority=%s queue_wait_ms_min=%.2f queue_wait_ms_max=%.2f reqids=%s preview=%s",
                 len(batch),
+                _priority_label(max(task.priority for task in batch)),
                 min(queue_wait_ms) if queue_wait_ms else 0.0,
                 max(queue_wait_ms) if queue_wait_ms else 0.0,
                 reqids,
@@ -423,22 +601,32 @@ def _text_batch_worker_loop() -&gt; None:
                 task.done.set()
-def _encode_single_text_with_microbatch(text: str, normalize: bool, request_id: str) -> List[float]:
+def _encode_single_text_with_microbatch(
+    text: str,
+    normalize: bool,
+    request_id: str,
+    priority: int,
+) -> List[float]:
     task = _SingleTextTask(
         text=text,
         normalize=normalize,
+        priority=_effective_priority(priority),
         created_at=time.perf_counter(),
         request_id=request_id,
         done=threading.Event(),
     )
     with _text_single_queue_cv:
-        _text_single_queue.append(task)
+        if task.priority > 0:
+            _text_single_high_queue.append(task)
+        else:
+            _text_single_normal_queue.append(task)
         _text_single_queue_cv.notify()
     if not task.done.wait(timeout=_TEXT_REQUEST_TIMEOUT_SEC):
         with _text_single_queue_cv:
+            queue = _text_single_high_queue if task.priority > 0 else _text_single_normal_queue
             try:
-                _text_single_queue.remove(task)
+                queue.remove(task)
             except ValueError:
                 pass
         raise RuntimeError(
@@ -489,6 +677,7 @@ def load_models():
                     f"Unsupported embedding backend: {backend_name}. "
                     "Supported: tei, local_st"
                 )
+            _start_text_dispatch_workers()
             logger.info("Text backend loaded successfully: %s", _text_backend_name)
         except Exception as e:
             logger.error("Failed to load text model: %s", e, exc_info=True)
@@ -532,6 +721,7 @@ def load_models():
 @app.on_event("shutdown")
 def stop_workers() -> None:
     _stop_text_batch_worker()
+    _stop_text_dispatch_workers()
 def _normalize_vector(vec: np.ndarray) -> np.ndarray:
@@ -602,6 +792,8 @@ def _try_full_image_cache_hit(
 def health() -> Dict[str, Any]:
     """Health check endpoint. Returns status and current throttling stats."""
     ready = (not open_text_model or _text_model is not None) and (not open_image_model or _image_model is not None)
+    text_dispatch_depth = _text_dispatch_queue_depth()
+    text_microbatch_depth = _text_microbatch_queue_depth()
     return {
         "status": "ok" if ready else "degraded",
         "service_kind": _SERVICE_KIND,
@@ -620,9 +812,18 @@ def health() -&gt; Dict[str, Any]:
             "text": _text_stats.snapshot(),
             "image": _image_stats.snapshot(),
         },
+        "text_dispatch": {
+            "workers": _text_dispatch_worker_count,
+            "workers_alive": sum(1 for worker in _text_dispatch_workers if worker.is_alive()),
+            "queue_depth": text_dispatch_depth["total"],
+            "queue_depth_high": text_dispatch_depth["high"],
+            "queue_depth_normal": text_dispatch_depth["normal"],
+        },
         "text_microbatch": {
             "window_ms": round(_TEXT_MICROBATCH_WINDOW_SEC * 1000.0, 3),
-            "queue_depth": len(_text_single_queue),
+            "queue_depth": text_microbatch_depth["total"],
+            "queue_depth_high": text_microbatch_depth["high"],
+            "queue_depth_normal": text_microbatch_depth["normal"],
             "worker_alive": bool(_text_batch_worker is not None and _text_batch_worker.is_alive()),
             "request_timeout_sec": _TEXT_REQUEST_TIMEOUT_SEC,
         },
@@ -654,6 +855,7 @@ def _embed_text_impl(
     normalized: List[str],
     effective_normalize: bool,
     request_id: str,
+    priority: int = 0,
 ) -> _EmbedResult:
     if _text_model is None:
         raise RuntimeError("Text model not loaded")
@@ -703,6 +905,7 @@ def _embed_text_impl(
                         missing_texts[0],
                         normalize=effective_normalize,
                         request_id=request_id,
+                        priority=priority,
                     )
                 ]
                 mode = "microbatch-single"
@@ -777,6 +980,7 @@ async def embed_text(
     http_request: Request,
     response: Response,
     normalize: Optional[bool] = None,
+    priority: int = 0,
 ) -> List[Optional[List[float]]]:
     if _text_model is None:
         raise HTTPException(status_code=503, detail="Text embedding model not loaded in this service")
@@ -784,6 +988,9 @@ async def embed_text(
     request_id = _resolve_request_id(http_request)
     response.headers["X-Request-ID"] = request_id
+    if priority < 0:
+        raise HTTPException(status_code=400, detail="priority must be >= 0")
+    effective_priority = _effective_priority(priority)
     effective_normalize = bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS) if normalize is None else bool(normalize)
     normalized: List[str] = []
     for i, t in enumerate(texts):
@@ -806,8 +1013,9 @@ async def embed_text(
             cache_misses=0,
         )
         logger.info(
-            "embed_text response | backend=%s mode=cache-only inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=0 first_vector=%s latency_ms=%.2f",
+            "embed_text response | backend=%s mode=cache-only priority=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=0 first_vector=%s latency_ms=%.2f",
             _text_backend_name,
+            _priority_label(effective_priority),
             len(normalized),
             effective_normalize,
             len(cache_only.vectors[0]) if cache_only.vectors and cache_only.vectors[0] is not None else 0,
@@ -818,13 +1026,14 @@ async def embed_text(
         )
         return cache_only.vectors
-    accepted, active = _text_request_limiter.try_acquire()
+    accepted, active = _text_request_limiter.try_acquire(bypass_limit=effective_priority > 0)
     if not accepted:
         _text_stats.record_rejected()
         logger.warning(
-            "embed_text rejected | client=%s backend=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
+            "embed_text rejected | client=%s backend=%s priority=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
             _request_client(http_request),
             _text_backend_name,
+            _priority_label(effective_priority),
             len(normalized),
             effective_normalize,
             active,
@@ -834,7 +1043,10 @@ async def embed_text(
         )
         raise HTTPException(
             status_code=_OVERLOAD_STATUS_CODE,
-            detail=f"Text embedding service busy: active={active}, limit={_TEXT_MAX_INFLIGHT}",
+            detail=(
+                "Text embedding service busy for priority=0 requests: "
+                f"active={active}, limit={_TEXT_MAX_INFLIGHT}"
+            ),
         )
     request_started = time.perf_counter()
@@ -844,9 +1056,10 @@ async def embed_text(
     cache_misses = 0
     try:
         logger.info(
-            "embed_text request | client=%s backend=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
+            "embed_text request | client=%s backend=%s priority=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
             _request_client(http_request),
             _text_backend_name,
+            _priority_label(effective_priority),
             len(normalized),
             effective_normalize,
             active,
@@ -855,13 +1068,20 @@ async def embed_text(
             extra=_request_log_extra(request_id),
         )
         verbose_logger.info(
-            "embed_text detail | payload=%s normalize=%s backend=%s",
+            "embed_text detail | payload=%s normalize=%s backend=%s priority=%s",
             normalized,
             effective_normalize,
             _text_backend_name,
+            _priority_label(effective_priority),
             extra=_request_log_extra(request_id),
         )
-        result = await run_in_threadpool(_embed_text_impl, normalized, effective_normalize, request_id)
+        result = await run_in_threadpool(
+            _submit_text_dispatch_and_wait,
+            normalized,
+            effective_normalize,
+            request_id,
+            effective_priority,
+        )
         success = True
         backend_elapsed_ms = result.backend_elapsed_ms
         cache_hits = result.cache_hits
@@ -875,9 +1095,10 @@ async def embed_text(
             cache_misses=cache_misses,
         )
         logger.info(
-            "embed_text response | backend=%s mode=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=%d first_vector=%s latency_ms=%.2f",
+            "embed_text response | backend=%s mode=%s priority=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=%d first_vector=%s latency_ms=%.2f",
             _text_backend_name,
             result.mode,
+            _priority_label(effective_priority),
             len(normalized),
             effective_normalize,
             len(result.vectors[0]) if result.vectors and result.vectors[0] is not None else 0,
@@ -888,8 +1109,9 @@ async def embed_text(
             extra=_request_log_extra(request_id),
         )
         verbose_logger.info(
-            "embed_text result detail | count=%d first_vector=%s latency_ms=%.2f",
+            "embed_text result detail | count=%d priority=%s first_vector=%s latency_ms=%.2f",
             len(result.vectors),
+            _priority_label(effective_priority),
             result.vectors[0][: _VECTOR_PREVIEW_DIMS]
             if result.vectors and result.vectors[0] is not None
             else [],
@@ -909,8 +1131,9 @@ async def embed_text(
             cache_misses=cache_misses,
         )
         logger.error(
-            "embed_text failed | backend=%s inputs=%d normalize=%s latency_ms=%.2f error=%s",
+            "embed_text failed | backend=%s priority=%s inputs=%d normalize=%s latency_ms=%.2f error=%s",
             _text_backend_name,
+            _priority_label(effective_priority),
             len(normalized),
             effective_normalize,
             latency_ms,
@@ -922,8 +1145,9 @@ async def embed_text(
     finally:
         remaining = _text_request_limiter.release(success=success)
         logger.info(
-            "embed_text finalize | success=%s active_after=%d",
+            "embed_text finalize | success=%s priority=%s active_after=%d",
             success,
+            _priority_label(effective_priority),
             remaining,
             extra=_request_log_extra(request_id),
         )
@@ -1019,6 +1243,7 @@ async def embed_image(
     http_request: Request,
     response: Response,
     normalize: Optional[bool] = None,
+    priority: int = 0,
 ) -> List[Optional[List[float]]]:
     if _image_model is None:
         raise HTTPException(status_code=503, detail="Image embedding model not loaded in this service")
@@ -1026,6 +1251,10 @@ async def embed_image(
     request_id = _resolve_request_id(http_request)
     response.headers["X-Request-ID"] = request_id
+    if priority < 0:
+        raise HTTPException(status_code=400, detail="priority must be >= 0")
+    effective_priority = _effective_priority(priority)
+
     effective_normalize = bool(CONFIG.IMAGE_NORMALIZE_EMBEDDINGS) if normalize is None else bool(normalize)
     urls: List[str] = []
     for i, url_or_path in enumerate(images):
@@ -1048,7 +1277,8 @@ async def embed_image(
             cache_misses=0,
         )
         logger.info(
-            "embed_image response | mode=cache-only inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=0 first_vector=%s latency_ms=%.2f",
+            "embed_image response | mode=cache-only priority=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=0 first_vector=%s latency_ms=%.2f",
+            _priority_label(effective_priority),
             len(urls),
             effective_normalize,
             len(cache_only.vectors[0]) if cache_only.vectors and cache_only.vectors[0] is not None else 0,
@@ -1059,12 +1289,13 @@ async def embed_image(
         )
         return cache_only.vectors
-    accepted, active = _image_request_limiter.try_acquire()
+    accepted, active = _image_request_limiter.try_acquire(bypass_limit=effective_priority > 0)
     if not accepted:
         _image_stats.record_rejected()
         logger.warning(
-            "embed_image rejected | client=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
+            "embed_image rejected | client=%s priority=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
             _request_client(http_request),
+            _priority_label(effective_priority),
             len(urls),
             effective_normalize,
             active,
@@ -1074,7 +1305,10 @@ async def embed_image(
         )
         raise HTTPException(
             status_code=_OVERLOAD_STATUS_CODE,
-            detail=f"Image embedding service busy: active={active}, limit={_IMAGE_MAX_INFLIGHT}",
+            detail=(
+                "Image embedding service busy for priority=0 requests: "
+                f"active={active}, limit={_IMAGE_MAX_INFLIGHT}"
+            ),
         )
     request_started = time.perf_counter()
@@ -1084,8 +1318,9 @@ async def embed_image(
     cache_misses = 0
     try:
         logger.info(
-            "embed_image request | client=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
+            "embed_image request | client=%s priority=%s inputs=%d normalize=%s active=%d limit=%d preview=%s",
             _request_client(http_request),
+            _priority_label(effective_priority),
             len(urls),
             effective_normalize,
             active,
@@ -1094,9 +1329,10 @@ async def embed_image(
             extra=_request_log_extra(request_id),
         )
         verbose_logger.info(
-            "embed_image detail | payload=%s normalize=%s",
+            "embed_image detail | payload=%s normalize=%s priority=%s",
             urls,
             effective_normalize,
+            _priority_label(effective_priority),
             extra=_request_log_extra(request_id),
         )
         result = await run_in_threadpool(_embed_image_impl, urls, effective_normalize, request_id)
@@ -1113,8 +1349,9 @@ async def embed_image(
             cache_misses=cache_misses,
         )
         logger.info(
-            "embed_image response | mode=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=%d first_vector=%s latency_ms=%.2f",
+            "embed_image response | mode=%s priority=%s inputs=%d normalize=%s dim=%d cache_hits=%d cache_misses=%d first_vector=%s latency_ms=%.2f",
             result.mode,
+            _priority_label(effective_priority),
             len(urls),
             effective_normalize,
             len(result.vectors[0]) if result.vectors and result.vectors[0] is not None else 0,
@@ -1146,7 +1383,8 @@ async def embed_image(
             cache_misses=cache_misses,
         )
         logger.error(
-            "embed_image failed | inputs=%d normalize=%s latency_ms=%.2f error=%s",
+            "embed_image failed | priority=%s inputs=%d normalize=%s latency_ms=%.2f error=%s",
+            _priority_label(effective_priority),
             len(urls),
             effective_normalize,
             latency_ms,
@@ -1158,8 +1396,9 @@ async def embed_image(
     finally:
         remaining = _image_request_limiter.release(success=success)
         logger.info(
-            "embed_image finalize | success=%s active_after=%d",
+            "embed_image finalize | success=%s priority=%s active_after=%d",
             success,
+            _priority_label(effective_priority),
             remaining,
             extra=_request_log_extra(request_id),
         )
@@ -35,7 +35,12 @@ class TextEmbeddingEncoder:
             expire_time=self.expire_time,
         )
-    def _call_service(self, request_data: List[str], normalize_embeddings: bool = True) -> List[Any]:
+    def _call_service(
+        self,
+        request_data: List[str],
+        normalize_embeddings: bool = True,
+        priority: int = 0,
+    ) -> List[Any]:
         """
         Call the embedding service API.
@@ -48,7 +53,10 @@ class TextEmbeddingEncoder:
         try:
             response = requests.post(
                 self.endpoint,
-                params={"normalize": "true" if normalize_embeddings else "false"},
+                params={
+                    "normalize": "true" if normalize_embeddings else "false",
+                    "priority": max(0, int(priority)),
+                },
                 json=request_data,
                 timeout=60
             )
@@ -62,6 +70,7 @@ class TextEmbeddingEncoder:
         self,
         sentences: Union[str, List[str]],
         normalize_embeddings: bool = True,
+        priority: int = 0,
         device: str = 'cpu',
         batch_size: int = 32
     ) -> np.ndarray:
@@ -100,7 +109,11 @@ class TextEmbeddingEncoder:
         # If there are uncached texts, call service
         if uncached_texts:
-            response_data = self._call_service(request_data, normalize_embeddings=normalize_embeddings)
+            response_data = self._call_service(
+                request_data,
+                normalize_embeddings=normalize_embeddings,
+                priority=priority,
+            )
             # Process response
             for i, text in enumerate(uncached_texts):
@@ -371,9 +371,61 @@ body {
     margin-bottom: 2px;
 }
+.product-debug-actions {
+    display: flex;
+    flex-wrap: wrap;
+    align-items: center;
+    gap: 10px 14px;
+    margin-top: 8px;
+}
+
+.product-debug-inline-es-btn {
+    font-family: inherit;
+    font-size: 12px;
+    padding: 4px 10px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    background: #fafafa;
+    color: #333;
+    cursor: pointer;
+}
+
+.product-debug-inline-es-btn:hover {
+    background: #f0f0f0;
+    border-color: #bbb;
+}
+
+.product-debug--es-expanded {
+    max-height: min(70vh, 720px);
+}
+
+.product-es-doc-panel {
+    margin-top: 10px;
+    padding-top: 8px;
+    border-top: 1px dashed #e8e8e8;
+}
+
+.product-es-doc-panel-status {
+    font-size: 12px;
+    color: #888;
+}
+
+.product-es-doc-pre {
+    margin: 6px 0 0;
+    padding: 10px;
+    background: #f5f5f5;
+    border-radius: 4px;
+    overflow: auto;
+    max-height: 50vh;
+    font-size: 11px;
+    line-height: 1.35;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
+
 .product-debug-link {
     display: inline-block;
-    margin-top: 6px;
+    margin-top: 0;
     font-size: 12px;
     color: #e67e22;
     text-decoration: none;
@@ -68,12 +68,85 @@ function initializeApp() {
     // 初始化租户下拉框和分面面板
     console.log('Initializing app...');
     initTenantSelect();
+    setupProductGridEsDocToggle();
     const searchInput = document.getElementById('searchInput');
     if (searchInput) {
         searchInput.focus();
     }
 }
+/** Delegated handler: toggle inline ES raw response under each result card (survives innerHTML refresh on re-search). */
+function setupProductGridEsDocToggle() {
+    const grid = document.getElementById('productGrid');
+    if (!grid || grid.dataset.esDocToggleBound === '1') {
+        return;
+    }
+    grid.dataset.esDocToggleBound = '1';
+    grid.addEventListener('click', onProductGridEsDocToggleClick);
+}
+
+async function onProductGridEsDocToggleClick(event) {
+    const btn = event.target.closest('[data-action="toggle-es-inline-doc"]');
+    if (!btn) {
+        return;
+    }
+    event.preventDefault();
+    const debugRoot = btn.closest('.product-debug');
+    if (!debugRoot) {
+        return;
+    }
+    const panel = debugRoot.querySelector('.product-es-doc-panel');
+    const pre = debugRoot.querySelector('.product-es-doc-pre');
+    const statusEl = debugRoot.querySelector('.product-es-doc-panel-status');
+    if (!panel || !pre || !statusEl) {
+        return;
+    }
+
+    const spuId = btn.getAttribute('data-spu-id') || '';
+    const tenantId = getTenantId();
+    const url = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`;
+
+    if (debugRoot.dataset.esInlineOpen === '1') {
+        panel.setAttribute('hidden', '');
+        debugRoot.classList.remove('product-debug--es-expanded');
+        debugRoot.dataset.esInlineOpen = '0';
+        btn.textContent = '在结果中显示 ES 文档';
+        return;
+    }
+
+    panel.removeAttribute('hidden');
+    debugRoot.classList.add('product-debug--es-expanded');
+    debugRoot.dataset.esInlineOpen = '1';
+    btn.textContent = '隐藏 ES 文档';
+
+    if (pre.textContent.length > 0) {
+        panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+        return;
+    }
+
+    statusEl.style.display = '';
+    statusEl.textContent = '加载中…';
+    pre.style.display = 'none';
+
+    try {
+        const response = await fetch(url);
+        if (!response.ok) {
+            const errText = await response.text();
+            throw new Error(`HTTP ${response.status}: ${errText.slice(0, 200)}`);
+        }
+        const data = await response.json();
+        pre.textContent = customStringify(data);
+        statusEl.style.display = 'none';
+        pre.style.display = 'block';
+    } catch (err) {
+        console.error('ES doc fetch failed', err);
+        statusEl.textContent = `加载失败: ${err.message || err}`;
+        pre.style.display = 'none';
+    }
+
+    panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+}
+
 // 在 DOM 加载完成后初始化
 if (document.readyState === 'loading') {
     document.addEventListener('DOMContentLoaded', initializeApp);
@@ -401,9 +474,20 @@ function displayResults(data) {
                     <div class="product-debug-line">Rerank score: ${rerankScore}</div>
                     <div class="product-debug-line">Fused score: ${fusedScore}</div>
                     ${titleLines}
-                    <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer">
-                        查看 ES 原始文档
-                    </a>
+                    <div class="product-debug-actions">
+                        <button type="button" class="product-debug-inline-es-btn"
+                            data-action="toggle-es-inline-doc"
+                            data-spu-id="${escapeAttr(String(spuId || ''))}">
+                            在结果中显示 ES 文档
+                        </button>
+                        <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer">
+                            查看 ES 原始文档
+                        </a>
+                    </div>
+                    <div class="product-es-doc-panel" hidden>
+                        <div class="product-es-doc-panel-status"></div>
+                        <pre class="product-es-doc-pre"></pre>
+                    </div>
                 </div>
             `;
         }
@@ -0,0 +1,34 @@
+# 性能测试报告索引
+
+本目录存放各次压测/矩阵的原始 JSON 与说明。**推荐复用**仓库脚本，避免重复造轮子：
+
+| 脚本 | 用途 |
+|------|------|
+| `scripts/perf_api_benchmark.py` | 搜索后端、向量、翻译、重排等 HTTP 接口压测；支持 `--embed-text-priority` / `--embed-image-priority` 与 `scripts/perf_cases.json.example` |
+
+历史矩阵示例（并发扫描）：
+
+- `2026-03-12/matrix_report/summary.md` — 与 `summary.json` 同目录
+
+## 2026-03-20 — 向量服务 `priority` 参数烟测
+
+环境：本机 `127.0.0.1:6005`（文本）、`127.0.0.1:6008`（图片），命令与结果见同目录 JSON：
+
+| 报告文件 | 场景 | 说明 |
+|----------|------|------|
+| `2026-03-20_embed_text_p0.json` | `embed_text` | `priority=0`（默认），8s，并发 10 |
+| `2026-03-20_embed_text_p1.json` | `embed_text` | `--embed-text-priority 1`，8s，并发 10 |
+| `2026-03-20_embed_image_p0.json` | `embed_image` | `priority=0`，8s，并发 5 |
+| `2026-03-20_embed_image_p1.json` | `embed_image` | `--embed-image-priority 1`，8s，并发 5 |
+
+复现示例：
+
+```bash
+source activate.sh
+python scripts/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --timeout 30 --output perf_reports/2026-03-20_embed_text_p0.json
+python scripts/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --embed-text-priority 1 --output perf_reports/2026-03-20_embed_text_p1.json
+python scripts/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --timeout 60 --output perf_reports/2026-03-20_embed_image_p0.json
+python scripts/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --embed-image-priority 1 --output perf_reports/2026-03-20_embed_image_p1.json
+```
+
+说明：本次为 **8 秒 smoke**，与 `2026-03-12` 矩阵的时长/并发不可直接横向对比；仅验证 `priority` 参数下服务仍返回 200 且 payload 校验通过。
@@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union
 import numpy as np
 import logging
 import re
-from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+from concurrent.futures import ThreadPoolExecutor, wait
 from embeddings.text_encoder import TextEmbeddingEncoder
 from config import SearchConfig
@@ -42,6 +42,8 @@ class ParsedQuery:
         search_langs: Optional[List[str]] = None,
         index_languages: Optional[List[str]] = None,
         source_in_index_languages: bool = True,
+        contains_chinese: bool = False,
+        contains_english: bool = False,
     ):
         self.original_query = original_query
         self.query_normalized = query_normalized
@@ -58,6 +60,8 @@ class ParsedQuery:
         self.search_langs = search_langs or []
         self.index_languages = index_languages or []
         self.source_in_index_languages = bool(source_in_index_languages)
+        self.contains_chinese = bool(contains_chinese)
+        self.contains_english = bool(contains_english)
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary representation."""
@@ -73,6 +77,8 @@ class ParsedQuery:
         result["search_langs"] = self.search_langs
         result["index_languages"] = self.index_languages
         result["source_in_index_languages"] = self.source_in_index_languages
+        result["contains_chinese"] = self.contains_chinese
+        result["contains_english"] = self.contains_english
         return result
@@ -139,7 +145,6 @@ class QueryParser:
                 cfg.get("default_model"),
             )
             self._translator = create_translation_client()
-        self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation")
     @property
     def text_encoder(self) -> TextEmbeddingEncoder:
@@ -218,6 +223,16 @@ class QueryParser:
         return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
     @staticmethod
+    def _is_pure_english_word_token(token: str) -> bool:
+        """
+        A tokenizer token counts as English iff it is letters only (optional internal hyphens)
+        and length >= 3.
+        """
+        if not token or len(token) < 3:
+            return False
+        return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
+
+    @staticmethod
     def _extract_latin_tokens(text: str) -> List[str]:
         """Extract latin word tokens from query text."""
         return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
@@ -332,11 +347,14 @@ class QueryParser:
         if context:
             context.store_intermediate_result('detected_language', detected_lang)
-        # Stage 4: Translation (with async support and conditional waiting)
-        translations = {}
-        translation_futures = {}
-        translation_executor = None
+        # Stage 4: Translation — always submit to thread pool; results are collected together with
+        # embedding in one wait() that uses a configurable budget (short vs long by source-in-index).
+        translations: Dict[str, str] = {}
+        translation_futures: Dict[str, Any] = {}
+        translation_executor: Optional[ThreadPoolExecutor] = None
         index_langs: List[str] = []
+        detected_norm = str(detected_lang or "").strip().lower()
+
         try:
             # 根据租户配置的 index_languages 决定翻译目标语言
             from config.tenant_config_loader import get_tenant_config_loader
@@ -352,59 +370,32 @@ class QueryParser:
                 seen_langs.add(norm_lang)
                 index_langs.append(norm_lang)
-            target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang]
+            target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm]
             if target_langs_for_translation:
-                target_langs = target_langs_for_translation
-
-                if target_langs:
-                    # Determine if we need to wait for translation results
-                    # If detected_lang is not in index_languages, we must wait for translation
-                    need_wait_translation = detected_lang not in index_langs
-
-                    if need_wait_translation:
-                        translation_executor = ThreadPoolExecutor(
-                            max_workers=max(1, min(len(target_langs), 4)),
-                            thread_name_prefix="query-translation-wait",
-                        )
-                        for lang in target_langs:
-                            model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
-                            log_debug(
-                                f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
-                            )
-                            translation_futures[lang] = translation_executor.submit(
-                                self.translator.translate,
-                                query_text,
-                                lang,
-                                detected_lang,
-                                "ecommerce_search_query",
-                                model_name,
-                            )
-                    else:
-                        for lang in target_langs:
-                            model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
-                            log_debug(
-                                f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
-                            )
-                            self._translation_executor.submit(
-                                self.translator.translate,
-                                query_text,
-                                lang,
-                                detected_lang,
-                                "ecommerce_search_query",
-                                model_name,
-                            )
+                translation_executor = ThreadPoolExecutor(
+                    max_workers=max(1, min(len(target_langs_for_translation), 4)),
+                    thread_name_prefix="query-translation",
+                )
+                for lang in target_langs_for_translation:
+                    model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
+                    log_debug(
+                        f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
+                    )
+                    translation_futures[lang] = translation_executor.submit(
+                        self.translator.translate,
+                        query_text,
+                        lang,
+                        detected_lang,
+                        "ecommerce_search_query",
+                        model_name,
+                    )
-                    if translations:
-                        log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}")
-                    if translation_futures:
-                        log_debug(f"Translation in progress, waiting for results... | Query text: '{query_text}' | Languages: {list(translation_futures.keys())}")
-                    
-                    if context:
-                        context.store_intermediate_result('translations', translations)
-                        for lang, translation in translations.items():
-                            if translation:
-                                context.store_intermediate_result(f'translation_{lang}', translation)
+                if context:
+                    context.store_intermediate_result('translations', translations)
+                    for lang, translation in translations.items():
+                        if translation:
+                            context.store_intermediate_result(f'translation_{lang}', translation)
         except Exception as e:
             error_msg = f"Translation failed | Error: {str(e)}"
@@ -416,13 +407,18 @@ class QueryParser:
         keywords = self._extract_keywords(query_text)
         query_tokens = self._get_query_tokens(query_text)
         token_count = len(query_tokens)
+        contains_chinese = self._contains_cjk(query_text)
+        contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
         log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
-                 f"Query tokens: {query_tokens}")
+                 f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "
+                 f"contains_english={contains_english}")
         if context:
             context.store_intermediate_result('keywords', keywords)
             context.store_intermediate_result('token_count', token_count)
             context.store_intermediate_result('query_tokens', query_tokens)
+            context.store_intermediate_result('contains_chinese', contains_chinese)
+            context.store_intermediate_result('contains_english', contains_english)
         # Stage 6: Text embedding (only for non-short queries) - async execution
         query_vector = None
@@ -442,7 +438,7 @@ class QueryParser:
                 # Submit encoding task to thread pool for async execution
                 encoding_executor = ThreadPoolExecutor(max_workers=1)
                 def _encode_query_vector() -> Optional[np.ndarray]:
-                    arr = self.text_encoder.encode([query_text])
+                    arr = self.text_encoder.encode([query_text], priority=1)
                     if arr is None or len(arr) == 0:
                         return None
                     vec = arr[0]
@@ -458,45 +454,66 @@ class QueryParser:
                 encoding_executor = None
                 embedding_future = None
-        # Wait for all async tasks to complete (translation and embedding)
+        # Wait for translation + embedding concurrently; shared budget (ms) depends on whether
+        # the detected language is in tenant index_languages.
+        qc = self.config.query_config
+        source_in_index_for_budget = detected_norm in index_langs
+        budget_ms = (
+            qc.translation_embedding_wait_budget_ms_source_in_index
+            if source_in_index_for_budget
+            else qc.translation_embedding_wait_budget_ms_source_not_in_index
+        )
+        budget_sec = max(0.0, float(budget_ms) / 1000.0)
+
+        if translation_futures:
+            log_info(
+                f"Translation+embedding shared wait budget | budget_ms={budget_ms} | "
+                f"source_in_index_languages={source_in_index_for_budget} | "
+                f"translation_targets={list(translation_futures.keys())}"
+            )
+
         if translation_futures or embedding_future:
-            log_debug("Waiting for async tasks to complete...")
-            
-            # Collect all futures with their identifiers
-            all_futures = []
-            future_to_lang = {}
+            log_debug(
+                f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | "
+                f"source_in_index_languages={source_in_index_for_budget}"
+            )
+
+            all_futures: List[Any] = []
+            future_to_lang: Dict[Any, tuple] = {}
             for lang, future in translation_futures.items():
                 all_futures.append(future)
-                future_to_lang[future] = ('translation', lang)
-            
+                future_to_lang[future] = ("translation", lang)
+
             if embedding_future:
                 all_futures.append(embedding_future)
-                future_to_lang[embedding_future] = ('embedding', None)
-            
-            # Enforce a hard timeout for translation-related work (300ms budget)
-            done, not_done = wait(all_futures, timeout=0.3)
+                future_to_lang[embedding_future] = ("embedding", None)
+
+            done, not_done = wait(all_futures, timeout=budget_sec)
             for future in done:
                 task_type, lang = future_to_lang[future]
                 try:
                     result = future.result()
-                    if task_type == 'translation':
+                    if task_type == "translation":
                         if result:
                             translations[lang] = result
                             log_info(
-                                f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'"
+                                f"Translation completed | Query text: '{query_text}' | "
+                                f"Target language: {lang} | Translation result: '{result}'"
                             )
                             if context:
-                                context.store_intermediate_result(f'translation_{lang}', result)
-                    elif task_type == 'embedding':
+                                context.store_intermediate_result(f"translation_{lang}", result)
+                    elif task_type == "embedding":
                         query_vector = result
                         if query_vector is not None:
                             log_debug(f"Query vector generation completed | Shape: {query_vector.shape}")
                             if context:
-                                context.store_intermediate_result('query_vector_shape', query_vector.shape)
+                                context.store_intermediate_result("query_vector_shape", query_vector.shape)
                         else:
-                            log_info("Query vector generation completed but result is None, will process without vector")
+                            log_info(
+                                "Query vector generation completed but result is None, will process without vector"
+                            )
                 except Exception as e:
-                    if task_type == 'translation':
+                    if task_type == "translation":
                         error_msg = f"Translation failed | Language: {lang} | Error: {str(e)}"
                     else:
                         error_msg = f"Query vector generation failed | Error: {str(e)}"
@@ -504,30 +521,29 @@ class QueryParser:
                     if context:
                         context.add_warning(error_msg)
-            # Log timeouts for any futures that did not finish within 300ms
             if not_done:
                 for future in not_done:
                     task_type, lang = future_to_lang[future]
-                    if task_type == 'translation':
+                    if task_type == "translation":
                         timeout_msg = (
-                            f"Translation timeout (>300ms) | Language: {lang} | "
+                            f"Translation timeout (>{budget_ms}ms) | Language: {lang} | "
                             f"Query text: '{query_text}'"
                         )
                     else:
-                        timeout_msg = "Query vector generation timeout (>300ms), proceeding without embedding result"
+                        timeout_msg = (
+                            f"Query vector generation timeout (>{budget_ms}ms), proceeding without embedding result"
+                        )
                     log_info(timeout_msg)
                     if context:
                         context.add_warning(timeout_msg)
-            # Clean up encoding executor
             if encoding_executor:
                 encoding_executor.shutdown(wait=False)
             if translation_executor:
                 translation_executor.shutdown(wait=False)
-            
-            # Update translations in context after all are complete
+
             if translations and context:
-                context.store_intermediate_result('translations', translations)
+                context.store_intermediate_result("translations", translations)
         # Build language-scoped query plan: source language + available translations
         query_text_by_lang: Dict[str, str] = {}
@@ -547,7 +563,7 @@ class QueryParser:
                 # Use the original mixed-script query as a robust fallback probe for that language field set.
                 query_text_by_lang[lang] = query_text
-        source_in_index_languages = detected_lang in index_langs
+        source_in_index_languages = detected_norm in index_langs
         ordered_search_langs: List[str] = []
         seen_order = set()
         if detected_lang in query_text_by_lang:
@@ -583,6 +599,8 @@ class QueryParser:
             search_langs=ordered_search_langs,
             index_languages=index_langs,
             source_in_index_languages=source_in_index_languages,
+            contains_chinese=contains_chinese,
+            contains_english=contains_english,
         )
         if context and hasattr(context, 'logger'):
@@ -15,6 +15,9 @@ Examples:
   python scripts/perf_api_benchmark.py --scenario backend_suggest --duration 30 --concurrency 50 --tenant-id 162
   python scripts/perf_api_benchmark.py --scenario all --duration 60 --concurrency 80 --tenant-id 162
   python scripts/perf_api_benchmark.py --scenario all --cases-file scripts/perf_cases.json.example --output perf_result.json
+  # Embedding admission / priority (query param `priority`; same semantics as embedding service):
+  python scripts/perf_api_benchmark.py --scenario embed_text --embed-text-priority 1 --duration 30 --concurrency 20
+  python scripts/perf_api_benchmark.py --scenario embed_image --embed-image-priority 1 --duration 30 --concurrency 10
 """
 from __future__ import annotations
@@ -72,9 +75,9 @@ def validate_response_payload(
 ) -> Tuple[bool, str]:
     """
     Lightweight payload validation for correctness-aware perf tests.
-    Currently strict for embed_text to catch NaN/null vector regressions.
+    Strict for embed_text / embed_image to catch NaN/null vector regressions.
     """
-    if scenario_name != "embed_text":
+    if scenario_name not in ("embed_text", "embed_image"):
         return True, ""
     expected_len = len(tpl.json_body) if isinstance(tpl.json_body, list) else None
@@ -219,6 +222,43 @@ def load_cases_from_file(path: Path, tenant_id: str) -&gt; Dict[str, List[RequestTe
     return out
+def apply_embed_priority_params(
+    scenarios: Dict[str, Scenario],
+    embed_text_priority: int,
+    embed_image_priority: int,
+) -> None:
+    """
+    Merge default `priority` query param into embed templates when absent.
+    `scripts/perf_cases.json` may set per-request `params.priority` to override.
+    """
+    mapping = {
+        "embed_text": max(0, int(embed_text_priority)),
+        "embed_image": max(0, int(embed_image_priority)),
+    }
+    for name, pri in mapping.items():
+        if name not in scenarios:
+            continue
+        scen = scenarios[name]
+        new_templates: List[RequestTemplate] = []
+        for t in scen.templates:
+            params = dict(t.params or {})
+            params.setdefault("priority", str(pri))
+            new_templates.append(
+                RequestTemplate(
+                    method=t.method,
+                    path=t.path,
+                    params=params,
+                    json_body=t.json_body,
+                    headers=t.headers,
+                )
+            )
+        scenarios[name] = Scenario(
+            name=scen.name,
+            templates=new_templates,
+            timeout_sec=scen.timeout_sec,
+        )
+
+
 def build_scenarios(args: argparse.Namespace) -> Dict[str, Scenario]:
     defaults = make_default_templates(args.tenant_id)
     if args.cases_file:
@@ -252,6 +292,11 @@ def build_scenarios(args: argparse.Namespace) -&gt; Dict[str, Scenario]:
                 )
             )
         scenarios[name] = Scenario(name=name, templates=rewritten, timeout_sec=args.timeout)
+    apply_embed_priority_params(
+        scenarios,
+        embed_text_priority=args.embed_text_priority,
+        embed_image_priority=args.embed_image_priority,
+    )
     return scenarios
@@ -483,6 +528,18 @@ def parse_args() -&gt; argparse.Namespace:
         default=0,
         help="Optional top_n for rerank requests in dynamic docs mode (0 means omit top_n).",
     )
+    parser.add_argument(
+        "--embed-text-priority",
+        type=int,
+        default=0,
+        help="Default query param priority= for embed_text (0=offline admission; >0 bypasses rejection). Merged into params unless set in --cases-file.",
+    )
+    parser.add_argument(
+        "--embed-image-priority",
+        type=int,
+        default=0,
+        help="Default query param priority= for embed_image (same semantics as embed-text-priority).",
+    )
     return parser.parse_args()
@@ -609,6 +666,8 @@ async def main_async() -&gt; int:
     print(f"  embedding_image_base={args.embedding_image_base}")
     print(f"  translator_base={args.translator_base}")
     print(f"  reranker_base={args.reranker_base}")
+    print(f"  embed_text_priority={args.embed_text_priority}")
+    print(f"  embed_image_priority={args.embed_image_priority}")
     if args.rerank_dynamic_docs:
         print("  rerank_dynamic_docs=True")
         print(f"  rerank_doc_count={args.rerank_doc_count}")
@@ -667,6 +726,8 @@ async def main_async() -&gt; int:
             "rerank_query": args.rerank_query,
             "rerank_seed": args.rerank_seed,
             "rerank_top_n": args.rerank_top_n,
+            "embed_text_priority": args.embed_text_priority,
+            "embed_image_priority": args.embed_image_priority,
         },
         "results": results,
         "overall": aggregate_results(results),
@@ -32,9 +32,18 @@
       {
         "method": "POST",
         "path": "/embed/text",
+        "params": {"priority": "0"},
         "json": ["wireless mouse", "gaming keyboard", "USB-C cable", "barbie doll"]
       }
     ],
+    "embed_image": [
+      {
+        "method": "POST",
+        "path": "/embed/image",
+        "params": {"normalize": "true", "priority": "0"},
+        "json": ["/data/saas-search/docs/image-dress1.png"]
+      }
+    ],
     "translate": [
       {
         "method": "POST",
@@ -9,9 +9,13 @@ Simplified architecture:
 """
 from typing import Dict, Any, List, Optional, Union, Tuple
+
 import numpy as np
 from config import FunctionScoreConfig
+# (Elasticsearch field path, boost before formatting as "path^boost")
+MatchFieldSpec = Tuple[str, float]
+
 class ESQueryBuilder:
     """Builds Elasticsearch DSL queries."""
@@ -36,6 +40,7 @@ class ESQueryBuilder:
         source_boost_when_missing: float = 0.6,
         original_query_fallback_boost_when_translation_missing: float = 0.2,
         tie_breaker_base_query: float = 0.9,
+        mixed_script_merged_field_boost_scale: float = 0.6,
     ):
         """
         Initialize query builder.
@@ -51,6 +56,7 @@ class ESQueryBuilder:
             function_score_config: Function score configuration
             default_language: Default language to use when detection fails or returns "unknown"
             knn_boost: Boost value for KNN (embedding recall)
+            mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields
         """
         self.match_fields = match_fields
         self.field_boosts = field_boosts or {}
@@ -74,6 +80,7 @@ class ESQueryBuilder:
             original_query_fallback_boost_when_translation_missing
         )
         self.tie_breaker_base_query = float(tie_breaker_base_query)
+        self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
     def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
         """
@@ -414,7 +421,7 @@ class ESQueryBuilder:
     def _format_field_with_boost(self, field_name: str, boost: float) -> str:
         if abs(float(boost) - 1.0) < 1e-9:
             return field_name
-        return f"{field_name}^{boost}"
+        return f"{field_name}^{round(boost, 2)}"
     def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float:
         # Language-specific override first (e.g. title.de), then base field (e.g. title)
@@ -426,36 +433,74 @@ class ESQueryBuilder:
             return float(self.field_boosts[base_field])
         return 1.0
-    def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]:
+    def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]:
         """
-        Build dynamic match fields for one language.
-        
-        Args:
-            language: Language code (e.g. zh/en/de/fr/...)
-            
-        Returns:
-            (all_fields, core_fields) - core_fields are for phrase/keyword queries
+        Per-language match targets as (field_path, boost). Single source of truth before string formatting.
+        Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere.
         """
         lang = (language or "").strip().lower()
-        all_fields: List[str] = []
-        core_fields: List[str] = []
+        all_specs: List[MatchFieldSpec] = []
+        core_specs: List[MatchFieldSpec] = []
         for base in self.multilingual_fields:
             field = f"{base}.{lang}"
-            boost = self._get_field_boost(base, lang)
-            all_fields.append(self._format_field_with_boost(field, boost))
+            all_specs.append((field, self._get_field_boost(base, lang)))
         for shared in self.shared_fields:
-            boost = self._get_field_boost(shared, None)
-            all_fields.append(self._format_field_with_boost(shared, boost))
+            all_specs.append((shared, self._get_field_boost(shared, None)))
         for base in self.core_multilingual_fields:
             field = f"{base}.{lang}"
-            boost = self._get_field_boost(base, lang)
-            core_fields.append(self._format_field_with_boost(field, boost))
+            core_specs.append((field, self._get_field_boost(base, lang)))
+
+        return all_specs, core_specs
+
+    def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]:
+        """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
+        return [self._format_field_with_boost(path, boost) for path, boost in specs]
+
+    def _merge_supplemental_lang_field_specs(
+        self,
+        specs: List[MatchFieldSpec],
+        supplemental_lang: str,
+    ) -> List[MatchFieldSpec]:
+        """Append supplemental-language columns; boosts multiplied by mixed_script scale."""
+        scale = float(self.mixed_script_merged_field_boost_scale)
+        extra_all, _ = self._build_match_field_specs(supplemental_lang)
+        seen = {path for path, _ in specs}
+        out = list(specs)
+        for path, boost in extra_all:
+            if path not in seen:
+                out.append((path, boost * scale))
+                seen.add(path)
+        return out
+
+    def _expand_match_field_specs_for_mixed_script(
+        self,
+        lang: str,
+        specs: List[MatchFieldSpec],
+        contains_chinese: bool,
+        contains_english: bool,
+        index_languages: List[str],
+    ) -> List[MatchFieldSpec]:
+        """
+        When the query mixes scripts, widen each clause to indexed fields for the other script
+        (e.g. zh clause also searches title.en when the query contains an English word token).
+        """
+        norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()}
+        allow = norm or {"zh", "en"}
+
+        def can_use(lcode: str) -> bool:
+            return lcode in allow if norm else True
+
+        out = list(specs)
+        lnorm = (lang or "").strip().lower()
+        if contains_english and lnorm != "en" and can_use("en"):
+            out = self._merge_supplemental_lang_field_specs(out, "en")
+        if contains_chinese and lnorm != "zh" and can_use("zh"):
+            out = self._merge_supplemental_lang_field_specs(out, "zh")
+        return out
-        return all_fields, core_fields
-    
     def _get_embedding_field(self, language: str) -> str:
         """Get embedding field name for a language."""
         # Currently using unified embedding field
@@ -486,6 +531,8 @@ class ESQueryBuilder:
         source_in_index_languages = True
         index_languages: List[str] = []
+        contains_chinese = False
+        contains_english = False
         if parsed_query:
             query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
             search_langs = getattr(parsed_query, "search_langs", None) or []
@@ -495,6 +542,8 @@ class ESQueryBuilder:
                 getattr(parsed_query, "source_in_index_languages", True)
             )
             index_languages = getattr(parsed_query, "index_languages", None) or []
+            contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
+            contains_english = bool(getattr(parsed_query, "contains_english", False))
         if not query_text_by_lang:
             query_text_by_lang = {source_lang: query_text}
@@ -508,7 +557,15 @@ class ESQueryBuilder:
             lang_query = query_text_by_lang.get(lang)
             if not lang_query:
                 continue
-            match_fields, _ = self._get_match_fields(lang)
+            all_specs, _ = self._build_match_field_specs(lang)
+            expanded_specs = self._expand_match_field_specs_for_mixed_script(
+                lang,
+                all_specs,
+                contains_chinese,
+                contains_english,
+                index_languages,
+            )
+            match_fields = self._format_match_field_specs(expanded_specs)
             if not match_fields:
                 continue
@@ -559,7 +616,15 @@ class ESQueryBuilder:
                     continue
                 if lang in query_text_by_lang:
                     continue
-                match_fields, _ = self._get_match_fields(lang)
+                fb_specs, _ = self._build_match_field_specs(lang)
+                expanded_fb = self._expand_match_field_specs_for_mixed_script(
+                    lang,
+                    fb_specs,
+                    contains_chinese,
+                    contains_english,
+                    index_languages,
+                )
+                match_fields = self._format_match_field_specs(expanded_fb)
                 if not match_fields:
                     continue
                 should_clauses.append({
@@ -4,12 +4,13 @@ Main Searcher module - executes search queries against Elasticsearch.
 Handles query parsing, ranking, and result formatting.
 """
-from typing import Dict, Any, List, Optional, Union
+from typing import Dict, Any, List, Optional, Union, Tuple
 import os
 import time, json
 import logging
 import hashlib
 from string import Formatter
+import numpy as np
 from utils.es_client import ESClient
 from query import QueryParser, ParsedQuery
@@ -224,6 +225,265 @@ class Searcher:
             hits_by_id[str(hid)] = hit
         return hits_by_id, int(resp.get("took", 0) or 0)
+    @staticmethod
+    def _normalize_sku_match_text(value: Optional[str]) -> str:
+        """Normalize free text for lightweight SKU option matching."""
+        if value is None:
+            return ""
+        return " ".join(str(value).strip().casefold().split())
+
+    @staticmethod
+    def _sku_option1_embedding_key(
+        sku: Dict[str, Any],
+        spu_option1_name: Optional[Any] = None,
+    ) -> Optional[str]:
+        """
+        Text sent to the embedding service for option1 must be "name:value"
+        (option name from SKU row or SPU-level option1_name).
+        """
+        value_raw = sku.get("option1_value")
+        if value_raw is None:
+            return None
+        value = str(value_raw).strip()
+        if not value:
+            return None
+        name = sku.get("option1_name")
+        if name is None or not str(name).strip():
+            name = spu_option1_name
+        name_str = str(name).strip() if name is not None and str(name).strip() else ""
+        if name_str:
+            value = f"{name_str}:{value}"
+        return value.casefold()
+
+    def _build_sku_query_texts(self, parsed_query: ParsedQuery) -> List[str]:
+        """Collect original and translated query texts for SKU option matching."""
+        candidates: List[str] = []
+        for text in (
+            getattr(parsed_query, "original_query", None),
+            getattr(parsed_query, "query_normalized", None),
+            getattr(parsed_query, "rewritten_query", None),
+        ):
+            normalized = self._normalize_sku_match_text(text)
+            if normalized:
+                candidates.append(normalized)
+
+        query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {}
+        if isinstance(query_text_by_lang, dict):
+            for text in query_text_by_lang.values():
+                normalized = self._normalize_sku_match_text(text)
+                if normalized:
+                    candidates.append(normalized)
+
+        translations = getattr(parsed_query, "translations", {}) or {}
+        if isinstance(translations, dict):
+            for text in translations.values():
+                normalized = self._normalize_sku_match_text(text)
+                if normalized:
+                    candidates.append(normalized)
+
+        deduped: List[str] = []
+        seen = set()
+        for text in candidates:
+            if text in seen:
+                continue
+            seen.add(text)
+            deduped.append(text)
+        return deduped
+
+    def _find_query_matching_sku_index(
+        self,
+        skus: List[Dict[str, Any]],
+        query_texts: List[str],
+        spu_option1_name: Optional[Any] = None,
+    ) -> Optional[int]:
+        """Return the first SKU whose option1_value (or name:value) appears in query texts."""
+        if not skus or not query_texts:
+            return None
+
+        for index, sku in enumerate(skus):
+            option1_value = self._normalize_sku_match_text(sku.get("option1_value"))
+            if not option1_value:
+                continue
+            if any(option1_value in query_text for query_text in query_texts):
+                return index
+            embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
+            if embed_key and embed_key != option1_value:
+                composite_norm = self._normalize_sku_match_text(embed_key.replace(":", " "))
+                if any(composite_norm in query_text for query_text in query_texts):
+                    return index
+                if any(embed_key.casefold() in query_text for query_text in query_texts):
+                    return index
+        return None
+
+    def _encode_query_vector_for_sku_matching(
+        self,
+        parsed_query: ParsedQuery,
+        context: Optional[RequestContext] = None,
+    ) -> Optional[np.ndarray]:
+        """Best-effort fallback query embedding for final-page SKU matching."""
+        query_text = (
+            getattr(parsed_query, "rewritten_query", None)
+            or getattr(parsed_query, "query_normalized", None)
+            or getattr(parsed_query, "original_query", None)
+        )
+        if not query_text:
+            return None
+
+        text_encoder = getattr(self.query_parser, "text_encoder", None)
+        if text_encoder is None:
+            return None
+
+        try:
+            vectors = text_encoder.encode([query_text], priority=1)
+        except Exception as exc:
+            logger.warning("Failed to encode query vector for SKU matching: %s", exc, exc_info=True)
+            if context is not None:
+                context.add_warning(f"SKU query embedding failed: {exc}")
+            return None
+
+        if vectors is None or len(vectors) == 0:
+            return None
+
+        vector = vectors[0]
+        if vector is None:
+            return None
+        return np.asarray(vector, dtype=np.float32)
+
+    def _select_sku_by_embedding(
+        self,
+        skus: List[Dict[str, Any]],
+        option1_vectors: Dict[str, np.ndarray],
+        query_vector: np.ndarray,
+        spu_option1_name: Optional[Any] = None,
+    ) -> Tuple[Optional[int], Optional[float]]:
+        """Select the SKU whose option1 embedding key (name:value) is most similar to the query."""
+        best_index: Optional[int] = None
+        best_score: Optional[float] = None
+
+        for index, sku in enumerate(skus):
+            embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
+            if not embed_key:
+                continue
+            option_vector = option1_vectors.get(embed_key)
+            if option_vector is None:
+                continue
+            score = float(np.inner(query_vector, option_vector))
+            if best_score is None or score > best_score:
+                best_index = index
+                best_score = score
+
+        return best_index, best_score
+
+    @staticmethod
+    def _promote_matching_sku(source: Dict[str, Any], match_index: int) -> Optional[Dict[str, Any]]:
+        """Move the matched SKU to the front and swap the SPU image."""
+        skus = source.get("skus")
+        if not isinstance(skus, list) or match_index < 0 or match_index >= len(skus):
+            return None
+
+        matched_sku = skus.pop(match_index)
+        skus.insert(0, matched_sku)
+
+        image_src = matched_sku.get("image_src") or matched_sku.get("imageSrc")
+        if image_src:
+            source["image_url"] = image_src
+        return matched_sku
+
+    def _apply_sku_sorting_for_page_hits(
+        self,
+        es_hits: List[Dict[str, Any]],
+        parsed_query: ParsedQuery,
+        context: Optional[RequestContext] = None,
+    ) -> None:
+        """Sort each page hit's SKUs so the best-matching SKU is first."""
+        if not es_hits:
+            return
+
+        query_texts = self._build_sku_query_texts(parsed_query)
+        unmatched_hits: List[Dict[str, Any]] = []
+        option1_values_to_encode: List[str] = []
+        seen_option1_values = set()
+        text_matched = 0
+        embedding_matched = 0
+
+        for hit in es_hits:
+            source = hit.get("_source")
+            if not isinstance(source, dict):
+                continue
+            skus = source.get("skus")
+            if not isinstance(skus, list) or not skus:
+                continue
+
+            spu_option1_name = source.get("option1_name")
+            match_index = self._find_query_matching_sku_index(
+                skus, query_texts, spu_option1_name=spu_option1_name
+            )
+            if match_index is not None:
+                self._promote_matching_sku(source, match_index)
+                text_matched += 1
+                continue
+
+            unmatched_hits.append(hit)
+            for sku in skus:
+                embed_key = self._sku_option1_embedding_key(sku, spu_option1_name)
+                if not embed_key or embed_key in seen_option1_values:
+                    continue
+                seen_option1_values.add(embed_key)
+                option1_values_to_encode.append(embed_key)
+
+        if not unmatched_hits or not option1_values_to_encode:
+            return
+
+        query_vector = getattr(parsed_query, "query_vector", None)
+        if query_vector is None:
+            query_vector = self._encode_query_vector_for_sku_matching(parsed_query, context=context)
+        if query_vector is None:
+            return
+
+        text_encoder = getattr(self.query_parser, "text_encoder", None)
+        if text_encoder is None:
+            return
+
+        try:
+            encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1)
+        except Exception as exc:
+            logger.warning("Failed to encode SKU option1 values for final-page sorting: %s", exc, exc_info=True)
+            if context is not None:
+                context.add_warning(f"SKU option embedding failed: {exc}")
+            return
+
+        option1_vectors: Dict[str, np.ndarray] = {}
+        for option1_value, vector in zip(option1_values_to_encode, encoded_option_vectors):
+            if vector is None:
+                continue
+            option1_vectors[option1_value] = np.asarray(vector, dtype=np.float32)
+
+        query_vector_array = np.asarray(query_vector, dtype=np.float32)
+        for hit in unmatched_hits:
+            source = hit.get("_source")
+            if not isinstance(source, dict):
+                continue
+            skus = source.get("skus")
+            if not isinstance(skus, list) or not skus:
+                continue
+            match_index, _ = self._select_sku_by_embedding(
+                skus,
+                option1_vectors,
+                query_vector_array,
+                spu_option1_name=source.get("option1_name"),
+            )
+            if match_index is None:
+                continue
+            self._promote_matching_sku(source, match_index)
+            embedding_matched += 1
+
+        if text_matched or embedding_matched:
+            logger.info(
+                "Final-page SKU sorting completed | text_matched=%s | embedding_matched=%s",
+                text_matched,
+                embedding_matched,
+            )
+
     def search(
         self,
         query: str,
@@ -622,6 +882,8 @@ class Searcher:
                         continue
                     rerank_debug_by_doc[str(doc_id)] = item
+            self._apply_sku_sorting_for_page_hits(es_hits, parsed_query, context=context)
+
             # Format results using ResultFormatter
             formatted_results = ResultFormatter.format_search_results(
                 es_hits,
@@ -791,7 +1053,7 @@ class Searcher:
         # Generate image embedding
         if self.image_encoder is None:
             raise RuntimeError("Image encoder is not initialized at startup")
-        image_vector = self.image_encoder.encode_image_from_url(image_url)
+        image_vector = self.image_encoder.encode_image_from_url(image_url, priority=1)
         if image_vector is None:
             raise ValueError(f"Failed to encode image: {image_url}")
@@ -540,7 +540,15 @@ def test_indexer_index_validation_max_delete_spu_ids(indexer_client: TestClient)
 class _FakeTextModel:
-    def encode_batch(self, texts, batch_size=32, device="cpu", normalize_embeddings=True):
+    """Matches TEI / server path: `_text_model.encode(...)` (not encode_batch)."""
+
+    def encode(
+        self,
+        texts,
+        batch_size=32,
+        device="cpu",
+        normalize_embeddings=True,
+    ):
         return [np.array([0.1, 0.2, 0.3], dtype=np.float32) for _ in texts]
@@ -549,6 +557,18 @@ class _FakeImageModel:
         return [np.array([0.3, 0.2, 0.1], dtype=np.float32) for _ in urls]
+class _EmbeddingCacheMiss:
+    """Avoid Redis/module cache hits so contract tests exercise the encode path."""
+
+    redis_client = None
+
+    def get(self, key):
+        return None
+
+    def set(self, key, value):
+        return True
+
+
 @pytest.fixture
 def embedding_module():
     import embeddings.server as emb_server
@@ -556,17 +576,31 @@ def embedding_module():
     emb_server.app.router.on_startup.clear()
     emb_server._text_model = _FakeTextModel()
     emb_server._image_model = _FakeImageModel()
+    emb_server._text_backend_name = "tei"
+    emb_server._text_cache = _EmbeddingCacheMiss()
+    emb_server._image_cache = _EmbeddingCacheMiss()
     yield emb_server
 def test_embedding_text_contract(embedding_module):
-    data = embedding_module.embed_text(["hello", "world"])
+    """Contract via HTTP like production; route handlers require Request/Response."""
+    from fastapi.testclient import TestClient
+
+    with TestClient(embedding_module.app) as client:
+        resp = client.post("/embed/text", json=["hello", "world"])
+    assert resp.status_code == 200
+    data = resp.json()
     assert len(data) == 2
     assert len(data[0]) == 3
 def test_embedding_image_contract(embedding_module):
-    data = embedding_module.embed_image(["https://example.com/a.jpg"])
+    from fastapi.testclient import TestClient
+
+    with TestClient(embedding_module.app) as client:
+        resp = client.post("/embed/image", json=["https://example.com/a.jpg"])
+    assert resp.status_code == 200
+    data = resp.json()
     assert len(data[0]) == 3
@@ -63,7 +63,11 @@ class _FakeTranslator:
 class _FakeQueryEncoder:
+    def __init__(self):
+        self.calls = []
+
     def encode(self, sentences, **kwargs):
+        self.calls.append({"sentences": sentences, "kwargs": dict(kwargs)})
         if isinstance(sentences, str):
             sentences = [sentences]
         return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object)
@@ -98,9 +102,7 @@ def _build_test_config() -&gt; SearchConfig:
         rerank=RerankConfig(),
         spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3),
         es_index_name="test_products",
-        tenant_config={},
         es_settings={},
-        services={},
     )
@@ -111,6 +113,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch):
     def _fake_post(url, json, timeout, **kwargs):
         assert url.endswith("/embed/text")
         assert json == ["hello", "world"]
+        assert kwargs["params"]["priority"] == 0
         return _FakeResponse([[0.1, 0.2], [0.3, 0.4]])
     monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post)
@@ -172,6 +175,7 @@ def test_image_embedding_encoder_cache_hit(monkeypatch):
     def _fake_post(url, params, json, timeout, **kwargs):
         calls["count"] += 1
+        assert params["priority"] == 0
         return _FakeResponse([[0.1, 0.2]])
     monkeypatch.setattr("embeddings.image_encoder.requests.post", _fake_post)
@@ -184,16 +188,35 @@ def test_image_embedding_encoder_cache_hit(monkeypatch):
     assert np.allclose(out[1], np.array([0.1, 0.2], dtype=np.float32))
+def test_image_embedding_encoder_passes_priority(monkeypatch):
+    fake_cache = _FakeEmbeddingCache()
+    monkeypatch.setattr("embeddings.image_encoder.RedisEmbeddingCache", lambda **kwargs: fake_cache)
+
+    def _fake_post(url, params, json, timeout, **kwargs):
+        assert params["priority"] == 1
+        return _FakeResponse([[0.1, 0.2]])
+
+    monkeypatch.setattr("embeddings.image_encoder.requests.post", _fake_post)
+
+    encoder = CLIPImageEncoder(service_url="http://127.0.0.1:6008")
+    out = encoder.encode_batch(["https://example.com/a.jpg"], priority=1)
+    assert len(out) == 1
+    assert np.allclose(out[0], np.array([0.1, 0.2], dtype=np.float32))
+
+
 def test_query_parser_generates_query_vector_with_encoder():
+    encoder = _FakeQueryEncoder()
     parser = QueryParser(
         config=_build_test_config(),
-        text_encoder=_FakeQueryEncoder(),
+        text_encoder=encoder,
         translator=_FakeTranslator(),
     )
     parsed = parser.parse("red dress", tenant_id="162", generate_vector=True)
     assert parsed.query_vector is not None
     assert parsed.query_vector.shape == (3,)
+    assert encoder.calls
+    assert encoder.calls[0]["kwargs"]["priority"] == 1
 def test_query_parser_skips_query_vector_when_disabled():
@@ -69,6 +69,8 @@ def test_health_exposes_limit_stats(monkeypatch):
 def test_embed_image_rejects_when_image_lane_is_full(monkeypatch):
+    # Ensure no cache hit (module-level Redis cache may contain this URL from other tests).
+    monkeypatch.setattr(embedding_server, "_image_cache", _FakeCache({}))
     limiter = embedding_server._InflightLimiter("image", 1)
     acquired, _ = limiter.try_acquire()
     assert acquired is True
@@ -0,0 +1,81 @@
+import threading
+
+import embeddings.server as emb_server
+
+
+def test_text_inflight_limiter_priority_bypass():
+    limiter = emb_server._InflightLimiter(name="text", limit=1)
+
+    accepted, active = limiter.try_acquire()
+    assert accepted is True
+    assert active == 1
+
+    accepted, active = limiter.try_acquire()
+    assert accepted is False
+    assert active == 1
+
+    accepted, active = limiter.try_acquire(bypass_limit=True)
+    assert accepted is True
+    assert active == 2
+
+    snapshot = limiter.snapshot()
+    assert snapshot["priority_bypass_total"] == 1
+
+    limiter.release(success=True)
+    limiter.release(success=True)
+
+
+def test_text_dispatch_prefers_high_priority_queue():
+    high_task = emb_server._TextDispatchTask(
+        normalized=["online"],
+        effective_normalize=True,
+        request_id="high",
+        priority=1,
+        created_at=0.0,
+        done=threading.Event(),
+    )
+    normal_task = emb_server._TextDispatchTask(
+        normalized=["offline"],
+        effective_normalize=True,
+        request_id="normal",
+        priority=0,
+        created_at=0.0,
+        done=threading.Event(),
+    )
+
+    with emb_server._text_dispatch_cv:
+        emb_server._text_dispatch_high_queue.clear()
+        emb_server._text_dispatch_normal_queue.clear()
+        emb_server._text_dispatch_normal_queue.append(normal_task)
+        emb_server._text_dispatch_high_queue.append(high_task)
+
+        first = emb_server._pop_text_dispatch_task_locked()
+        second = emb_server._pop_text_dispatch_task_locked()
+
+        emb_server._text_dispatch_high_queue.clear()
+        emb_server._text_dispatch_normal_queue.clear()
+
+    assert first is high_task
+    assert second is normal_task
+
+
+def test_image_inflight_limiter_priority_bypass():
+    limiter = emb_server._InflightLimiter(name="image", limit=1)
+
+    accepted, active = limiter.try_acquire()
+    assert accepted is True
+    assert active == 1
+
+    accepted, active = limiter.try_acquire()
+    assert accepted is False
+    assert active == 1
+
+    accepted, active = limiter.try_acquire(bypass_limit=True)
+    assert accepted is True
+    assert active == 2
+
+    snapshot = limiter.snapshot()
+    assert snapshot["priority_bypass_total"] == 1
+
+    limiter.release(success=True)
+    limiter.release(success=True)
@@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries():
     names = [clause["multi_match"]["_name"] for clause in should]
     assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"]
+
+
+def test_mixed_script_merges_en_fields_into_zh_clause():
+    qb = ESQueryBuilder(
+        match_fields=["title.en^3.0"],
+        multilingual_fields=["title", "brief"],
+        shared_fields=[],
+        text_embedding_field="title_embedding",
+        default_language="en",
+    )
+    parsed_query = SimpleNamespace(
+        query_text_by_lang={"zh": "法式 dress"},
+        search_langs=["zh"],
+        detected_language="zh",
+        source_in_index_languages=True,
+        index_languages=["zh", "en"],
+        contains_chinese=True,
+        contains_english=True,
+    )
+    q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
+    fields = q["query"]["multi_match"]["fields"]
+    bases = {f.split("^", 1)[0] for f in fields}
+    assert "title.zh" in bases and "title.en" in bases
+    assert "brief.zh" in bases and "brief.en" in bases
+    # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)
+    assert "title.en^0.8" in fields
+    assert "brief.en^0.8" in fields
+
+
+def test_mixed_script_merges_zh_fields_into_en_clause():
+    qb = ESQueryBuilder(
+        match_fields=["title.en^3.0"],
+        multilingual_fields=["title"],
+        shared_fields=[],
+        text_embedding_field="title_embedding",
+        default_language="en",
+    )
+    parsed_query = SimpleNamespace(
+        query_text_by_lang={"en": "red 连衣裙"},
+        search_langs=["en"],
+        detected_language="en",
+        source_in_index_languages=True,
+        index_languages=["zh", "en"],
+        contains_chinese=True,
+        contains_english=True,
+    )
+    q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)
+    fields = q["query"]["multi_match"]["fields"]
+    bases = {f.split("^", 1)[0] for f in fields}
+    assert "title.en" in bases and "title.zh" in bases
+    assert "title.zh^0.8" in fields
+
+
+def test_mixed_script_merged_fields_scale_configured_boosts():
+    qb = ESQueryBuilder(
+        match_fields=["title.en^3.0"],
+        multilingual_fields=["title"],
+        shared_fields=[],
+        field_boosts={"title.zh": 5.0, "title.en": 10.0},
+        text_embedding_field="title_embedding",
+        default_language="en",
+    )
+    parsed_query = SimpleNamespace(
+        query_text_by_lang={"zh": "法式 dress"},
+        search_langs=["zh"],
+        detected_language="zh",
+        source_in_index_languages=True,
+        index_languages=["zh", "en"],
+        contains_chinese=True,
+        contains_english=True,
+    )
+    q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
+    fields = q["query"]["multi_match"]["fields"]
+    assert "title.zh^5.0" in fields
+    assert "title.en^8.0" in fields  # 10.0 * 0.8
+
+
+def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
+    qb = ESQueryBuilder(
+        match_fields=["title.zh^3.0"],
+        multilingual_fields=["title"],
+        shared_fields=[],
+        text_embedding_field="title_embedding",
+        default_language="zh",
+    )
+    parsed_query = SimpleNamespace(
+        query_text_by_lang={"zh": "法式 dress"},
+        search_langs=["zh"],
+        detected_language="zh",
+        source_in_index_languages=True,
+        index_languages=["zh"],
+        contains_chinese=True,
+        contains_english=True,
+    )
+    q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
+    fields = q["query"]["multi_match"]["fields"]
+    bases = {f.split("^", 1)[0] for f in fields}
+    assert "title.zh" in bases
+    assert "title.en" not in bases
@@ -9,6 +9,14 @@ class _DummyTranslator:
         return f"{text}-{target_lang}"
+def test_pure_english_word_token_length_and_script():
+    assert QueryParser._is_pure_english_word_token("ab") is False
+    assert QueryParser._is_pure_english_word_token("abc") is True
+    assert QueryParser._is_pure_english_word_token("wi-fi") is True
+    assert QueryParser._is_pure_english_word_token("连衣裙") is False
+    assert QueryParser._is_pure_english_word_token("ab12") is False
+
+
 def _build_config() -> SearchConfig:
     return SearchConfig(
         es_index_name="test_products",
@@ -38,8 +46,11 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo
     result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
     assert result.detected_language == "zh"
+    assert result.contains_chinese is True
+    assert result.contains_english is True
     assert "en" in result.search_langs
-    assert result.query_text_by_lang["en"] == "法式 dress 连衣裙"
+    # 翻译在预算内完成时会写入目标语言字段（优于仅用原文做 supplemental 探测）
+    assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
     assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
@@ -55,6 +66,28 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
     result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
     assert result.detected_language == "en"
+    assert result.contains_chinese is True
+    assert result.contains_english is True
     assert "zh" in result.search_langs
-    assert result.query_text_by_lang["zh"] == "red 连衣裙"
+    assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
     assert result.query_text_by_lang["en"] == "red 连衣裙"
+
+
+def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
+    """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果（与向量共用预算）。"""
+    parser = QueryParser(_build_config(), translator=_DummyTranslator())
+    monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
+    monkeypatch.setattr(
+        "query.query_parser.get_tenant_config_loader",
+        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
+        raising=False,
+    )
+
+    result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
+
+    assert result.detected_language == "en"
+    assert result.contains_chinese is False
+    assert result.contains_english is True
+    assert result.translations.get("zh") == "off shoulder top-zh"
+    assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
+    assert result.source_in_index_languages is True
@@ -5,6 +5,7 @@ from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Dict, List
+import numpy as np
 import yaml
 from config import (
@@ -157,9 +158,7 @@ def _build_search_config(*, rerank_enabled: bool = True, rerank_window: int = 38
         rerank=RerankConfig(enabled=rerank_enabled, rerank_window=rerank_window),
         spu_config=SPUConfig(enabled=False),
         es_index_name="test_products",
-        tenant_config={},
         es_settings={},
-        services={},
     )
@@ -173,6 +172,19 @@ def _build_searcher(config: SearchConfig, es_client: _FakeESClient) -&gt; Searcher:
     return searcher
+class _FakeTextEncoder:
+    def __init__(self, vectors: Dict[str, List[float]]):
+        self.vectors = {
+            key: np.array(value, dtype=np.float32)
+            for key, value in vectors.items()
+        }
+
+    def encode(self, sentences, priority: int = 0, **kwargs):
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        return np.array([self.vectors[text] for text in sentences], dtype=object)
+
+
 def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
     config_data = {
         "es_index_name": "test_products",
@@ -327,3 +339,118 @@ def test_searcher_skips_rerank_when_page_exceeds_window(monkeypatch):
     assert es_client.calls[0]["size"] == 10
     assert es_client.calls[0]["include_named_queries_score"] is False
     assert len(es_client.calls) == 1
+
+
+def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch):
+    es_client = _FakeESClient(total_hits=1)
+    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
+    context = create_request_context(reqid="sku-text", uid="u-sku-text")
+
+    monkeypatch.setattr(
+        "search.searcher.get_tenant_config_loader",
+        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
+    )
+
+    class _TranslatedQueryParser:
+        text_encoder = None
+
+        def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
+            return _FakeParsedQuery(
+                original_query=query,
+                query_normalized=query,
+                rewritten_query=query,
+                translations={"en": "black dress"},
+            )
+
+    searcher.query_parser = _TranslatedQueryParser()
+
+    def _full_source_with_skus(doc_id: str) -> Dict[str, Any]:
+        return {
+            "spu_id": doc_id,
+            "title": {"en": f"product-{doc_id}"},
+            "brief": {"en": f"brief-{doc_id}"},
+            "vendor": {"en": f"vendor-{doc_id}"},
+            "option1_name": "Color",
+            "image_url": "https://img/default.jpg",
+            "skus": [
+                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
+                {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"},
+            ],
+        }
+
+    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus))
+
+    result = searcher.search(
+        query="黑色 连衣裙",
+        tenant_id="162",
+        from_=0,
+        size=1,
+        context=context,
+        enable_rerank=False,
+    )
+
+    assert len(result.results) == 1
+    assert result.results[0].skus[0].sku_id == "sku-black"
+    assert result.results[0].image_url == "https://img/black.jpg"
+
+
+def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch):
+    es_client = _FakeESClient(total_hits=1)
+    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
+    context = create_request_context(reqid="sku-embed", uid="u-sku-embed")
+
+    monkeypatch.setattr(
+        "search.searcher.get_tenant_config_loader",
+        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
+    )
+
+    encoder = _FakeTextEncoder(
+        {
+            "linen summer dress": [0.8, 0.2],
+            "color:Red": [1.0, 0.0],
+            "color:Blue": [0.0, 1.0],
+        }
+    )
+
+    class _EmbeddingQueryParser:
+        text_encoder = encoder
+
+        def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
+            return _FakeParsedQuery(
+                original_query=query,
+                query_normalized=query,
+                rewritten_query=query,
+                translations={},
+                query_vector=np.array([0.0, 1.0], dtype=np.float32),
+            )
+
+    searcher.query_parser = _EmbeddingQueryParser()
+
+    def _full_source_with_skus(doc_id: str) -> Dict[str, Any]:
+        return {
+            "spu_id": doc_id,
+            "title": {"en": f"product-{doc_id}"},
+            "brief": {"en": f"brief-{doc_id}"},
+            "vendor": {"en": f"vendor-{doc_id}"},
+            "option1_name": "Color",
+            "image_url": "https://img/default.jpg",
+            "skus": [
+                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
+                {"sku_id": "sku-blue", "option1_value": "Blue", "image_src": "https://img/blue.jpg"},
+            ],
+        }
+
+    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus))
+
+    result = searcher.search(
+        query="linen summer dress",
+        tenant_id="162",
+        from_=0,
+        size=1,
+        context=context,
+        enable_rerank=False,
+    )
+
+    assert len(result.results) == 1
+    assert result.results[0].skus[0].sku_id == "sku-blue"
+    assert result.results[0].image_url == "https://img/blue.jpg"