query翻译，根据源语言是否在索引语言中区分配置

tangwang
1 parent 0536222c
Showing 4 changed files with 114 additions and 16 deletions Show diff stats
config/config.yaml
config/loader.py
config/schema.py
query/query_parser.py
 # Unified Configuration for Multi-Tenant Search Engine
 # 统一配置文件，所有租户共用一套配置
 # 注意：索引结构由 mappings/search_products.json 定义，此文件只配置搜索行为
+#
+# 约定：下列键为必填；进程环境变量可覆盖 infrastructure / runtime 中同名语义项
+#（如 ES_HOST、API_PORT 等），未设置环境变量时使用本文件中的值。
+
+# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
+runtime:
+  environment: "prod"
+  index_namespace: ""
+  api_host: "0.0.0.0"
+  api_port: 6002
+  indexer_host: "0.0.0.0"
+  indexer_port: 6004
+  embedding_host: "0.0.0.0"
+  embedding_port: 6005
+  embedding_text_port: 6005
+  embedding_image_port: 6008
+  translator_host: "127.0.0.1"
+  translator_port: 6006
+  reranker_host: "127.0.0.1"
+  reranker_port: 6007
+
+# 基础设施连接（敏感项优先读环境变量：ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY）
+infrastructure:
+  elasticsearch:
+    host: "http://localhost:9200"
+    username: null
+    password: null
+  redis:
+    host: "localhost"
+    port: 6479
+    snapshot_db: 0
+    password: null
+    socket_timeout: 1
+    socket_connect_timeout: 1
+    retry_on_timeout: false
+    cache_expire_days: 720
+    embedding_cache_prefix: "embedding"
+    anchor_cache_prefix: "product_anchors"
+    anchor_cache_expire_days: 30
+  database:
+    host: null
+    port: 3306
+    database: null
+    username: null
+    password: null
+  secrets:
+    dashscope_api_key: null
+    deepl_auth_key: null
  
 # Elasticsearch Index
 es_index_name: "search_products"
  
+# 检索域 / 索引列表（可为空列表；每项字段均需显式给出）
+indexes: []
+
 # Config assets
 assets:
   query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
@@ -47,10 +98,23 @@ query_config:
   enable_text_embedding: true
   enable_query_rewrite: true
  
+  # 查询翻译模型（须与 services.translation.capabilities 中某项一致）
+  # 源语种在租户 index_languages 内：主召回可打在源语种字段，用下面三项。
+  # zh_to_en_model: "opus-mt-zh-en"
+  # en_to_zh_model: "opus-mt-en-zh"
+  # default_translation_model: "nllb-200-distilled-600m"
+  zh_to_en_model: "deepl"
+  en_to_zh_model: "deepl"
+  default_translation_model: "deepl"
+  # 源语种不在 index_languages：翻译对可检索文本更关键，可单独指定（缺省则与上一组相同）
+  zh_to_en_model__source_not_in_index: "deepl"
+  en_to_zh_model__source_not_in_index: "deepl"
+  default_translation_model__source_not_in_index: "deepl"
+
   # 查询解析阶段：翻译与 query 向量并发执行，共用同一等待预算（毫秒）。
   # 检测语言已在租户 index_languages 内：较短；不在索引语言内：较长（翻译对召回更关键）。
-  translation_embedding_wait_budget_ms_source_in_index: 80
-  translation_embedding_wait_budget_ms_source_not_in_index: 200
+  translation_embedding_wait_budget_ms_source_in_index: 500 # 80
+  translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
  
   # 动态多语言检索字段配置
   # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式；
@@ -78,7 +142,7 @@ query_config:
   text_query_strategy:
     base_minimum_should_match: "75%"
     translation_minimum_should_match: "75%"
-    translation_boost: 0.4
+    translation_boost: 0.6
     tie_breaker_base_query: 0.9
  
   # Embedding字段名称
@@ -290,6 +290,24 @@ class AppConfigLoader:
             default_translation_model=str(
                 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
             ),
+            zh_to_en_model_source_not_in_index=(
+                str(v)
+                if (v := query_cfg.get("zh_to_en_model__source_not_in_index"))
+                not in (None, "")
+                else None
+            ),
+            en_to_zh_model_source_not_in_index=(
+                str(v)
+                if (v := query_cfg.get("en_to_zh_model__source_not_in_index"))
+                not in (None, "")
+                else None
+            ),
+            default_translation_model_source_not_in_index=(
+                str(v)
+                if (v := query_cfg.get("default_translation_model__source_not_in_index"))
+                not in (None, "")
+                else None
+            ),
             translation_embedding_wait_budget_ms_source_in_index=int(
                 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
             ),
@@ -58,6 +58,10 @@ class QueryConfig:
     zh_to_en_model: str = "opus-mt-zh-en"
     en_to_zh_model: str = "opus-mt-en-zh"
     default_translation_model: str = "nllb-200-distilled-600m"
+    # 检测语种不在租户 index_languages（无可直接命中的多语字段）时使用；None 表示与上一组同模型。
+    zh_to_en_model_source_not_in_index: Optional[str] = None
+    en_to_zh_model_source_not_in_index: Optional[str] = None
+    default_translation_model_source_not_in_index: Optional[str] = None
     # 查询阶段：翻译与向量生成并发提交后，共用同一等待预算（毫秒）。
     # 检测语言已在租户 index_languages 内：偏快返回，预算较短。
     # 检测语言不在 index_languages 内：翻译对召回更关键，预算较长。
@@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
  
 logger = logging.getLogger(__name__)
  
-try:
-    import hanlp  # type: ignore
-except Exception:  # pragma: no cover
-    hanlp = None
+import hanlp  # type: ignore
  
  
 def simple_tokenize_query(text: str) -> List[str]:
@@ -140,20 +137,29 @@ class QueryParser:
         return tokenizer
  
     @staticmethod
-    def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str:
+    def _pick_query_translation_model(
+        source_lang: str,
+        target_lang: str,
+        config: SearchConfig,
+        source_language_in_index: bool,
+    ) -> str:
         """Pick the translation capability for query-time translation (configurable)."""
         src = str(source_lang or "").strip().lower()
         tgt = str(target_lang or "").strip().lower()
+        qc = config.query_config
+
+        if source_language_in_index:
+            if src == "zh" and tgt == "en":
+                return qc.zh_to_en_model
+            if src == "en" and tgt == "zh":
+                return qc.en_to_zh_model
+            return qc.default_translation_model
  
-        # Use dedicated models for zh<->en if configured
         if src == "zh" and tgt == "en":
-            return config.query_config.zh_to_en_model
+            return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model
         if src == "en" and tgt == "zh":
-            return config.query_config.en_to_zh_model
-
-        # For any other language pairs, fall back to the configurable default model.
-        # By default this is `nllb-200-distilled-600m` (multi-lingual local model).
-        return config.query_config.default_translation_model
+            return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model
+        return qc.default_translation_model_source_not_in_index or qc.default_translation_model
  
     @staticmethod
     def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
@@ -301,6 +307,7 @@ class QueryParser:
         detected_norm = str(detected_lang or "").strip().lower()
         normalized_targets = self._normalize_language_codes(target_languages)
         translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
+        source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
  
         # Stage 6: Text embedding - async execution
         query_vector = None
@@ -319,7 +326,12 @@ class QueryParser:
         try:
             if async_executor is not None:
                 for lang in translation_targets:
-                    model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
+                    model_name = self._pick_query_translation_model(
+                        detected_lang,
+                        lang,
+                        self.config,
+                        source_language_in_index,
+                    )
                     log_debug(
                         f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
                     )