Commit 86d0e83d8467addf30934dff4dcd6d60a5b372a0

Authored by tangwang
1 parent 0536222c

query翻译,根据源语言是否在索引语言中区分配置

config/config.yaml
1 # Unified Configuration for Multi-Tenant Search Engine 1 # Unified Configuration for Multi-Tenant Search Engine
2 # 统一配置文件,所有租户共用一套配置 2 # 统一配置文件,所有租户共用一套配置
3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
  9 +runtime:
  10 + environment: "prod"
  11 + index_namespace: ""
  12 + api_host: "0.0.0.0"
  13 + api_port: 6002
  14 + indexer_host: "0.0.0.0"
  15 + indexer_port: 6004
  16 + embedding_host: "0.0.0.0"
  17 + embedding_port: 6005
  18 + embedding_text_port: 6005
  19 + embedding_image_port: 6008
  20 + translator_host: "127.0.0.1"
  21 + translator_port: 6006
  22 + reranker_host: "127.0.0.1"
  23 + reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
  26 +infrastructure:
  27 + elasticsearch:
  28 + host: "http://localhost:9200"
  29 + username: null
  30 + password: null
  31 + redis:
  32 + host: "localhost"
  33 + port: 6479
  34 + snapshot_db: 0
  35 + password: null
  36 + socket_timeout: 1
  37 + socket_connect_timeout: 1
  38 + retry_on_timeout: false
  39 + cache_expire_days: 720
  40 + embedding_cache_prefix: "embedding"
  41 + anchor_cache_prefix: "product_anchors"
  42 + anchor_cache_expire_days: 30
  43 + database:
  44 + host: null
  45 + port: 3306
  46 + database: null
  47 + username: null
  48 + password: null
  49 + secrets:
  50 + dashscope_api_key: null
  51 + deepl_auth_key: null
4 52
5 # Elasticsearch Index 53 # Elasticsearch Index
6 es_index_name: "search_products" 54 es_index_name: "search_products"
7 55
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
  57 +indexes: []
  58 +
8 # Config assets 59 # Config assets
9 assets: 60 assets:
10 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" 61 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
@@ -47,10 +98,23 @@ query_config: @@ -47,10 +98,23 @@ query_config:
47 enable_text_embedding: true 98 enable_text_embedding: true
48 enable_query_rewrite: true 99 enable_query_rewrite: true
49 100
  101 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  102 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  103 + # zh_to_en_model: "opus-mt-zh-en"
  104 + # en_to_zh_model: "opus-mt-en-zh"
  105 + # default_translation_model: "nllb-200-distilled-600m"
  106 + zh_to_en_model: "deepl"
  107 + en_to_zh_model: "deepl"
  108 + default_translation_model: "deepl"
  109 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
  110 + zh_to_en_model__source_not_in_index: "deepl"
  111 + en_to_zh_model__source_not_in_index: "deepl"
  112 + default_translation_model__source_not_in_index: "deepl"
  113 +
50 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 114 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
51 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 115 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
52 - translation_embedding_wait_budget_ms_source_in_index: 80  
53 - translation_embedding_wait_budget_ms_source_not_in_index: 200 116 + translation_embedding_wait_budget_ms_source_in_index: 500 # 80
  117 + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
54 118
55 # 动态多语言检索字段配置 119 # 动态多语言检索字段配置
56 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; 120 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
@@ -78,7 +142,7 @@ query_config: @@ -78,7 +142,7 @@ query_config:
78 text_query_strategy: 142 text_query_strategy:
79 base_minimum_should_match: "75%" 143 base_minimum_should_match: "75%"
80 translation_minimum_should_match: "75%" 144 translation_minimum_should_match: "75%"
81 - translation_boost: 0.4 145 + translation_boost: 0.6
82 tie_breaker_base_query: 0.9 146 tie_breaker_base_query: 0.9
83 147
84 # Embedding字段名称 148 # Embedding字段名称
@@ -290,6 +290,24 @@ class AppConfigLoader: @@ -290,6 +290,24 @@ class AppConfigLoader:
290 default_translation_model=str( 290 default_translation_model=str(
291 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" 291 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
292 ), 292 ),
  293 + zh_to_en_model_source_not_in_index=(
  294 + str(v)
  295 + if (v := query_cfg.get("zh_to_en_model__source_not_in_index"))
  296 + not in (None, "")
  297 + else None
  298 + ),
  299 + en_to_zh_model_source_not_in_index=(
  300 + str(v)
  301 + if (v := query_cfg.get("en_to_zh_model__source_not_in_index"))
  302 + not in (None, "")
  303 + else None
  304 + ),
  305 + default_translation_model_source_not_in_index=(
  306 + str(v)
  307 + if (v := query_cfg.get("default_translation_model__source_not_in_index"))
  308 + not in (None, "")
  309 + else None
  310 + ),
293 translation_embedding_wait_budget_ms_source_in_index=int( 311 translation_embedding_wait_budget_ms_source_in_index=int(
294 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) 312 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
295 ), 313 ),
@@ -58,6 +58,10 @@ class QueryConfig: @@ -58,6 +58,10 @@ class QueryConfig:
58 zh_to_en_model: str = "opus-mt-zh-en" 58 zh_to_en_model: str = "opus-mt-zh-en"
59 en_to_zh_model: str = "opus-mt-en-zh" 59 en_to_zh_model: str = "opus-mt-en-zh"
60 default_translation_model: str = "nllb-200-distilled-600m" 60 default_translation_model: str = "nllb-200-distilled-600m"
  61 + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。
  62 + zh_to_en_model_source_not_in_index: Optional[str] = None
  63 + en_to_zh_model_source_not_in_index: Optional[str] = None
  64 + default_translation_model_source_not_in_index: Optional[str] = None
61 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 65 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。
62 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 66 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。
63 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 67 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
query/query_parser.py
@@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer @@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
23 23
24 logger = logging.getLogger(__name__) 24 logger = logging.getLogger(__name__)
25 25
26 -try:  
27 - import hanlp # type: ignore  
28 -except Exception: # pragma: no cover  
29 - hanlp = None 26 +import hanlp # type: ignore
30 27
31 28
32 def simple_tokenize_query(text: str) -> List[str]: 29 def simple_tokenize_query(text: str) -> List[str]:
@@ -140,20 +137,29 @@ class QueryParser: @@ -140,20 +137,29 @@ class QueryParser:
140 return tokenizer 137 return tokenizer
141 138
142 @staticmethod 139 @staticmethod
143 - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: 140 + def _pick_query_translation_model(
  141 + source_lang: str,
  142 + target_lang: str,
  143 + config: SearchConfig,
  144 + source_language_in_index: bool,
  145 + ) -> str:
144 """Pick the translation capability for query-time translation (configurable).""" 146 """Pick the translation capability for query-time translation (configurable)."""
145 src = str(source_lang or "").strip().lower() 147 src = str(source_lang or "").strip().lower()
146 tgt = str(target_lang or "").strip().lower() 148 tgt = str(target_lang or "").strip().lower()
  149 + qc = config.query_config
  150 +
  151 + if source_language_in_index:
  152 + if src == "zh" and tgt == "en":
  153 + return qc.zh_to_en_model
  154 + if src == "en" and tgt == "zh":
  155 + return qc.en_to_zh_model
  156 + return qc.default_translation_model
147 157
148 - # Use dedicated models for zh<->en if configured  
149 if src == "zh" and tgt == "en": 158 if src == "zh" and tgt == "en":
150 - return config.query_config.zh_to_en_model 159 + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model
151 if src == "en" and tgt == "zh": 160 if src == "en" and tgt == "zh":
152 - return config.query_config.en_to_zh_model  
153 -  
154 - # For any other language pairs, fall back to the configurable default model.  
155 - # By default this is `nllb-200-distilled-600m` (multi-lingual local model).  
156 - return config.query_config.default_translation_model 161 + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model
  162 + return qc.default_translation_model_source_not_in_index or qc.default_translation_model
157 163
158 @staticmethod 164 @staticmethod
159 def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: 165 def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
@@ -301,6 +307,7 @@ class QueryParser: @@ -301,6 +307,7 @@ class QueryParser:
301 detected_norm = str(detected_lang or "").strip().lower() 307 detected_norm = str(detected_lang or "").strip().lower()
302 normalized_targets = self._normalize_language_codes(target_languages) 308 normalized_targets = self._normalize_language_codes(target_languages)
303 translation_targets = [lang for lang in normalized_targets if lang != detected_norm] 309 translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  310 + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
304 311
305 # Stage 6: Text embedding - async execution 312 # Stage 6: Text embedding - async execution
306 query_vector = None 313 query_vector = None
@@ -319,7 +326,12 @@ class QueryParser: @@ -319,7 +326,12 @@ class QueryParser:
319 try: 326 try:
320 if async_executor is not None: 327 if async_executor is not None:
321 for lang in translation_targets: 328 for lang in translation_targets:
322 - model_name = self._pick_query_translation_model(detected_lang, lang, self.config) 329 + model_name = self._pick_query_translation_model(
  330 + detected_lang,
  331 + lang,
  332 + self.config,
  333 + source_language_in_index,
  334 + )
323 log_debug( 335 log_debug(
324 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" 336 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
325 ) 337 )