Commit 86d0e83d8467addf30934dff4dcd6d60a5b372a0

Authored by tangwang
1 parent 0536222c

query翻译,根据源语言是否在索引语言中区分配置

config/config.yaml
1 1 # Unified Configuration for Multi-Tenant Search Engine
2 2 # 统一配置文件,所有租户共用一套配置
3 3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
  9 +runtime:
  10 + environment: "prod"
  11 + index_namespace: ""
  12 + api_host: "0.0.0.0"
  13 + api_port: 6002
  14 + indexer_host: "0.0.0.0"
  15 + indexer_port: 6004
  16 + embedding_host: "0.0.0.0"
  17 + embedding_port: 6005
  18 + embedding_text_port: 6005
  19 + embedding_image_port: 6008
  20 + translator_host: "127.0.0.1"
  21 + translator_port: 6006
  22 + reranker_host: "127.0.0.1"
  23 + reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
  26 +infrastructure:
  27 + elasticsearch:
  28 + host: "http://localhost:9200"
  29 + username: null
  30 + password: null
  31 + redis:
  32 + host: "localhost"
  33 + port: 6479
  34 + snapshot_db: 0
  35 + password: null
  36 + socket_timeout: 1
  37 + socket_connect_timeout: 1
  38 + retry_on_timeout: false
  39 + cache_expire_days: 720
  40 + embedding_cache_prefix: "embedding"
  41 + anchor_cache_prefix: "product_anchors"
  42 + anchor_cache_expire_days: 30
  43 + database:
  44 + host: null
  45 + port: 3306
  46 + database: null
  47 + username: null
  48 + password: null
  49 + secrets:
  50 + dashscope_api_key: null
  51 + deepl_auth_key: null
4 52  
5 53 # Elasticsearch Index
6 54 es_index_name: "search_products"
7 55  
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
  57 +indexes: []
  58 +
8 59 # Config assets
9 60 assets:
10 61 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
... ... @@ -47,10 +98,23 @@ query_config:
47 98 enable_text_embedding: true
48 99 enable_query_rewrite: true
49 100  
  101 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  102 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  103 + # zh_to_en_model: "opus-mt-zh-en"
  104 + # en_to_zh_model: "opus-mt-en-zh"
  105 + # default_translation_model: "nllb-200-distilled-600m"
  106 + zh_to_en_model: "deepl"
  107 + en_to_zh_model: "deepl"
  108 + default_translation_model: "deepl"
  109 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
  110 + zh_to_en_model__source_not_in_index: "deepl"
  111 + en_to_zh_model__source_not_in_index: "deepl"
  112 + default_translation_model__source_not_in_index: "deepl"
  113 +
50 114 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
51 115 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
52   - translation_embedding_wait_budget_ms_source_in_index: 80
53   - translation_embedding_wait_budget_ms_source_not_in_index: 200
  116 + translation_embedding_wait_budget_ms_source_in_index: 500 # 80
  117 + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
54 118  
55 119 # 动态多语言检索字段配置
56 120 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
... ... @@ -78,7 +142,7 @@ query_config:
78 142 text_query_strategy:
79 143 base_minimum_should_match: "75%"
80 144 translation_minimum_should_match: "75%"
81   - translation_boost: 0.4
  145 + translation_boost: 0.6
82 146 tie_breaker_base_query: 0.9
83 147  
84 148 # Embedding字段名称
... ...
config/loader.py
... ... @@ -290,6 +290,24 @@ class AppConfigLoader:
290 290 default_translation_model=str(
291 291 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
292 292 ),
  293 + zh_to_en_model_source_not_in_index=(
  294 + str(v)
  295 + if (v := query_cfg.get("zh_to_en_model__source_not_in_index"))
  296 + not in (None, "")
  297 + else None
  298 + ),
  299 + en_to_zh_model_source_not_in_index=(
  300 + str(v)
  301 + if (v := query_cfg.get("en_to_zh_model__source_not_in_index"))
  302 + not in (None, "")
  303 + else None
  304 + ),
  305 + default_translation_model_source_not_in_index=(
  306 + str(v)
  307 + if (v := query_cfg.get("default_translation_model__source_not_in_index"))
  308 + not in (None, "")
  309 + else None
  310 + ),
293 311 translation_embedding_wait_budget_ms_source_in_index=int(
294 312 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
295 313 ),
... ...
config/schema.py
... ... @@ -58,6 +58,10 @@ class QueryConfig:
58 58 zh_to_en_model: str = "opus-mt-zh-en"
59 59 en_to_zh_model: str = "opus-mt-en-zh"
60 60 default_translation_model: str = "nllb-200-distilled-600m"
  61 + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。
  62 + zh_to_en_model_source_not_in_index: Optional[str] = None
  63 + en_to_zh_model_source_not_in_index: Optional[str] = None
  64 + default_translation_model_source_not_in_index: Optional[str] = None
61 65 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。
62 66 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。
63 67 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
... ...
query/query_parser.py
... ... @@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
23 23  
24 24 logger = logging.getLogger(__name__)
25 25  
26   -try:
27   - import hanlp # type: ignore
28   -except Exception: # pragma: no cover
29   - hanlp = None
  26 +import hanlp # type: ignore
30 27  
31 28  
32 29 def simple_tokenize_query(text: str) -> List[str]:
... ... @@ -140,20 +137,29 @@ class QueryParser:
140 137 return tokenizer
141 138  
142 139 @staticmethod
143   - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str:
  140 + def _pick_query_translation_model(
  141 + source_lang: str,
  142 + target_lang: str,
  143 + config: SearchConfig,
  144 + source_language_in_index: bool,
  145 + ) -> str:
144 146 """Pick the translation capability for query-time translation (configurable)."""
145 147 src = str(source_lang or "").strip().lower()
146 148 tgt = str(target_lang or "").strip().lower()
  149 + qc = config.query_config
  150 +
  151 + if source_language_in_index:
  152 + if src == "zh" and tgt == "en":
  153 + return qc.zh_to_en_model
  154 + if src == "en" and tgt == "zh":
  155 + return qc.en_to_zh_model
  156 + return qc.default_translation_model
147 157  
148   - # Use dedicated models for zh<->en if configured
149 158 if src == "zh" and tgt == "en":
150   - return config.query_config.zh_to_en_model
  159 + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model
151 160 if src == "en" and tgt == "zh":
152   - return config.query_config.en_to_zh_model
153   -
154   - # For any other language pairs, fall back to the configurable default model.
155   - # By default this is `nllb-200-distilled-600m` (multi-lingual local model).
156   - return config.query_config.default_translation_model
  161 + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model
  162 + return qc.default_translation_model_source_not_in_index or qc.default_translation_model
157 163  
158 164 @staticmethod
159 165 def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
... ... @@ -301,6 +307,7 @@ class QueryParser:
301 307 detected_norm = str(detected_lang or "").strip().lower()
302 308 normalized_targets = self._normalize_language_codes(target_languages)
303 309 translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  310 + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
304 311  
305 312 # Stage 6: Text embedding - async execution
306 313 query_vector = None
... ... @@ -319,7 +326,12 @@ class QueryParser:
319 326 try:
320 327 if async_executor is not None:
321 328 for lang in translation_targets:
322   - model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
  329 + model_name = self._pick_query_translation_model(
  330 + detected_lang,
  331 + lang,
  332 + self.config,
  333 + source_language_in_index,
  334 + )
323 335 log_debug(
324 336 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
325 337 )
... ...