Commit 86d0e83d8467addf30934dff4dcd6d60a5b372a0
1 parent
0536222c
query翻译,根据源语言是否在索引语言中区分配置
Showing
4 changed files
with
114 additions
and
16 deletions
Show diff stats
config/config.yaml
| 1 | # Unified Configuration for Multi-Tenant Search Engine | 1 | # Unified Configuration for Multi-Tenant Search Engine |
| 2 | # 统一配置文件,所有租户共用一套配置 | 2 | # 统一配置文件,所有租户共用一套配置 |
| 3 | # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 | 3 | # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 |
| 4 | +# | ||
| 5 | +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 | ||
| 6 | +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 | ||
| 7 | + | ||
| 8 | +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) | ||
| 9 | +runtime: | ||
| 10 | + environment: "prod" | ||
| 11 | + index_namespace: "" | ||
| 12 | + api_host: "0.0.0.0" | ||
| 13 | + api_port: 6002 | ||
| 14 | + indexer_host: "0.0.0.0" | ||
| 15 | + indexer_port: 6004 | ||
| 16 | + embedding_host: "0.0.0.0" | ||
| 17 | + embedding_port: 6005 | ||
| 18 | + embedding_text_port: 6005 | ||
| 19 | + embedding_image_port: 6008 | ||
| 20 | + translator_host: "127.0.0.1" | ||
| 21 | + translator_port: 6006 | ||
| 22 | + reranker_host: "127.0.0.1" | ||
| 23 | + reranker_port: 6007 | ||
| 24 | + | ||
| 25 | +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) | ||
| 26 | +infrastructure: | ||
| 27 | + elasticsearch: | ||
| 28 | + host: "http://localhost:9200" | ||
| 29 | + username: null | ||
| 30 | + password: null | ||
| 31 | + redis: | ||
| 32 | + host: "localhost" | ||
| 33 | + port: 6479 | ||
| 34 | + snapshot_db: 0 | ||
| 35 | + password: null | ||
| 36 | + socket_timeout: 1 | ||
| 37 | + socket_connect_timeout: 1 | ||
| 38 | + retry_on_timeout: false | ||
| 39 | + cache_expire_days: 720 | ||
| 40 | + embedding_cache_prefix: "embedding" | ||
| 41 | + anchor_cache_prefix: "product_anchors" | ||
| 42 | + anchor_cache_expire_days: 30 | ||
| 43 | + database: | ||
| 44 | + host: null | ||
| 45 | + port: 3306 | ||
| 46 | + database: null | ||
| 47 | + username: null | ||
| 48 | + password: null | ||
| 49 | + secrets: | ||
| 50 | + dashscope_api_key: null | ||
| 51 | + deepl_auth_key: null | ||
| 4 | 52 | ||
| 5 | # Elasticsearch Index | 53 | # Elasticsearch Index |
| 6 | es_index_name: "search_products" | 54 | es_index_name: "search_products" |
| 7 | 55 | ||
| 56 | +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) | ||
| 57 | +indexes: [] | ||
| 58 | + | ||
| 8 | # Config assets | 59 | # Config assets |
| 9 | assets: | 60 | assets: |
| 10 | query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" | 61 | query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" |
| @@ -47,10 +98,23 @@ query_config: | @@ -47,10 +98,23 @@ query_config: | ||
| 47 | enable_text_embedding: true | 98 | enable_text_embedding: true |
| 48 | enable_query_rewrite: true | 99 | enable_query_rewrite: true |
| 49 | 100 | ||
| 101 | + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) | ||
| 102 | + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 | ||
| 103 | + # zh_to_en_model: "opus-mt-zh-en" | ||
| 104 | + # en_to_zh_model: "opus-mt-en-zh" | ||
| 105 | + # default_translation_model: "nllb-200-distilled-600m" | ||
| 106 | + zh_to_en_model: "deepl" | ||
| 107 | + en_to_zh_model: "deepl" | ||
| 108 | + default_translation_model: "deepl" | ||
| 109 | + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) | ||
| 110 | + zh_to_en_model__source_not_in_index: "deepl" | ||
| 111 | + en_to_zh_model__source_not_in_index: "deepl" | ||
| 112 | + default_translation_model__source_not_in_index: "deepl" | ||
| 113 | + | ||
| 50 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 | 114 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 51 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 | 115 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 52 | - translation_embedding_wait_budget_ms_source_in_index: 80 | ||
| 53 | - translation_embedding_wait_budget_ms_source_not_in_index: 200 | 116 | + translation_embedding_wait_budget_ms_source_in_index: 500 # 80 |
| 117 | + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 | ||
| 54 | 118 | ||
| 55 | # 动态多语言检索字段配置 | 119 | # 动态多语言检索字段配置 |
| 56 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | 120 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; |
| @@ -78,7 +142,7 @@ query_config: | @@ -78,7 +142,7 @@ query_config: | ||
| 78 | text_query_strategy: | 142 | text_query_strategy: |
| 79 | base_minimum_should_match: "75%" | 143 | base_minimum_should_match: "75%" |
| 80 | translation_minimum_should_match: "75%" | 144 | translation_minimum_should_match: "75%" |
| 81 | - translation_boost: 0.4 | 145 | + translation_boost: 0.6 |
| 82 | tie_breaker_base_query: 0.9 | 146 | tie_breaker_base_query: 0.9 |
| 83 | 147 | ||
| 84 | # Embedding字段名称 | 148 | # Embedding字段名称 |
config/loader.py
| @@ -290,6 +290,24 @@ class AppConfigLoader: | @@ -290,6 +290,24 @@ class AppConfigLoader: | ||
| 290 | default_translation_model=str( | 290 | default_translation_model=str( |
| 291 | query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" | 291 | query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" |
| 292 | ), | 292 | ), |
| 293 | + zh_to_en_model_source_not_in_index=( | ||
| 294 | + str(v) | ||
| 295 | + if (v := query_cfg.get("zh_to_en_model__source_not_in_index")) | ||
| 296 | + not in (None, "") | ||
| 297 | + else None | ||
| 298 | + ), | ||
| 299 | + en_to_zh_model_source_not_in_index=( | ||
| 300 | + str(v) | ||
| 301 | + if (v := query_cfg.get("en_to_zh_model__source_not_in_index")) | ||
| 302 | + not in (None, "") | ||
| 303 | + else None | ||
| 304 | + ), | ||
| 305 | + default_translation_model_source_not_in_index=( | ||
| 306 | + str(v) | ||
| 307 | + if (v := query_cfg.get("default_translation_model__source_not_in_index")) | ||
| 308 | + not in (None, "") | ||
| 309 | + else None | ||
| 310 | + ), | ||
| 293 | translation_embedding_wait_budget_ms_source_in_index=int( | 311 | translation_embedding_wait_budget_ms_source_in_index=int( |
| 294 | query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) | 312 | query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) |
| 295 | ), | 313 | ), |
config/schema.py
| @@ -58,6 +58,10 @@ class QueryConfig: | @@ -58,6 +58,10 @@ class QueryConfig: | ||
| 58 | zh_to_en_model: str = "opus-mt-zh-en" | 58 | zh_to_en_model: str = "opus-mt-zh-en" |
| 59 | en_to_zh_model: str = "opus-mt-en-zh" | 59 | en_to_zh_model: str = "opus-mt-en-zh" |
| 60 | default_translation_model: str = "nllb-200-distilled-600m" | 60 | default_translation_model: str = "nllb-200-distilled-600m" |
| 61 | + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。 | ||
| 62 | + zh_to_en_model_source_not_in_index: Optional[str] = None | ||
| 63 | + en_to_zh_model_source_not_in_index: Optional[str] = None | ||
| 64 | + default_translation_model_source_not_in_index: Optional[str] = None | ||
| 61 | # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 | 65 | # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 |
| 62 | # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 | 66 | # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 |
| 63 | # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 | 67 | # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 |
query/query_parser.py
| @@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer | @@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer | ||
| 23 | 23 | ||
| 24 | logger = logging.getLogger(__name__) | 24 | logger = logging.getLogger(__name__) |
| 25 | 25 | ||
| 26 | -try: | ||
| 27 | - import hanlp # type: ignore | ||
| 28 | -except Exception: # pragma: no cover | ||
| 29 | - hanlp = None | 26 | +import hanlp # type: ignore |
| 30 | 27 | ||
| 31 | 28 | ||
| 32 | def simple_tokenize_query(text: str) -> List[str]: | 29 | def simple_tokenize_query(text: str) -> List[str]: |
| @@ -140,20 +137,29 @@ class QueryParser: | @@ -140,20 +137,29 @@ class QueryParser: | ||
| 140 | return tokenizer | 137 | return tokenizer |
| 141 | 138 | ||
| 142 | @staticmethod | 139 | @staticmethod |
| 143 | - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: | 140 | + def _pick_query_translation_model( |
| 141 | + source_lang: str, | ||
| 142 | + target_lang: str, | ||
| 143 | + config: SearchConfig, | ||
| 144 | + source_language_in_index: bool, | ||
| 145 | + ) -> str: | ||
| 144 | """Pick the translation capability for query-time translation (configurable).""" | 146 | """Pick the translation capability for query-time translation (configurable).""" |
| 145 | src = str(source_lang or "").strip().lower() | 147 | src = str(source_lang or "").strip().lower() |
| 146 | tgt = str(target_lang or "").strip().lower() | 148 | tgt = str(target_lang or "").strip().lower() |
| 149 | + qc = config.query_config | ||
| 150 | + | ||
| 151 | + if source_language_in_index: | ||
| 152 | + if src == "zh" and tgt == "en": | ||
| 153 | + return qc.zh_to_en_model | ||
| 154 | + if src == "en" and tgt == "zh": | ||
| 155 | + return qc.en_to_zh_model | ||
| 156 | + return qc.default_translation_model | ||
| 147 | 157 | ||
| 148 | - # Use dedicated models for zh<->en if configured | ||
| 149 | if src == "zh" and tgt == "en": | 158 | if src == "zh" and tgt == "en": |
| 150 | - return config.query_config.zh_to_en_model | 159 | + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model |
| 151 | if src == "en" and tgt == "zh": | 160 | if src == "en" and tgt == "zh": |
| 152 | - return config.query_config.en_to_zh_model | ||
| 153 | - | ||
| 154 | - # For any other language pairs, fall back to the configurable default model. | ||
| 155 | - # By default this is `nllb-200-distilled-600m` (multi-lingual local model). | ||
| 156 | - return config.query_config.default_translation_model | 161 | + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model |
| 162 | + return qc.default_translation_model_source_not_in_index or qc.default_translation_model | ||
| 157 | 163 | ||
| 158 | @staticmethod | 164 | @staticmethod |
| 159 | def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: | 165 | def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: |
| @@ -301,6 +307,7 @@ class QueryParser: | @@ -301,6 +307,7 @@ class QueryParser: | ||
| 301 | detected_norm = str(detected_lang or "").strip().lower() | 307 | detected_norm = str(detected_lang or "").strip().lower() |
| 302 | normalized_targets = self._normalize_language_codes(target_languages) | 308 | normalized_targets = self._normalize_language_codes(target_languages) |
| 303 | translation_targets = [lang for lang in normalized_targets if lang != detected_norm] | 309 | translation_targets = [lang for lang in normalized_targets if lang != detected_norm] |
| 310 | + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets | ||
| 304 | 311 | ||
| 305 | # Stage 6: Text embedding - async execution | 312 | # Stage 6: Text embedding - async execution |
| 306 | query_vector = None | 313 | query_vector = None |
| @@ -319,7 +326,12 @@ class QueryParser: | @@ -319,7 +326,12 @@ class QueryParser: | ||
| 319 | try: | 326 | try: |
| 320 | if async_executor is not None: | 327 | if async_executor is not None: |
| 321 | for lang in translation_targets: | 328 | for lang in translation_targets: |
| 322 | - model_name = self._pick_query_translation_model(detected_lang, lang, self.config) | 329 | + model_name = self._pick_query_translation_model( |
| 330 | + detected_lang, | ||
| 331 | + lang, | ||
| 332 | + self.config, | ||
| 333 | + source_language_in_index, | ||
| 334 | + ) | ||
| 323 | log_debug( | 335 | log_debug( |
| 324 | f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" | 336 | f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" |
| 325 | ) | 337 | ) |