diff --git a/config/config.yaml b/config/config.yaml index 6f8233f..16976dc 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,10 +1,61 @@ # Unified Configuration for Multi-Tenant Search Engine # 统一配置文件,所有租户共用一套配置 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 +# +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 + +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) +runtime: + environment: "prod" + index_namespace: "" + api_host: "0.0.0.0" + api_port: 6002 + indexer_host: "0.0.0.0" + indexer_port: 6004 + embedding_host: "0.0.0.0" + embedding_port: 6005 + embedding_text_port: 6005 + embedding_image_port: 6008 + translator_host: "127.0.0.1" + translator_port: 6006 + reranker_host: "127.0.0.1" + reranker_port: 6007 + +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) +infrastructure: + elasticsearch: + host: "http://localhost:9200" + username: null + password: null + redis: + host: "localhost" + port: 6479 + snapshot_db: 0 + password: null + socket_timeout: 1 + socket_connect_timeout: 1 + retry_on_timeout: false + cache_expire_days: 720 + embedding_cache_prefix: "embedding" + anchor_cache_prefix: "product_anchors" + anchor_cache_expire_days: 30 + database: + host: null + port: 3306 + database: null + username: null + password: null + secrets: + dashscope_api_key: null + deepl_auth_key: null # Elasticsearch Index es_index_name: "search_products" +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) +indexes: [] + # Config assets assets: query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" @@ -47,10 +98,23 @@ query_config: enable_text_embedding: true enable_query_rewrite: true + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 + # zh_to_en_model: "opus-mt-zh-en" + # en_to_zh_model: "opus-mt-en-zh" + # default_translation_model: "nllb-200-distilled-600m" + zh_to_en_model: "deepl" + en_to_zh_model: "deepl" + default_translation_model: "deepl" + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) + zh_to_en_model__source_not_in_index: "deepl" + en_to_zh_model__source_not_in_index: "deepl" + default_translation_model__source_not_in_index: "deepl" + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 - translation_embedding_wait_budget_ms_source_in_index: 80 - translation_embedding_wait_budget_ms_source_not_in_index: 200 + translation_embedding_wait_budget_ms_source_in_index: 500 # 80 + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200 # 动态多语言检索字段配置 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; @@ -78,7 +142,7 @@ query_config: text_query_strategy: base_minimum_should_match: "75%" translation_minimum_should_match: "75%" - translation_boost: 0.4 + translation_boost: 0.6 tie_breaker_base_query: 0.9 # Embedding字段名称 diff --git a/config/loader.py b/config/loader.py index 8be9087..6e8ddb7 100644 --- a/config/loader.py +++ b/config/loader.py @@ -290,6 +290,24 @@ class AppConfigLoader: default_translation_model=str( query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" ), + zh_to_en_model_source_not_in_index=( + str(v) + if (v := query_cfg.get("zh_to_en_model__source_not_in_index")) + not in (None, "") + else None + ), + en_to_zh_model_source_not_in_index=( + str(v) + if (v := query_cfg.get("en_to_zh_model__source_not_in_index")) + not in (None, "") + else None + ), + default_translation_model_source_not_in_index=( + str(v) + if (v := query_cfg.get("default_translation_model__source_not_in_index")) + not in (None, "") + else None + ), translation_embedding_wait_budget_ms_source_in_index=int( query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) ), diff --git a/config/schema.py b/config/schema.py index 713d741..13e2ce0 100644 --- a/config/schema.py +++ b/config/schema.py @@ -58,6 +58,10 @@ class QueryConfig: zh_to_en_model: str = "opus-mt-zh-en" en_to_zh_model: str = "opus-mt-en-zh" default_translation_model: str = "nllb-200-distilled-600m" + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。 + zh_to_en_model_source_not_in_index: Optional[str] = None + en_to_zh_model_source_not_in_index: Optional[str] = None + default_translation_model_source_not_in_index: Optional[str] = None # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 diff --git a/query/query_parser.py b/query/query_parser.py index 5294427..3ee2f85 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -23,10 +23,7 @@ from .query_rewriter import QueryRewriter, QueryNormalizer logger = logging.getLogger(__name__) -try: - import hanlp # type: ignore -except Exception: # pragma: no cover - hanlp = None +import hanlp # type: ignore def simple_tokenize_query(text: str) -> List[str]: @@ -140,20 +137,29 @@ class QueryParser: return tokenizer @staticmethod - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: + def _pick_query_translation_model( + source_lang: str, + target_lang: str, + config: SearchConfig, + source_language_in_index: bool, + ) -> str: """Pick the translation capability for query-time translation (configurable).""" src = str(source_lang or "").strip().lower() tgt = str(target_lang or "").strip().lower() + qc = config.query_config + + if source_language_in_index: + if src == "zh" and tgt == "en": + return qc.zh_to_en_model + if src == "en" and tgt == "zh": + return qc.en_to_zh_model + return qc.default_translation_model - # Use dedicated models for zh<->en if configured if src == "zh" and tgt == "en": - return config.query_config.zh_to_en_model + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model if src == "en" and tgt == "zh": - return config.query_config.en_to_zh_model - - # For any other language pairs, fall back to the configurable default model. - # By default this is `nllb-200-distilled-600m` (multi-lingual local model). - return config.query_config.default_translation_model + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model + return qc.default_translation_model_source_not_in_index or qc.default_translation_model @staticmethod def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: @@ -301,6 +307,7 @@ class QueryParser: detected_norm = str(detected_lang or "").strip().lower() normalized_targets = self._normalize_language_codes(target_languages) translation_targets = [lang for lang in normalized_targets if lang != detected_norm] + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets # Stage 6: Text embedding - async execution query_vector = None @@ -319,7 +326,12 @@ class QueryParser: try: if async_executor is not None: for lang in translation_targets: - model_name = self._pick_query_translation_model(detected_lang, lang, self.config) + model_name = self._pick_query_translation_model( + detected_lang, + lang, + self.config, + source_language_in_index, + ) log_debug( f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" ) -- libgit2 0.21.2