1.

加了一个过滤/降权词典，query中有独立分词匹配到指定的触发词，将过滤带某些分词的商品（比如fitted/修身，过滤宽松、loose、relaxed、baggy,slouchy等商品） 2. reranker的query使用翻译后的

1.
加了一个过滤/降权词典，query中有独立分词匹配到指定的触发词，将过滤带某些分词的商品（比如fitted/修身，过滤宽松、loose、relaxed、baggy,slouchy等商品） 2. reranker的query使用翻译后的
tangwang
1 parent 6adbf18a
Showing 12 changed files with 515 additions and 3 deletions Show diff stats
config/config.yaml
config/dictionaries/product_title_exclusion.tsv
config/loader.py
config/schema.py
query/product_title_exclusion.py
query/query_parser.py
search/es_query_builder.py
search/searcher.py
tests/test_es_query_builder.py
tests/test_product_title_exclusion.py
tests/test_rerank_query_text.py
tests/test_search_rerank_window.py
@@ -124,6 +124,10 @@ query_config:
       color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
       size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
  
+  product_title_exclusion:
+    enabled: true
+    dictionary_path: "config/dictionaries/product_title_exclusion.tsv"
+
   # 动态多语言检索字段配置
   # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式；
   # shared_fields 为无语言后缀字段。
@@ -376,7 +380,7 @@ services:
       max_docs: 1000
       normalize: true
     # 服务内后端（reranker 进程启动时读取）
-    backend: "qwen3_vllm"  # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
+    backend: "bge"  # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
     backends:
       bge:
         model_name: "BAAI/bge-reranker-v2-m3"
@@ -0,0 +1,2 @@
+# zh triggers	en triggers	zh title exclusions	en title exclusions
+修身	fitted	宽松	loose,relaxed,oversized,baggy,slouchy
@@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -&gt; List[List[str]]:
     return rows
  
  
+def _read_product_title_exclusion_dictionary(path: Path) -> List[Dict[str, List[str]]]:
+    rules: List[Dict[str, List[str]]] = []
+    if not path.exists():
+        return rules
+
+    with open(path, "r", encoding="utf-8") as handle:
+        for raw_line in handle:
+            line = raw_line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = [segment.strip() for segment in line.split("\t")]
+            if len(parts) != 4:
+                continue
+
+            def _split_cell(cell: str) -> List[str]:
+                return [item.strip() for item in cell.split(",") if item.strip()]
+
+            rules.append(
+                {
+                    "zh_trigger_terms": _split_cell(parts[0]),
+                    "en_trigger_terms": _split_cell(parts[1]),
+                    "zh_title_exclusions": _split_cell(parts[2]),
+                    "en_title_exclusions": _split_cell(parts[3]),
+                }
+            )
+    return rules
+
+
 _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = {
     "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"],
     "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"],
@@ -282,6 +310,11 @@ class AppConfigLoader:
             if isinstance(query_cfg.get("style_intent"), dict)
             else {}
         )
+        product_title_exclusion_cfg = (
+            query_cfg.get("product_title_exclusion")
+            if isinstance(query_cfg.get("product_title_exclusion"), dict)
+            else {}
+        )
  
         def _resolve_project_path(value: Any, default_path: Path) -> Path:
             if value in (None, ""):
@@ -316,6 +349,10 @@ class AppConfigLoader:
             "color": _read_synonym_csv_dictionary(style_color_path),
             "size": _read_synonym_csv_dictionary(style_size_path),
         }
+        product_title_exclusion_path = _resolve_project_path(
+            product_title_exclusion_cfg.get("dictionary_path"),
+            self.config_dir / "dictionaries" / "product_title_exclusion.tsv",
+        )
         query_config = QueryConfig(
             supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]),
             default_language=str(query_cfg.get("default_language") or "en"),
@@ -390,6 +427,10 @@ class AppConfigLoader:
             style_intent_enabled=bool(style_intent_cfg.get("enabled", True)),
             style_intent_terms=style_intent_terms,
             style_intent_dimension_aliases=style_dimension_aliases,
+            product_title_exclusion_enabled=bool(product_title_exclusion_cfg.get("enabled", True)),
+            product_title_exclusion_rules=_read_product_title_exclusion_dictionary(
+                product_title_exclusion_path
+            ),
         )
  
         function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {}
@@ -67,6 +67,8 @@ class QueryConfig:
     style_intent_enabled: bool = True
     style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict)
     style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict)
+    product_title_exclusion_enabled: bool = True
+    product_title_exclusion_rules: List[Dict[str, List[str]]] = field(default_factory=list)
  
  
 @dataclass(frozen=True)
@@ -0,0 +1,225 @@
+"""
+Product title exclusion detection for query understanding.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+from .tokenization import TokenizedText, normalize_query_text, tokenize_text
+
+
+def _dedupe_terms(terms: Iterable[str]) -> List[str]:
+    result: List[str] = []
+    seen: Set[str] = set()
+    for raw_term in terms:
+        term = normalize_query_text(raw_term)
+        if not term or term in seen:
+            continue
+        seen.add(term)
+        result.append(term)
+    return result
+
+
+@dataclass(frozen=True)
+class ProductTitleExclusionRule:
+    zh_trigger_terms: Tuple[str, ...]
+    en_trigger_terms: Tuple[str, ...]
+    zh_title_exclusions: Tuple[str, ...]
+    en_title_exclusions: Tuple[str, ...]
+    max_term_ngram: int = 3
+
+    @classmethod
+    def from_config_row(cls, row: Dict[str, Sequence[str]]) -> Optional["ProductTitleExclusionRule"]:
+        zh_trigger_terms = tuple(_dedupe_terms(row.get("zh_trigger_terms") or []))
+        en_trigger_terms = tuple(_dedupe_terms(row.get("en_trigger_terms") or []))
+        zh_title_exclusions = tuple(_dedupe_terms(row.get("zh_title_exclusions") or []))
+        en_title_exclusions = tuple(_dedupe_terms(row.get("en_title_exclusions") or []))
+        if not zh_title_exclusions and not en_title_exclusions:
+            return None
+        if not zh_trigger_terms and not en_trigger_terms:
+            return None
+
+        max_ngram = max(
+            [1]
+            + [len(term.split()) for term in zh_trigger_terms]
+            + [len(term.split()) for term in en_trigger_terms]
+        )
+        return cls(
+            zh_trigger_terms=zh_trigger_terms,
+            en_trigger_terms=en_trigger_terms,
+            zh_title_exclusions=zh_title_exclusions,
+            en_title_exclusions=en_title_exclusions,
+            max_term_ngram=max_ngram,
+        )
+
+    def match_candidates(self, candidates: Iterable[str]) -> Optional[str]:
+        normalized_candidates = {normalize_query_text(candidate) for candidate in candidates}
+        for term in self.zh_trigger_terms:
+            if term in normalized_candidates:
+                return term
+        for term in self.en_trigger_terms:
+            if term in normalized_candidates:
+                return term
+        return None
+
+
+@dataclass(frozen=True)
+class DetectedProductTitleExclusion:
+    matched_term: str
+    matched_query_text: str
+    zh_title_exclusions: Tuple[str, ...]
+    en_title_exclusions: Tuple[str, ...]
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "matched_term": self.matched_term,
+            "matched_query_text": self.matched_query_text,
+            "zh_title_exclusions": list(self.zh_title_exclusions),
+            "en_title_exclusions": list(self.en_title_exclusions),
+        }
+
+
+@dataclass(frozen=True)
+class ProductTitleExclusionProfile:
+    query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple)
+    exclusions: Tuple[DetectedProductTitleExclusion, ...] = field(default_factory=tuple)
+
+    @property
+    def is_active(self) -> bool:
+        return bool(self.exclusions)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "active": self.is_active,
+            "exclusions": [item.to_dict() for item in self.exclusions],
+            "query_variants": [
+                {
+                    "text": variant.text,
+                    "normalized_text": variant.normalized_text,
+                    "fine_tokens": list(variant.fine_tokens),
+                    "coarse_tokens": list(variant.coarse_tokens),
+                    "candidates": list(variant.candidates),
+                }
+                for variant in self.query_variants
+            ],
+        }
+
+    def all_zh_title_exclusions(self) -> List[str]:
+        return _dedupe_terms(
+            term
+            for item in self.exclusions
+            for term in item.zh_title_exclusions
+        )
+
+    def all_en_title_exclusions(self) -> List[str]:
+        return _dedupe_terms(
+            term
+            for item in self.exclusions
+            for term in item.en_title_exclusions
+        )
+
+
+class ProductTitleExclusionRegistry:
+    def __init__(
+        self,
+        rules: Sequence[ProductTitleExclusionRule],
+        *,
+        enabled: bool = True,
+    ) -> None:
+        self.rules = tuple(rules)
+        self.enabled = bool(enabled)
+        self.max_term_ngram = max((rule.max_term_ngram for rule in self.rules), default=3)
+
+    @classmethod
+    def from_query_config(cls, query_config: Any) -> "ProductTitleExclusionRegistry":
+        raw_rules = getattr(query_config, "product_title_exclusion_rules", []) or []
+        rules: List[ProductTitleExclusionRule] = []
+        for row in raw_rules:
+            if not isinstance(row, dict):
+                continue
+            rule = ProductTitleExclusionRule.from_config_row(row)
+            if rule is not None:
+                rules.append(rule)
+        return cls(
+            rules,
+            enabled=bool(getattr(query_config, "product_title_exclusion_enabled", True)),
+        )
+
+
+class ProductTitleExclusionDetector:
+    def __init__(
+        self,
+        registry: ProductTitleExclusionRegistry,
+        *,
+        tokenizer: Optional[Callable[[str], Any]] = None,
+    ) -> None:
+        self.registry = registry
+        self.tokenizer = tokenizer
+
+    def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
+        seen = set()
+        variants: List[TokenizedText] = []
+        texts = [
+            getattr(parsed_query, "original_query", None),
+            getattr(parsed_query, "query_normalized", None),
+            getattr(parsed_query, "rewritten_query", None),
+        ]
+
+        translations = getattr(parsed_query, "translations", {}) or {}
+        if isinstance(translations, dict):
+            texts.extend(translations.values())
+
+        for raw_text in texts:
+            text = str(raw_text or "").strip()
+            if not text:
+                continue
+            normalized = normalize_query_text(text)
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            variants.append(
+                tokenize_text(
+                    text,
+                    tokenizer=self.tokenizer,
+                    max_ngram=self.registry.max_term_ngram,
+                )
+            )
+
+        return tuple(variants)
+
+    def detect(self, parsed_query: Any) -> ProductTitleExclusionProfile:
+        if not self.registry.enabled or not self.registry.rules:
+            return ProductTitleExclusionProfile()
+
+        query_variants = self._build_query_variants(parsed_query)
+        detected: List[DetectedProductTitleExclusion] = []
+        seen_keys = set()
+
+        for variant in query_variants:
+            for rule in self.registry.rules:
+                matched_term = rule.match_candidates(variant.candidates)
+                if not matched_term:
+                    continue
+
+                key = (
+                    tuple(rule.zh_title_exclusions),
+                    tuple(rule.en_title_exclusions),
+                )
+                if key in seen_keys:
+                    continue
+                seen_keys.add(key)
+                detected.append(
+                    DetectedProductTitleExclusion(
+                        matched_term=matched_term,
+                        matched_query_text=variant.text,
+                        zh_title_exclusions=rule.zh_title_exclusions,
+                        en_title_exclusions=rule.en_title_exclusions,
+                    )
+                )
+
+        return ProductTitleExclusionProfile(
+            query_variants=query_variants,
+            exclusions=tuple(detected),
+        )
@@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder
 from config import SearchConfig
 from translation import create_translation_client
 from .language_detector import LanguageDetector
+from .product_title_exclusion import (
+    ProductTitleExclusionDetector,
+    ProductTitleExclusionProfile,
+    ProductTitleExclusionRegistry,
+)
 from .query_rewriter import QueryRewriter, QueryNormalizer
 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
 from .tokenization import extract_token_strings, simple_tokenize_query
@@ -27,6 +32,30 @@ logger = logging.getLogger(__name__)
 import hanlp  # type: ignore
  
  
+def rerank_query_text(
+    original_query: str,
+    *,
+    detected_language: Optional[str] = None,
+    translations: Optional[Dict[str, str]] = None,
+) -> str:
+    """
+    Text substituted for ``{query}`` when calling the reranker.
+
+    Chinese and English queries use the original string. For any other detected
+    language, prefer the English translation, then Chinese; if neither exists,
+    fall back to the original query.
+    """
+    lang = (detected_language or "").strip().lower()
+    if lang in ("zh", "en"):
+        return original_query
+    trans = translations or {}
+    for key in ("en", "zh"):
+        t = (trans.get(key) or "").strip()
+        if t:
+            return t
+    return original_query
+
+
 @dataclass(slots=True)
 class ParsedQuery:
     """Container for query parser facts."""
@@ -39,6 +68,15 @@ class ParsedQuery:
     query_vector: Optional[np.ndarray] = None
     query_tokens: List[str] = field(default_factory=list)
     style_intent_profile: Optional[StyleIntentProfile] = None
+    product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None
+
+    def text_for_rerank(self) -> str:
+        """See :func:`rerank_query_text`."""
+        return rerank_query_text(
+            self.original_query,
+            detected_language=self.detected_language,
+            translations=self.translations,
+        )
  
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary representation."""
@@ -52,6 +90,11 @@ class ParsedQuery:
             "style_intent_profile": (
                 self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
             ),
+            "product_title_exclusion_profile": (
+                self.product_title_exclusion_profile.to_dict()
+                if self.product_title_exclusion_profile is not None
+                else None
+            ),
         }
  
  
@@ -94,6 +137,13 @@ class QueryParser:
             self.style_intent_registry,
             tokenizer=self._tokenizer,
         )
+        self.product_title_exclusion_registry = ProductTitleExclusionRegistry.from_query_config(
+            config.query_config
+        )
+        self.product_title_exclusion_detector = ProductTitleExclusionDetector(
+            self.product_title_exclusion_registry,
+            tokenizer=self._tokenizer,
+        )
  
         # Eager initialization (startup-time failure visibility, no lazy init in request path)
         if self.config.query_config.enable_text_embedding and self._text_encoder is None:
@@ -416,11 +466,16 @@ class QueryParser:
             query_tokens=query_tokens,
         )
         style_intent_profile = self.style_intent_detector.detect(base_result)
+        product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result)
         if context:
             context.store_intermediate_result(
                 "style_intent_profile",
                 style_intent_profile.to_dict(),
             )
+            context.store_intermediate_result(
+                "product_title_exclusion_profile",
+                product_title_exclusion_profile.to_dict(),
+            )
  
         result = ParsedQuery(
             original_query=query,
@@ -431,6 +486,7 @@ class QueryParser:
             query_vector=query_vector,
             query_tokens=query_tokens,
             style_intent_profile=style_intent_profile,
+            product_title_exclusion_profile=product_title_exclusion_profile,
         )
  
         if context and hasattr(context, 'logger'):
@@ -229,7 +229,10 @@ class ESQueryBuilder:
  
         # Build filter clauses for query (conjunctive filters + range filters)
         filter_clauses = self._build_filters(conjunctive_filters, range_filters)
-        
+        product_title_exclusion_filter = self._build_product_title_exclusion_filter(parsed_query)
+        if product_title_exclusion_filter:
+            filter_clauses.append(product_title_exclusion_filter)
+
         # 3. Build main query structure: filters and recall
         if recall_clauses:
             # Combine text recalls with OR logic (if multiple)
@@ -780,6 +783,37 @@ class ESQueryBuilder:
  
         return filter_clauses
  
+    @staticmethod
+    def _build_product_title_exclusion_filter(parsed_query: Optional[Any]) -> Optional[Dict[str, Any]]:
+        if parsed_query is None:
+            return None
+
+        profile = getattr(parsed_query, "product_title_exclusion_profile", None)
+        if not profile or not getattr(profile, "is_active", False):
+            return None
+
+        should_clauses: List[Dict[str, Any]] = []
+        for term in profile.all_zh_title_exclusions():
+            should_clauses.append({"match_phrase": {"title.zh": {"query": term}}})
+        for term in profile.all_en_title_exclusions():
+            should_clauses.append({"match_phrase": {"title.en": {"query": term}}})
+
+        if not should_clauses:
+            return None
+
+        return {
+            "bool": {
+                "must_not": [
+                    {
+                        "bool": {
+                            "should": should_clauses,
+                            "minimum_should_match": 1,
+                        }
+                    }
+                ]
+            }
+        }
+
     def add_sorting(
         self,
         es_query: Dict[str, Any],
@@ -581,7 +581,7 @@ class Searcher:
             try:
                 from .rerank_client import run_rerank
  
-                rerank_query = parsed_query.original_query if parsed_query else query
+                rerank_query = parsed_query.text_for_rerank() if parsed_query else query
                 es_response, rerank_meta, fused_debug = run_rerank(
                     query=rerank_query,
                     es_response=es_response,
@@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base():
     root = q["query"]
     assert root["bool"]["_name"] == "base_query"
     assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"]
+
+
+def test_product_title_exclusion_filter_is_applied_to_query_and_knn():
+    qb = _builder()
+    parsed_query = SimpleNamespace(
+        rewritten_query="fitted dress",
+        detected_language="en",
+        translations={"zh": "修身 连衣裙"},
+        product_title_exclusion_profile=SimpleNamespace(
+            is_active=True,
+            all_zh_title_exclusions=lambda: ["宽松"],
+            all_en_title_exclusions=lambda: ["loose", "relaxed"],
+        ),
+    )
+
+    q = qb.build_query(
+        query_text="fitted dress",
+        query_vector=np.array([0.1, 0.2, 0.3]),
+        parsed_query=parsed_query,
+        enable_knn=True,
+    )
+
+    expected_filter = {
+        "bool": {
+            "must_not": [
+                {
+                    "bool": {
+                        "should": [
+                            {"match_phrase": {"title.zh": {"query": "宽松"}}},
+                            {"match_phrase": {"title.en": {"query": "loose"}}},
+                            {"match_phrase": {"title.en": {"query": "relaxed"}}},
+                        ],
+                        "minimum_should_match": 1,
+                    }
+                }
+            ]
+        }
+    }
+
+    assert expected_filter in q["query"]["bool"]["filter"]
+    assert q["knn"]["filter"] == expected_filter
@@ -0,0 +1,43 @@
+from types import SimpleNamespace
+
+from config import QueryConfig
+from query.product_title_exclusion import (
+    ProductTitleExclusionDetector,
+    ProductTitleExclusionRegistry,
+)
+
+
+def test_product_title_exclusion_detector_matches_translated_english_token():
+    query_config = QueryConfig(
+        product_title_exclusion_rules=[
+            {
+                "zh_trigger_terms": ["修身"],
+                "en_trigger_terms": ["fitted"],
+                "zh_title_exclusions": ["宽松"],
+                "en_title_exclusions": ["loose", "relaxed", "oversized", "baggy", "slouchy"],
+            }
+        ]
+    )
+    detector = ProductTitleExclusionDetector(
+        ProductTitleExclusionRegistry.from_query_config(query_config),
+        tokenizer=lambda text: text.split(),
+    )
+
+    parsed_query = SimpleNamespace(
+        original_query="修身连衣裙",
+        query_normalized="修身 连衣裙",
+        rewritten_query="修身 连衣裙",
+        translations={"en": "fitted dress"},
+    )
+
+    profile = detector.detect(parsed_query)
+
+    assert profile.is_active is True
+    assert profile.all_zh_title_exclusions() == ["宽松"]
+    assert profile.all_en_title_exclusions() == [
+        "loose",
+        "relaxed",
+        "oversized",
+        "baggy",
+        "slouchy",
+    ]
@@ -0,0 +1,55 @@
+"""Unit tests for rerank {query} text selection (translation fallback)."""
+
+from query.query_parser import ParsedQuery, rerank_query_text
+
+
+def test_rerank_query_text_zh_uses_original():
+    assert rerank_query_text("你好", detected_language="zh", translations={"en": "hello"}) == "你好"
+
+
+def test_rerank_query_text_en_uses_original():
+    assert rerank_query_text("hello", detected_language="en", translations={"zh": "你好"}) == "hello"
+
+
+def test_rerank_query_text_russian_prefers_en_translation():
+    assert (
+        rerank_query_text(
+            "красное платье",
+            detected_language="ru",
+            translations={"en": "red dress", "zh": "红裙"},
+        )
+        == "red dress"
+    )
+
+
+def test_rerank_query_text_russian_falls_back_to_zh_when_no_en():
+    assert (
+        rerank_query_text(
+            "красное платье",
+            detected_language="ru",
+            translations={"zh": "红裙"},
+        )
+        == "红裙"
+    )
+
+
+def test_rerank_query_text_non_zh_en_falls_back_to_original_without_translations():
+    assert rerank_query_text("foo", detected_language="ja", translations={}) == "foo"
+
+
+def test_rerank_query_text_unknown_language_uses_en_when_present():
+    assert (
+        rerank_query_text("x", detected_language="unknown", translations={"en": "translated"})
+        == "translated"
+    )
+
+
+def test_parsed_query_text_for_rerank_delegates():
+    pq = ParsedQuery(
+        original_query="orig",
+        query_normalized="orig",
+        rewritten_query="rewritten",
+        detected_language="fr",
+        translations={"en": "en version"},
+    )
+    assert pq.text_for_rerank() == "en version"
@@ -32,6 +32,15 @@ class _FakeParsedQuery:
     query_vector: Any = None
     style_intent_profile: Any = None
  
+    def text_for_rerank(self) -> str:
+        from query.query_parser import rerank_query_text
+
+        return rerank_query_text(
+            self.original_query,
+            detected_language=self.detected_language,
+            translations=self.translations,
+        )
+
     def to_dict(self) -> Dict[str, Any]:
         return {
             "original_query": self.original_query,