diff --git a/config/config.yaml b/config/config.yaml index cb6edfb..c045a72 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -124,6 +124,10 @@ query_config: color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] + product_title_exclusion: + enabled: true + dictionary_path: "config/dictionaries/product_title_exclusion.tsv" + # 动态多语言检索字段配置 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; # shared_fields 为无语言后缀字段。 @@ -376,7 +380,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank + backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" diff --git a/config/dictionaries/product_title_exclusion.tsv b/config/dictionaries/product_title_exclusion.tsv new file mode 100644 index 0000000..7c10912 --- /dev/null +++ b/config/dictionaries/product_title_exclusion.tsv @@ -0,0 +1,2 @@ +# zh triggers en triggers zh title exclusions en title exclusions +修身 fitted 宽松 loose,relaxed,oversized,baggy,slouchy diff --git a/config/loader.py b/config/loader.py index 3b36e67..e8498fd 100644 --- a/config/loader.py +++ b/config/loader.py @@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]: return rows +def _read_product_title_exclusion_dictionary(path: Path) -> List[Dict[str, List[str]]]: + rules: List[Dict[str, List[str]]] = [] + if not path.exists(): + return rules + + with open(path, "r", encoding="utf-8") as handle: + for raw_line in handle: + line = raw_line.strip() + if not line or line.startswith("#"): + continue + parts = [segment.strip() for segment in line.split("\t")] + if len(parts) != 4: + continue + + def _split_cell(cell: str) -> List[str]: + return [item.strip() for item in cell.split(",") if item.strip()] + + rules.append( + { + "zh_trigger_terms": _split_cell(parts[0]), + "en_trigger_terms": _split_cell(parts[1]), + "zh_title_exclusions": _split_cell(parts[2]), + "en_title_exclusions": _split_cell(parts[3]), + } + ) + return rules + + _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = { "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"], "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"], @@ -282,6 +310,11 @@ class AppConfigLoader: if isinstance(query_cfg.get("style_intent"), dict) else {} ) + product_title_exclusion_cfg = ( + query_cfg.get("product_title_exclusion") + if isinstance(query_cfg.get("product_title_exclusion"), dict) + else {} + ) def _resolve_project_path(value: Any, default_path: Path) -> Path: if value in (None, ""): @@ -316,6 +349,10 @@ class AppConfigLoader: "color": _read_synonym_csv_dictionary(style_color_path), "size": _read_synonym_csv_dictionary(style_size_path), } + product_title_exclusion_path = _resolve_project_path( + product_title_exclusion_cfg.get("dictionary_path"), + self.config_dir / "dictionaries" / "product_title_exclusion.tsv", + ) query_config = QueryConfig( supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), default_language=str(query_cfg.get("default_language") or "en"), @@ -390,6 +427,10 @@ class AppConfigLoader: style_intent_enabled=bool(style_intent_cfg.get("enabled", True)), style_intent_terms=style_intent_terms, style_intent_dimension_aliases=style_dimension_aliases, + product_title_exclusion_enabled=bool(product_title_exclusion_cfg.get("enabled", True)), + product_title_exclusion_rules=_read_product_title_exclusion_dictionary( + product_title_exclusion_path + ), ) function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} diff --git a/config/schema.py b/config/schema.py index f28329f..3f5300b 100644 --- a/config/schema.py +++ b/config/schema.py @@ -67,6 +67,8 @@ class QueryConfig: style_intent_enabled: bool = True style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict) style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict) + product_title_exclusion_enabled: bool = True + product_title_exclusion_rules: List[Dict[str, List[str]]] = field(default_factory=list) @dataclass(frozen=True) diff --git a/query/product_title_exclusion.py b/query/product_title_exclusion.py new file mode 100644 index 0000000..66dfacd --- /dev/null +++ b/query/product_title_exclusion.py @@ -0,0 +1,225 @@ +""" +Product title exclusion detection for query understanding. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple + +from .tokenization import TokenizedText, normalize_query_text, tokenize_text + + +def _dedupe_terms(terms: Iterable[str]) -> List[str]: + result: List[str] = [] + seen: Set[str] = set() + for raw_term in terms: + term = normalize_query_text(raw_term) + if not term or term in seen: + continue + seen.add(term) + result.append(term) + return result + + +@dataclass(frozen=True) +class ProductTitleExclusionRule: + zh_trigger_terms: Tuple[str, ...] + en_trigger_terms: Tuple[str, ...] + zh_title_exclusions: Tuple[str, ...] + en_title_exclusions: Tuple[str, ...] + max_term_ngram: int = 3 + + @classmethod + def from_config_row(cls, row: Dict[str, Sequence[str]]) -> Optional["ProductTitleExclusionRule"]: + zh_trigger_terms = tuple(_dedupe_terms(row.get("zh_trigger_terms") or [])) + en_trigger_terms = tuple(_dedupe_terms(row.get("en_trigger_terms") or [])) + zh_title_exclusions = tuple(_dedupe_terms(row.get("zh_title_exclusions") or [])) + en_title_exclusions = tuple(_dedupe_terms(row.get("en_title_exclusions") or [])) + if not zh_title_exclusions and not en_title_exclusions: + return None + if not zh_trigger_terms and not en_trigger_terms: + return None + + max_ngram = max( + [1] + + [len(term.split()) for term in zh_trigger_terms] + + [len(term.split()) for term in en_trigger_terms] + ) + return cls( + zh_trigger_terms=zh_trigger_terms, + en_trigger_terms=en_trigger_terms, + zh_title_exclusions=zh_title_exclusions, + en_title_exclusions=en_title_exclusions, + max_term_ngram=max_ngram, + ) + + def match_candidates(self, candidates: Iterable[str]) -> Optional[str]: + normalized_candidates = {normalize_query_text(candidate) for candidate in candidates} + for term in self.zh_trigger_terms: + if term in normalized_candidates: + return term + for term in self.en_trigger_terms: + if term in normalized_candidates: + return term + return None + + +@dataclass(frozen=True) +class DetectedProductTitleExclusion: + matched_term: str + matched_query_text: str + zh_title_exclusions: Tuple[str, ...] + en_title_exclusions: Tuple[str, ...] + + def to_dict(self) -> Dict[str, Any]: + return { + "matched_term": self.matched_term, + "matched_query_text": self.matched_query_text, + "zh_title_exclusions": list(self.zh_title_exclusions), + "en_title_exclusions": list(self.en_title_exclusions), + } + + +@dataclass(frozen=True) +class ProductTitleExclusionProfile: + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple) + exclusions: Tuple[DetectedProductTitleExclusion, ...] = field(default_factory=tuple) + + @property + def is_active(self) -> bool: + return bool(self.exclusions) + + def to_dict(self) -> Dict[str, Any]: + return { + "active": self.is_active, + "exclusions": [item.to_dict() for item in self.exclusions], + "query_variants": [ + { + "text": variant.text, + "normalized_text": variant.normalized_text, + "fine_tokens": list(variant.fine_tokens), + "coarse_tokens": list(variant.coarse_tokens), + "candidates": list(variant.candidates), + } + for variant in self.query_variants + ], + } + + def all_zh_title_exclusions(self) -> List[str]: + return _dedupe_terms( + term + for item in self.exclusions + for term in item.zh_title_exclusions + ) + + def all_en_title_exclusions(self) -> List[str]: + return _dedupe_terms( + term + for item in self.exclusions + for term in item.en_title_exclusions + ) + + +class ProductTitleExclusionRegistry: + def __init__( + self, + rules: Sequence[ProductTitleExclusionRule], + *, + enabled: bool = True, + ) -> None: + self.rules = tuple(rules) + self.enabled = bool(enabled) + self.max_term_ngram = max((rule.max_term_ngram for rule in self.rules), default=3) + + @classmethod + def from_query_config(cls, query_config: Any) -> "ProductTitleExclusionRegistry": + raw_rules = getattr(query_config, "product_title_exclusion_rules", []) or [] + rules: List[ProductTitleExclusionRule] = [] + for row in raw_rules: + if not isinstance(row, dict): + continue + rule = ProductTitleExclusionRule.from_config_row(row) + if rule is not None: + rules.append(rule) + return cls( + rules, + enabled=bool(getattr(query_config, "product_title_exclusion_enabled", True)), + ) + + +class ProductTitleExclusionDetector: + def __init__( + self, + registry: ProductTitleExclusionRegistry, + *, + tokenizer: Optional[Callable[[str], Any]] = None, + ) -> None: + self.registry = registry + self.tokenizer = tokenizer + + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: + seen = set() + variants: List[TokenizedText] = [] + texts = [ + getattr(parsed_query, "original_query", None), + getattr(parsed_query, "query_normalized", None), + getattr(parsed_query, "rewritten_query", None), + ] + + translations = getattr(parsed_query, "translations", {}) or {} + if isinstance(translations, dict): + texts.extend(translations.values()) + + for raw_text in texts: + text = str(raw_text or "").strip() + if not text: + continue + normalized = normalize_query_text(text) + if not normalized or normalized in seen: + continue + seen.add(normalized) + variants.append( + tokenize_text( + text, + tokenizer=self.tokenizer, + max_ngram=self.registry.max_term_ngram, + ) + ) + + return tuple(variants) + + def detect(self, parsed_query: Any) -> ProductTitleExclusionProfile: + if not self.registry.enabled or not self.registry.rules: + return ProductTitleExclusionProfile() + + query_variants = self._build_query_variants(parsed_query) + detected: List[DetectedProductTitleExclusion] = [] + seen_keys = set() + + for variant in query_variants: + for rule in self.registry.rules: + matched_term = rule.match_candidates(variant.candidates) + if not matched_term: + continue + + key = ( + tuple(rule.zh_title_exclusions), + tuple(rule.en_title_exclusions), + ) + if key in seen_keys: + continue + seen_keys.add(key) + detected.append( + DetectedProductTitleExclusion( + matched_term=matched_term, + matched_query_text=variant.text, + zh_title_exclusions=rule.zh_title_exclusions, + en_title_exclusions=rule.en_title_exclusions, + ) + ) + + return ProductTitleExclusionProfile( + query_variants=query_variants, + exclusions=tuple(detected), + ) diff --git a/query/query_parser.py b/query/query_parser.py index 64edaad..4b7fffc 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder from config import SearchConfig from translation import create_translation_client from .language_detector import LanguageDetector +from .product_title_exclusion import ( + ProductTitleExclusionDetector, + ProductTitleExclusionProfile, + ProductTitleExclusionRegistry, +) from .query_rewriter import QueryRewriter, QueryNormalizer from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry from .tokenization import extract_token_strings, simple_tokenize_query @@ -27,6 +32,30 @@ logger = logging.getLogger(__name__) import hanlp # type: ignore +def rerank_query_text( + original_query: str, + *, + detected_language: Optional[str] = None, + translations: Optional[Dict[str, str]] = None, +) -> str: + """ + Text substituted for ``{query}`` when calling the reranker. + + Chinese and English queries use the original string. For any other detected + language, prefer the English translation, then Chinese; if neither exists, + fall back to the original query. + """ + lang = (detected_language or "").strip().lower() + if lang in ("zh", "en"): + return original_query + trans = translations or {} + for key in ("en", "zh"): + t = (trans.get(key) or "").strip() + if t: + return t + return original_query + + @dataclass(slots=True) class ParsedQuery: """Container for query parser facts.""" @@ -39,6 +68,15 @@ class ParsedQuery: query_vector: Optional[np.ndarray] = None query_tokens: List[str] = field(default_factory=list) style_intent_profile: Optional[StyleIntentProfile] = None + product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None + + def text_for_rerank(self) -> str: + """See :func:`rerank_query_text`.""" + return rerank_query_text( + self.original_query, + detected_language=self.detected_language, + translations=self.translations, + ) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" @@ -52,6 +90,11 @@ class ParsedQuery: "style_intent_profile": ( self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None ), + "product_title_exclusion_profile": ( + self.product_title_exclusion_profile.to_dict() + if self.product_title_exclusion_profile is not None + else None + ), } @@ -94,6 +137,13 @@ class QueryParser: self.style_intent_registry, tokenizer=self._tokenizer, ) + self.product_title_exclusion_registry = ProductTitleExclusionRegistry.from_query_config( + config.query_config + ) + self.product_title_exclusion_detector = ProductTitleExclusionDetector( + self.product_title_exclusion_registry, + tokenizer=self._tokenizer, + ) # Eager initialization (startup-time failure visibility, no lazy init in request path) if self.config.query_config.enable_text_embedding and self._text_encoder is None: @@ -416,11 +466,16 @@ class QueryParser: query_tokens=query_tokens, ) style_intent_profile = self.style_intent_detector.detect(base_result) + product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) if context: context.store_intermediate_result( "style_intent_profile", style_intent_profile.to_dict(), ) + context.store_intermediate_result( + "product_title_exclusion_profile", + product_title_exclusion_profile.to_dict(), + ) result = ParsedQuery( original_query=query, @@ -431,6 +486,7 @@ class QueryParser: query_vector=query_vector, query_tokens=query_tokens, style_intent_profile=style_intent_profile, + product_title_exclusion_profile=product_title_exclusion_profile, ) if context and hasattr(context, 'logger'): diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 4b99a09..e5ffc2c 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -229,7 +229,10 @@ class ESQueryBuilder: # Build filter clauses for query (conjunctive filters + range filters) filter_clauses = self._build_filters(conjunctive_filters, range_filters) - + product_title_exclusion_filter = self._build_product_title_exclusion_filter(parsed_query) + if product_title_exclusion_filter: + filter_clauses.append(product_title_exclusion_filter) + # 3. Build main query structure: filters and recall if recall_clauses: # Combine text recalls with OR logic (if multiple) @@ -780,6 +783,37 @@ class ESQueryBuilder: return filter_clauses + @staticmethod + def _build_product_title_exclusion_filter(parsed_query: Optional[Any]) -> Optional[Dict[str, Any]]: + if parsed_query is None: + return None + + profile = getattr(parsed_query, "product_title_exclusion_profile", None) + if not profile or not getattr(profile, "is_active", False): + return None + + should_clauses: List[Dict[str, Any]] = [] + for term in profile.all_zh_title_exclusions(): + should_clauses.append({"match_phrase": {"title.zh": {"query": term}}}) + for term in profile.all_en_title_exclusions(): + should_clauses.append({"match_phrase": {"title.en": {"query": term}}}) + + if not should_clauses: + return None + + return { + "bool": { + "must_not": [ + { + "bool": { + "should": should_clauses, + "minimum_should_match": 1, + } + } + ] + } + } + def add_sorting( self, es_query: Dict[str, Any], diff --git a/search/searcher.py b/search/searcher.py index d7bccea..6ed1f72 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -581,7 +581,7 @@ class Searcher: try: from .rerank_client import run_rerank - rerank_query = parsed_query.original_query if parsed_query else query + rerank_query = parsed_query.text_for_rerank() if parsed_query else query es_response, rerank_meta, fused_debug = run_rerank( query=rerank_query, es_response=es_response, diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index 763bda2..f4a06bd 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base(): root = q["query"] assert root["bool"]["_name"] == "base_query" assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] + + +def test_product_title_exclusion_filter_is_applied_to_query_and_knn(): + qb = _builder() + parsed_query = SimpleNamespace( + rewritten_query="fitted dress", + detected_language="en", + translations={"zh": "修身 连衣裙"}, + product_title_exclusion_profile=SimpleNamespace( + is_active=True, + all_zh_title_exclusions=lambda: ["宽松"], + all_en_title_exclusions=lambda: ["loose", "relaxed"], + ), + ) + + q = qb.build_query( + query_text="fitted dress", + query_vector=np.array([0.1, 0.2, 0.3]), + parsed_query=parsed_query, + enable_knn=True, + ) + + expected_filter = { + "bool": { + "must_not": [ + { + "bool": { + "should": [ + {"match_phrase": {"title.zh": {"query": "宽松"}}}, + {"match_phrase": {"title.en": {"query": "loose"}}}, + {"match_phrase": {"title.en": {"query": "relaxed"}}}, + ], + "minimum_should_match": 1, + } + } + ] + } + } + + assert expected_filter in q["query"]["bool"]["filter"] + assert q["knn"]["filter"] == expected_filter diff --git a/tests/test_product_title_exclusion.py b/tests/test_product_title_exclusion.py new file mode 100644 index 0000000..79705d7 --- /dev/null +++ b/tests/test_product_title_exclusion.py @@ -0,0 +1,43 @@ +from types import SimpleNamespace + +from config import QueryConfig +from query.product_title_exclusion import ( + ProductTitleExclusionDetector, + ProductTitleExclusionRegistry, +) + + +def test_product_title_exclusion_detector_matches_translated_english_token(): + query_config = QueryConfig( + product_title_exclusion_rules=[ + { + "zh_trigger_terms": ["修身"], + "en_trigger_terms": ["fitted"], + "zh_title_exclusions": ["宽松"], + "en_title_exclusions": ["loose", "relaxed", "oversized", "baggy", "slouchy"], + } + ] + ) + detector = ProductTitleExclusionDetector( + ProductTitleExclusionRegistry.from_query_config(query_config), + tokenizer=lambda text: text.split(), + ) + + parsed_query = SimpleNamespace( + original_query="修身连衣裙", + query_normalized="修身 连衣裙", + rewritten_query="修身 连衣裙", + translations={"en": "fitted dress"}, + ) + + profile = detector.detect(parsed_query) + + assert profile.is_active is True + assert profile.all_zh_title_exclusions() == ["宽松"] + assert profile.all_en_title_exclusions() == [ + "loose", + "relaxed", + "oversized", + "baggy", + "slouchy", + ] diff --git a/tests/test_rerank_query_text.py b/tests/test_rerank_query_text.py new file mode 100644 index 0000000..fdce46e --- /dev/null +++ b/tests/test_rerank_query_text.py @@ -0,0 +1,55 @@ +"""Unit tests for rerank {query} text selection (translation fallback).""" + +from query.query_parser import ParsedQuery, rerank_query_text + + +def test_rerank_query_text_zh_uses_original(): + assert rerank_query_text("你好", detected_language="zh", translations={"en": "hello"}) == "你好" + + +def test_rerank_query_text_en_uses_original(): + assert rerank_query_text("hello", detected_language="en", translations={"zh": "你好"}) == "hello" + + +def test_rerank_query_text_russian_prefers_en_translation(): + assert ( + rerank_query_text( + "красное платье", + detected_language="ru", + translations={"en": "red dress", "zh": "红裙"}, + ) + == "red dress" + ) + + +def test_rerank_query_text_russian_falls_back_to_zh_when_no_en(): + assert ( + rerank_query_text( + "красное платье", + detected_language="ru", + translations={"zh": "红裙"}, + ) + == "红裙" + ) + + +def test_rerank_query_text_non_zh_en_falls_back_to_original_without_translations(): + assert rerank_query_text("foo", detected_language="ja", translations={}) == "foo" + + +def test_rerank_query_text_unknown_language_uses_en_when_present(): + assert ( + rerank_query_text("x", detected_language="unknown", translations={"en": "translated"}) + == "translated" + ) + + +def test_parsed_query_text_for_rerank_delegates(): + pq = ParsedQuery( + original_query="orig", + query_normalized="orig", + rewritten_query="rewritten", + detected_language="fr", + translations={"en": "en version"}, + ) + assert pq.text_for_rerank() == "en version" diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index 2ca38f0..1cff026 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -32,6 +32,15 @@ class _FakeParsedQuery: query_vector: Any = None style_intent_profile: Any = None + def text_for_rerank(self) -> str: + from query.query_parser import rerank_query_text + + return rerank_query_text( + self.original_query, + detected_language=self.detected_language, + translations=self.translations, + ) + def to_dict(self) -> Dict[str, Any]: return { "original_query": self.original_query, -- libgit2 0.21.2