Commit 74fdf9bd6f9de47fcd91155a02a258edf9ef9005
1 parent
6adbf18a
1.
加了一个过滤/降权词典,query中有独立分词匹配到指定的触发词,将过滤带某些分词的商品(比如fitted/修身,过滤宽松、loose、relaxed、baggy,slouchy等商品) 2. reranker的query使用翻译后的
Showing
12 changed files
with
515 additions
and
3 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -124,6 +124,10 @@ query_config: |
| 124 | 124 | color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] |
| 125 | 125 | size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] |
| 126 | 126 | |
| 127 | + product_title_exclusion: | |
| 128 | + enabled: true | |
| 129 | + dictionary_path: "config/dictionaries/product_title_exclusion.tsv" | |
| 130 | + | |
| 127 | 131 | # 动态多语言检索字段配置 |
| 128 | 132 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; |
| 129 | 133 | # shared_fields 为无语言后缀字段。 |
| ... | ... | @@ -376,7 +380,7 @@ services: |
| 376 | 380 | max_docs: 1000 |
| 377 | 381 | normalize: true |
| 378 | 382 | # 服务内后端(reranker 进程启动时读取) |
| 379 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 383 | + backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 380 | 384 | backends: |
| 381 | 385 | bge: |
| 382 | 386 | model_name: "BAAI/bge-reranker-v2-m3" | ... | ... |
config/loader.py
| ... | ... | @@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]: |
| 113 | 113 | return rows |
| 114 | 114 | |
| 115 | 115 | |
| 116 | +def _read_product_title_exclusion_dictionary(path: Path) -> List[Dict[str, List[str]]]: | |
| 117 | + rules: List[Dict[str, List[str]]] = [] | |
| 118 | + if not path.exists(): | |
| 119 | + return rules | |
| 120 | + | |
| 121 | + with open(path, "r", encoding="utf-8") as handle: | |
| 122 | + for raw_line in handle: | |
| 123 | + line = raw_line.strip() | |
| 124 | + if not line or line.startswith("#"): | |
| 125 | + continue | |
| 126 | + parts = [segment.strip() for segment in line.split("\t")] | |
| 127 | + if len(parts) != 4: | |
| 128 | + continue | |
| 129 | + | |
| 130 | + def _split_cell(cell: str) -> List[str]: | |
| 131 | + return [item.strip() for item in cell.split(",") if item.strip()] | |
| 132 | + | |
| 133 | + rules.append( | |
| 134 | + { | |
| 135 | + "zh_trigger_terms": _split_cell(parts[0]), | |
| 136 | + "en_trigger_terms": _split_cell(parts[1]), | |
| 137 | + "zh_title_exclusions": _split_cell(parts[2]), | |
| 138 | + "en_title_exclusions": _split_cell(parts[3]), | |
| 139 | + } | |
| 140 | + ) | |
| 141 | + return rules | |
| 142 | + | |
| 143 | + | |
| 116 | 144 | _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = { |
| 117 | 145 | "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"], |
| 118 | 146 | "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"], |
| ... | ... | @@ -282,6 +310,11 @@ class AppConfigLoader: |
| 282 | 310 | if isinstance(query_cfg.get("style_intent"), dict) |
| 283 | 311 | else {} |
| 284 | 312 | ) |
| 313 | + product_title_exclusion_cfg = ( | |
| 314 | + query_cfg.get("product_title_exclusion") | |
| 315 | + if isinstance(query_cfg.get("product_title_exclusion"), dict) | |
| 316 | + else {} | |
| 317 | + ) | |
| 285 | 318 | |
| 286 | 319 | def _resolve_project_path(value: Any, default_path: Path) -> Path: |
| 287 | 320 | if value in (None, ""): |
| ... | ... | @@ -316,6 +349,10 @@ class AppConfigLoader: |
| 316 | 349 | "color": _read_synonym_csv_dictionary(style_color_path), |
| 317 | 350 | "size": _read_synonym_csv_dictionary(style_size_path), |
| 318 | 351 | } |
| 352 | + product_title_exclusion_path = _resolve_project_path( | |
| 353 | + product_title_exclusion_cfg.get("dictionary_path"), | |
| 354 | + self.config_dir / "dictionaries" / "product_title_exclusion.tsv", | |
| 355 | + ) | |
| 319 | 356 | query_config = QueryConfig( |
| 320 | 357 | supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), |
| 321 | 358 | default_language=str(query_cfg.get("default_language") or "en"), |
| ... | ... | @@ -390,6 +427,10 @@ class AppConfigLoader: |
| 390 | 427 | style_intent_enabled=bool(style_intent_cfg.get("enabled", True)), |
| 391 | 428 | style_intent_terms=style_intent_terms, |
| 392 | 429 | style_intent_dimension_aliases=style_dimension_aliases, |
| 430 | + product_title_exclusion_enabled=bool(product_title_exclusion_cfg.get("enabled", True)), | |
| 431 | + product_title_exclusion_rules=_read_product_title_exclusion_dictionary( | |
| 432 | + product_title_exclusion_path | |
| 433 | + ), | |
| 393 | 434 | ) |
| 394 | 435 | |
| 395 | 436 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} | ... | ... |
config/schema.py
| ... | ... | @@ -67,6 +67,8 @@ class QueryConfig: |
| 67 | 67 | style_intent_enabled: bool = True |
| 68 | 68 | style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict) |
| 69 | 69 | style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict) |
| 70 | + product_title_exclusion_enabled: bool = True | |
| 71 | + product_title_exclusion_rules: List[Dict[str, List[str]]] = field(default_factory=list) | |
| 70 | 72 | |
| 71 | 73 | |
| 72 | 74 | @dataclass(frozen=True) | ... | ... |
| ... | ... | @@ -0,0 +1,225 @@ |
| 1 | +""" | |
| 2 | +Product title exclusion detection for query understanding. | |
| 3 | +""" | |
| 4 | + | |
| 5 | +from __future__ import annotations | |
| 6 | + | |
| 7 | +from dataclasses import dataclass, field | |
| 8 | +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple | |
| 9 | + | |
| 10 | +from .tokenization import TokenizedText, normalize_query_text, tokenize_text | |
| 11 | + | |
| 12 | + | |
| 13 | +def _dedupe_terms(terms: Iterable[str]) -> List[str]: | |
| 14 | + result: List[str] = [] | |
| 15 | + seen: Set[str] = set() | |
| 16 | + for raw_term in terms: | |
| 17 | + term = normalize_query_text(raw_term) | |
| 18 | + if not term or term in seen: | |
| 19 | + continue | |
| 20 | + seen.add(term) | |
| 21 | + result.append(term) | |
| 22 | + return result | |
| 23 | + | |
| 24 | + | |
| 25 | +@dataclass(frozen=True) | |
| 26 | +class ProductTitleExclusionRule: | |
| 27 | + zh_trigger_terms: Tuple[str, ...] | |
| 28 | + en_trigger_terms: Tuple[str, ...] | |
| 29 | + zh_title_exclusions: Tuple[str, ...] | |
| 30 | + en_title_exclusions: Tuple[str, ...] | |
| 31 | + max_term_ngram: int = 3 | |
| 32 | + | |
| 33 | + @classmethod | |
| 34 | + def from_config_row(cls, row: Dict[str, Sequence[str]]) -> Optional["ProductTitleExclusionRule"]: | |
| 35 | + zh_trigger_terms = tuple(_dedupe_terms(row.get("zh_trigger_terms") or [])) | |
| 36 | + en_trigger_terms = tuple(_dedupe_terms(row.get("en_trigger_terms") or [])) | |
| 37 | + zh_title_exclusions = tuple(_dedupe_terms(row.get("zh_title_exclusions") or [])) | |
| 38 | + en_title_exclusions = tuple(_dedupe_terms(row.get("en_title_exclusions") or [])) | |
| 39 | + if not zh_title_exclusions and not en_title_exclusions: | |
| 40 | + return None | |
| 41 | + if not zh_trigger_terms and not en_trigger_terms: | |
| 42 | + return None | |
| 43 | + | |
| 44 | + max_ngram = max( | |
| 45 | + [1] | |
| 46 | + + [len(term.split()) for term in zh_trigger_terms] | |
| 47 | + + [len(term.split()) for term in en_trigger_terms] | |
| 48 | + ) | |
| 49 | + return cls( | |
| 50 | + zh_trigger_terms=zh_trigger_terms, | |
| 51 | + en_trigger_terms=en_trigger_terms, | |
| 52 | + zh_title_exclusions=zh_title_exclusions, | |
| 53 | + en_title_exclusions=en_title_exclusions, | |
| 54 | + max_term_ngram=max_ngram, | |
| 55 | + ) | |
| 56 | + | |
| 57 | + def match_candidates(self, candidates: Iterable[str]) -> Optional[str]: | |
| 58 | + normalized_candidates = {normalize_query_text(candidate) for candidate in candidates} | |
| 59 | + for term in self.zh_trigger_terms: | |
| 60 | + if term in normalized_candidates: | |
| 61 | + return term | |
| 62 | + for term in self.en_trigger_terms: | |
| 63 | + if term in normalized_candidates: | |
| 64 | + return term | |
| 65 | + return None | |
| 66 | + | |
| 67 | + | |
| 68 | +@dataclass(frozen=True) | |
| 69 | +class DetectedProductTitleExclusion: | |
| 70 | + matched_term: str | |
| 71 | + matched_query_text: str | |
| 72 | + zh_title_exclusions: Tuple[str, ...] | |
| 73 | + en_title_exclusions: Tuple[str, ...] | |
| 74 | + | |
| 75 | + def to_dict(self) -> Dict[str, Any]: | |
| 76 | + return { | |
| 77 | + "matched_term": self.matched_term, | |
| 78 | + "matched_query_text": self.matched_query_text, | |
| 79 | + "zh_title_exclusions": list(self.zh_title_exclusions), | |
| 80 | + "en_title_exclusions": list(self.en_title_exclusions), | |
| 81 | + } | |
| 82 | + | |
| 83 | + | |
| 84 | +@dataclass(frozen=True) | |
| 85 | +class ProductTitleExclusionProfile: | |
| 86 | + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple) | |
| 87 | + exclusions: Tuple[DetectedProductTitleExclusion, ...] = field(default_factory=tuple) | |
| 88 | + | |
| 89 | + @property | |
| 90 | + def is_active(self) -> bool: | |
| 91 | + return bool(self.exclusions) | |
| 92 | + | |
| 93 | + def to_dict(self) -> Dict[str, Any]: | |
| 94 | + return { | |
| 95 | + "active": self.is_active, | |
| 96 | + "exclusions": [item.to_dict() for item in self.exclusions], | |
| 97 | + "query_variants": [ | |
| 98 | + { | |
| 99 | + "text": variant.text, | |
| 100 | + "normalized_text": variant.normalized_text, | |
| 101 | + "fine_tokens": list(variant.fine_tokens), | |
| 102 | + "coarse_tokens": list(variant.coarse_tokens), | |
| 103 | + "candidates": list(variant.candidates), | |
| 104 | + } | |
| 105 | + for variant in self.query_variants | |
| 106 | + ], | |
| 107 | + } | |
| 108 | + | |
| 109 | + def all_zh_title_exclusions(self) -> List[str]: | |
| 110 | + return _dedupe_terms( | |
| 111 | + term | |
| 112 | + for item in self.exclusions | |
| 113 | + for term in item.zh_title_exclusions | |
| 114 | + ) | |
| 115 | + | |
| 116 | + def all_en_title_exclusions(self) -> List[str]: | |
| 117 | + return _dedupe_terms( | |
| 118 | + term | |
| 119 | + for item in self.exclusions | |
| 120 | + for term in item.en_title_exclusions | |
| 121 | + ) | |
| 122 | + | |
| 123 | + | |
| 124 | +class ProductTitleExclusionRegistry: | |
| 125 | + def __init__( | |
| 126 | + self, | |
| 127 | + rules: Sequence[ProductTitleExclusionRule], | |
| 128 | + *, | |
| 129 | + enabled: bool = True, | |
| 130 | + ) -> None: | |
| 131 | + self.rules = tuple(rules) | |
| 132 | + self.enabled = bool(enabled) | |
| 133 | + self.max_term_ngram = max((rule.max_term_ngram for rule in self.rules), default=3) | |
| 134 | + | |
| 135 | + @classmethod | |
| 136 | + def from_query_config(cls, query_config: Any) -> "ProductTitleExclusionRegistry": | |
| 137 | + raw_rules = getattr(query_config, "product_title_exclusion_rules", []) or [] | |
| 138 | + rules: List[ProductTitleExclusionRule] = [] | |
| 139 | + for row in raw_rules: | |
| 140 | + if not isinstance(row, dict): | |
| 141 | + continue | |
| 142 | + rule = ProductTitleExclusionRule.from_config_row(row) | |
| 143 | + if rule is not None: | |
| 144 | + rules.append(rule) | |
| 145 | + return cls( | |
| 146 | + rules, | |
| 147 | + enabled=bool(getattr(query_config, "product_title_exclusion_enabled", True)), | |
| 148 | + ) | |
| 149 | + | |
| 150 | + | |
| 151 | +class ProductTitleExclusionDetector: | |
| 152 | + def __init__( | |
| 153 | + self, | |
| 154 | + registry: ProductTitleExclusionRegistry, | |
| 155 | + *, | |
| 156 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 157 | + ) -> None: | |
| 158 | + self.registry = registry | |
| 159 | + self.tokenizer = tokenizer | |
| 160 | + | |
| 161 | + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: | |
| 162 | + seen = set() | |
| 163 | + variants: List[TokenizedText] = [] | |
| 164 | + texts = [ | |
| 165 | + getattr(parsed_query, "original_query", None), | |
| 166 | + getattr(parsed_query, "query_normalized", None), | |
| 167 | + getattr(parsed_query, "rewritten_query", None), | |
| 168 | + ] | |
| 169 | + | |
| 170 | + translations = getattr(parsed_query, "translations", {}) or {} | |
| 171 | + if isinstance(translations, dict): | |
| 172 | + texts.extend(translations.values()) | |
| 173 | + | |
| 174 | + for raw_text in texts: | |
| 175 | + text = str(raw_text or "").strip() | |
| 176 | + if not text: | |
| 177 | + continue | |
| 178 | + normalized = normalize_query_text(text) | |
| 179 | + if not normalized or normalized in seen: | |
| 180 | + continue | |
| 181 | + seen.add(normalized) | |
| 182 | + variants.append( | |
| 183 | + tokenize_text( | |
| 184 | + text, | |
| 185 | + tokenizer=self.tokenizer, | |
| 186 | + max_ngram=self.registry.max_term_ngram, | |
| 187 | + ) | |
| 188 | + ) | |
| 189 | + | |
| 190 | + return tuple(variants) | |
| 191 | + | |
| 192 | + def detect(self, parsed_query: Any) -> ProductTitleExclusionProfile: | |
| 193 | + if not self.registry.enabled or not self.registry.rules: | |
| 194 | + return ProductTitleExclusionProfile() | |
| 195 | + | |
| 196 | + query_variants = self._build_query_variants(parsed_query) | |
| 197 | + detected: List[DetectedProductTitleExclusion] = [] | |
| 198 | + seen_keys = set() | |
| 199 | + | |
| 200 | + for variant in query_variants: | |
| 201 | + for rule in self.registry.rules: | |
| 202 | + matched_term = rule.match_candidates(variant.candidates) | |
| 203 | + if not matched_term: | |
| 204 | + continue | |
| 205 | + | |
| 206 | + key = ( | |
| 207 | + tuple(rule.zh_title_exclusions), | |
| 208 | + tuple(rule.en_title_exclusions), | |
| 209 | + ) | |
| 210 | + if key in seen_keys: | |
| 211 | + continue | |
| 212 | + seen_keys.add(key) | |
| 213 | + detected.append( | |
| 214 | + DetectedProductTitleExclusion( | |
| 215 | + matched_term=matched_term, | |
| 216 | + matched_query_text=variant.text, | |
| 217 | + zh_title_exclusions=rule.zh_title_exclusions, | |
| 218 | + en_title_exclusions=rule.en_title_exclusions, | |
| 219 | + ) | |
| 220 | + ) | |
| 221 | + | |
| 222 | + return ProductTitleExclusionProfile( | |
| 223 | + query_variants=query_variants, | |
| 224 | + exclusions=tuple(detected), | |
| 225 | + ) | ... | ... |
query/query_parser.py
| ... | ... | @@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder |
| 18 | 18 | from config import SearchConfig |
| 19 | 19 | from translation import create_translation_client |
| 20 | 20 | from .language_detector import LanguageDetector |
| 21 | +from .product_title_exclusion import ( | |
| 22 | + ProductTitleExclusionDetector, | |
| 23 | + ProductTitleExclusionProfile, | |
| 24 | + ProductTitleExclusionRegistry, | |
| 25 | +) | |
| 21 | 26 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 22 | 27 | from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry |
| 23 | 28 | from .tokenization import extract_token_strings, simple_tokenize_query |
| ... | ... | @@ -27,6 +32,30 @@ logger = logging.getLogger(__name__) |
| 27 | 32 | import hanlp # type: ignore |
| 28 | 33 | |
| 29 | 34 | |
| 35 | +def rerank_query_text( | |
| 36 | + original_query: str, | |
| 37 | + *, | |
| 38 | + detected_language: Optional[str] = None, | |
| 39 | + translations: Optional[Dict[str, str]] = None, | |
| 40 | +) -> str: | |
| 41 | + """ | |
| 42 | + Text substituted for ``{query}`` when calling the reranker. | |
| 43 | + | |
| 44 | + Chinese and English queries use the original string. For any other detected | |
| 45 | + language, prefer the English translation, then Chinese; if neither exists, | |
| 46 | + fall back to the original query. | |
| 47 | + """ | |
| 48 | + lang = (detected_language or "").strip().lower() | |
| 49 | + if lang in ("zh", "en"): | |
| 50 | + return original_query | |
| 51 | + trans = translations or {} | |
| 52 | + for key in ("en", "zh"): | |
| 53 | + t = (trans.get(key) or "").strip() | |
| 54 | + if t: | |
| 55 | + return t | |
| 56 | + return original_query | |
| 57 | + | |
| 58 | + | |
| 30 | 59 | @dataclass(slots=True) |
| 31 | 60 | class ParsedQuery: |
| 32 | 61 | """Container for query parser facts.""" |
| ... | ... | @@ -39,6 +68,15 @@ class ParsedQuery: |
| 39 | 68 | query_vector: Optional[np.ndarray] = None |
| 40 | 69 | query_tokens: List[str] = field(default_factory=list) |
| 41 | 70 | style_intent_profile: Optional[StyleIntentProfile] = None |
| 71 | + product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None | |
| 72 | + | |
| 73 | + def text_for_rerank(self) -> str: | |
| 74 | + """See :func:`rerank_query_text`.""" | |
| 75 | + return rerank_query_text( | |
| 76 | + self.original_query, | |
| 77 | + detected_language=self.detected_language, | |
| 78 | + translations=self.translations, | |
| 79 | + ) | |
| 42 | 80 | |
| 43 | 81 | def to_dict(self) -> Dict[str, Any]: |
| 44 | 82 | """Convert to dictionary representation.""" |
| ... | ... | @@ -52,6 +90,11 @@ class ParsedQuery: |
| 52 | 90 | "style_intent_profile": ( |
| 53 | 91 | self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None |
| 54 | 92 | ), |
| 93 | + "product_title_exclusion_profile": ( | |
| 94 | + self.product_title_exclusion_profile.to_dict() | |
| 95 | + if self.product_title_exclusion_profile is not None | |
| 96 | + else None | |
| 97 | + ), | |
| 55 | 98 | } |
| 56 | 99 | |
| 57 | 100 | |
| ... | ... | @@ -94,6 +137,13 @@ class QueryParser: |
| 94 | 137 | self.style_intent_registry, |
| 95 | 138 | tokenizer=self._tokenizer, |
| 96 | 139 | ) |
| 140 | + self.product_title_exclusion_registry = ProductTitleExclusionRegistry.from_query_config( | |
| 141 | + config.query_config | |
| 142 | + ) | |
| 143 | + self.product_title_exclusion_detector = ProductTitleExclusionDetector( | |
| 144 | + self.product_title_exclusion_registry, | |
| 145 | + tokenizer=self._tokenizer, | |
| 146 | + ) | |
| 97 | 147 | |
| 98 | 148 | # Eager initialization (startup-time failure visibility, no lazy init in request path) |
| 99 | 149 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: |
| ... | ... | @@ -416,11 +466,16 @@ class QueryParser: |
| 416 | 466 | query_tokens=query_tokens, |
| 417 | 467 | ) |
| 418 | 468 | style_intent_profile = self.style_intent_detector.detect(base_result) |
| 469 | + product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) | |
| 419 | 470 | if context: |
| 420 | 471 | context.store_intermediate_result( |
| 421 | 472 | "style_intent_profile", |
| 422 | 473 | style_intent_profile.to_dict(), |
| 423 | 474 | ) |
| 475 | + context.store_intermediate_result( | |
| 476 | + "product_title_exclusion_profile", | |
| 477 | + product_title_exclusion_profile.to_dict(), | |
| 478 | + ) | |
| 424 | 479 | |
| 425 | 480 | result = ParsedQuery( |
| 426 | 481 | original_query=query, |
| ... | ... | @@ -431,6 +486,7 @@ class QueryParser: |
| 431 | 486 | query_vector=query_vector, |
| 432 | 487 | query_tokens=query_tokens, |
| 433 | 488 | style_intent_profile=style_intent_profile, |
| 489 | + product_title_exclusion_profile=product_title_exclusion_profile, | |
| 434 | 490 | ) |
| 435 | 491 | |
| 436 | 492 | if context and hasattr(context, 'logger'): | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -229,7 +229,10 @@ class ESQueryBuilder: |
| 229 | 229 | |
| 230 | 230 | # Build filter clauses for query (conjunctive filters + range filters) |
| 231 | 231 | filter_clauses = self._build_filters(conjunctive_filters, range_filters) |
| 232 | - | |
| 232 | + product_title_exclusion_filter = self._build_product_title_exclusion_filter(parsed_query) | |
| 233 | + if product_title_exclusion_filter: | |
| 234 | + filter_clauses.append(product_title_exclusion_filter) | |
| 235 | + | |
| 233 | 236 | # 3. Build main query structure: filters and recall |
| 234 | 237 | if recall_clauses: |
| 235 | 238 | # Combine text recalls with OR logic (if multiple) |
| ... | ... | @@ -780,6 +783,37 @@ class ESQueryBuilder: |
| 780 | 783 | |
| 781 | 784 | return filter_clauses |
| 782 | 785 | |
| 786 | + @staticmethod | |
| 787 | + def _build_product_title_exclusion_filter(parsed_query: Optional[Any]) -> Optional[Dict[str, Any]]: | |
| 788 | + if parsed_query is None: | |
| 789 | + return None | |
| 790 | + | |
| 791 | + profile = getattr(parsed_query, "product_title_exclusion_profile", None) | |
| 792 | + if not profile or not getattr(profile, "is_active", False): | |
| 793 | + return None | |
| 794 | + | |
| 795 | + should_clauses: List[Dict[str, Any]] = [] | |
| 796 | + for term in profile.all_zh_title_exclusions(): | |
| 797 | + should_clauses.append({"match_phrase": {"title.zh": {"query": term}}}) | |
| 798 | + for term in profile.all_en_title_exclusions(): | |
| 799 | + should_clauses.append({"match_phrase": {"title.en": {"query": term}}}) | |
| 800 | + | |
| 801 | + if not should_clauses: | |
| 802 | + return None | |
| 803 | + | |
| 804 | + return { | |
| 805 | + "bool": { | |
| 806 | + "must_not": [ | |
| 807 | + { | |
| 808 | + "bool": { | |
| 809 | + "should": should_clauses, | |
| 810 | + "minimum_should_match": 1, | |
| 811 | + } | |
| 812 | + } | |
| 813 | + ] | |
| 814 | + } | |
| 815 | + } | |
| 816 | + | |
| 783 | 817 | def add_sorting( |
| 784 | 818 | self, |
| 785 | 819 | es_query: Dict[str, Any], | ... | ... |
search/searcher.py
| ... | ... | @@ -581,7 +581,7 @@ class Searcher: |
| 581 | 581 | try: |
| 582 | 582 | from .rerank_client import run_rerank |
| 583 | 583 | |
| 584 | - rerank_query = parsed_query.original_query if parsed_query else query | |
| 584 | + rerank_query = parsed_query.text_for_rerank() if parsed_query else query | |
| 585 | 585 | es_response, rerank_meta, fused_debug = run_rerank( |
| 586 | 586 | query=rerank_query, |
| 587 | 587 | es_response=es_response, | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base(): |
| 118 | 118 | root = q["query"] |
| 119 | 119 | assert root["bool"]["_name"] == "base_query" |
| 120 | 120 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] |
| 121 | + | |
| 122 | + | |
| 123 | +def test_product_title_exclusion_filter_is_applied_to_query_and_knn(): | |
| 124 | + qb = _builder() | |
| 125 | + parsed_query = SimpleNamespace( | |
| 126 | + rewritten_query="fitted dress", | |
| 127 | + detected_language="en", | |
| 128 | + translations={"zh": "修身 连衣裙"}, | |
| 129 | + product_title_exclusion_profile=SimpleNamespace( | |
| 130 | + is_active=True, | |
| 131 | + all_zh_title_exclusions=lambda: ["宽松"], | |
| 132 | + all_en_title_exclusions=lambda: ["loose", "relaxed"], | |
| 133 | + ), | |
| 134 | + ) | |
| 135 | + | |
| 136 | + q = qb.build_query( | |
| 137 | + query_text="fitted dress", | |
| 138 | + query_vector=np.array([0.1, 0.2, 0.3]), | |
| 139 | + parsed_query=parsed_query, | |
| 140 | + enable_knn=True, | |
| 141 | + ) | |
| 142 | + | |
| 143 | + expected_filter = { | |
| 144 | + "bool": { | |
| 145 | + "must_not": [ | |
| 146 | + { | |
| 147 | + "bool": { | |
| 148 | + "should": [ | |
| 149 | + {"match_phrase": {"title.zh": {"query": "宽松"}}}, | |
| 150 | + {"match_phrase": {"title.en": {"query": "loose"}}}, | |
| 151 | + {"match_phrase": {"title.en": {"query": "relaxed"}}}, | |
| 152 | + ], | |
| 153 | + "minimum_should_match": 1, | |
| 154 | + } | |
| 155 | + } | |
| 156 | + ] | |
| 157 | + } | |
| 158 | + } | |
| 159 | + | |
| 160 | + assert expected_filter in q["query"]["bool"]["filter"] | |
| 161 | + assert q["knn"]["filter"] == expected_filter | ... | ... |
| ... | ... | @@ -0,0 +1,43 @@ |
| 1 | +from types import SimpleNamespace | |
| 2 | + | |
| 3 | +from config import QueryConfig | |
| 4 | +from query.product_title_exclusion import ( | |
| 5 | + ProductTitleExclusionDetector, | |
| 6 | + ProductTitleExclusionRegistry, | |
| 7 | +) | |
| 8 | + | |
| 9 | + | |
| 10 | +def test_product_title_exclusion_detector_matches_translated_english_token(): | |
| 11 | + query_config = QueryConfig( | |
| 12 | + product_title_exclusion_rules=[ | |
| 13 | + { | |
| 14 | + "zh_trigger_terms": ["修身"], | |
| 15 | + "en_trigger_terms": ["fitted"], | |
| 16 | + "zh_title_exclusions": ["宽松"], | |
| 17 | + "en_title_exclusions": ["loose", "relaxed", "oversized", "baggy", "slouchy"], | |
| 18 | + } | |
| 19 | + ] | |
| 20 | + ) | |
| 21 | + detector = ProductTitleExclusionDetector( | |
| 22 | + ProductTitleExclusionRegistry.from_query_config(query_config), | |
| 23 | + tokenizer=lambda text: text.split(), | |
| 24 | + ) | |
| 25 | + | |
| 26 | + parsed_query = SimpleNamespace( | |
| 27 | + original_query="修身连衣裙", | |
| 28 | + query_normalized="修身 连衣裙", | |
| 29 | + rewritten_query="修身 连衣裙", | |
| 30 | + translations={"en": "fitted dress"}, | |
| 31 | + ) | |
| 32 | + | |
| 33 | + profile = detector.detect(parsed_query) | |
| 34 | + | |
| 35 | + assert profile.is_active is True | |
| 36 | + assert profile.all_zh_title_exclusions() == ["宽松"] | |
| 37 | + assert profile.all_en_title_exclusions() == [ | |
| 38 | + "loose", | |
| 39 | + "relaxed", | |
| 40 | + "oversized", | |
| 41 | + "baggy", | |
| 42 | + "slouchy", | |
| 43 | + ] | ... | ... |
| ... | ... | @@ -0,0 +1,55 @@ |
| 1 | +"""Unit tests for rerank {query} text selection (translation fallback).""" | |
| 2 | + | |
| 3 | +from query.query_parser import ParsedQuery, rerank_query_text | |
| 4 | + | |
| 5 | + | |
| 6 | +def test_rerank_query_text_zh_uses_original(): | |
| 7 | + assert rerank_query_text("你好", detected_language="zh", translations={"en": "hello"}) == "你好" | |
| 8 | + | |
| 9 | + | |
| 10 | +def test_rerank_query_text_en_uses_original(): | |
| 11 | + assert rerank_query_text("hello", detected_language="en", translations={"zh": "你好"}) == "hello" | |
| 12 | + | |
| 13 | + | |
| 14 | +def test_rerank_query_text_russian_prefers_en_translation(): | |
| 15 | + assert ( | |
| 16 | + rerank_query_text( | |
| 17 | + "красное платье", | |
| 18 | + detected_language="ru", | |
| 19 | + translations={"en": "red dress", "zh": "红裙"}, | |
| 20 | + ) | |
| 21 | + == "red dress" | |
| 22 | + ) | |
| 23 | + | |
| 24 | + | |
| 25 | +def test_rerank_query_text_russian_falls_back_to_zh_when_no_en(): | |
| 26 | + assert ( | |
| 27 | + rerank_query_text( | |
| 28 | + "красное платье", | |
| 29 | + detected_language="ru", | |
| 30 | + translations={"zh": "红裙"}, | |
| 31 | + ) | |
| 32 | + == "红裙" | |
| 33 | + ) | |
| 34 | + | |
| 35 | + | |
| 36 | +def test_rerank_query_text_non_zh_en_falls_back_to_original_without_translations(): | |
| 37 | + assert rerank_query_text("foo", detected_language="ja", translations={}) == "foo" | |
| 38 | + | |
| 39 | + | |
| 40 | +def test_rerank_query_text_unknown_language_uses_en_when_present(): | |
| 41 | + assert ( | |
| 42 | + rerank_query_text("x", detected_language="unknown", translations={"en": "translated"}) | |
| 43 | + == "translated" | |
| 44 | + ) | |
| 45 | + | |
| 46 | + | |
| 47 | +def test_parsed_query_text_for_rerank_delegates(): | |
| 48 | + pq = ParsedQuery( | |
| 49 | + original_query="orig", | |
| 50 | + query_normalized="orig", | |
| 51 | + rewritten_query="rewritten", | |
| 52 | + detected_language="fr", | |
| 53 | + translations={"en": "en version"}, | |
| 54 | + ) | |
| 55 | + assert pq.text_for_rerank() == "en version" | ... | ... |
tests/test_search_rerank_window.py
| ... | ... | @@ -32,6 +32,15 @@ class _FakeParsedQuery: |
| 32 | 32 | query_vector: Any = None |
| 33 | 33 | style_intent_profile: Any = None |
| 34 | 34 | |
| 35 | + def text_for_rerank(self) -> str: | |
| 36 | + from query.query_parser import rerank_query_text | |
| 37 | + | |
| 38 | + return rerank_query_text( | |
| 39 | + self.original_query, | |
| 40 | + detected_language=self.detected_language, | |
| 41 | + translations=self.translations, | |
| 42 | + ) | |
| 43 | + | |
| 35 | 44 | def to_dict(self) -> Dict[str, Any]: |
| 36 | 45 | return { |
| 37 | 46 | "original_query": self.original_query, | ... | ... |