Commit 74fdf9bd6f9de47fcd91155a02a258edf9ef9005
1 parent
6adbf18a
1.
加了一个过滤/降权词典,query中有独立分词匹配到指定的触发词,将过滤带某些分词的商品(比如fitted/修身,过滤宽松、loose、relaxed、baggy,slouchy等商品) 2. reranker的query使用翻译后的
Showing
12 changed files
with
515 additions
and
3 deletions
Show diff stats
config/config.yaml
| @@ -124,6 +124,10 @@ query_config: | @@ -124,6 +124,10 @@ query_config: | ||
| 124 | color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] | 124 | color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] |
| 125 | size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] | 125 | size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] |
| 126 | 126 | ||
| 127 | + product_title_exclusion: | ||
| 128 | + enabled: true | ||
| 129 | + dictionary_path: "config/dictionaries/product_title_exclusion.tsv" | ||
| 130 | + | ||
| 127 | # 动态多语言检索字段配置 | 131 | # 动态多语言检索字段配置 |
| 128 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | 132 | # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; |
| 129 | # shared_fields 为无语言后缀字段。 | 133 | # shared_fields 为无语言后缀字段。 |
| @@ -376,7 +380,7 @@ services: | @@ -376,7 +380,7 @@ services: | ||
| 376 | max_docs: 1000 | 380 | max_docs: 1000 |
| 377 | normalize: true | 381 | normalize: true |
| 378 | # 服务内后端(reranker 进程启动时读取) | 382 | # 服务内后端(reranker 进程启动时读取) |
| 379 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | 383 | + backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank |
| 380 | backends: | 384 | backends: |
| 381 | bge: | 385 | bge: |
| 382 | model_name: "BAAI/bge-reranker-v2-m3" | 386 | model_name: "BAAI/bge-reranker-v2-m3" |
config/loader.py
| @@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]: | @@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]: | ||
| 113 | return rows | 113 | return rows |
| 114 | 114 | ||
| 115 | 115 | ||
| 116 | +def _read_product_title_exclusion_dictionary(path: Path) -> List[Dict[str, List[str]]]: | ||
| 117 | + rules: List[Dict[str, List[str]]] = [] | ||
| 118 | + if not path.exists(): | ||
| 119 | + return rules | ||
| 120 | + | ||
| 121 | + with open(path, "r", encoding="utf-8") as handle: | ||
| 122 | + for raw_line in handle: | ||
| 123 | + line = raw_line.strip() | ||
| 124 | + if not line or line.startswith("#"): | ||
| 125 | + continue | ||
| 126 | + parts = [segment.strip() for segment in line.split("\t")] | ||
| 127 | + if len(parts) != 4: | ||
| 128 | + continue | ||
| 129 | + | ||
| 130 | + def _split_cell(cell: str) -> List[str]: | ||
| 131 | + return [item.strip() for item in cell.split(",") if item.strip()] | ||
| 132 | + | ||
| 133 | + rules.append( | ||
| 134 | + { | ||
| 135 | + "zh_trigger_terms": _split_cell(parts[0]), | ||
| 136 | + "en_trigger_terms": _split_cell(parts[1]), | ||
| 137 | + "zh_title_exclusions": _split_cell(parts[2]), | ||
| 138 | + "en_title_exclusions": _split_cell(parts[3]), | ||
| 139 | + } | ||
| 140 | + ) | ||
| 141 | + return rules | ||
| 142 | + | ||
| 143 | + | ||
| 116 | _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = { | 144 | _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = { |
| 117 | "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"], | 145 | "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"], |
| 118 | "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"], | 146 | "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"], |
| @@ -282,6 +310,11 @@ class AppConfigLoader: | @@ -282,6 +310,11 @@ class AppConfigLoader: | ||
| 282 | if isinstance(query_cfg.get("style_intent"), dict) | 310 | if isinstance(query_cfg.get("style_intent"), dict) |
| 283 | else {} | 311 | else {} |
| 284 | ) | 312 | ) |
| 313 | + product_title_exclusion_cfg = ( | ||
| 314 | + query_cfg.get("product_title_exclusion") | ||
| 315 | + if isinstance(query_cfg.get("product_title_exclusion"), dict) | ||
| 316 | + else {} | ||
| 317 | + ) | ||
| 285 | 318 | ||
| 286 | def _resolve_project_path(value: Any, default_path: Path) -> Path: | 319 | def _resolve_project_path(value: Any, default_path: Path) -> Path: |
| 287 | if value in (None, ""): | 320 | if value in (None, ""): |
| @@ -316,6 +349,10 @@ class AppConfigLoader: | @@ -316,6 +349,10 @@ class AppConfigLoader: | ||
| 316 | "color": _read_synonym_csv_dictionary(style_color_path), | 349 | "color": _read_synonym_csv_dictionary(style_color_path), |
| 317 | "size": _read_synonym_csv_dictionary(style_size_path), | 350 | "size": _read_synonym_csv_dictionary(style_size_path), |
| 318 | } | 351 | } |
| 352 | + product_title_exclusion_path = _resolve_project_path( | ||
| 353 | + product_title_exclusion_cfg.get("dictionary_path"), | ||
| 354 | + self.config_dir / "dictionaries" / "product_title_exclusion.tsv", | ||
| 355 | + ) | ||
| 319 | query_config = QueryConfig( | 356 | query_config = QueryConfig( |
| 320 | supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), | 357 | supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]), |
| 321 | default_language=str(query_cfg.get("default_language") or "en"), | 358 | default_language=str(query_cfg.get("default_language") or "en"), |
| @@ -390,6 +427,10 @@ class AppConfigLoader: | @@ -390,6 +427,10 @@ class AppConfigLoader: | ||
| 390 | style_intent_enabled=bool(style_intent_cfg.get("enabled", True)), | 427 | style_intent_enabled=bool(style_intent_cfg.get("enabled", True)), |
| 391 | style_intent_terms=style_intent_terms, | 428 | style_intent_terms=style_intent_terms, |
| 392 | style_intent_dimension_aliases=style_dimension_aliases, | 429 | style_intent_dimension_aliases=style_dimension_aliases, |
| 430 | + product_title_exclusion_enabled=bool(product_title_exclusion_cfg.get("enabled", True)), | ||
| 431 | + product_title_exclusion_rules=_read_product_title_exclusion_dictionary( | ||
| 432 | + product_title_exclusion_path | ||
| 433 | + ), | ||
| 393 | ) | 434 | ) |
| 394 | 435 | ||
| 395 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} | 436 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} |
config/schema.py
| @@ -67,6 +67,8 @@ class QueryConfig: | @@ -67,6 +67,8 @@ class QueryConfig: | ||
| 67 | style_intent_enabled: bool = True | 67 | style_intent_enabled: bool = True |
| 68 | style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict) | 68 | style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict) |
| 69 | style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict) | 69 | style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict) |
| 70 | + product_title_exclusion_enabled: bool = True | ||
| 71 | + product_title_exclusion_rules: List[Dict[str, List[str]]] = field(default_factory=list) | ||
| 70 | 72 | ||
| 71 | 73 | ||
| 72 | @dataclass(frozen=True) | 74 | @dataclass(frozen=True) |
| @@ -0,0 +1,225 @@ | @@ -0,0 +1,225 @@ | ||
| 1 | +""" | ||
| 2 | +Product title exclusion detection for query understanding. | ||
| 3 | +""" | ||
| 4 | + | ||
| 5 | +from __future__ import annotations | ||
| 6 | + | ||
| 7 | +from dataclasses import dataclass, field | ||
| 8 | +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple | ||
| 9 | + | ||
| 10 | +from .tokenization import TokenizedText, normalize_query_text, tokenize_text | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def _dedupe_terms(terms: Iterable[str]) -> List[str]: | ||
| 14 | + result: List[str] = [] | ||
| 15 | + seen: Set[str] = set() | ||
| 16 | + for raw_term in terms: | ||
| 17 | + term = normalize_query_text(raw_term) | ||
| 18 | + if not term or term in seen: | ||
| 19 | + continue | ||
| 20 | + seen.add(term) | ||
| 21 | + result.append(term) | ||
| 22 | + return result | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +@dataclass(frozen=True) | ||
| 26 | +class ProductTitleExclusionRule: | ||
| 27 | + zh_trigger_terms: Tuple[str, ...] | ||
| 28 | + en_trigger_terms: Tuple[str, ...] | ||
| 29 | + zh_title_exclusions: Tuple[str, ...] | ||
| 30 | + en_title_exclusions: Tuple[str, ...] | ||
| 31 | + max_term_ngram: int = 3 | ||
| 32 | + | ||
| 33 | + @classmethod | ||
| 34 | + def from_config_row(cls, row: Dict[str, Sequence[str]]) -> Optional["ProductTitleExclusionRule"]: | ||
| 35 | + zh_trigger_terms = tuple(_dedupe_terms(row.get("zh_trigger_terms") or [])) | ||
| 36 | + en_trigger_terms = tuple(_dedupe_terms(row.get("en_trigger_terms") or [])) | ||
| 37 | + zh_title_exclusions = tuple(_dedupe_terms(row.get("zh_title_exclusions") or [])) | ||
| 38 | + en_title_exclusions = tuple(_dedupe_terms(row.get("en_title_exclusions") or [])) | ||
| 39 | + if not zh_title_exclusions and not en_title_exclusions: | ||
| 40 | + return None | ||
| 41 | + if not zh_trigger_terms and not en_trigger_terms: | ||
| 42 | + return None | ||
| 43 | + | ||
| 44 | + max_ngram = max( | ||
| 45 | + [1] | ||
| 46 | + + [len(term.split()) for term in zh_trigger_terms] | ||
| 47 | + + [len(term.split()) for term in en_trigger_terms] | ||
| 48 | + ) | ||
| 49 | + return cls( | ||
| 50 | + zh_trigger_terms=zh_trigger_terms, | ||
| 51 | + en_trigger_terms=en_trigger_terms, | ||
| 52 | + zh_title_exclusions=zh_title_exclusions, | ||
| 53 | + en_title_exclusions=en_title_exclusions, | ||
| 54 | + max_term_ngram=max_ngram, | ||
| 55 | + ) | ||
| 56 | + | ||
| 57 | + def match_candidates(self, candidates: Iterable[str]) -> Optional[str]: | ||
| 58 | + normalized_candidates = {normalize_query_text(candidate) for candidate in candidates} | ||
| 59 | + for term in self.zh_trigger_terms: | ||
| 60 | + if term in normalized_candidates: | ||
| 61 | + return term | ||
| 62 | + for term in self.en_trigger_terms: | ||
| 63 | + if term in normalized_candidates: | ||
| 64 | + return term | ||
| 65 | + return None | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +@dataclass(frozen=True) | ||
| 69 | +class DetectedProductTitleExclusion: | ||
| 70 | + matched_term: str | ||
| 71 | + matched_query_text: str | ||
| 72 | + zh_title_exclusions: Tuple[str, ...] | ||
| 73 | + en_title_exclusions: Tuple[str, ...] | ||
| 74 | + | ||
| 75 | + def to_dict(self) -> Dict[str, Any]: | ||
| 76 | + return { | ||
| 77 | + "matched_term": self.matched_term, | ||
| 78 | + "matched_query_text": self.matched_query_text, | ||
| 79 | + "zh_title_exclusions": list(self.zh_title_exclusions), | ||
| 80 | + "en_title_exclusions": list(self.en_title_exclusions), | ||
| 81 | + } | ||
| 82 | + | ||
| 83 | + | ||
| 84 | +@dataclass(frozen=True) | ||
| 85 | +class ProductTitleExclusionProfile: | ||
| 86 | + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple) | ||
| 87 | + exclusions: Tuple[DetectedProductTitleExclusion, ...] = field(default_factory=tuple) | ||
| 88 | + | ||
| 89 | + @property | ||
| 90 | + def is_active(self) -> bool: | ||
| 91 | + return bool(self.exclusions) | ||
| 92 | + | ||
| 93 | + def to_dict(self) -> Dict[str, Any]: | ||
| 94 | + return { | ||
| 95 | + "active": self.is_active, | ||
| 96 | + "exclusions": [item.to_dict() for item in self.exclusions], | ||
| 97 | + "query_variants": [ | ||
| 98 | + { | ||
| 99 | + "text": variant.text, | ||
| 100 | + "normalized_text": variant.normalized_text, | ||
| 101 | + "fine_tokens": list(variant.fine_tokens), | ||
| 102 | + "coarse_tokens": list(variant.coarse_tokens), | ||
| 103 | + "candidates": list(variant.candidates), | ||
| 104 | + } | ||
| 105 | + for variant in self.query_variants | ||
| 106 | + ], | ||
| 107 | + } | ||
| 108 | + | ||
| 109 | + def all_zh_title_exclusions(self) -> List[str]: | ||
| 110 | + return _dedupe_terms( | ||
| 111 | + term | ||
| 112 | + for item in self.exclusions | ||
| 113 | + for term in item.zh_title_exclusions | ||
| 114 | + ) | ||
| 115 | + | ||
| 116 | + def all_en_title_exclusions(self) -> List[str]: | ||
| 117 | + return _dedupe_terms( | ||
| 118 | + term | ||
| 119 | + for item in self.exclusions | ||
| 120 | + for term in item.en_title_exclusions | ||
| 121 | + ) | ||
| 122 | + | ||
| 123 | + | ||
| 124 | +class ProductTitleExclusionRegistry: | ||
| 125 | + def __init__( | ||
| 126 | + self, | ||
| 127 | + rules: Sequence[ProductTitleExclusionRule], | ||
| 128 | + *, | ||
| 129 | + enabled: bool = True, | ||
| 130 | + ) -> None: | ||
| 131 | + self.rules = tuple(rules) | ||
| 132 | + self.enabled = bool(enabled) | ||
| 133 | + self.max_term_ngram = max((rule.max_term_ngram for rule in self.rules), default=3) | ||
| 134 | + | ||
| 135 | + @classmethod | ||
| 136 | + def from_query_config(cls, query_config: Any) -> "ProductTitleExclusionRegistry": | ||
| 137 | + raw_rules = getattr(query_config, "product_title_exclusion_rules", []) or [] | ||
| 138 | + rules: List[ProductTitleExclusionRule] = [] | ||
| 139 | + for row in raw_rules: | ||
| 140 | + if not isinstance(row, dict): | ||
| 141 | + continue | ||
| 142 | + rule = ProductTitleExclusionRule.from_config_row(row) | ||
| 143 | + if rule is not None: | ||
| 144 | + rules.append(rule) | ||
| 145 | + return cls( | ||
| 146 | + rules, | ||
| 147 | + enabled=bool(getattr(query_config, "product_title_exclusion_enabled", True)), | ||
| 148 | + ) | ||
| 149 | + | ||
| 150 | + | ||
| 151 | +class ProductTitleExclusionDetector: | ||
| 152 | + def __init__( | ||
| 153 | + self, | ||
| 154 | + registry: ProductTitleExclusionRegistry, | ||
| 155 | + *, | ||
| 156 | + tokenizer: Optional[Callable[[str], Any]] = None, | ||
| 157 | + ) -> None: | ||
| 158 | + self.registry = registry | ||
| 159 | + self.tokenizer = tokenizer | ||
| 160 | + | ||
| 161 | + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: | ||
| 162 | + seen = set() | ||
| 163 | + variants: List[TokenizedText] = [] | ||
| 164 | + texts = [ | ||
| 165 | + getattr(parsed_query, "original_query", None), | ||
| 166 | + getattr(parsed_query, "query_normalized", None), | ||
| 167 | + getattr(parsed_query, "rewritten_query", None), | ||
| 168 | + ] | ||
| 169 | + | ||
| 170 | + translations = getattr(parsed_query, "translations", {}) or {} | ||
| 171 | + if isinstance(translations, dict): | ||
| 172 | + texts.extend(translations.values()) | ||
| 173 | + | ||
| 174 | + for raw_text in texts: | ||
| 175 | + text = str(raw_text or "").strip() | ||
| 176 | + if not text: | ||
| 177 | + continue | ||
| 178 | + normalized = normalize_query_text(text) | ||
| 179 | + if not normalized or normalized in seen: | ||
| 180 | + continue | ||
| 181 | + seen.add(normalized) | ||
| 182 | + variants.append( | ||
| 183 | + tokenize_text( | ||
| 184 | + text, | ||
| 185 | + tokenizer=self.tokenizer, | ||
| 186 | + max_ngram=self.registry.max_term_ngram, | ||
| 187 | + ) | ||
| 188 | + ) | ||
| 189 | + | ||
| 190 | + return tuple(variants) | ||
| 191 | + | ||
| 192 | + def detect(self, parsed_query: Any) -> ProductTitleExclusionProfile: | ||
| 193 | + if not self.registry.enabled or not self.registry.rules: | ||
| 194 | + return ProductTitleExclusionProfile() | ||
| 195 | + | ||
| 196 | + query_variants = self._build_query_variants(parsed_query) | ||
| 197 | + detected: List[DetectedProductTitleExclusion] = [] | ||
| 198 | + seen_keys = set() | ||
| 199 | + | ||
| 200 | + for variant in query_variants: | ||
| 201 | + for rule in self.registry.rules: | ||
| 202 | + matched_term = rule.match_candidates(variant.candidates) | ||
| 203 | + if not matched_term: | ||
| 204 | + continue | ||
| 205 | + | ||
| 206 | + key = ( | ||
| 207 | + tuple(rule.zh_title_exclusions), | ||
| 208 | + tuple(rule.en_title_exclusions), | ||
| 209 | + ) | ||
| 210 | + if key in seen_keys: | ||
| 211 | + continue | ||
| 212 | + seen_keys.add(key) | ||
| 213 | + detected.append( | ||
| 214 | + DetectedProductTitleExclusion( | ||
| 215 | + matched_term=matched_term, | ||
| 216 | + matched_query_text=variant.text, | ||
| 217 | + zh_title_exclusions=rule.zh_title_exclusions, | ||
| 218 | + en_title_exclusions=rule.en_title_exclusions, | ||
| 219 | + ) | ||
| 220 | + ) | ||
| 221 | + | ||
| 222 | + return ProductTitleExclusionProfile( | ||
| 223 | + query_variants=query_variants, | ||
| 224 | + exclusions=tuple(detected), | ||
| 225 | + ) |
query/query_parser.py
| @@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder | @@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder | ||
| 18 | from config import SearchConfig | 18 | from config import SearchConfig |
| 19 | from translation import create_translation_client | 19 | from translation import create_translation_client |
| 20 | from .language_detector import LanguageDetector | 20 | from .language_detector import LanguageDetector |
| 21 | +from .product_title_exclusion import ( | ||
| 22 | + ProductTitleExclusionDetector, | ||
| 23 | + ProductTitleExclusionProfile, | ||
| 24 | + ProductTitleExclusionRegistry, | ||
| 25 | +) | ||
| 21 | from .query_rewriter import QueryRewriter, QueryNormalizer | 26 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 22 | from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry | 27 | from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry |
| 23 | from .tokenization import extract_token_strings, simple_tokenize_query | 28 | from .tokenization import extract_token_strings, simple_tokenize_query |
| @@ -27,6 +32,30 @@ logger = logging.getLogger(__name__) | @@ -27,6 +32,30 @@ logger = logging.getLogger(__name__) | ||
| 27 | import hanlp # type: ignore | 32 | import hanlp # type: ignore |
| 28 | 33 | ||
| 29 | 34 | ||
| 35 | +def rerank_query_text( | ||
| 36 | + original_query: str, | ||
| 37 | + *, | ||
| 38 | + detected_language: Optional[str] = None, | ||
| 39 | + translations: Optional[Dict[str, str]] = None, | ||
| 40 | +) -> str: | ||
| 41 | + """ | ||
| 42 | + Text substituted for ``{query}`` when calling the reranker. | ||
| 43 | + | ||
| 44 | + Chinese and English queries use the original string. For any other detected | ||
| 45 | + language, prefer the English translation, then Chinese; if neither exists, | ||
| 46 | + fall back to the original query. | ||
| 47 | + """ | ||
| 48 | + lang = (detected_language or "").strip().lower() | ||
| 49 | + if lang in ("zh", "en"): | ||
| 50 | + return original_query | ||
| 51 | + trans = translations or {} | ||
| 52 | + for key in ("en", "zh"): | ||
| 53 | + t = (trans.get(key) or "").strip() | ||
| 54 | + if t: | ||
| 55 | + return t | ||
| 56 | + return original_query | ||
| 57 | + | ||
| 58 | + | ||
| 30 | @dataclass(slots=True) | 59 | @dataclass(slots=True) |
| 31 | class ParsedQuery: | 60 | class ParsedQuery: |
| 32 | """Container for query parser facts.""" | 61 | """Container for query parser facts.""" |
| @@ -39,6 +68,15 @@ class ParsedQuery: | @@ -39,6 +68,15 @@ class ParsedQuery: | ||
| 39 | query_vector: Optional[np.ndarray] = None | 68 | query_vector: Optional[np.ndarray] = None |
| 40 | query_tokens: List[str] = field(default_factory=list) | 69 | query_tokens: List[str] = field(default_factory=list) |
| 41 | style_intent_profile: Optional[StyleIntentProfile] = None | 70 | style_intent_profile: Optional[StyleIntentProfile] = None |
| 71 | + product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None | ||
| 72 | + | ||
| 73 | + def text_for_rerank(self) -> str: | ||
| 74 | + """See :func:`rerank_query_text`.""" | ||
| 75 | + return rerank_query_text( | ||
| 76 | + self.original_query, | ||
| 77 | + detected_language=self.detected_language, | ||
| 78 | + translations=self.translations, | ||
| 79 | + ) | ||
| 42 | 80 | ||
| 43 | def to_dict(self) -> Dict[str, Any]: | 81 | def to_dict(self) -> Dict[str, Any]: |
| 44 | """Convert to dictionary representation.""" | 82 | """Convert to dictionary representation.""" |
| @@ -52,6 +90,11 @@ class ParsedQuery: | @@ -52,6 +90,11 @@ class ParsedQuery: | ||
| 52 | "style_intent_profile": ( | 90 | "style_intent_profile": ( |
| 53 | self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None | 91 | self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None |
| 54 | ), | 92 | ), |
| 93 | + "product_title_exclusion_profile": ( | ||
| 94 | + self.product_title_exclusion_profile.to_dict() | ||
| 95 | + if self.product_title_exclusion_profile is not None | ||
| 96 | + else None | ||
| 97 | + ), | ||
| 55 | } | 98 | } |
| 56 | 99 | ||
| 57 | 100 | ||
| @@ -94,6 +137,13 @@ class QueryParser: | @@ -94,6 +137,13 @@ class QueryParser: | ||
| 94 | self.style_intent_registry, | 137 | self.style_intent_registry, |
| 95 | tokenizer=self._tokenizer, | 138 | tokenizer=self._tokenizer, |
| 96 | ) | 139 | ) |
| 140 | + self.product_title_exclusion_registry = ProductTitleExclusionRegistry.from_query_config( | ||
| 141 | + config.query_config | ||
| 142 | + ) | ||
| 143 | + self.product_title_exclusion_detector = ProductTitleExclusionDetector( | ||
| 144 | + self.product_title_exclusion_registry, | ||
| 145 | + tokenizer=self._tokenizer, | ||
| 146 | + ) | ||
| 97 | 147 | ||
| 98 | # Eager initialization (startup-time failure visibility, no lazy init in request path) | 148 | # Eager initialization (startup-time failure visibility, no lazy init in request path) |
| 99 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: | 149 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: |
| @@ -416,11 +466,16 @@ class QueryParser: | @@ -416,11 +466,16 @@ class QueryParser: | ||
| 416 | query_tokens=query_tokens, | 466 | query_tokens=query_tokens, |
| 417 | ) | 467 | ) |
| 418 | style_intent_profile = self.style_intent_detector.detect(base_result) | 468 | style_intent_profile = self.style_intent_detector.detect(base_result) |
| 469 | + product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) | ||
| 419 | if context: | 470 | if context: |
| 420 | context.store_intermediate_result( | 471 | context.store_intermediate_result( |
| 421 | "style_intent_profile", | 472 | "style_intent_profile", |
| 422 | style_intent_profile.to_dict(), | 473 | style_intent_profile.to_dict(), |
| 423 | ) | 474 | ) |
| 475 | + context.store_intermediate_result( | ||
| 476 | + "product_title_exclusion_profile", | ||
| 477 | + product_title_exclusion_profile.to_dict(), | ||
| 478 | + ) | ||
| 424 | 479 | ||
| 425 | result = ParsedQuery( | 480 | result = ParsedQuery( |
| 426 | original_query=query, | 481 | original_query=query, |
| @@ -431,6 +486,7 @@ class QueryParser: | @@ -431,6 +486,7 @@ class QueryParser: | ||
| 431 | query_vector=query_vector, | 486 | query_vector=query_vector, |
| 432 | query_tokens=query_tokens, | 487 | query_tokens=query_tokens, |
| 433 | style_intent_profile=style_intent_profile, | 488 | style_intent_profile=style_intent_profile, |
| 489 | + product_title_exclusion_profile=product_title_exclusion_profile, | ||
| 434 | ) | 490 | ) |
| 435 | 491 | ||
| 436 | if context and hasattr(context, 'logger'): | 492 | if context and hasattr(context, 'logger'): |
search/es_query_builder.py
| @@ -229,7 +229,10 @@ class ESQueryBuilder: | @@ -229,7 +229,10 @@ class ESQueryBuilder: | ||
| 229 | 229 | ||
| 230 | # Build filter clauses for query (conjunctive filters + range filters) | 230 | # Build filter clauses for query (conjunctive filters + range filters) |
| 231 | filter_clauses = self._build_filters(conjunctive_filters, range_filters) | 231 | filter_clauses = self._build_filters(conjunctive_filters, range_filters) |
| 232 | - | 232 | + product_title_exclusion_filter = self._build_product_title_exclusion_filter(parsed_query) |
| 233 | + if product_title_exclusion_filter: | ||
| 234 | + filter_clauses.append(product_title_exclusion_filter) | ||
| 235 | + | ||
| 233 | # 3. Build main query structure: filters and recall | 236 | # 3. Build main query structure: filters and recall |
| 234 | if recall_clauses: | 237 | if recall_clauses: |
| 235 | # Combine text recalls with OR logic (if multiple) | 238 | # Combine text recalls with OR logic (if multiple) |
| @@ -780,6 +783,37 @@ class ESQueryBuilder: | @@ -780,6 +783,37 @@ class ESQueryBuilder: | ||
| 780 | 783 | ||
| 781 | return filter_clauses | 784 | return filter_clauses |
| 782 | 785 | ||
| 786 | + @staticmethod | ||
| 787 | + def _build_product_title_exclusion_filter(parsed_query: Optional[Any]) -> Optional[Dict[str, Any]]: | ||
| 788 | + if parsed_query is None: | ||
| 789 | + return None | ||
| 790 | + | ||
| 791 | + profile = getattr(parsed_query, "product_title_exclusion_profile", None) | ||
| 792 | + if not profile or not getattr(profile, "is_active", False): | ||
| 793 | + return None | ||
| 794 | + | ||
| 795 | + should_clauses: List[Dict[str, Any]] = [] | ||
| 796 | + for term in profile.all_zh_title_exclusions(): | ||
| 797 | + should_clauses.append({"match_phrase": {"title.zh": {"query": term}}}) | ||
| 798 | + for term in profile.all_en_title_exclusions(): | ||
| 799 | + should_clauses.append({"match_phrase": {"title.en": {"query": term}}}) | ||
| 800 | + | ||
| 801 | + if not should_clauses: | ||
| 802 | + return None | ||
| 803 | + | ||
| 804 | + return { | ||
| 805 | + "bool": { | ||
| 806 | + "must_not": [ | ||
| 807 | + { | ||
| 808 | + "bool": { | ||
| 809 | + "should": should_clauses, | ||
| 810 | + "minimum_should_match": 1, | ||
| 811 | + } | ||
| 812 | + } | ||
| 813 | + ] | ||
| 814 | + } | ||
| 815 | + } | ||
| 816 | + | ||
| 783 | def add_sorting( | 817 | def add_sorting( |
| 784 | self, | 818 | self, |
| 785 | es_query: Dict[str, Any], | 819 | es_query: Dict[str, Any], |
search/searcher.py
| @@ -581,7 +581,7 @@ class Searcher: | @@ -581,7 +581,7 @@ class Searcher: | ||
| 581 | try: | 581 | try: |
| 582 | from .rerank_client import run_rerank | 582 | from .rerank_client import run_rerank |
| 583 | 583 | ||
| 584 | - rerank_query = parsed_query.original_query if parsed_query else query | 584 | + rerank_query = parsed_query.text_for_rerank() if parsed_query else query |
| 585 | es_response, rerank_meta, fused_debug = run_rerank( | 585 | es_response, rerank_meta, fused_debug = run_rerank( |
| 586 | query=rerank_query, | 586 | query=rerank_query, |
| 587 | es_response=es_response, | 587 | es_response=es_response, |
tests/test_es_query_builder.py
| @@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | @@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | ||
| 118 | root = q["query"] | 118 | root = q["query"] |
| 119 | assert root["bool"]["_name"] == "base_query" | 119 | assert root["bool"]["_name"] == "base_query" |
| 120 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | 120 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] |
| 121 | + | ||
| 122 | + | ||
| 123 | +def test_product_title_exclusion_filter_is_applied_to_query_and_knn(): | ||
| 124 | + qb = _builder() | ||
| 125 | + parsed_query = SimpleNamespace( | ||
| 126 | + rewritten_query="fitted dress", | ||
| 127 | + detected_language="en", | ||
| 128 | + translations={"zh": "修身 连衣裙"}, | ||
| 129 | + product_title_exclusion_profile=SimpleNamespace( | ||
| 130 | + is_active=True, | ||
| 131 | + all_zh_title_exclusions=lambda: ["宽松"], | ||
| 132 | + all_en_title_exclusions=lambda: ["loose", "relaxed"], | ||
| 133 | + ), | ||
| 134 | + ) | ||
| 135 | + | ||
| 136 | + q = qb.build_query( | ||
| 137 | + query_text="fitted dress", | ||
| 138 | + query_vector=np.array([0.1, 0.2, 0.3]), | ||
| 139 | + parsed_query=parsed_query, | ||
| 140 | + enable_knn=True, | ||
| 141 | + ) | ||
| 142 | + | ||
| 143 | + expected_filter = { | ||
| 144 | + "bool": { | ||
| 145 | + "must_not": [ | ||
| 146 | + { | ||
| 147 | + "bool": { | ||
| 148 | + "should": [ | ||
| 149 | + {"match_phrase": {"title.zh": {"query": "宽松"}}}, | ||
| 150 | + {"match_phrase": {"title.en": {"query": "loose"}}}, | ||
| 151 | + {"match_phrase": {"title.en": {"query": "relaxed"}}}, | ||
| 152 | + ], | ||
| 153 | + "minimum_should_match": 1, | ||
| 154 | + } | ||
| 155 | + } | ||
| 156 | + ] | ||
| 157 | + } | ||
| 158 | + } | ||
| 159 | + | ||
| 160 | + assert expected_filter in q["query"]["bool"]["filter"] | ||
| 161 | + assert q["knn"]["filter"] == expected_filter |
| @@ -0,0 +1,43 @@ | @@ -0,0 +1,43 @@ | ||
| 1 | +from types import SimpleNamespace | ||
| 2 | + | ||
| 3 | +from config import QueryConfig | ||
| 4 | +from query.product_title_exclusion import ( | ||
| 5 | + ProductTitleExclusionDetector, | ||
| 6 | + ProductTitleExclusionRegistry, | ||
| 7 | +) | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +def test_product_title_exclusion_detector_matches_translated_english_token(): | ||
| 11 | + query_config = QueryConfig( | ||
| 12 | + product_title_exclusion_rules=[ | ||
| 13 | + { | ||
| 14 | + "zh_trigger_terms": ["修身"], | ||
| 15 | + "en_trigger_terms": ["fitted"], | ||
| 16 | + "zh_title_exclusions": ["宽松"], | ||
| 17 | + "en_title_exclusions": ["loose", "relaxed", "oversized", "baggy", "slouchy"], | ||
| 18 | + } | ||
| 19 | + ] | ||
| 20 | + ) | ||
| 21 | + detector = ProductTitleExclusionDetector( | ||
| 22 | + ProductTitleExclusionRegistry.from_query_config(query_config), | ||
| 23 | + tokenizer=lambda text: text.split(), | ||
| 24 | + ) | ||
| 25 | + | ||
| 26 | + parsed_query = SimpleNamespace( | ||
| 27 | + original_query="修身连衣裙", | ||
| 28 | + query_normalized="修身 连衣裙", | ||
| 29 | + rewritten_query="修身 连衣裙", | ||
| 30 | + translations={"en": "fitted dress"}, | ||
| 31 | + ) | ||
| 32 | + | ||
| 33 | + profile = detector.detect(parsed_query) | ||
| 34 | + | ||
| 35 | + assert profile.is_active is True | ||
| 36 | + assert profile.all_zh_title_exclusions() == ["宽松"] | ||
| 37 | + assert profile.all_en_title_exclusions() == [ | ||
| 38 | + "loose", | ||
| 39 | + "relaxed", | ||
| 40 | + "oversized", | ||
| 41 | + "baggy", | ||
| 42 | + "slouchy", | ||
| 43 | + ] |
| @@ -0,0 +1,55 @@ | @@ -0,0 +1,55 @@ | ||
| 1 | +"""Unit tests for rerank {query} text selection (translation fallback).""" | ||
| 2 | + | ||
| 3 | +from query.query_parser import ParsedQuery, rerank_query_text | ||
| 4 | + | ||
| 5 | + | ||
| 6 | +def test_rerank_query_text_zh_uses_original(): | ||
| 7 | + assert rerank_query_text("你好", detected_language="zh", translations={"en": "hello"}) == "你好" | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +def test_rerank_query_text_en_uses_original(): | ||
| 11 | + assert rerank_query_text("hello", detected_language="en", translations={"zh": "你好"}) == "hello" | ||
| 12 | + | ||
| 13 | + | ||
| 14 | +def test_rerank_query_text_russian_prefers_en_translation(): | ||
| 15 | + assert ( | ||
| 16 | + rerank_query_text( | ||
| 17 | + "красное платье", | ||
| 18 | + detected_language="ru", | ||
| 19 | + translations={"en": "red dress", "zh": "红裙"}, | ||
| 20 | + ) | ||
| 21 | + == "red dress" | ||
| 22 | + ) | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +def test_rerank_query_text_russian_falls_back_to_zh_when_no_en(): | ||
| 26 | + assert ( | ||
| 27 | + rerank_query_text( | ||
| 28 | + "красное платье", | ||
| 29 | + detected_language="ru", | ||
| 30 | + translations={"zh": "红裙"}, | ||
| 31 | + ) | ||
| 32 | + == "红裙" | ||
| 33 | + ) | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +def test_rerank_query_text_non_zh_en_falls_back_to_original_without_translations(): | ||
| 37 | + assert rerank_query_text("foo", detected_language="ja", translations={}) == "foo" | ||
| 38 | + | ||
| 39 | + | ||
| 40 | +def test_rerank_query_text_unknown_language_uses_en_when_present(): | ||
| 41 | + assert ( | ||
| 42 | + rerank_query_text("x", detected_language="unknown", translations={"en": "translated"}) | ||
| 43 | + == "translated" | ||
| 44 | + ) | ||
| 45 | + | ||
| 46 | + | ||
| 47 | +def test_parsed_query_text_for_rerank_delegates(): | ||
| 48 | + pq = ParsedQuery( | ||
| 49 | + original_query="orig", | ||
| 50 | + query_normalized="orig", | ||
| 51 | + rewritten_query="rewritten", | ||
| 52 | + detected_language="fr", | ||
| 53 | + translations={"en": "en version"}, | ||
| 54 | + ) | ||
| 55 | + assert pq.text_for_rerank() == "en version" |
tests/test_search_rerank_window.py
| @@ -32,6 +32,15 @@ class _FakeParsedQuery: | @@ -32,6 +32,15 @@ class _FakeParsedQuery: | ||
| 32 | query_vector: Any = None | 32 | query_vector: Any = None |
| 33 | style_intent_profile: Any = None | 33 | style_intent_profile: Any = None |
| 34 | 34 | ||
| 35 | + def text_for_rerank(self) -> str: | ||
| 36 | + from query.query_parser import rerank_query_text | ||
| 37 | + | ||
| 38 | + return rerank_query_text( | ||
| 39 | + self.original_query, | ||
| 40 | + detected_language=self.detected_language, | ||
| 41 | + translations=self.translations, | ||
| 42 | + ) | ||
| 43 | + | ||
| 35 | def to_dict(self) -> Dict[str, Any]: | 44 | def to_dict(self) -> Dict[str, Any]: |
| 36 | return { | 45 | return { |
| 37 | "original_query": self.original_query, | 46 | "original_query": self.original_query, |