Commit 74fdf9bd6f9de47fcd91155a02a258edf9ef9005

Authored by tangwang
1 parent 6adbf18a

1.

加了一个过滤/降权词典,query中有独立分词匹配到指定的触发词,将过滤带某些分词的商品(比如fitted/修身,过滤宽松、loose、relaxed、baggy,slouchy等商品)
2. reranker的query使用翻译后的
config/config.yaml
... ... @@ -124,6 +124,10 @@ query_config:
124 124 color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
125 125 size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
126 126  
  127 + product_title_exclusion:
  128 + enabled: true
  129 + dictionary_path: "config/dictionaries/product_title_exclusion.tsv"
  130 +
127 131 # 动态多语言检索字段配置
128 132 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
129 133 # shared_fields 为无语言后缀字段。
... ... @@ -376,7 +380,7 @@ services:
376 380 max_docs: 1000
377 381 normalize: true
378 382 # 服务内后端(reranker 进程启动时读取)
379   - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
  383 + backend: "bge" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
380 384 backends:
381 385 bge:
382 386 model_name: "BAAI/bge-reranker-v2-m3"
... ...
config/dictionaries/product_title_exclusion.tsv 0 → 100644
... ... @@ -0,0 +1,2 @@
  1 +# zh triggers en triggers zh title exclusions en title exclusions
  2 +修身 fitted 宽松 loose,relaxed,oversized,baggy,slouchy
... ...
config/loader.py
... ... @@ -113,6 +113,34 @@ def _read_synonym_csv_dictionary(path: Path) -> List[List[str]]:
113 113 return rows
114 114  
115 115  
  116 +def _read_product_title_exclusion_dictionary(path: Path) -> List[Dict[str, List[str]]]:
  117 + rules: List[Dict[str, List[str]]] = []
  118 + if not path.exists():
  119 + return rules
  120 +
  121 + with open(path, "r", encoding="utf-8") as handle:
  122 + for raw_line in handle:
  123 + line = raw_line.strip()
  124 + if not line or line.startswith("#"):
  125 + continue
  126 + parts = [segment.strip() for segment in line.split("\t")]
  127 + if len(parts) != 4:
  128 + continue
  129 +
  130 + def _split_cell(cell: str) -> List[str]:
  131 + return [item.strip() for item in cell.split(",") if item.strip()]
  132 +
  133 + rules.append(
  134 + {
  135 + "zh_trigger_terms": _split_cell(parts[0]),
  136 + "en_trigger_terms": _split_cell(parts[1]),
  137 + "zh_title_exclusions": _split_cell(parts[2]),
  138 + "en_title_exclusions": _split_cell(parts[3]),
  139 + }
  140 + )
  141 + return rules
  142 +
  143 +
116 144 _DEFAULT_STYLE_INTENT_DIMENSION_ALIASES: Dict[str, List[str]] = {
117 145 "color": ["color", "colors", "colour", "colours", "颜色", "色", "色系"],
118 146 "size": ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"],
... ... @@ -282,6 +310,11 @@ class AppConfigLoader:
282 310 if isinstance(query_cfg.get("style_intent"), dict)
283 311 else {}
284 312 )
  313 + product_title_exclusion_cfg = (
  314 + query_cfg.get("product_title_exclusion")
  315 + if isinstance(query_cfg.get("product_title_exclusion"), dict)
  316 + else {}
  317 + )
285 318  
286 319 def _resolve_project_path(value: Any, default_path: Path) -> Path:
287 320 if value in (None, ""):
... ... @@ -316,6 +349,10 @@ class AppConfigLoader:
316 349 "color": _read_synonym_csv_dictionary(style_color_path),
317 350 "size": _read_synonym_csv_dictionary(style_size_path),
318 351 }
  352 + product_title_exclusion_path = _resolve_project_path(
  353 + product_title_exclusion_cfg.get("dictionary_path"),
  354 + self.config_dir / "dictionaries" / "product_title_exclusion.tsv",
  355 + )
319 356 query_config = QueryConfig(
320 357 supported_languages=list(query_cfg.get("supported_languages") or ["zh", "en"]),
321 358 default_language=str(query_cfg.get("default_language") or "en"),
... ... @@ -390,6 +427,10 @@ class AppConfigLoader:
390 427 style_intent_enabled=bool(style_intent_cfg.get("enabled", True)),
391 428 style_intent_terms=style_intent_terms,
392 429 style_intent_dimension_aliases=style_dimension_aliases,
  430 + product_title_exclusion_enabled=bool(product_title_exclusion_cfg.get("enabled", True)),
  431 + product_title_exclusion_rules=_read_product_title_exclusion_dictionary(
  432 + product_title_exclusion_path
  433 + ),
393 434 )
394 435  
395 436 function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {}
... ...
config/schema.py
... ... @@ -67,6 +67,8 @@ class QueryConfig:
67 67 style_intent_enabled: bool = True
68 68 style_intent_terms: Dict[str, List[List[str]]] = field(default_factory=dict)
69 69 style_intent_dimension_aliases: Dict[str, List[str]] = field(default_factory=dict)
  70 + product_title_exclusion_enabled: bool = True
  71 + product_title_exclusion_rules: List[Dict[str, List[str]]] = field(default_factory=list)
70 72  
71 73  
72 74 @dataclass(frozen=True)
... ...
query/product_title_exclusion.py 0 → 100644
... ... @@ -0,0 +1,225 @@
  1 +"""
  2 +Product title exclusion detection for query understanding.
  3 +"""
  4 +
  5 +from __future__ import annotations
  6 +
  7 +from dataclasses import dataclass, field
  8 +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
  9 +
  10 +from .tokenization import TokenizedText, normalize_query_text, tokenize_text
  11 +
  12 +
  13 +def _dedupe_terms(terms: Iterable[str]) -> List[str]:
  14 + result: List[str] = []
  15 + seen: Set[str] = set()
  16 + for raw_term in terms:
  17 + term = normalize_query_text(raw_term)
  18 + if not term or term in seen:
  19 + continue
  20 + seen.add(term)
  21 + result.append(term)
  22 + return result
  23 +
  24 +
  25 +@dataclass(frozen=True)
  26 +class ProductTitleExclusionRule:
  27 + zh_trigger_terms: Tuple[str, ...]
  28 + en_trigger_terms: Tuple[str, ...]
  29 + zh_title_exclusions: Tuple[str, ...]
  30 + en_title_exclusions: Tuple[str, ...]
  31 + max_term_ngram: int = 3
  32 +
  33 + @classmethod
  34 + def from_config_row(cls, row: Dict[str, Sequence[str]]) -> Optional["ProductTitleExclusionRule"]:
  35 + zh_trigger_terms = tuple(_dedupe_terms(row.get("zh_trigger_terms") or []))
  36 + en_trigger_terms = tuple(_dedupe_terms(row.get("en_trigger_terms") or []))
  37 + zh_title_exclusions = tuple(_dedupe_terms(row.get("zh_title_exclusions") or []))
  38 + en_title_exclusions = tuple(_dedupe_terms(row.get("en_title_exclusions") or []))
  39 + if not zh_title_exclusions and not en_title_exclusions:
  40 + return None
  41 + if not zh_trigger_terms and not en_trigger_terms:
  42 + return None
  43 +
  44 + max_ngram = max(
  45 + [1]
  46 + + [len(term.split()) for term in zh_trigger_terms]
  47 + + [len(term.split()) for term in en_trigger_terms]
  48 + )
  49 + return cls(
  50 + zh_trigger_terms=zh_trigger_terms,
  51 + en_trigger_terms=en_trigger_terms,
  52 + zh_title_exclusions=zh_title_exclusions,
  53 + en_title_exclusions=en_title_exclusions,
  54 + max_term_ngram=max_ngram,
  55 + )
  56 +
  57 + def match_candidates(self, candidates: Iterable[str]) -> Optional[str]:
  58 + normalized_candidates = {normalize_query_text(candidate) for candidate in candidates}
  59 + for term in self.zh_trigger_terms:
  60 + if term in normalized_candidates:
  61 + return term
  62 + for term in self.en_trigger_terms:
  63 + if term in normalized_candidates:
  64 + return term
  65 + return None
  66 +
  67 +
  68 +@dataclass(frozen=True)
  69 +class DetectedProductTitleExclusion:
  70 + matched_term: str
  71 + matched_query_text: str
  72 + zh_title_exclusions: Tuple[str, ...]
  73 + en_title_exclusions: Tuple[str, ...]
  74 +
  75 + def to_dict(self) -> Dict[str, Any]:
  76 + return {
  77 + "matched_term": self.matched_term,
  78 + "matched_query_text": self.matched_query_text,
  79 + "zh_title_exclusions": list(self.zh_title_exclusions),
  80 + "en_title_exclusions": list(self.en_title_exclusions),
  81 + }
  82 +
  83 +
  84 +@dataclass(frozen=True)
  85 +class ProductTitleExclusionProfile:
  86 + query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple)
  87 + exclusions: Tuple[DetectedProductTitleExclusion, ...] = field(default_factory=tuple)
  88 +
  89 + @property
  90 + def is_active(self) -> bool:
  91 + return bool(self.exclusions)
  92 +
  93 + def to_dict(self) -> Dict[str, Any]:
  94 + return {
  95 + "active": self.is_active,
  96 + "exclusions": [item.to_dict() for item in self.exclusions],
  97 + "query_variants": [
  98 + {
  99 + "text": variant.text,
  100 + "normalized_text": variant.normalized_text,
  101 + "fine_tokens": list(variant.fine_tokens),
  102 + "coarse_tokens": list(variant.coarse_tokens),
  103 + "candidates": list(variant.candidates),
  104 + }
  105 + for variant in self.query_variants
  106 + ],
  107 + }
  108 +
  109 + def all_zh_title_exclusions(self) -> List[str]:
  110 + return _dedupe_terms(
  111 + term
  112 + for item in self.exclusions
  113 + for term in item.zh_title_exclusions
  114 + )
  115 +
  116 + def all_en_title_exclusions(self) -> List[str]:
  117 + return _dedupe_terms(
  118 + term
  119 + for item in self.exclusions
  120 + for term in item.en_title_exclusions
  121 + )
  122 +
  123 +
  124 +class ProductTitleExclusionRegistry:
  125 + def __init__(
  126 + self,
  127 + rules: Sequence[ProductTitleExclusionRule],
  128 + *,
  129 + enabled: bool = True,
  130 + ) -> None:
  131 + self.rules = tuple(rules)
  132 + self.enabled = bool(enabled)
  133 + self.max_term_ngram = max((rule.max_term_ngram for rule in self.rules), default=3)
  134 +
  135 + @classmethod
  136 + def from_query_config(cls, query_config: Any) -> "ProductTitleExclusionRegistry":
  137 + raw_rules = getattr(query_config, "product_title_exclusion_rules", []) or []
  138 + rules: List[ProductTitleExclusionRule] = []
  139 + for row in raw_rules:
  140 + if not isinstance(row, dict):
  141 + continue
  142 + rule = ProductTitleExclusionRule.from_config_row(row)
  143 + if rule is not None:
  144 + rules.append(rule)
  145 + return cls(
  146 + rules,
  147 + enabled=bool(getattr(query_config, "product_title_exclusion_enabled", True)),
  148 + )
  149 +
  150 +
  151 +class ProductTitleExclusionDetector:
  152 + def __init__(
  153 + self,
  154 + registry: ProductTitleExclusionRegistry,
  155 + *,
  156 + tokenizer: Optional[Callable[[str], Any]] = None,
  157 + ) -> None:
  158 + self.registry = registry
  159 + self.tokenizer = tokenizer
  160 +
  161 + def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
  162 + seen = set()
  163 + variants: List[TokenizedText] = []
  164 + texts = [
  165 + getattr(parsed_query, "original_query", None),
  166 + getattr(parsed_query, "query_normalized", None),
  167 + getattr(parsed_query, "rewritten_query", None),
  168 + ]
  169 +
  170 + translations = getattr(parsed_query, "translations", {}) or {}
  171 + if isinstance(translations, dict):
  172 + texts.extend(translations.values())
  173 +
  174 + for raw_text in texts:
  175 + text = str(raw_text or "").strip()
  176 + if not text:
  177 + continue
  178 + normalized = normalize_query_text(text)
  179 + if not normalized or normalized in seen:
  180 + continue
  181 + seen.add(normalized)
  182 + variants.append(
  183 + tokenize_text(
  184 + text,
  185 + tokenizer=self.tokenizer,
  186 + max_ngram=self.registry.max_term_ngram,
  187 + )
  188 + )
  189 +
  190 + return tuple(variants)
  191 +
  192 + def detect(self, parsed_query: Any) -> ProductTitleExclusionProfile:
  193 + if not self.registry.enabled or not self.registry.rules:
  194 + return ProductTitleExclusionProfile()
  195 +
  196 + query_variants = self._build_query_variants(parsed_query)
  197 + detected: List[DetectedProductTitleExclusion] = []
  198 + seen_keys = set()
  199 +
  200 + for variant in query_variants:
  201 + for rule in self.registry.rules:
  202 + matched_term = rule.match_candidates(variant.candidates)
  203 + if not matched_term:
  204 + continue
  205 +
  206 + key = (
  207 + tuple(rule.zh_title_exclusions),
  208 + tuple(rule.en_title_exclusions),
  209 + )
  210 + if key in seen_keys:
  211 + continue
  212 + seen_keys.add(key)
  213 + detected.append(
  214 + DetectedProductTitleExclusion(
  215 + matched_term=matched_term,
  216 + matched_query_text=variant.text,
  217 + zh_title_exclusions=rule.zh_title_exclusions,
  218 + en_title_exclusions=rule.en_title_exclusions,
  219 + )
  220 + )
  221 +
  222 + return ProductTitleExclusionProfile(
  223 + query_variants=query_variants,
  224 + exclusions=tuple(detected),
  225 + )
... ...
query/query_parser.py
... ... @@ -18,6 +18,11 @@ from embeddings.text_encoder import TextEmbeddingEncoder
18 18 from config import SearchConfig
19 19 from translation import create_translation_client
20 20 from .language_detector import LanguageDetector
  21 +from .product_title_exclusion import (
  22 + ProductTitleExclusionDetector,
  23 + ProductTitleExclusionProfile,
  24 + ProductTitleExclusionRegistry,
  25 +)
21 26 from .query_rewriter import QueryRewriter, QueryNormalizer
22 27 from .style_intent import StyleIntentDetector, StyleIntentProfile, StyleIntentRegistry
23 28 from .tokenization import extract_token_strings, simple_tokenize_query
... ... @@ -27,6 +32,30 @@ logger = logging.getLogger(__name__)
27 32 import hanlp # type: ignore
28 33  
29 34  
  35 +def rerank_query_text(
  36 + original_query: str,
  37 + *,
  38 + detected_language: Optional[str] = None,
  39 + translations: Optional[Dict[str, str]] = None,
  40 +) -> str:
  41 + """
  42 + Text substituted for ``{query}`` when calling the reranker.
  43 +
  44 + Chinese and English queries use the original string. For any other detected
  45 + language, prefer the English translation, then Chinese; if neither exists,
  46 + fall back to the original query.
  47 + """
  48 + lang = (detected_language or "").strip().lower()
  49 + if lang in ("zh", "en"):
  50 + return original_query
  51 + trans = translations or {}
  52 + for key in ("en", "zh"):
  53 + t = (trans.get(key) or "").strip()
  54 + if t:
  55 + return t
  56 + return original_query
  57 +
  58 +
30 59 @dataclass(slots=True)
31 60 class ParsedQuery:
32 61 """Container for query parser facts."""
... ... @@ -39,6 +68,15 @@ class ParsedQuery:
39 68 query_vector: Optional[np.ndarray] = None
40 69 query_tokens: List[str] = field(default_factory=list)
41 70 style_intent_profile: Optional[StyleIntentProfile] = None
  71 + product_title_exclusion_profile: Optional[ProductTitleExclusionProfile] = None
  72 +
  73 + def text_for_rerank(self) -> str:
  74 + """See :func:`rerank_query_text`."""
  75 + return rerank_query_text(
  76 + self.original_query,
  77 + detected_language=self.detected_language,
  78 + translations=self.translations,
  79 + )
42 80  
43 81 def to_dict(self) -> Dict[str, Any]:
44 82 """Convert to dictionary representation."""
... ... @@ -52,6 +90,11 @@ class ParsedQuery:
52 90 "style_intent_profile": (
53 91 self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
54 92 ),
  93 + "product_title_exclusion_profile": (
  94 + self.product_title_exclusion_profile.to_dict()
  95 + if self.product_title_exclusion_profile is not None
  96 + else None
  97 + ),
55 98 }
56 99  
57 100  
... ... @@ -94,6 +137,13 @@ class QueryParser:
94 137 self.style_intent_registry,
95 138 tokenizer=self._tokenizer,
96 139 )
  140 + self.product_title_exclusion_registry = ProductTitleExclusionRegistry.from_query_config(
  141 + config.query_config
  142 + )
  143 + self.product_title_exclusion_detector = ProductTitleExclusionDetector(
  144 + self.product_title_exclusion_registry,
  145 + tokenizer=self._tokenizer,
  146 + )
97 147  
98 148 # Eager initialization (startup-time failure visibility, no lazy init in request path)
99 149 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
... ... @@ -416,11 +466,16 @@ class QueryParser:
416 466 query_tokens=query_tokens,
417 467 )
418 468 style_intent_profile = self.style_intent_detector.detect(base_result)
  469 + product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result)
419 470 if context:
420 471 context.store_intermediate_result(
421 472 "style_intent_profile",
422 473 style_intent_profile.to_dict(),
423 474 )
  475 + context.store_intermediate_result(
  476 + "product_title_exclusion_profile",
  477 + product_title_exclusion_profile.to_dict(),
  478 + )
424 479  
425 480 result = ParsedQuery(
426 481 original_query=query,
... ... @@ -431,6 +486,7 @@ class QueryParser:
431 486 query_vector=query_vector,
432 487 query_tokens=query_tokens,
433 488 style_intent_profile=style_intent_profile,
  489 + product_title_exclusion_profile=product_title_exclusion_profile,
434 490 )
435 491  
436 492 if context and hasattr(context, 'logger'):
... ...
search/es_query_builder.py
... ... @@ -229,7 +229,10 @@ class ESQueryBuilder:
229 229  
230 230 # Build filter clauses for query (conjunctive filters + range filters)
231 231 filter_clauses = self._build_filters(conjunctive_filters, range_filters)
232   -
  232 + product_title_exclusion_filter = self._build_product_title_exclusion_filter(parsed_query)
  233 + if product_title_exclusion_filter:
  234 + filter_clauses.append(product_title_exclusion_filter)
  235 +
233 236 # 3. Build main query structure: filters and recall
234 237 if recall_clauses:
235 238 # Combine text recalls with OR logic (if multiple)
... ... @@ -780,6 +783,37 @@ class ESQueryBuilder:
780 783  
781 784 return filter_clauses
782 785  
  786 + @staticmethod
  787 + def _build_product_title_exclusion_filter(parsed_query: Optional[Any]) -> Optional[Dict[str, Any]]:
  788 + if parsed_query is None:
  789 + return None
  790 +
  791 + profile = getattr(parsed_query, "product_title_exclusion_profile", None)
  792 + if not profile or not getattr(profile, "is_active", False):
  793 + return None
  794 +
  795 + should_clauses: List[Dict[str, Any]] = []
  796 + for term in profile.all_zh_title_exclusions():
  797 + should_clauses.append({"match_phrase": {"title.zh": {"query": term}}})
  798 + for term in profile.all_en_title_exclusions():
  799 + should_clauses.append({"match_phrase": {"title.en": {"query": term}}})
  800 +
  801 + if not should_clauses:
  802 + return None
  803 +
  804 + return {
  805 + "bool": {
  806 + "must_not": [
  807 + {
  808 + "bool": {
  809 + "should": should_clauses,
  810 + "minimum_should_match": 1,
  811 + }
  812 + }
  813 + ]
  814 + }
  815 + }
  816 +
783 817 def add_sorting(
784 818 self,
785 819 es_query: Dict[str, Any],
... ...
search/searcher.py
... ... @@ -581,7 +581,7 @@ class Searcher:
581 581 try:
582 582 from .rerank_client import run_rerank
583 583  
584   - rerank_query = parsed_query.original_query if parsed_query else query
  584 + rerank_query = parsed_query.text_for_rerank() if parsed_query else query
585 585 es_response, rerank_meta, fused_debug = run_rerank(
586 586 query=rerank_query,
587 587 es_response=es_response,
... ...
tests/test_es_query_builder.py
... ... @@ -118,3 +118,44 @@ def test_text_query_skips_duplicate_translation_same_as_base():
118 118 root = q["query"]
119 119 assert root["bool"]["_name"] == "base_query"
120 120 assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"]
  121 +
  122 +
  123 +def test_product_title_exclusion_filter_is_applied_to_query_and_knn():
  124 + qb = _builder()
  125 + parsed_query = SimpleNamespace(
  126 + rewritten_query="fitted dress",
  127 + detected_language="en",
  128 + translations={"zh": "修身 连衣裙"},
  129 + product_title_exclusion_profile=SimpleNamespace(
  130 + is_active=True,
  131 + all_zh_title_exclusions=lambda: ["宽松"],
  132 + all_en_title_exclusions=lambda: ["loose", "relaxed"],
  133 + ),
  134 + )
  135 +
  136 + q = qb.build_query(
  137 + query_text="fitted dress",
  138 + query_vector=np.array([0.1, 0.2, 0.3]),
  139 + parsed_query=parsed_query,
  140 + enable_knn=True,
  141 + )
  142 +
  143 + expected_filter = {
  144 + "bool": {
  145 + "must_not": [
  146 + {
  147 + "bool": {
  148 + "should": [
  149 + {"match_phrase": {"title.zh": {"query": "宽松"}}},
  150 + {"match_phrase": {"title.en": {"query": "loose"}}},
  151 + {"match_phrase": {"title.en": {"query": "relaxed"}}},
  152 + ],
  153 + "minimum_should_match": 1,
  154 + }
  155 + }
  156 + ]
  157 + }
  158 + }
  159 +
  160 + assert expected_filter in q["query"]["bool"]["filter"]
  161 + assert q["knn"]["filter"] == expected_filter
... ...
tests/test_product_title_exclusion.py 0 → 100644
... ... @@ -0,0 +1,43 @@
  1 +from types import SimpleNamespace
  2 +
  3 +from config import QueryConfig
  4 +from query.product_title_exclusion import (
  5 + ProductTitleExclusionDetector,
  6 + ProductTitleExclusionRegistry,
  7 +)
  8 +
  9 +
  10 +def test_product_title_exclusion_detector_matches_translated_english_token():
  11 + query_config = QueryConfig(
  12 + product_title_exclusion_rules=[
  13 + {
  14 + "zh_trigger_terms": ["修身"],
  15 + "en_trigger_terms": ["fitted"],
  16 + "zh_title_exclusions": ["宽松"],
  17 + "en_title_exclusions": ["loose", "relaxed", "oversized", "baggy", "slouchy"],
  18 + }
  19 + ]
  20 + )
  21 + detector = ProductTitleExclusionDetector(
  22 + ProductTitleExclusionRegistry.from_query_config(query_config),
  23 + tokenizer=lambda text: text.split(),
  24 + )
  25 +
  26 + parsed_query = SimpleNamespace(
  27 + original_query="修身连衣裙",
  28 + query_normalized="修身 连衣裙",
  29 + rewritten_query="修身 连衣裙",
  30 + translations={"en": "fitted dress"},
  31 + )
  32 +
  33 + profile = detector.detect(parsed_query)
  34 +
  35 + assert profile.is_active is True
  36 + assert profile.all_zh_title_exclusions() == ["宽松"]
  37 + assert profile.all_en_title_exclusions() == [
  38 + "loose",
  39 + "relaxed",
  40 + "oversized",
  41 + "baggy",
  42 + "slouchy",
  43 + ]
... ...
tests/test_rerank_query_text.py 0 → 100644
... ... @@ -0,0 +1,55 @@
  1 +"""Unit tests for rerank {query} text selection (translation fallback)."""
  2 +
  3 +from query.query_parser import ParsedQuery, rerank_query_text
  4 +
  5 +
  6 +def test_rerank_query_text_zh_uses_original():
  7 + assert rerank_query_text("你好", detected_language="zh", translations={"en": "hello"}) == "你好"
  8 +
  9 +
  10 +def test_rerank_query_text_en_uses_original():
  11 + assert rerank_query_text("hello", detected_language="en", translations={"zh": "你好"}) == "hello"
  12 +
  13 +
  14 +def test_rerank_query_text_russian_prefers_en_translation():
  15 + assert (
  16 + rerank_query_text(
  17 + "красное платье",
  18 + detected_language="ru",
  19 + translations={"en": "red dress", "zh": "红裙"},
  20 + )
  21 + == "red dress"
  22 + )
  23 +
  24 +
  25 +def test_rerank_query_text_russian_falls_back_to_zh_when_no_en():
  26 + assert (
  27 + rerank_query_text(
  28 + "красное платье",
  29 + detected_language="ru",
  30 + translations={"zh": "红裙"},
  31 + )
  32 + == "红裙"
  33 + )
  34 +
  35 +
  36 +def test_rerank_query_text_non_zh_en_falls_back_to_original_without_translations():
  37 + assert rerank_query_text("foo", detected_language="ja", translations={}) == "foo"
  38 +
  39 +
  40 +def test_rerank_query_text_unknown_language_uses_en_when_present():
  41 + assert (
  42 + rerank_query_text("x", detected_language="unknown", translations={"en": "translated"})
  43 + == "translated"
  44 + )
  45 +
  46 +
  47 +def test_parsed_query_text_for_rerank_delegates():
  48 + pq = ParsedQuery(
  49 + original_query="orig",
  50 + query_normalized="orig",
  51 + rewritten_query="rewritten",
  52 + detected_language="fr",
  53 + translations={"en": "en version"},
  54 + )
  55 + assert pq.text_for_rerank() == "en version"
... ...
tests/test_search_rerank_window.py
... ... @@ -32,6 +32,15 @@ class _FakeParsedQuery:
32 32 query_vector: Any = None
33 33 style_intent_profile: Any = None
34 34  
  35 + def text_for_rerank(self) -> str:
  36 + from query.query_parser import rerank_query_text
  37 +
  38 + return rerank_query_text(
  39 + self.original_query,
  40 + detected_language=self.detected_language,
  41 + translations=self.translations,
  42 + )
  43 +
35 44 def to_dict(self) -> Dict[str, Any]:
36 45 return {
37 46 "original_query": self.original_query,
... ...