from __future__ import annotations from dataclasses import dataclass, field from pathlib import Path from types import SimpleNamespace from typing import Any, Dict, List import numpy as np import yaml from config import ( ConfigLoader, FineRankConfig, FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig, ) from context import create_request_context from query.style_intent import DetectedStyleIntent, StyleIntentProfile from search.searcher import Searcher @dataclass class _FakeParsedQuery: original_query: str query_normalized: str rewritten_query: str detected_language: str = "en" translations: Dict[str, str] = None keywords_queries: Dict[str, str] = field(default_factory=dict) query_vector: Any = None image_query_vector: Any = None query_tokens: List[str] = field(default_factory=list) style_intent_profile: Any = None def text_for_rerank(self) -> str: from query.query_parser import rerank_query_text return rerank_query_text( self.original_query, detected_language=self.detected_language, translations=self.translations, ) def to_dict(self) -> Dict[str, Any]: return { "original_query": self.original_query, "query_normalized": self.query_normalized, "rewritten_query": self.rewritten_query, "detected_language": self.detected_language, "translations": self.translations or {}, "style_intent_profile": ( self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None ), } def _build_style_intent_profile(intent_type: str, canonical_value: str, *dimension_aliases: str) -> StyleIntentProfile: aliases = dimension_aliases or (intent_type,) return StyleIntentProfile( intents=( DetectedStyleIntent( intent_type=intent_type, canonical_value=canonical_value, matched_term=canonical_value, matched_query_text=canonical_value, attribute_terms=(canonical_value,), dimension_aliases=tuple(aliases), ), ) ) class _FakeQueryParser: def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, ) class _FakeQueryBuilder: knn_text_k = 120 knn_text_k_long = 160 knn_text_num_candidates = 400 knn_text_num_candidates_long = 500 knn_text_boost = 20.0 knn_image_k = 120 knn_image_num_candidates = 400 knn_image_boost = 20.0 def build_query(self, **kwargs): return { "query": {"match_all": {}}, "size": kwargs["size"], "from": kwargs["from_"], } def build_facets(self, facets: Any): return {} def add_sorting(self, es_query: Dict[str, Any], sort_by: str, sort_order: str): return es_query class _FakeESClient: def __init__(self, total_hits: int = 5000): self.calls: List[Dict[str, Any]] = [] self.total_hits = total_hits @staticmethod def _apply_source_filter(src: Dict[str, Any], source_spec: Any) -> Dict[str, Any]: if source_spec is None: return dict(src) if source_spec is False: return {} if isinstance(source_spec, dict): includes = source_spec.get("includes") or [] elif isinstance(source_spec, list): includes = source_spec else: includes = [] if not includes: return dict(src) return {k: v for k, v in src.items() if k in set(includes)} @staticmethod def _full_source(doc_id: str) -> Dict[str, Any]: return { "spu_id": doc_id, "title": {"en": f"product-{doc_id}"}, "brief": {"en": f"brief-{doc_id}"}, "vendor": {"en": f"vendor-{doc_id}"}, "skus": [], } def search( self, index_name: str, body: Dict[str, Any], size: int, from_: int, include_named_queries_score: bool = False, ): self.calls.append( { "index_name": index_name, "body": body, "size": size, "from_": from_, "include_named_queries_score": include_named_queries_score, } ) ids_query = (((body or {}).get("query") or {}).get("ids") or {}).get("values") source_spec = (body or {}).get("_source") if isinstance(ids_query, list): # Return reversed order intentionally; caller should restore original ranking order. ids = [str(i) for i in ids_query][::-1] hits = [] for doc_id in ids: src = self._apply_source_filter(self._full_source(doc_id), source_spec) hit = {"_id": doc_id, "_score": 1.0} if source_spec is not False: hit["_source"] = src hits.append(hit) else: end = min(from_ + size, self.total_hits) hits = [] for i in range(from_, end): doc_id = str(i) src = self._apply_source_filter(self._full_source(doc_id), source_spec) hit = {"_id": doc_id, "_score": float(self.total_hits - i)} if source_spec is not False: hit["_source"] = src hits.append(hit) return { "took": 8, "hits": { "total": {"value": self.total_hits}, "max_score": hits[0]["_score"] if hits else 0.0, "hits": hits, }, } def _build_search_config( *, rerank_enabled: bool = True, rerank_window: int = 384, exact_knn_rescore_enabled: bool = False, exact_knn_rescore_window: int = 0, ): return SearchConfig( field_boosts={"title.en": 3.0}, indexes=[IndexConfig(name="default", label="default", fields=["title.en"])], query_config=QueryConfig(enable_text_embedding=False, enable_query_rewrite=False), function_score=FunctionScoreConfig(), rerank=RerankConfig( enabled=rerank_enabled, rerank_window=rerank_window, exact_knn_rescore_enabled=exact_knn_rescore_enabled, exact_knn_rescore_window=exact_knn_rescore_window, ), spu_config=SPUConfig(enabled=False), es_index_name="test_products", es_settings={}, ) def _build_searcher(config: SearchConfig, es_client: _FakeESClient) -> Searcher: searcher = Searcher( es_client=es_client, config=config, query_parser=_FakeQueryParser(), ) searcher.query_builder = _FakeQueryBuilder() return searcher class _FakeTextEncoder: def __init__(self, vectors: Dict[str, List[float]]): self.vectors = { key: np.array(value, dtype=np.float32) for key, value in vectors.items() } def encode(self, sentences, priority: int = 0, **kwargs): if isinstance(sentences, str): sentences = [sentences] return np.array([self.vectors[text] for text in sentences], dtype=object) def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): config_data = { "es_index_name": "test_products", "field_boosts": {"title.en": 3.0}, "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], "query_config": {"supported_languages": ["en"], "default_language": "en"}, "services": { "translation": { "service_url": "http://localhost:6005", "timeout_sec": 3.0, "default_model": "dummy-model", "default_scene": "general", "cache": { "ttl_seconds": 60, "sliding_expiration": True, }, "capabilities": { "dummy-model": { "enabled": True, "backend": "llm", "use_cache": True, "model": "dummy-model", "base_url": "http://localhost:6005/v1", "timeout_sec": 3.0, } }, }, "embedding": { "provider": "http", "providers": { "http": { "text_base_url": "http://localhost:6005", "image_base_url": "http://localhost:6008", } }, "backend": "tei", "backends": { "tei": { "base_url": "http://localhost:8080", "timeout_sec": 3.0, "model_id": "dummy-embedding-model", } }, }, "rerank": { "provider": "http", "providers": { "http": { "base_url": "http://localhost:6007", "service_url": "http://localhost:6007/rerank", } }, "backend": "bge", "backends": { "bge": { "model_name": "dummy-rerank-model", "device": "cpu", "use_fp16": False, "batch_size": 8, "max_length": 128, "cache_dir": "./model_cache", "enable_warmup": False, } }, }, }, "spu_config": {"enabled": False}, "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, "rerank": { "rerank_window": 384, "exact_knn_rescore_enabled": True, "exact_knn_rescore_window": 160, }, } config_path = tmp_path / "config.yaml" config_path.write_text(yaml.safe_dump(config_data), encoding="utf-8") loader = ConfigLoader(config_path) loaded = loader.load_config(validate=False) assert loaded.rerank.enabled is True assert loaded.rerank.exact_knn_rescore_enabled is True assert loaded.rerank.exact_knn_rescore_window == 160 def test_config_loader_parses_named_rerank_instances(tmp_path: Path): from config.loader import AppConfigLoader config_data = { "es_index_name": "test_products", "field_boosts": {"title.en": 3.0}, "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], "query_config": {"supported_languages": ["en"], "default_language": "en"}, "services": { "translation": { "service_url": "http://localhost:6005", "timeout_sec": 3.0, "default_model": "dummy-model", "default_scene": "general", "cache": {"ttl_seconds": 60, "sliding_expiration": True}, "capabilities": { "dummy-model": { "enabled": True, "backend": "llm", "model": "dummy-model", "base_url": "http://localhost:6005/v1", "timeout_sec": 3.0, "use_cache": True, } }, }, "embedding": { "provider": "http", "providers": {"http": {"text_base_url": "http://localhost:6005", "image_base_url": "http://localhost:6008"}}, "backend": "tei", "backends": {"tei": {"base_url": "http://localhost:8080", "model_id": "dummy-embedding-model"}}, }, "rerank": { "provider": "http", "providers": { "http": { "instances": { "default": {"service_url": "http://localhost:6007/rerank"}, "fine": {"service_url": "http://localhost:6009/rerank"}, } } }, "default_instance": "default", "instances": { "default": {"port": 6007, "backend": "qwen3_vllm_score"}, "fine": {"port": 6009, "backend": "bge"}, }, "backends": { "bge": {"model_name": "BAAI/bge-reranker-v2-m3"}, "qwen3_vllm_score": {"model_name": "Qwen/Qwen3-Reranker-0.6B"}, }, }, }, "spu_config": {"enabled": False}, "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, } config_path = tmp_path / "config.yaml" config_path.write_text(yaml.safe_dump(config_data), encoding="utf-8") loader = AppConfigLoader(config_file=config_path) loaded = loader.load(validate=False) assert loaded.services.rerank.default_instance == "default" assert loaded.services.rerank.get_instance("fine").port == 6009 assert loaded.services.rerank.get_instance("fine").backend == "bge" def test_searcher_reranks_top_window_by_default(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) context = create_request_context(reqid="t1", uid="u1") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) called: Dict[str, Any] = {"count": 0, "docs": 0} def _fake_run_lightweight_rerank(**kwargs): hits = kwargs["es_hits"] for idx, hit in enumerate(hits): hit["_fine_score"] = float(len(hits) - idx) return [hit["_fine_score"] for hit in hits], {"stage": "fine"}, [] def _fake_run_rerank(**kwargs): called["count"] += 1 called["docs"] = len(kwargs["es_response"]["hits"]["hits"]) return kwargs["es_response"], None, [] monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank) monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) result = searcher.search( query="toy", tenant_id="162", from_=20, size=10, context=context, enable_rerank=None, ) assert called["count"] == 1 assert called["docs"] == searcher.config.rerank.rerank_window assert es_client.calls[0]["from_"] == 0 assert es_client.calls[0]["size"] == searcher.config.coarse_rank.input_window assert es_client.calls[0]["include_named_queries_score"] is True assert es_client.calls[0]["body"]["_source"] is False assert len(es_client.calls) == 3 assert es_client.calls[1]["size"] == max( searcher.config.coarse_rank.output_window, searcher.config.rerank.rerank_window, ) assert es_client.calls[1]["from_"] == 0 assert es_client.calls[2]["size"] == 10 assert es_client.calls[2]["from_"] == 0 assert es_client.calls[2]["body"]["query"]["ids"]["values"] == [str(i) for i in range(20, 30)] assert len(result.results) == 10 assert result.results[0].spu_id == "20" assert result.results[0].brief == "brief-20" def test_searcher_debug_info_exposes_ranking_funnel(monkeypatch): es_client = _FakeESClient(total_hits=120) searcher = _build_searcher(_build_search_config(rerank_enabled=True, rerank_window=20), es_client) context = create_request_context(reqid="t-debug", uid="u-debug") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) def _fake_run_lightweight_rerank(**kwargs): hits = kwargs["es_hits"] scores = [] debug_rows = [] for idx, hit in enumerate(hits): score = float(len(hits) - idx) hit["_fine_score"] = score scores.append(score) debug_rows.append( { "doc_id": hit["_id"], "fine_score": score, "rerank_input": {"doc_preview": f"product-{hit['_id']}"}, } ) hits.sort(key=lambda item: item["_fine_score"], reverse=True) return scores, {"model": "fine-bge"}, debug_rows def _fake_run_rerank(**kwargs): hits = kwargs["es_response"]["hits"]["hits"] fused_debug = [] for idx, hit in enumerate(hits): hit["_rerank_score"] = 10.0 - idx hit["_fused_score"] = 100.0 - idx hit["_text_score"] = hit.get("_score", 0.0) hit["_knn_score"] = 0.0 fused_debug.append( { "doc_id": hit["_id"], "rerank_score": hit["_rerank_score"], "fine_score": hit.get("_fine_score"), "text_score": hit["_text_score"], "knn_score": 0.0, "rerank_factor": 1.0, "fine_factor": 1.0, "text_factor": 1.0, "knn_factor": 1.0, "fused_score": hit["_fused_score"], "matched_queries": {}, "rerank_input": {"doc_preview": f"product-{hit['_id']}"}, } ) return kwargs["es_response"], {"model": "final-reranker"}, fused_debug monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank) monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) result = searcher.search( query="toy", tenant_id="162", from_=0, size=5, context=context, enable_rerank=True, debug=True, ) assert result.debug_info["ranking_funnel"]["fine_rank"]["docs_out"] == 80 assert result.debug_info["ranking_funnel"]["rerank"]["docs_out"] == 20 assert result.debug_info["ranking_funnel"]["coarse_rank"]["applied"] is True assert result.debug_info["ranking_funnel"]["coarse_rank"]["backend"] == "local_coarse_fusion" first = result.debug_info["per_result"][0]["ranking_funnel"] assert first["es_recall"]["rank"] is not None assert first["coarse_rank"]["score"] is not None assert first["coarse_rank"]["fusion_summary"] is not None assert first["fine_rank"]["score"] is not None assert first["rerank"]["rerank_score"] is not None def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) context = create_request_context(reqid="t1b", uid="u1b") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) monkeypatch.setattr( "search.rerank_client.run_lightweight_rerank", lambda **kwargs: ([1.0] * len(kwargs["es_hits"]), {"stage": "fine"}, []), ) monkeypatch.setattr("search.rerank_client.run_rerank", lambda **kwargs: (kwargs["es_response"], None, [])) searcher.search( query="toy", tenant_id="162", from_=0, size=5, context=context, enable_rerank=None, rerank_doc_template="{title} {vendor} {brief}", ) assert es_client.calls[0]["body"]["_source"] is False assert es_client.calls[1]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]} def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_active(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) context = create_request_context(reqid="t1c", uid="u1c") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) monkeypatch.setattr( "search.rerank_client.run_lightweight_rerank", lambda **kwargs: ([1.0] * len(kwargs["es_hits"]), {"stage": "fine"}, []), ) monkeypatch.setattr( "search.rerank_client.run_rerank", lambda **kwargs: (kwargs["es_response"], None, []), ) class _IntentQueryParser: text_encoder = None def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, style_intent_profile=_build_style_intent_profile( "color", "black", "color", "colors", "颜色" ), ) searcher.query_parser = _IntentQueryParser() searcher.search( query="black dress", tenant_id="162", from_=0, size=5, context=context, enable_rerank=None, ) assert es_client.calls[0]["body"]["_source"] is False assert es_client.calls[1]["body"]["_source"] == { "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"] } def test_searcher_keeps_previous_stage_order_when_request_explicitly_disables_rerank(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client) context = create_request_context(reqid="t2", uid="u2") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) called: Dict[str, int] = {"count": 0, "fine": 0} def _fake_run_lightweight_rerank(**kwargs): called["fine"] += 1 hits = kwargs["es_hits"] for idx, hit in enumerate(hits): hit["_fine_score"] = float(idx + 1) hits.reverse() return [hit["_fine_score"] for hit in hits], {"stage": "fine"}, [] def _fake_run_rerank(**kwargs): called["count"] += 1 return kwargs["es_response"], None, [] monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank) monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) result = searcher.search( query="toy", tenant_id="162", from_=20, size=10, context=context, enable_rerank=False, debug=True, ) assert called["count"] == 0 assert called["fine"] == 1 assert es_client.calls[0]["from_"] == 0 assert es_client.calls[0]["size"] == searcher.config.coarse_rank.input_window assert es_client.calls[0]["include_named_queries_score"] is True assert len(es_client.calls) == 3 assert es_client.calls[2]["body"]["query"]["ids"]["values"] == [str(i) for i in range(363, 353, -1)] assert len(result.results) == 10 assert [item.spu_id for item in result.results[:3]] == ["363", "362", "361"] assert result.debug_info["rerank"]["enabled"] is False assert result.debug_info["rerank"]["applied"] is False assert result.debug_info["rerank"]["skipped_reason"] == "disabled" assert result.debug_info["per_result"][0]["ranking_funnel"]["rerank"]["rank"] == 21 def test_searcher_keeps_previous_stage_order_when_config_disables_rerank(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="t2b", uid="u2b") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) called: Dict[str, int] = {"count": 0, "fine": 0} def _fake_run_lightweight_rerank(**kwargs): called["fine"] += 1 hits = kwargs["es_hits"] hits.reverse() for idx, hit in enumerate(hits): hit["_fine_score"] = float(len(hits) - idx) return [hit["_fine_score"] for hit in hits], {"stage": "fine"}, [] def _fake_run_rerank(**kwargs): called["count"] += 1 return kwargs["es_response"], None, [] monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank) monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) result = searcher.search( query="toy", tenant_id="162", from_=0, size=5, context=context, enable_rerank=None, debug=True, ) assert called["count"] == 0 assert called["fine"] == 1 assert es_client.calls[0]["from_"] == 0 assert es_client.calls[0]["size"] == searcher.config.coarse_rank.input_window assert es_client.calls[0]["include_named_queries_score"] is True assert len(result.results) == 5 assert [item.spu_id for item in result.results] == ["383", "382", "381", "380", "379"] assert result.debug_info["rerank"]["enabled"] is False assert result.debug_info["rerank"]["applied"] is False assert result.debug_info["rerank"]["skipped_reason"] == "disabled" def test_searcher_skips_rerank_when_page_exceeds_window(monkeypatch): es_client = _FakeESClient() searcher = _build_searcher(_build_search_config(rerank_enabled=True, rerank_window=384), es_client) context = create_request_context(reqid="t3", uid="u3") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) called: Dict[str, int] = {"count": 0} def _fake_run_rerank(**kwargs): called["count"] += 1 return kwargs["es_response"], None, [] monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) searcher.search( query="toy", tenant_id="162", from_=995, size=10, context=context, enable_rerank=None, ) assert called["count"] == 0 assert es_client.calls[0]["from_"] == 995 assert es_client.calls[0]["size"] == 10 assert es_client.calls[0]["include_named_queries_score"] is False assert len(es_client.calls) == 1 def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch): es_client = _FakeESClient(total_hits=1) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="sku-text", uid="u-sku-text") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), ) class _TranslatedQueryParser: text_encoder = None def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={"en": "black dress"}, style_intent_profile=_build_style_intent_profile( "color", "black", "color", "colors", "颜色" ), ) searcher.query_parser = _TranslatedQueryParser() def _full_source_with_skus(doc_id: str) -> Dict[str, Any]: return { "spu_id": doc_id, "title": {"en": f"product-{doc_id}"}, "brief": {"en": f"brief-{doc_id}"}, "vendor": {"en": f"vendor-{doc_id}"}, "option1_name": "Color", "image_url": "https://img/default.jpg", "skus": [ {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, ], } monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus)) result = searcher.search( query="黑色 连衣裙", tenant_id="162", from_=0, size=1, context=context, enable_rerank=False, ) assert len(result.results) == 1 assert result.results[0].skus[0].sku_id == "sku-black" assert result.results[0].image_url == "https://img/black.jpg" def test_searcher_uses_first_text_match_without_comparing_all_matches(monkeypatch): es_client = _FakeESClient(total_hits=1) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="sku-first-text", uid="u-sku-first-text") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) class _TextMatchQueryParser: text_encoder = None def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, style_intent_profile=_build_style_intent_profile( "color", "black", "color", "colors", "颜色" ), ) searcher.query_parser = _TextMatchQueryParser() def _full_source_with_multiple_text_matches(doc_id: str) -> Dict[str, Any]: return { "spu_id": doc_id, "title": {"en": f"product-{doc_id}"}, "brief": {"en": f"brief-{doc_id}"}, "vendor": {"en": f"vendor-{doc_id}"}, "option1_name": "Color", "image_url": "https://img/default.jpg", "skus": [ {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, { "sku_id": "sku-gloss-black", "option1_value": "Gloss Black", "image_src": "https://img/gloss-black.jpg", }, {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, ], } monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_multiple_text_matches)) result = searcher.search( query="black dress", tenant_id="162", from_=0, size=1, context=context, enable_rerank=False, ) assert len(result.results) == 1 assert result.results[0].skus[0].sku_id == "sku-gloss-black" assert result.results[0].image_url == "https://img/gloss-black.jpg" def test_searcher_skips_sku_selection_when_option_name_does_not_match_dimension_alias(monkeypatch): es_client = _FakeESClient(total_hits=1) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="sku-unresolved-dimension", uid="u-sku-unresolved-dimension") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), ) class _UnresolvedDimensionQueryParser: text_encoder = None def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={"en": "black dress"}, style_intent_profile=_build_style_intent_profile( "color", "black", "color", "colors", "颜色" ), ) searcher.query_parser = _UnresolvedDimensionQueryParser() def _full_source_with_unmatched_option_name(doc_id: str) -> Dict[str, Any]: return { "spu_id": doc_id, "title": {"en": f"product-{doc_id}"}, "brief": {"en": f"brief-{doc_id}"}, "vendor": {"en": f"vendor-{doc_id}"}, "option1_name": "Tone", "image_url": "https://img/default.jpg", "skus": [ {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"}, ], } monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_unmatched_option_name)) result = searcher.search( query="黑色 连衣裙", tenant_id="162", from_=0, size=1, context=context, enable_rerank=False, ) assert len(result.results) == 1 assert result.results[0].skus[0].sku_id == "sku-red" assert result.results[0].image_url == "https://img/default.jpg" def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch): es_client = _FakeESClient(total_hits=1) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="sku-embed", uid="u-sku-embed") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) encoder = _FakeTextEncoder( { "linen summer dress": [0.8, 0.2], "red": [1.0, 0.0], "blue": [0.0, 1.0], } ) class _EmbeddingQueryParser: text_encoder = encoder def parse( self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None, ): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, query_vector=np.array([0.0, 1.0], dtype=np.float32), style_intent_profile=_build_style_intent_profile( "color", "blue", "color", "colors", "颜色" ), ) searcher.query_parser = _EmbeddingQueryParser() def _full_source_with_skus(doc_id: str) -> Dict[str, Any]: return { "spu_id": doc_id, "title": {"en": f"product-{doc_id}"}, "brief": {"en": f"brief-{doc_id}"}, "vendor": {"en": f"vendor-{doc_id}"}, "option1_name": "Color", "image_url": "https://img/default.jpg", "skus": [ {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"}, {"sku_id": "sku-blue", "option1_value": "Blue", "image_src": "https://img/blue.jpg"}, ], } monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus)) result = searcher.search( query="linen summer dress", tenant_id="162", from_=0, size=1, context=context, enable_rerank=False, ) assert len(result.results) == 1 assert result.results[0].skus[0].sku_id == "sku-blue" assert result.results[0].image_url == "https://img/blue.jpg" def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeypatch): es_client = _FakeESClient(total_hits=3) cfg = _build_search_config(rerank_enabled=False) searcher = _build_searcher(cfg, es_client) context = create_request_context(reqid="dbg", uid="u-dbg") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), ) result = searcher.search( query="toy", tenant_id="162", from_=0, size=2, context=context, enable_rerank=False, debug=True, ) assert result.debug_info["query_analysis"]["index_languages"] == ["en", "zh"] assert result.debug_info["query_analysis"]["query_tokens"] == [] expected_es_fetch = max(cfg.rerank.rerank_window, cfg.coarse_rank.input_window) assert result.debug_info["es_query_context"]["es_fetch_size"] == expected_es_fetch assert result.debug_info["es_response"]["es_score_normalization_factor"] == 3.0 assert result.debug_info["per_result"][0]["initial_rank"] == 1 assert result.debug_info["per_result"][0]["final_rank"] == 1 assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0 assert result.debug_info["per_result"][1]["es_score_normalized"] == 2.0 / 3.0 def test_searcher_attaches_exact_knn_rescore_for_rank_window(monkeypatch): class _VectorQueryParser: def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), image_query_vector=np.array([0.4, 0.5, 0.6], dtype=np.float32), query_tokens=["dress", "formal", "spring", "summer", "floral"], ) es_client = _FakeESClient(total_hits=5) base = _build_search_config( rerank_enabled=True, rerank_window=5, exact_knn_rescore_enabled=True, exact_knn_rescore_window=3, ) config = SearchConfig( field_boosts=base.field_boosts, indexes=base.indexes, query_config=QueryConfig( enable_text_embedding=True, enable_query_rewrite=False, text_embedding_field="title_embedding", image_embedding_field="image_embedding.vector", ), function_score=base.function_score, coarse_rank=base.coarse_rank, fine_rank=FineRankConfig(enabled=False, input_window=5, output_window=5), rerank=base.rerank, spu_config=base.spu_config, es_index_name=base.es_index_name, es_settings=base.es_settings, ) searcher = Searcher( es_client=es_client, config=config, query_parser=_VectorQueryParser(), image_encoder=SimpleNamespace(), ) context = create_request_context(reqid="exact-rescore", uid="u-exact") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) searcher.search( query="dress", tenant_id="162", from_=0, size=2, context=context, enable_rerank=False, debug=True, ) body = es_client.calls[0]["body"] assert body["rescore"]["window_size"] == 3 assert body["rescore"]["query"]["score_mode"] == "total" assert body["rescore"]["query"]["rescore_query_weight"] == 0.0 should = body["rescore"]["query"]["rescore_query"]["bool"]["should"] names = [] for clause in should: if "script_score" in clause: names.append(clause["script_score"]["_name"]) elif "nested" in clause: names.append(clause["nested"]["_name"]) assert names == ["exact_text_knn_query", "exact_image_knn_query"] recall_query = body["query"] if "bool" in recall_query and recall_query["bool"].get("must"): recall_query = recall_query["bool"]["must"][0] if "function_score" in recall_query: recall_query = recall_query["function_score"]["query"] recall_should = recall_query["bool"]["should"] text_knn_clause = next( clause["knn"] for clause in recall_should if clause.get("knn", {}).get("_name") == "knn_query" ) image_knn_clause = next( clause["nested"]["query"]["knn"] for clause in recall_should if clause.get("nested", {}).get("_name") == "image_knn_query" ) exact_text_clause = next( clause["script_score"] for clause in should if clause.get("script_score", {}).get("_name") == "exact_text_knn_query" ) exact_image_clause = next( clause["nested"]["query"]["script_score"] for clause in should if clause.get("nested", {}).get("_name") == "exact_image_knn_query" ) assert text_knn_clause["boost"] == 28.0 assert exact_text_clause["script"]["params"]["boost"] == text_knn_clause["boost"] assert image_knn_clause["boost"] == 20.0 assert exact_image_clause["script"]["params"]["boost"] == image_knn_clause["boost"] def test_searcher_skips_exact_knn_rescore_outside_rank_window(monkeypatch): class _VectorQueryParser: def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any, target_languages: Any = None): return _FakeParsedQuery( original_query=query, query_normalized=query, rewritten_query=query, translations={}, query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), ) es_client = _FakeESClient(total_hits=20) base = _build_search_config( rerank_enabled=True, rerank_window=5, exact_knn_rescore_enabled=True, exact_knn_rescore_window=4, ) config = SearchConfig( field_boosts=base.field_boosts, indexes=base.indexes, query_config=QueryConfig( enable_text_embedding=True, enable_query_rewrite=False, text_embedding_field="title_embedding", ), function_score=base.function_score, coarse_rank=base.coarse_rank, fine_rank=FineRankConfig(enabled=False, input_window=5, output_window=5), rerank=base.rerank, spu_config=base.spu_config, es_index_name=base.es_index_name, es_settings=base.es_settings, ) searcher = _build_searcher(config, es_client) searcher.query_parser = _VectorQueryParser() context = create_request_context(reqid="exact-rescore-off", uid="u-exact-off") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) searcher.search( query="dress", tenant_id="162", from_=5, size=2, context=context, enable_rerank=False, ) body = es_client.calls[0]["body"] assert "rescore" not in body def test_searcher_rerank_rank_change_falls_back_to_coarse_rank_when_fine_disabled(monkeypatch): es_client = _FakeESClient(total_hits=5) config = _build_search_config(rerank_enabled=True, rerank_window=5) config = SearchConfig( field_boosts=config.field_boosts, indexes=config.indexes, query_config=config.query_config, function_score=config.function_score, coarse_rank=config.coarse_rank, fine_rank=FineRankConfig(enabled=False, input_window=5, output_window=5), rerank=config.rerank, spu_config=config.spu_config, es_index_name=config.es_index_name, es_settings=config.es_settings, ) searcher = _build_searcher(config, es_client) context = create_request_context(reqid="rank-fallback", uid="u-rank-fallback") monkeypatch.setattr( "search.searcher.get_tenant_config_loader", lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), ) fine_called: Dict[str, int] = {"count": 0} def _fake_run_lightweight_rerank(**kwargs): fine_called["count"] += 1 return [], {"stage": "fine"}, [] def _fake_run_rerank(**kwargs): hits = kwargs["es_response"]["hits"]["hits"] hits.reverse() fused_debug = [] for idx, hit in enumerate(hits): hit["_fused_score"] = 100.0 - idx hit["_rerank_score"] = 1.0 - 0.1 * idx fused_debug.append( { "doc_id": hit["_id"], "score": hit["_fused_score"], "es_score": hit.get("_raw_es_score", hit.get("_score")), "rerank_score": hit["_rerank_score"], "text_score": hit.get("_text_score", hit.get("_score")), "knn_score": hit.get("_knn_score", 0.0), "es_factor": 1.0, "rerank_factor": 1.0, "text_factor": 1.0, "knn_factor": 1.0, "fused_score": hit["_fused_score"], } ) return kwargs["es_response"], {"model": "final-reranker"}, fused_debug monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank) monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) result = searcher.search( query="toy", tenant_id="162", from_=0, size=5, context=context, enable_rerank=True, debug=True, ) per_result = {row["spu_id"]: row for row in result.debug_info["per_result"]} moved = per_result["4"]["ranking_funnel"] assert fine_called["count"] == 0 assert result.debug_info["fine_rank"]["enabled"] is False assert result.debug_info["fine_rank"]["applied"] is False assert result.debug_info["fine_rank"]["skipped_reason"] == "disabled" assert moved["fine_rank"]["rank"] == 5 assert moved["fine_rank"]["rank_change"] == 0 assert moved["rerank"]["rank"] == 1 assert moved["rerank"]["rank_change"] == 4 assert moved["final_page"]["rank_change"] == 0