tests/test_search_rerank_window.py

from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Dict, List
import numpy as np
import yaml
from config import (
    ConfigLoader,
    FineRankConfig,
    FunctionScoreConfig,
    IndexConfig,
    QueryConfig,
    RerankConfig,
    SPUConfig,
    SearchConfig,
)
from context import create_request_context
from query.style_intent import DetectedStyleIntent, StyleIntentProfile
from search.searcher import Searcher
@dataclass
class _FakeParsedQuery:
    original_query: str
    query_normalized: str
    rewritten_query: str
    detected_language: str = "en"
    translations: Dict[str, str] = None
    query_vector: Any = None
    style_intent_profile: Any = None
    def text_for_rerank(self) -> str:
        from query.query_parser import rerank_query_text
        return rerank_query_text(
            self.original_query,
            detected_language=self.detected_language,
            translations=self.translations,
        )
    def to_dict(self) -> Dict[str, Any]:
        return {
            "original_query": self.original_query,
            "query_normalized": self.query_normalized,
            "rewritten_query": self.rewritten_query,
            "detected_language": self.detected_language,
            "translations": self.translations or {},
            "style_intent_profile": (
                self.style_intent_profile.to_dict() if self.style_intent_profile is not None else None
            ),
        }
def _build_style_intent_profile(intent_type: str, canonical_value: str, *dimension_aliases: str) -> StyleIntentProfile:
    aliases = dimension_aliases or (intent_type,)
    return StyleIntentProfile(
        intents=(
            DetectedStyleIntent(
                intent_type=intent_type,
                canonical_value=canonical_value,
                matched_term=canonical_value,
                matched_query_text=canonical_value,
                attribute_terms=(canonical_value,),
                dimension_aliases=tuple(aliases),
            ),
        )
    )
class _FakeQueryParser:
    def parse(
        self,
        query: str,
        tenant_id: str,
        generate_vector: bool,
        context: Any,
        target_languages: Any = None,
    ):
        return _FakeParsedQuery(
            original_query=query,
            query_normalized=query,
            rewritten_query=query,
            translations={},
        )
class _FakeQueryBuilder:
    def build_query(self, **kwargs):
        return {
            "query": {"match_all": {}},
            "size": kwargs["size"],
            "from": kwargs["from_"],
        }
    def build_facets(self, facets: Any):
        return {}
    def add_sorting(self, es_query: Dict[str, Any], sort_by: str, sort_order: str):
        return es_query
class _FakeESClient:
    def __init__(self, total_hits: int = 5000):
        self.calls: List[Dict[str, Any]] = []
        self.total_hits = total_hits
    @staticmethod
    def _apply_source_filter(src: Dict[str, Any], source_spec: Any) -> Dict[str, Any]:
        if source_spec is None:
            return dict(src)
        if source_spec is False:
            return {}
        if isinstance(source_spec, dict):
            includes = source_spec.get("includes") or []
        elif isinstance(source_spec, list):
            includes = source_spec
        else:
            includes = []
        if not includes:
            return dict(src)
        return {k: v for k, v in src.items() if k in set(includes)}
    @staticmethod
    def _full_source(doc_id: str) -> Dict[str, Any]:
        return {
            "spu_id": doc_id,
            "title": {"en": f"product-{doc_id}"},
            "brief": {"en": f"brief-{doc_id}"},
            "vendor": {"en": f"vendor-{doc_id}"},
            "skus": [],
        }
    def search(
        self,
        index_name: str,
        body: Dict[str, Any],
        size: int,
        from_: int,
        include_named_queries_score: bool = False,
    ):
        self.calls.append(
            {
                "index_name": index_name,
                "body": body,
                "size": size,
                "from_": from_,
                "include_named_queries_score": include_named_queries_score,
            }
        )
        ids_query = (((body or {}).get("query") or {}).get("ids") or {}).get("values")
        source_spec = (body or {}).get("_source")
        if isinstance(ids_query, list):
            # Return reversed order intentionally; caller should restore original ranking order.
            ids = [str(i) for i in ids_query][::-1]
            hits = []
            for doc_id in ids:
                src = self._apply_source_filter(self._full_source(doc_id), source_spec)
                hit = {"_id": doc_id, "_score": 1.0}
                if source_spec is not False:
                    hit["_source"] = src
                hits.append(hit)
        else:
            end = min(from_ + size, self.total_hits)
            hits = []
            for i in range(from_, end):
                doc_id = str(i)
                src = self._apply_source_filter(self._full_source(doc_id), source_spec)
                hit = {"_id": doc_id, "_score": float(self.total_hits - i)}
                if source_spec is not False:
                    hit["_source"] = src
                hits.append(hit)
        return {
            "took": 8,
            "hits": {
                "total": {"value": self.total_hits},
                "max_score": hits[0]["_score"] if hits else 0.0,
                "hits": hits,
            },
        }
def _build_search_config(*, rerank_enabled: bool = True, rerank_window: int = 384):
    return SearchConfig(
        field_boosts={"title.en": 3.0},
        indexes=[IndexConfig(name="default", label="default", fields=["title.en"])],
        query_config=QueryConfig(enable_text_embedding=False, enable_query_rewrite=False),
        function_score=FunctionScoreConfig(),
        rerank=RerankConfig(enabled=rerank_enabled, rerank_window=rerank_window),
        spu_config=SPUConfig(enabled=False),
        es_index_name="test_products",
        es_settings={},
    )
def _build_searcher(config: SearchConfig, es_client: _FakeESClient) -> Searcher:
    searcher = Searcher(
        es_client=es_client,
        config=config,
        query_parser=_FakeQueryParser(),
    )
    searcher.query_builder = _FakeQueryBuilder()
    return searcher
class _FakeTextEncoder:
    def __init__(self, vectors: Dict[str, List[float]]):
        self.vectors = {
            key: np.array(value, dtype=np.float32)
            for key, value in vectors.items()
        }
    def encode(self, sentences, priority: int = 0, **kwargs):
        if isinstance(sentences, str):
            sentences = [sentences]
        return np.array([self.vectors[text] for text in sentences], dtype=object)
def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
    config_data = {
        "es_index_name": "test_products",
        "field_boosts": {"title.en": 3.0},
        "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
        "query_config": {"supported_languages": ["en"], "default_language": "en"},
        "services": {
            "translation": {
                "service_url": "http://localhost:6005",
                "timeout_sec": 3.0,
                "default_model": "dummy-model",
                "default_scene": "general",
                "cache": {
                    "ttl_seconds": 60,
                    "sliding_expiration": True,
                },
                "capabilities": {
                    "dummy-model": {
                        "enabled": True,
                        "backend": "llm",
                        "use_cache": True,
                        "model": "dummy-model",
                        "base_url": "http://localhost:6005/v1",
                        "timeout_sec": 3.0,
                    }
                },
            },
            "embedding": {
                "provider": "http",
                "providers": {
                    "http": {
                        "text_base_url": "http://localhost:6005",
                        "image_base_url": "http://localhost:6008",
                    }
                },
                "backend": "tei",
                "backends": {
                    "tei": {
                        "base_url": "http://localhost:8080",
                        "timeout_sec": 3.0,
                        "model_id": "dummy-embedding-model",
                    }
                },
            },
            "rerank": {
                "provider": "http",
                "providers": {
                    "http": {
                        "base_url": "http://localhost:6007",
                        "service_url": "http://localhost:6007/rerank",
                    }
                },
                "backend": "bge",
                "backends": {
                    "bge": {
                        "model_name": "dummy-rerank-model",
                        "device": "cpu",
                        "use_fp16": False,
                        "batch_size": 8,
                        "max_length": 128,
                        "cache_dir": "./model_cache",
                        "enable_warmup": False,
                    }
                },
            },
        },
        "spu_config": {"enabled": False},
        "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
        "rerank": {"rerank_window": 384},
    }
    config_path = tmp_path / "config.yaml"
    config_path.write_text(yaml.safe_dump(config_data), encoding="utf-8")
    loader = ConfigLoader(config_path)
    loaded = loader.load_config(validate=False)
    assert loaded.rerank.enabled is True
def test_config_loader_parses_named_rerank_instances(tmp_path: Path):
    from config.loader import AppConfigLoader
    config_data = {
        "es_index_name": "test_products",
        "field_boosts": {"title.en": 3.0},
        "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
        "query_config": {"supported_languages": ["en"], "default_language": "en"},
        "services": {
            "translation": {
                "service_url": "http://localhost:6005",
                "timeout_sec": 3.0,
                "default_model": "dummy-model",
                "default_scene": "general",
                "cache": {"ttl_seconds": 60, "sliding_expiration": True},
                "capabilities": {
                    "dummy-model": {
                        "enabled": True,
                        "backend": "llm",
                        "model": "dummy-model",
                        "base_url": "http://localhost:6005/v1",
                        "timeout_sec": 3.0,
                        "use_cache": True,
                    }
                },
            },
            "embedding": {
                "provider": "http",
                "providers": {"http": {"text_base_url": "http://localhost:6005", "image_base_url": "http://localhost:6008"}},
                "backend": "tei",
                "backends": {"tei": {"base_url": "http://localhost:8080", "model_id": "dummy-embedding-model"}},
            },
            "rerank": {
                "provider": "http",
                "providers": {
                    "http": {
                        "instances": {
                            "default": {"service_url": "http://localhost:6007/rerank"},
                            "fine": {"service_url": "http://localhost:6009/rerank"},
                        }
                    }
                },
                "default_instance": "default",
                "instances": {
                    "default": {"port": 6007, "backend": "qwen3_vllm_score"},
                    "fine": {"port": 6009, "backend": "bge"},
                },
                "backends": {
                    "bge": {"model_name": "BAAI/bge-reranker-v2-m3"},
                    "qwen3_vllm_score": {"model_name": "Qwen/Qwen3-Reranker-0.6B"},
                },
            },
        },
        "spu_config": {"enabled": False},
        "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
    }
    config_path = tmp_path / "config.yaml"
    config_path.write_text(yaml.safe_dump(config_data), encoding="utf-8")
    loader = AppConfigLoader(config_file=config_path)
    loaded = loader.load(validate=False)
    assert loaded.services.rerank.default_instance == "default"
    assert loaded.services.rerank.get_instance("fine").port == 6009
    assert loaded.services.rerank.get_instance("fine").backend == "bge"
def test_searcher_reranks_top_window_by_default(monkeypatch):
    es_client = _FakeESClient()
    searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
    context = create_request_context(reqid="t1", uid="u1")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    called: Dict[str, Any] = {"count": 0, "docs": 0}
    def _fake_run_lightweight_rerank(**kwargs):
        hits = kwargs["es_hits"]
        for idx, hit in enumerate(hits):
            hit["_fine_score"] = float(len(hits) - idx)
        return [hit["_fine_score"] for hit in hits], {"stage": "fine"}, []
    def _fake_run_rerank(**kwargs):
        called["count"] += 1
        called["docs"] = len(kwargs["es_response"]["hits"]["hits"])
        return kwargs["es_response"], None, []
    monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank)
    monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank)
    result = searcher.search(
        query="toy",
        tenant_id="162",
        from_=20,
        size=10,
        context=context,
        enable_rerank=None,
    )
    assert called["count"] == 1
    assert called["docs"] == searcher.config.rerank.rerank_window
    assert es_client.calls[0]["from_"] == 0
    assert es_client.calls[0]["size"] == searcher.config.coarse_rank.input_window
    assert es_client.calls[0]["include_named_queries_score"] is True
    assert es_client.calls[0]["body"]["_source"] is False
    assert len(es_client.calls) == 3
    assert es_client.calls[1]["size"] == max(
        searcher.config.coarse_rank.output_window,
        searcher.config.rerank.rerank_window,
    )
    assert es_client.calls[1]["from_"] == 0
    assert es_client.calls[2]["size"] == 10
    assert es_client.calls[2]["from_"] == 0
    assert es_client.calls[2]["body"]["query"]["ids"]["values"] == [str(i) for i in range(20, 30)]
    assert len(result.results) == 10
    assert result.results[0].spu_id == "20"
    assert result.results[0].brief == "brief-20"
def test_searcher_debug_info_exposes_ranking_funnel(monkeypatch):
    es_client = _FakeESClient(total_hits=120)
    searcher = _build_searcher(_build_search_config(rerank_enabled=True, rerank_window=20), es_client)
    context = create_request_context(reqid="t-debug", uid="u-debug")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    def _fake_run_lightweight_rerank(**kwargs):
        hits = kwargs["es_hits"]
        scores = []
        debug_rows = []
        for idx, hit in enumerate(hits):
            score = float(len(hits) - idx)
            hit["_fine_score"] = score
            scores.append(score)
            debug_rows.append(
                {
                    "doc_id": hit["_id"],
                    "fine_score": score,
                    "rerank_input": {"doc_preview": f"product-{hit['_id']}"},
                }
            )
        hits.sort(key=lambda item: item["_fine_score"], reverse=True)
        return scores, {"model": "fine-bge"}, debug_rows
    def _fake_run_rerank(**kwargs):
        hits = kwargs["es_response"]["hits"]["hits"]
        fused_debug = []
        for idx, hit in enumerate(hits):
            hit["_rerank_score"] = 10.0 - idx
            hit["_fused_score"] = 100.0 - idx
            hit["_text_score"] = hit.get("_score", 0.0)
            hit["_knn_score"] = 0.0
            fused_debug.append(
                {
                    "doc_id": hit["_id"],
                    "rerank_score": hit["_rerank_score"],
                    "fine_score": hit.get("_fine_score"),
                    "text_score": hit["_text_score"],
                    "knn_score": 0.0,
                    "rerank_factor": 1.0,
                    "fine_factor": 1.0,
                    "text_factor": 1.0,
                    "knn_factor": 1.0,
                    "fused_score": hit["_fused_score"],
                    "matched_queries": {},
                    "rerank_input": {"doc_preview": f"product-{hit['_id']}"},
                }
            )
        return kwargs["es_response"], {"model": "final-reranker"}, fused_debug
    monkeypatch.setattr("search.rerank_client.run_lightweight_rerank", _fake_run_lightweight_rerank)
    monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank)
    result = searcher.search(
        query="toy",
        tenant_id="162",
        from_=0,
        size=5,
        context=context,
        enable_rerank=True,
        debug=True,
    )
    assert result.debug_info["ranking_funnel"]["fine_rank"]["docs_out"] == 80
    assert result.debug_info["ranking_funnel"]["rerank"]["docs_out"] == 20
    first = result.debug_info["per_result"][0]["ranking_funnel"]
    assert first["es_recall"]["rank"] is not None
    assert first["coarse_rank"]["score"] is not None
    assert first["fine_rank"]["score"] is not None
    assert first["rerank"]["rerank_score"] is not None
def test_searcher_rerank_prefetch_source_follows_doc_template(monkeypatch):
    es_client = _FakeESClient()
    searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
    context = create_request_context(reqid="t1b", uid="u1b")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    monkeypatch.setattr(
        "search.rerank_client.run_lightweight_rerank",
        lambda **kwargs: ([1.0] * len(kwargs["es_hits"]), {"stage": "fine"}, []),
    )
    monkeypatch.setattr("search.rerank_client.run_rerank", lambda **kwargs: (kwargs["es_response"], None, []))
    searcher.search(
        query="toy",
        tenant_id="162",
        from_=0,
        size=5,
        context=context,
        enable_rerank=None,
        rerank_doc_template="{title} {vendor} {brief}",
    )
    assert es_client.calls[0]["body"]["_source"] is False
    assert es_client.calls[1]["body"]["_source"] == {"includes": ["brief", "title", "vendor"]}
def test_searcher_rerank_prefetch_source_includes_sku_fields_when_style_intent_active(monkeypatch):
    es_client = _FakeESClient()
    searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
    context = create_request_context(reqid="t1c", uid="u1c")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    monkeypatch.setattr(
        "search.rerank_client.run_lightweight_rerank",
        lambda **kwargs: ([1.0] * len(kwargs["es_hits"]), {"stage": "fine"}, []),
    )
    monkeypatch.setattr(
        "search.rerank_client.run_rerank",
        lambda **kwargs: (kwargs["es_response"], None, []),
    )
    class _IntentQueryParser:
        text_encoder = None
        def parse(
            self,
            query: str,
            tenant_id: str,
            generate_vector: bool,
            context: Any,
            target_languages: Any = None,
        ):
            return _FakeParsedQuery(
                original_query=query,
                query_normalized=query,
                rewritten_query=query,
                translations={},
                style_intent_profile=_build_style_intent_profile(
                    "color", "black", "color", "colors", "颜色"
                ),
            )
    searcher.query_parser = _IntentQueryParser()
    searcher.search(
        query="black dress",
        tenant_id="162",
        from_=0,
        size=5,
        context=context,
        enable_rerank=None,
    )
    assert es_client.calls[0]["body"]["_source"] is False
    assert es_client.calls[1]["body"]["_source"] == {
        "includes": ["option1_name", "option2_name", "option3_name", "skus", "title"]
    }
def test_searcher_skips_rerank_when_request_explicitly_false(monkeypatch):
    es_client = _FakeESClient()
    searcher = _build_searcher(_build_search_config(rerank_enabled=True), es_client)
    context = create_request_context(reqid="t2", uid="u2")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    called: Dict[str, int] = {"count": 0}
    def _fake_run_rerank(**kwargs):
        called["count"] += 1
        return kwargs["es_response"], None, []
    monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank)
    searcher.search(
        query="toy",
        tenant_id="162",
        from_=20,
        size=10,
        context=context,
        enable_rerank=False,
    )
    assert called["count"] == 0
    assert es_client.calls[0]["from_"] == 20
    assert es_client.calls[0]["size"] == 10
    assert es_client.calls[0]["include_named_queries_score"] is False
    assert len(es_client.calls) == 1
def test_searcher_skips_rerank_when_page_exceeds_window(monkeypatch):
    es_client = _FakeESClient()
    searcher = _build_searcher(_build_search_config(rerank_enabled=True, rerank_window=384), es_client)
    context = create_request_context(reqid="t3", uid="u3")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    called: Dict[str, int] = {"count": 0}
    def _fake_run_rerank(**kwargs):
        called["count"] += 1
        return kwargs["es_response"], None, []
    monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank)
    searcher.search(
        query="toy",
        tenant_id="162",
        from_=995,
        size=10,
        context=context,
        enable_rerank=None,
    )
    assert called["count"] == 0
    assert es_client.calls[0]["from_"] == 995
    assert es_client.calls[0]["size"] == 10
    assert es_client.calls[0]["include_named_queries_score"] is False
    assert len(es_client.calls) == 1
def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch):
    es_client = _FakeESClient(total_hits=1)
    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
    context = create_request_context(reqid="sku-text", uid="u-sku-text")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
    )
    class _TranslatedQueryParser:
        text_encoder = None
        def parse(
            self,
            query: str,
            tenant_id: str,
            generate_vector: bool,
            context: Any,
            target_languages: Any = None,
        ):
            return _FakeParsedQuery(
                original_query=query,
                query_normalized=query,
                rewritten_query=query,
                translations={"en": "black dress"},
                style_intent_profile=_build_style_intent_profile(
                    "color", "black", "color", "colors", "颜色"
                ),
            )
    searcher.query_parser = _TranslatedQueryParser()
    def _full_source_with_skus(doc_id: str) -> Dict[str, Any]:
        return {
            "spu_id": doc_id,
            "title": {"en": f"product-{doc_id}"},
            "brief": {"en": f"brief-{doc_id}"},
            "vendor": {"en": f"vendor-{doc_id}"},
            "option1_name": "Color",
            "image_url": "https://img/default.jpg",
            "skus": [
                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
                {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"},
            ],
        }
    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus))
    result = searcher.search(
        query="黑色 连衣裙",
        tenant_id="162",
        from_=0,
        size=1,
        context=context,
        enable_rerank=False,
    )
    assert len(result.results) == 1
    assert result.results[0].skus[0].sku_id == "sku-black"
    assert result.results[0].image_url == "https://img/black.jpg"
def test_searcher_uses_first_text_match_without_comparing_all_matches(monkeypatch):
    es_client = _FakeESClient(total_hits=1)
    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
    context = create_request_context(reqid="sku-first-text", uid="u-sku-first-text")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    class _TextMatchQueryParser:
        text_encoder = None
        def parse(
            self,
            query: str,
            tenant_id: str,
            generate_vector: bool,
            context: Any,
            target_languages: Any = None,
        ):
            return _FakeParsedQuery(
                original_query=query,
                query_normalized=query,
                rewritten_query=query,
                translations={},
                style_intent_profile=_build_style_intent_profile(
                    "color", "black", "color", "colors", "颜色"
                ),
            )
    searcher.query_parser = _TextMatchQueryParser()
    def _full_source_with_multiple_text_matches(doc_id: str) -> Dict[str, Any]:
        return {
            "spu_id": doc_id,
            "title": {"en": f"product-{doc_id}"},
            "brief": {"en": f"brief-{doc_id}"},
            "vendor": {"en": f"vendor-{doc_id}"},
            "option1_name": "Color",
            "image_url": "https://img/default.jpg",
            "skus": [
                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
                {
                    "sku_id": "sku-gloss-black",
                    "option1_value": "Gloss Black",
                    "image_src": "https://img/gloss-black.jpg",
                },
                {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"},
            ],
        }
    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_multiple_text_matches))
    result = searcher.search(
        query="black dress",
        tenant_id="162",
        from_=0,
        size=1,
        context=context,
        enable_rerank=False,
    )
    assert len(result.results) == 1
    assert result.results[0].skus[0].sku_id == "sku-gloss-black"
    assert result.results[0].image_url == "https://img/gloss-black.jpg"
def test_searcher_skips_sku_selection_when_option_name_does_not_match_dimension_alias(monkeypatch):
    es_client = _FakeESClient(total_hits=1)
    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
    context = create_request_context(reqid="sku-unresolved-dimension", uid="u-sku-unresolved-dimension")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
    )
    class _UnresolvedDimensionQueryParser:
        text_encoder = None
        def parse(
            self,
            query: str,
            tenant_id: str,
            generate_vector: bool,
            context: Any,
            target_languages: Any = None,
        ):
            return _FakeParsedQuery(
                original_query=query,
                query_normalized=query,
                rewritten_query=query,
                translations={"en": "black dress"},
                style_intent_profile=_build_style_intent_profile(
                    "color", "black", "color", "colors", "颜色"
                ),
            )
    searcher.query_parser = _UnresolvedDimensionQueryParser()
    def _full_source_with_unmatched_option_name(doc_id: str) -> Dict[str, Any]:
        return {
            "spu_id": doc_id,
            "title": {"en": f"product-{doc_id}"},
            "brief": {"en": f"brief-{doc_id}"},
            "vendor": {"en": f"vendor-{doc_id}"},
            "option1_name": "Tone",
            "image_url": "https://img/default.jpg",
            "skus": [
                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
                {"sku_id": "sku-black", "option1_value": "Black", "image_src": "https://img/black.jpg"},
            ],
        }
    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_unmatched_option_name))
    result = searcher.search(
        query="黑色 连衣裙",
        tenant_id="162",
        from_=0,
        size=1,
        context=context,
        enable_rerank=False,
    )
    assert len(result.results) == 1
    assert result.results[0].skus[0].sku_id == "sku-red"
    assert result.results[0].image_url == "https://img/default.jpg"
def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_match(monkeypatch):
    es_client = _FakeESClient(total_hits=1)
    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
    context = create_request_context(reqid="sku-embed", uid="u-sku-embed")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    encoder = _FakeTextEncoder(
        {
            "linen summer dress": [0.8, 0.2],
            "red": [1.0, 0.0],
            "blue": [0.0, 1.0],
        }
    )
    class _EmbeddingQueryParser:
        text_encoder = encoder
        def parse(
            self,
            query: str,
            tenant_id: str,
            generate_vector: bool,
            context: Any,
            target_languages: Any = None,
        ):
            return _FakeParsedQuery(
                original_query=query,
                query_normalized=query,
                rewritten_query=query,
                translations={},
                query_vector=np.array([0.0, 1.0], dtype=np.float32),
                style_intent_profile=_build_style_intent_profile(
                    "color", "blue", "color", "colors", "颜色"
                ),
            )
    searcher.query_parser = _EmbeddingQueryParser()
    def _full_source_with_skus(doc_id: str) -> Dict[str, Any]:
        return {
            "spu_id": doc_id,
            "title": {"en": f"product-{doc_id}"},
            "brief": {"en": f"brief-{doc_id}"},
            "vendor": {"en": f"vendor-{doc_id}"},
            "option1_name": "Color",
            "image_url": "https://img/default.jpg",
            "skus": [
                {"sku_id": "sku-red", "option1_value": "Red", "image_src": "https://img/red.jpg"},
                {"sku_id": "sku-blue", "option1_value": "Blue", "image_src": "https://img/blue.jpg"},
            ],
        }
    monkeypatch.setattr(_FakeESClient, "_full_source", staticmethod(_full_source_with_skus))
    result = searcher.search(
        query="linen summer dress",
        tenant_id="162",
        from_=0,
        size=1,
        context=context,
        enable_rerank=False,
    )
    assert len(result.results) == 1
    assert result.results[0].skus[0].sku_id == "sku-blue"
    assert result.results[0].image_url == "https://img/blue.jpg"
def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeypatch):
    es_client = _FakeESClient(total_hits=3)
    searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client)
    context = create_request_context(reqid="dbg", uid="u-dbg")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
    )
    result = searcher.search(
        query="toy",
        tenant_id="162",
        from_=0,
        size=2,
        context=context,
        enable_rerank=False,
        debug=True,
    )
    assert result.debug_info["query_analysis"]["index_languages"] == ["en", "zh"]
    assert result.debug_info["query_analysis"]["query_tokens"] == []
    assert result.debug_info["es_query_context"]["es_fetch_size"] == 2
    assert result.debug_info["es_response"]["es_score_normalization_factor"] == 3.0
    assert result.debug_info["per_result"][0]["initial_rank"] == 1
    assert result.debug_info["per_result"][0]["final_rank"] == 1
    assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0
    assert result.debug_info["per_result"][1]["es_score_normalized"] == 2.0 / 3.0
def test_searcher_rerank_rank_change_falls_back_to_coarse_rank_when_fine_disabled(monkeypatch):
    es_client = _FakeESClient(total_hits=5)
    config = _build_search_config(rerank_enabled=True, rerank_window=5)
    config = SearchConfig(
        field_boosts=config.field_boosts,
        indexes=config.indexes,
        query_config=config.query_config,
        function_score=config.function_score,
        coarse_rank=config.coarse_rank,
        fine_rank=FineRankConfig(enabled=False, input_window=5, output_window=5),
        rerank=config.rerank,
        spu_config=config.spu_config,
        es_index_name=config.es_index_name,
        es_settings=config.es_settings,
    )
    searcher = _build_searcher(config, es_client)
    context = create_request_context(reqid="rank-fallback", uid="u-rank-fallback")
    monkeypatch.setattr(
        "search.searcher.get_tenant_config_loader",
        lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}),
    )
    def _fake_run_rerank(**kwargs):
        hits = kwargs["es_response"]["hits"]["hits"]
        hits.reverse()
        fused_debug = []
        for idx, hit in enumerate(hits):
            hit["_fused_score"] = 100.0 - idx
            hit["_rerank_score"] = 1.0 - 0.1 * idx
            fused_debug.append(
                {
                    "doc_id": hit["_id"],
                    "score": hit["_fused_score"],
                    "es_score": hit.get("_raw_es_score", hit.get("_score")),
                    "rerank_score": hit["_rerank_score"],
                    "text_score": hit.get("_text_score", hit.get("_score")),
                    "knn_score": hit.get("_knn_score", 0.0),
                    "es_factor": 1.0,
                    "rerank_factor": 1.0,
                    "text_factor": 1.0,
                    "knn_factor": 1.0,
                    "fused_score": hit["_fused_score"],
                }
            )
        return kwargs["es_response"], {"model": "final-reranker"}, fused_debug
    monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank)
    result = searcher.search(
        query="toy",
        tenant_id="162",
        from_=0,
        size=5,
        context=context,
        enable_rerank=True,
        debug=True,
    )
    per_result = {row["spu_id"]: row for row in result.debug_info["per_result"]}
    moved = per_result["4"]["ranking_funnel"]
    assert moved["fine_rank"]["rank"] is None
    assert moved["rerank"]["rank"] == 1
    assert moved["rerank"]["rank_change"] == 4
    assert moved["final_page"]["rank_change"] == 0