Commit 77ab67adeae5b5b41985c5f24abf93311682d320
1 parent
2260eed2
更新测试用例
Showing
12 changed files
with
24 additions
and
70 deletions
Show diff stats
.cursor/plans/es-query-25a9f060.plan.检索表达式优化.ES_function表达式.md
| ... | ... | @@ -187,11 +187,8 @@ class RerankEngine: |
| 187 | 187 | 用途:复杂的自定义排序逻辑、实时个性化等 |
| 188 | 188 | """ |
| 189 | 189 | |
| 190 | - def __init__(self, ranking_expression: str, enabled: bool = False): | |
| 190 | + def __init__(self, enabled: bool = False): | |
| 191 | 191 | self.enabled = enabled |
| 192 | - self.ranking_expression = ranking_expression | |
| 193 | - if enabled: | |
| 194 | - self.parsed_terms = self._parse_expression(ranking_expression) | |
| 195 | 192 | ``` |
| 196 | 193 | |
| 197 | 194 | #### `/home/tw/saas-search/search/__init__.py` |
| ... | ... | @@ -206,14 +203,6 @@ from .rerank_engine import RerankEngine # 原 RankingEngine |
| 206 | 203 | |
| 207 | 204 | **修改初始化**(约88行): |
| 208 | 205 | |
| 209 | -```python | |
| 210 | -# 改为RerankEngine,默认禁用 | |
| 211 | -self.rerank_engine = RerankEngine( | |
| 212 | - config.ranking.expression, | |
| 213 | - enabled=False # 暂时禁用 | |
| 214 | -) | |
| 215 | -``` | |
| 216 | - | |
| 217 | 206 | **修改search方法中的rerank逻辑**(约356-383行): |
| 218 | 207 | |
| 219 | 208 | ```python |
| ... | ... | @@ -295,7 +284,6 @@ class FunctionScoreConfig: |
| 295 | 284 | @dataclass |
| 296 | 285 | class TenantConfig: |
| 297 | 286 | # ... 其他字段 ... |
| 298 | - ranking: RankingConfig # 保留用于兼容 | |
| 299 | 287 | rerank: RerankConfig # 新增 |
| 300 | 288 | function_score: FunctionScoreConfig # 新增 |
| 301 | 289 | ``` |
| ... | ... | @@ -327,11 +315,18 @@ curl -X POST http://localhost:6002/search/ \ |
| 327 | 315 | |
| 328 | 316 | ### 4. 配置迁移 |
| 329 | 317 | |
| 330 | -对于现有的`ranking.expression`配置,建议: | |
| 318 | +```python | |
| 319 | +# 改为RerankEngine,默认禁用 | |
| 320 | +self.rerank_engine = RerankEngine( | |
| 321 | + enabled=False # 暂时禁用 | |
| 322 | +) | |
| 323 | +``` | |
| 324 | + | |
| 325 | +@@ | |
| 326 | +对于现有的排序/打分配置,建议: | |
| 331 | 327 | |
| 332 | -- 保留`ranking`配置用于文档说明 | |
| 333 | -- 新增`rerank.enabled=false`明确禁用状态 | |
| 334 | -- 新增`function_score`配置用于ES层打分 | |
| 328 | +- 使用 `function_score` 配置 ES 层打分 | |
| 329 | +- 使用 `rerank` 配置控制本地/AI 重排行为 | |
| 335 | 330 | |
| 336 | 331 | ### 5. 后续优化空间 |
| 337 | 332 | ... | ... |
CLAUDE.md
| ... | ... | @@ -588,8 +588,7 @@ python main.py search "query" --tenant-id 1 # Quick search test |
| 588 | 588 | 1. **Modifying Search Behavior**: Edit `config/config.yaml` |
| 589 | 589 | 2. **Changing Index Structure**: Update `mappings/search_products.json` |
| 590 | 590 | 3. **Adding New Filters**: Extend `api/models.py` with new Pydantic models |
| 591 | -4. **Updating Ranking**: Modify `ranking.expression` in config | |
| 592 | -5. **Testing Queries**: Use frontend UI at http://localhost:6003 | |
| 591 | +4. **Testing Queries**: Use frontend UI at http://localhost:6003 | |
| 593 | 592 | |
| 594 | 593 | ## Key Implementation Details |
| 595 | 594 | ... | ... |
api/routes/admin.py
| ... | ... | @@ -56,7 +56,6 @@ async def get_configuration(): |
| 56 | 56 | "shared_fields": config.query_config.shared_fields, |
| 57 | 57 | "core_multilingual_fields": config.query_config.core_multilingual_fields, |
| 58 | 58 | "supported_languages": config.query_config.supported_languages, |
| 59 | - "ranking_expression": config.ranking.expression, | |
| 60 | 59 | "spu_enabled": config.spu_config.enabled |
| 61 | 60 | } |
| 62 | 61 | ... | ... |
config/__init__.py
| ... | ... | @@ -9,7 +9,6 @@ from .config_loader import ( |
| 9 | 9 | QueryConfig, |
| 10 | 10 | IndexConfig, |
| 11 | 11 | SPUConfig, |
| 12 | - RankingConfig, | |
| 13 | 12 | FunctionScoreConfig, |
| 14 | 13 | RerankConfig, |
| 15 | 14 | ConfigLoader, |
| ... | ... | @@ -38,7 +37,6 @@ __all__ = [ |
| 38 | 37 | 'QueryConfig', |
| 39 | 38 | 'IndexConfig', |
| 40 | 39 | 'SPUConfig', |
| 41 | - 'RankingConfig', | |
| 42 | 40 | 'FunctionScoreConfig', |
| 43 | 41 | 'RerankConfig', |
| 44 | 42 | ... | ... |
config/config.yaml
| ... | ... | @@ -100,11 +100,6 @@ query_config: |
| 100 | 100 | # KNN boost配置(向量召回的boost值) |
| 101 | 101 | knn_boost: 0.25 # Lower boost for embedding recall |
| 102 | 102 | |
| 103 | -# Ranking Configuration(排序配置) | |
| 104 | -ranking: | |
| 105 | - expression: "bm25() + 0.25*text_embedding_relevance()" | |
| 106 | - description: "BM25 text relevance combined with semantic embedding similarity" | |
| 107 | - | |
| 108 | 103 | # Function Score配置(ES层打分规则) |
| 109 | 104 | function_score: |
| 110 | 105 | score_mode: "sum" | ... | ... |
config/config_loader.py
| ... | ... | @@ -97,13 +97,6 @@ class FunctionScoreConfig: |
| 97 | 97 | |
| 98 | 98 | |
| 99 | 99 | @dataclass |
| 100 | -class RankingConfig: | |
| 101 | - """Configuration for ranking expressions.""" | |
| 102 | - expression: str = "bm25()" | |
| 103 | - description: str = "Default BM25 ranking" | |
| 104 | - | |
| 105 | - | |
| 106 | -@dataclass | |
| 107 | 100 | class RerankConfig: |
| 108 | 101 | """重排配置(provider/URL 在 services.rerank)""" |
| 109 | 102 | enabled: bool = True |
| ... | ... | @@ -128,9 +121,6 @@ class SearchConfig: |
| 128 | 121 | # Query processing |
| 129 | 122 | query_config: QueryConfig |
| 130 | 123 | |
| 131 | - # Ranking configuration | |
| 132 | - ranking: RankingConfig | |
| 133 | - | |
| 134 | 124 | # Function Score configuration (ES层打分) |
| 135 | 125 | function_score: FunctionScoreConfig |
| 136 | 126 | |
| ... | ... | @@ -293,13 +283,6 @@ class ConfigLoader: |
| 293 | 283 | tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)), |
| 294 | 284 | ) |
| 295 | 285 | |
| 296 | - # Parse ranking config | |
| 297 | - ranking_data = config_data.get("ranking", {}) | |
| 298 | - ranking = RankingConfig( | |
| 299 | - expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()", | |
| 300 | - description=ranking_data.get("description") or "Default BM25 + text embedding ranking" | |
| 301 | - ) | |
| 302 | - | |
| 303 | 286 | # Parse Function Score configuration |
| 304 | 287 | fs_data = config_data.get("function_score", {}) |
| 305 | 288 | function_score = FunctionScoreConfig( |
| ... | ... | @@ -336,7 +319,6 @@ class ConfigLoader: |
| 336 | 319 | field_boosts=field_boosts, |
| 337 | 320 | indexes=indexes, |
| 338 | 321 | query_config=query_config, |
| 339 | - ranking=ranking, | |
| 340 | 322 | function_score=function_score, |
| 341 | 323 | rerank=rerank, |
| 342 | 324 | spu_config=spu_config, |
| ... | ... | @@ -510,10 +492,6 @@ class ConfigLoader: |
| 510 | 492 | "field_boosts": config.field_boosts, |
| 511 | 493 | "indexes": [self._index_to_dict(index) for index in config.indexes], |
| 512 | 494 | "query_config": query_config_dict, |
| 513 | - "ranking": { | |
| 514 | - "expression": config.ranking.expression, | |
| 515 | - "description": config.ranking.description | |
| 516 | - }, | |
| 517 | 495 | "function_score": { |
| 518 | 496 | "score_mode": config.function_score.score_mode, |
| 519 | 497 | "boost_mode": config.function_score.boost_mode, | ... | ... |
docs/QUICKSTART.md
| ... | ... | @@ -357,7 +357,6 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 357 | 357 | - `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core) |
| 358 | 358 | - `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost、翻译失败原文兜底boost等) |
| 359 | 359 | - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等 |
| 360 | -- `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`) | |
| 361 | 360 | - `function_score`:ES 层加权函数 |
| 362 | 361 | - `rerank`:重排窗口、超时、ES/AI 融合权重 |
| 363 | 362 | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -1622,7 +1622,6 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ |
| 1622 | 1622 | "num_fields": 20, |
| 1623 | 1623 | "num_indexes": 4, |
| 1624 | 1624 | "supported_languages": ["zh", "en", "ru"], |
| 1625 | - "ranking_expression": "bm25() + 0.2*text_embedding_relevance()", | |
| 1626 | 1625 | "spu_enabled": false |
| 1627 | 1626 | } |
| 1628 | 1627 | ``` | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -152,7 +152,7 @@ class _FakeBulkService: |
| 152 | 152 | |
| 153 | 153 | |
| 154 | 154 | class _FakeTransformer: |
| 155 | - def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options): | |
| 155 | + def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options, **kwargs): | |
| 156 | 156 | return { |
| 157 | 157 | "tenant_id": tenant_id, |
| 158 | 158 | "spu_id": str(spu_row.get("id", "0")), |
| ... | ... | @@ -545,7 +545,7 @@ class _FakeTextModel: |
| 545 | 545 | |
| 546 | 546 | |
| 547 | 547 | class _FakeImageModel: |
| 548 | - def encode_image_urls(self, urls, batch_size=8): | |
| 548 | + def encode_image_urls(self, urls, batch_size=8, normalize_embeddings=True): | |
| 549 | 549 | return [np.array([0.3, 0.2, 0.1], dtype=np.float32) for _ in urls] |
| 550 | 550 | |
| 551 | 551 | ... | ... |
tests/conftest.py
| ... | ... | @@ -15,7 +15,7 @@ from unittest.mock import Mock, MagicMock |
| 15 | 15 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| 16 | 16 | sys.path.insert(0, project_root) |
| 17 | 17 | |
| 18 | -from config import SearchConfig, QueryConfig, IndexConfig, SPUConfig, RankingConfig, FunctionScoreConfig, RerankConfig | |
| 18 | +from config import SearchConfig, QueryConfig, IndexConfig, SPUConfig, FunctionScoreConfig, RerankConfig | |
| 19 | 19 | from utils.es_client import ESClient |
| 20 | 20 | from search import Searcher |
| 21 | 21 | from query import QueryParser |
| ... | ... | @@ -48,11 +48,6 @@ def sample_search_config(sample_index_config) -> SearchConfig: |
| 48 | 48 | inner_hits_size=3 |
| 49 | 49 | ) |
| 50 | 50 | |
| 51 | - ranking_config = RankingConfig( | |
| 52 | - expression="static_bm25() + text_embedding_relevance() * 0.2", | |
| 53 | - description="Test ranking" | |
| 54 | - ) | |
| 55 | - | |
| 56 | 51 | function_score_config = FunctionScoreConfig() |
| 57 | 52 | rerank_config = RerankConfig() |
| 58 | 53 | |
| ... | ... | @@ -67,7 +62,6 @@ def sample_search_config(sample_index_config) -> SearchConfig: |
| 67 | 62 | }, |
| 68 | 63 | indexes=[sample_index_config], |
| 69 | 64 | query_config=query_config, |
| 70 | - ranking=ranking_config, | |
| 71 | 65 | function_score=function_score_config, |
| 72 | 66 | rerank=rerank_config, |
| 73 | 67 | spu_config=spu_config | ... | ... |
tests/test_embedding_pipeline.py
| ... | ... | @@ -8,7 +8,6 @@ from config import ( |
| 8 | 8 | FunctionScoreConfig, |
| 9 | 9 | IndexConfig, |
| 10 | 10 | QueryConfig, |
| 11 | - RankingConfig, | |
| 12 | 11 | RerankConfig, |
| 13 | 12 | SPUConfig, |
| 14 | 13 | SearchConfig, |
| ... | ... | @@ -82,7 +81,7 @@ def _build_test_config() -> SearchConfig: |
| 82 | 81 | text_embedding_field="title_embedding", |
| 83 | 82 | image_embedding_field=None, |
| 84 | 83 | ), |
| 85 | - ranking=RankingConfig(expression="bm25()", description="test"), | |
| 84 | + function_score=FunctionScoreConfig(), | |
| 86 | 85 | function_score=FunctionScoreConfig(), |
| 87 | 86 | rerank=RerankConfig(), |
| 88 | 87 | spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3), |
| ... | ... | @@ -97,7 +96,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): |
| 97 | 96 | fake_redis = _FakeRedis() |
| 98 | 97 | monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis) |
| 99 | 98 | |
| 100 | - def _fake_post(url, json, timeout): | |
| 99 | + def _fake_post(url, json, timeout, **kwargs): | |
| 101 | 100 | assert url.endswith("/embed/text") |
| 102 | 101 | assert json == ["hello", "world"] |
| 103 | 102 | return _FakeResponse([[0.1, 0.2], [0.3, 0.4]]) |
| ... | ... | @@ -118,7 +117,7 @@ def test_text_embedding_encoder_raises_on_missing_vector(monkeypatch): |
| 118 | 117 | fake_redis = _FakeRedis() |
| 119 | 118 | monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis) |
| 120 | 119 | |
| 121 | - def _fake_post(url, json, timeout): | |
| 120 | + def _fake_post(url, json, timeout, **kwargs): | |
| 122 | 121 | return _FakeResponse([[0.1, 0.2], None]) |
| 123 | 122 | |
| 124 | 123 | monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) |
| ... | ... | @@ -136,7 +135,7 @@ def test_text_embedding_encoder_cache_hit(monkeypatch): |
| 136 | 135 | |
| 137 | 136 | calls = {"count": 0} |
| 138 | 137 | |
| 139 | - def _fake_post(url, json, timeout): | |
| 138 | + def _fake_post(url, json, timeout, **kwargs): | |
| 140 | 139 | calls["count"] += 1 |
| 141 | 140 | return _FakeResponse([[0.3, 0.4]]) |
| 142 | 141 | ... | ... |
tests/test_search_rerank_window.py
| ... | ... | @@ -12,7 +12,6 @@ from config import ( |
| 12 | 12 | FunctionScoreConfig, |
| 13 | 13 | IndexConfig, |
| 14 | 14 | QueryConfig, |
| 15 | - RankingConfig, | |
| 16 | 15 | RerankConfig, |
| 17 | 16 | SPUConfig, |
| 18 | 17 | SearchConfig, |
| ... | ... | @@ -141,7 +140,6 @@ def _build_search_config(*, rerank_enabled: bool = True, rerank_window: int = 38 |
| 141 | 140 | field_boosts={"title.en": 3.0}, |
| 142 | 141 | indexes=[IndexConfig(name="default", label="default", fields=["title.en"])], |
| 143 | 142 | query_config=QueryConfig(enable_text_embedding=False, enable_query_rewrite=False), |
| 144 | - ranking=RankingConfig(), | |
| 145 | 143 | function_score=FunctionScoreConfig(), |
| 146 | 144 | rerank=RerankConfig(enabled=rerank_enabled, rerank_window=rerank_window), |
| 147 | 145 | spu_config=SPUConfig(enabled=False), |
| ... | ... | @@ -169,7 +167,6 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): |
| 169 | 167 | "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], |
| 170 | 168 | "query_config": {"supported_languages": ["en"], "default_language": "en"}, |
| 171 | 169 | "spu_config": {"enabled": False}, |
| 172 | - "ranking": {"expression": "bm25()", "description": "test"}, | |
| 173 | 170 | "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, |
| 174 | 171 | "rerank": {"rerank_window": 384}, |
| 175 | 172 | } |
| ... | ... | @@ -211,9 +208,11 @@ def test_searcher_reranks_top_window_by_default(monkeypatch): |
| 211 | 208 | ) |
| 212 | 209 | |
| 213 | 210 | assert called["count"] == 1 |
| 214 | - assert called["docs"] == 1000 | |
| 211 | + # 应当对配置的 rerank_window 条文档做重排预取 | |
| 212 | + window = searcher.config.rerank.rerank_window | |
| 213 | + assert called["docs"] == window | |
| 215 | 214 | assert es_client.calls[0]["from_"] == 0 |
| 216 | - assert es_client.calls[0]["size"] == 1000 | |
| 215 | + assert es_client.calls[0]["size"] == window | |
| 217 | 216 | assert es_client.calls[0]["body"]["_source"] == {"includes": ["title"]} |
| 218 | 217 | assert len(es_client.calls) == 2 |
| 219 | 218 | assert es_client.calls[1]["size"] == 10 | ... | ... |