Commit 77ab67adeae5b5b41985c5f24abf93311682d320

Authored by tangwang
1 parent 2260eed2

更新测试用例

.cursor/plans/es-query-25a9f060.plan.检索表达式优化.ES_function表达式.md
... ... @@ -187,11 +187,8 @@ class RerankEngine:
187 187 用途:复杂的自定义排序逻辑、实时个性化等
188 188 """
189 189  
190   - def __init__(self, ranking_expression: str, enabled: bool = False):
  190 + def __init__(self, enabled: bool = False):
191 191 self.enabled = enabled
192   - self.ranking_expression = ranking_expression
193   - if enabled:
194   - self.parsed_terms = self._parse_expression(ranking_expression)
195 192 ```
196 193  
197 194 #### `/home/tw/saas-search/search/__init__.py`
... ... @@ -206,14 +203,6 @@ from .rerank_engine import RerankEngine # 原 RankingEngine
206 203  
207 204 **修改初始化**(约88行):
208 205  
209   -```python
210   -# 改为RerankEngine,默认禁用
211   -self.rerank_engine = RerankEngine(
212   - config.ranking.expression,
213   - enabled=False # 暂时禁用
214   -)
215   -```
216   -
217 206 **修改search方法中的rerank逻辑**(约356-383行):
218 207  
219 208 ```python
... ... @@ -295,7 +284,6 @@ class FunctionScoreConfig:
295 284 @dataclass
296 285 class TenantConfig:
297 286 # ... 其他字段 ...
298   - ranking: RankingConfig # 保留用于兼容
299 287 rerank: RerankConfig # 新增
300 288 function_score: FunctionScoreConfig # 新增
301 289 ```
... ... @@ -327,11 +315,18 @@ curl -X POST http://localhost:6002/search/ \
327 315  
328 316 ### 4. 配置迁移
329 317  
330   -对于现有的`ranking.expression`配置,建议:
  318 +```python
  319 +# 改为RerankEngine,默认禁用
  320 +self.rerank_engine = RerankEngine(
  321 + enabled=False # 暂时禁用
  322 +)
  323 +```
  324 +
  325 +@@
  326 +对于现有的排序/打分配置,建议:
331 327  
332   -- 保留`ranking`配置用于文档说明
333   -- 新增`rerank.enabled=false`明确禁用状态
334   -- 新增`function_score`配置用于ES层打分
  328 +- 使用 `function_score` 配置 ES 层打分
  329 +- 使用 `rerank` 配置控制本地/AI 重排行为
335 330  
336 331 ### 5. 后续优化空间
337 332  
... ...
CLAUDE.md
... ... @@ -588,8 +588,7 @@ python main.py search "query" --tenant-id 1 # Quick search test
588 588 1. **Modifying Search Behavior**: Edit `config/config.yaml`
589 589 2. **Changing Index Structure**: Update `mappings/search_products.json`
590 590 3. **Adding New Filters**: Extend `api/models.py` with new Pydantic models
591   -4. **Updating Ranking**: Modify `ranking.expression` in config
592   -5. **Testing Queries**: Use frontend UI at http://localhost:6003
  591 +4. **Testing Queries**: Use frontend UI at http://localhost:6003
593 592  
594 593 ## Key Implementation Details
595 594  
... ...
api/routes/admin.py
... ... @@ -56,7 +56,6 @@ async def get_configuration():
56 56 "shared_fields": config.query_config.shared_fields,
57 57 "core_multilingual_fields": config.query_config.core_multilingual_fields,
58 58 "supported_languages": config.query_config.supported_languages,
59   - "ranking_expression": config.ranking.expression,
60 59 "spu_enabled": config.spu_config.enabled
61 60 }
62 61  
... ...
config/__init__.py
... ... @@ -9,7 +9,6 @@ from .config_loader import (
9 9 QueryConfig,
10 10 IndexConfig,
11 11 SPUConfig,
12   - RankingConfig,
13 12 FunctionScoreConfig,
14 13 RerankConfig,
15 14 ConfigLoader,
... ... @@ -38,7 +37,6 @@ __all__ = [
38 37 'QueryConfig',
39 38 'IndexConfig',
40 39 'SPUConfig',
41   - 'RankingConfig',
42 40 'FunctionScoreConfig',
43 41 'RerankConfig',
44 42  
... ...
config/config.yaml
... ... @@ -100,11 +100,6 @@ query_config:
100 100 # KNN boost配置(向量召回的boost值)
101 101 knn_boost: 0.25 # Lower boost for embedding recall
102 102  
103   -# Ranking Configuration(排序配置)
104   -ranking:
105   - expression: "bm25() + 0.25*text_embedding_relevance()"
106   - description: "BM25 text relevance combined with semantic embedding similarity"
107   -
108 103 # Function Score配置(ES层打分规则)
109 104 function_score:
110 105 score_mode: "sum"
... ...
config/config_loader.py
... ... @@ -97,13 +97,6 @@ class FunctionScoreConfig:
97 97  
98 98  
99 99 @dataclass
100   -class RankingConfig:
101   - """Configuration for ranking expressions."""
102   - expression: str = "bm25()"
103   - description: str = "Default BM25 ranking"
104   -
105   -
106   -@dataclass
107 100 class RerankConfig:
108 101 """重排配置(provider/URL 在 services.rerank)"""
109 102 enabled: bool = True
... ... @@ -128,9 +121,6 @@ class SearchConfig:
128 121 # Query processing
129 122 query_config: QueryConfig
130 123  
131   - # Ranking configuration
132   - ranking: RankingConfig
133   -
134 124 # Function Score configuration (ES层打分)
135 125 function_score: FunctionScoreConfig
136 126  
... ... @@ -293,13 +283,6 @@ class ConfigLoader:
293 283 tie_breaker_keywords=float(text_strategy_cfg.get("tie_breaker_keywords", 0.9)),
294 284 )
295 285  
296   - # Parse ranking config
297   - ranking_data = config_data.get("ranking", {})
298   - ranking = RankingConfig(
299   - expression=ranking_data.get("expression") or "bm25() + 0.2*text_embedding_relevance()",
300   - description=ranking_data.get("description") or "Default BM25 + text embedding ranking"
301   - )
302   -
303 286 # Parse Function Score configuration
304 287 fs_data = config_data.get("function_score", {})
305 288 function_score = FunctionScoreConfig(
... ... @@ -336,7 +319,6 @@ class ConfigLoader:
336 319 field_boosts=field_boosts,
337 320 indexes=indexes,
338 321 query_config=query_config,
339   - ranking=ranking,
340 322 function_score=function_score,
341 323 rerank=rerank,
342 324 spu_config=spu_config,
... ... @@ -510,10 +492,6 @@ class ConfigLoader:
510 492 "field_boosts": config.field_boosts,
511 493 "indexes": [self._index_to_dict(index) for index in config.indexes],
512 494 "query_config": query_config_dict,
513   - "ranking": {
514   - "expression": config.ranking.expression,
515   - "description": config.ranking.description
516   - },
517 495 "function_score": {
518 496 "score_mode": config.function_score.score_mode,
519 497 "boost_mode": config.function_score.boost_mode,
... ...
docs/QUICKSTART.md
... ... @@ -357,7 +357,6 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源:
357 357 - `query_config.search_fields`:动态多语言检索字段(multilingual/shared/core)
358 358 - `query_config.text_query_strategy`:文本召回策略参数(minimum_should_match、翻译boost、翻译失败原文兜底boost等)
359 359 - `query_config`:语言、embedding 开关、source_fields、knn_boost、翻译提示词等
360   -- `ranking.expression`:融合表达式(例如 `bm25() + 0.25*text_embedding_relevance()`)
361 360 - `function_score`:ES 层加权函数
362 361 - `rerank`:重排窗口、超时、ES/AI 融合权重
363 362  
... ...
docs/搜索API对接指南.md
... ... @@ -1622,7 +1622,6 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \
1622 1622 "num_fields": 20,
1623 1623 "num_indexes": 4,
1624 1624 "supported_languages": ["zh", "en", "ru"],
1625   - "ranking_expression": "bm25() + 0.2*text_embedding_relevance()",
1626 1625 "spu_enabled": false
1627 1626 }
1628 1627 ```
... ...
tests/ci/test_service_api_contracts.py
... ... @@ -152,7 +152,7 @@ class _FakeBulkService:
152 152  
153 153  
154 154 class _FakeTransformer:
155   - def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options):
  155 + def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options, **kwargs):
156 156 return {
157 157 "tenant_id": tenant_id,
158 158 "spu_id": str(spu_row.get("id", "0")),
... ... @@ -545,7 +545,7 @@ class _FakeTextModel:
545 545  
546 546  
547 547 class _FakeImageModel:
548   - def encode_image_urls(self, urls, batch_size=8):
  548 + def encode_image_urls(self, urls, batch_size=8, normalize_embeddings=True):
549 549 return [np.array([0.3, 0.2, 0.1], dtype=np.float32) for _ in urls]
550 550  
551 551  
... ...
tests/conftest.py
... ... @@ -15,7 +15,7 @@ from unittest.mock import Mock, MagicMock
15 15 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
16 16 sys.path.insert(0, project_root)
17 17  
18   -from config import SearchConfig, QueryConfig, IndexConfig, SPUConfig, RankingConfig, FunctionScoreConfig, RerankConfig
  18 +from config import SearchConfig, QueryConfig, IndexConfig, SPUConfig, FunctionScoreConfig, RerankConfig
19 19 from utils.es_client import ESClient
20 20 from search import Searcher
21 21 from query import QueryParser
... ... @@ -48,11 +48,6 @@ def sample_search_config(sample_index_config) -> SearchConfig:
48 48 inner_hits_size=3
49 49 )
50 50  
51   - ranking_config = RankingConfig(
52   - expression="static_bm25() + text_embedding_relevance() * 0.2",
53   - description="Test ranking"
54   - )
55   -
56 51 function_score_config = FunctionScoreConfig()
57 52 rerank_config = RerankConfig()
58 53  
... ... @@ -67,7 +62,6 @@ def sample_search_config(sample_index_config) -> SearchConfig:
67 62 },
68 63 indexes=[sample_index_config],
69 64 query_config=query_config,
70   - ranking=ranking_config,
71 65 function_score=function_score_config,
72 66 rerank=rerank_config,
73 67 spu_config=spu_config
... ...
tests/test_embedding_pipeline.py
... ... @@ -8,7 +8,6 @@ from config import (
8 8 FunctionScoreConfig,
9 9 IndexConfig,
10 10 QueryConfig,
11   - RankingConfig,
12 11 RerankConfig,
13 12 SPUConfig,
14 13 SearchConfig,
... ... @@ -82,7 +81,7 @@ def _build_test_config() -> SearchConfig:
82 81 text_embedding_field="title_embedding",
83 82 image_embedding_field=None,
84 83 ),
85   - ranking=RankingConfig(expression="bm25()", description="test"),
  84 + function_score=FunctionScoreConfig(),
86 85 function_score=FunctionScoreConfig(),
87 86 rerank=RerankConfig(),
88 87 spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3),
... ... @@ -97,7 +96,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch):
97 96 fake_redis = _FakeRedis()
98 97 monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis)
99 98  
100   - def _fake_post(url, json, timeout):
  99 + def _fake_post(url, json, timeout, **kwargs):
101 100 assert url.endswith("/embed/text")
102 101 assert json == ["hello", "world"]
103 102 return _FakeResponse([[0.1, 0.2], [0.3, 0.4]])
... ... @@ -118,7 +117,7 @@ def test_text_embedding_encoder_raises_on_missing_vector(monkeypatch):
118 117 fake_redis = _FakeRedis()
119 118 monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis)
120 119  
121   - def _fake_post(url, json, timeout):
  120 + def _fake_post(url, json, timeout, **kwargs):
122 121 return _FakeResponse([[0.1, 0.2], None])
123 122  
124 123 monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post)
... ... @@ -136,7 +135,7 @@ def test_text_embedding_encoder_cache_hit(monkeypatch):
136 135  
137 136 calls = {"count": 0}
138 137  
139   - def _fake_post(url, json, timeout):
  138 + def _fake_post(url, json, timeout, **kwargs):
140 139 calls["count"] += 1
141 140 return _FakeResponse([[0.3, 0.4]])
142 141  
... ...
tests/test_search_rerank_window.py
... ... @@ -12,7 +12,6 @@ from config import (
12 12 FunctionScoreConfig,
13 13 IndexConfig,
14 14 QueryConfig,
15   - RankingConfig,
16 15 RerankConfig,
17 16 SPUConfig,
18 17 SearchConfig,
... ... @@ -141,7 +140,6 @@ def _build_search_config(*, rerank_enabled: bool = True, rerank_window: int = 38
141 140 field_boosts={"title.en": 3.0},
142 141 indexes=[IndexConfig(name="default", label="default", fields=["title.en"])],
143 142 query_config=QueryConfig(enable_text_embedding=False, enable_query_rewrite=False),
144   - ranking=RankingConfig(),
145 143 function_score=FunctionScoreConfig(),
146 144 rerank=RerankConfig(enabled=rerank_enabled, rerank_window=rerank_window),
147 145 spu_config=SPUConfig(enabled=False),
... ... @@ -169,7 +167,6 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
169 167 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
170 168 "query_config": {"supported_languages": ["en"], "default_language": "en"},
171 169 "spu_config": {"enabled": False},
172   - "ranking": {"expression": "bm25()", "description": "test"},
173 170 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
174 171 "rerank": {"rerank_window": 384},
175 172 }
... ... @@ -211,9 +208,11 @@ def test_searcher_reranks_top_window_by_default(monkeypatch):
211 208 )
212 209  
213 210 assert called["count"] == 1
214   - assert called["docs"] == 1000
  211 + # 应当对配置的 rerank_window 条文档做重排预取
  212 + window = searcher.config.rerank.rerank_window
  213 + assert called["docs"] == window
215 214 assert es_client.calls[0]["from_"] == 0
216   - assert es_client.calls[0]["size"] == 1000
  215 + assert es_client.calls[0]["size"] == window
217 216 assert es_client.calls[0]["body"]["_source"] == {"includes": ["title"]}
218 217 assert len(es_client.calls) == 2
219 218 assert es_client.calls[1]["size"] == 10
... ...