diff --git a/docs/Usage-Guide.md b/docs/Usage-Guide.md index 9aab5d5..a4a8a1b 100644 --- a/docs/Usage-Guide.md +++ b/docs/Usage-Guide.md @@ -343,6 +343,49 @@ curl -X POST http://localhost:6002/search/image \ --- +## 8. Suggestion 索引与接口使用 + +### 8.1 构建 Suggestion 索引(全量) + +Suggestion 索引会从: + +- ES 商品索引:`title.{lang}`, `qanchors.{lang}` +- MySQL 日志表:`shoplazza_search_log.query`(含 `language`、`request_params`) + +聚合生成 `search_suggestions_tenant_{tenant_id}`。 + +在项目根目录执行: + +```bash +# 为指定租户全量重建 suggestion 索引(会删除旧索引) +python main.py build-suggestions \ + --tenant-id 162 \ + --es-host http://localhost:9200 \ + --days 30 \ + --recreate +``` + +可选参数: + +- `--days`:回溯日志天数(默认 30) +- `--batch-size`:扫描商品索引的批大小(默认 500) +- `--min-query-len`:参与 suggestion 的最小查询长度(默认 1) + +> 建议在商品索引构建完成、日志正常写入一段时间后执行一次全量构建,然后按天/小时增加增量构建任务。 + +### 8.2 调用 Suggestion 接口 + +全量构建完成后,可直接通过 `/search/suggestions` 获取自动补全结果: + +```bash +curl "http://localhost:6002/search/suggestions?q=iph&size=5&language=en&with_results=true" \ + -H "X-Tenant-ID: 162" +``` + +接口返回结构详见 `docs/搜索API对接指南.md` 的“3.7 搜索建议接口”章节。 + +--- + ## 常见问题 ### Q1: MySQL连接失败 diff --git a/docs/搜索API对接指南.md b/docs/搜索API对接指南.md index 6d9553d..2f4ed28 100644 --- a/docs/搜索API对接指南.md +++ b/docs/搜索API对接指南.md @@ -558,40 +558,59 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) ### 3.7 搜索建议接口 -> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。 - - **端点**: `GET /search/suggestions` -- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展。 +- **描述**: 返回搜索建议(自动补全/热词),支持多语言与“结果直达”(每条 suggestion 附带商品列表)。 #### 查询参数 | 参数 | 类型 | 必填 | 默认值 | 描述 | |------|------|------|--------|------| | `q` | string | Y | - | 查询字符串(至少 1 个字符) | -| `size` | integer | N | 5 | 返回建议数量(1-20) | -| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` | +| `size` | integer | N | 10 | 返回建议数量(1-20) | +| `language` | string | N | `en` | 请求语言,如 `zh` / `en` / `ar` / `ru`,用于路由到对应语种 suggestion 索引 | +| `with_results` | bool | N | `true` | 是否为每条 suggestion 返回商品列表(结果直达) | +| `result_size` | integer | N | 3 | 每条 suggestion 返回的商品数量(1-10) | +| `debug` | bool | N | `false` | 是否开启调试(目前主要用于排查 suggestion 排序与语言解析) | + +> **租户标识**:同 [3.1](#31-接口信息),通过请求头 `X-Tenant-ID` 或 query 参数 `tenant_id` 传递。 #### 响应示例 ```json { - "query": "芭", + "query": "iph", + "language": "en", + "resolved_language": "en", "suggestions": [ { - "text": "芭比娃娃", - "type": "query", - "highlight": "比娃娃", - "popularity": 850 + "text": "iphone 15", + "lang": "en", + "score": 12.37, + "rank_score": 5.1, + "sources": ["query_log", "qanchor"], + "lang_source": "log_field", + "lang_confidence": 1.0, + "lang_conflict": false, + "products": [ + { + "spu_id": "12345", + "title": "iPhone 15 Pro Max", + "price": 999.0, + "image_url": "https://example.com/image.jpg", + "score": 3.21 + } + ] } ], - "took_ms": 5 + "took_ms": 12 } ``` #### 请求示例 ```bash -curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product" +curl "http://localhost:6002/search/suggestions?q=芭&size=5&language=zh&with_results=true" \ + -H "X-Tenant-ID: 162" ``` ### 3.8 即时搜索接口 diff --git a/docs/搜索API速查表.md b/docs/搜索API速查表.md index e485230..fd6f4fe 100644 --- a/docs/搜索API速查表.md +++ b/docs/搜索API速查表.md @@ -294,7 +294,7 @@ POST /search/image "size": 20 } -GET /search/suggestions?q=芭&size=5 +GET /search/suggestions?q=芭&size=5&language=zh&with_results=true GET /search/instant?q=玩具&size=5 diff --git a/main.py b/main.py index 4c0ae91..9e49725 100755 --- a/main.py +++ b/main.py @@ -12,13 +12,12 @@ import sys import os import argparse import json -import uvicorn # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from config import ConfigLoader -from config.env_config import DB_CONFIG +from config.env_config import ES_CONFIG from utils import ESClient from search import Searcher from suggestion import SuggestionIndexBuilder @@ -27,6 +26,7 @@ from utils.db_connector import create_db_connection def cmd_serve(args): """Start API service.""" + import uvicorn os.environ['ES_HOST'] = args.es_host print("Starting API service (multi-tenant)...") @@ -43,6 +43,7 @@ def cmd_serve(args): def cmd_serve_indexer(args): """Start dedicated Indexer API service (no search endpoints).""" + import uvicorn os.environ['ES_HOST'] = args.es_host print("Starting Indexer API service...") @@ -102,17 +103,33 @@ def cmd_search(args): def cmd_build_suggestions(args): """Build suggestion index for a tenant.""" - es_client = ESClient(hosts=[args.es_host]) + # Initialize ES client with optional authentication + es_username = os.getenv("ES_USERNAME") or ES_CONFIG.get("username") + es_password = os.getenv("ES_PASSWORD") or ES_CONFIG.get("password") + if es_username and es_password: + es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password) + else: + es_client = ESClient(hosts=[args.es_host]) if not es_client.ping(): print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") return 1 + # Build DB config directly from environment to avoid dotenv dependency + db_host = os.getenv("DB_HOST") + db_port = int(os.getenv("DB_PORT", "3306")) + db_name = os.getenv("DB_DATABASE") + db_user = os.getenv("DB_USERNAME") + db_pass = os.getenv("DB_PASSWORD") + if not all([db_host, db_name, db_user, db_pass]): + print("ERROR: DB_HOST/DB_PORT/DB_DATABASE/DB_USERNAME/DB_PASSWORD must be set in environment") + return 1 + db_engine = create_db_connection( - host=DB_CONFIG["host"], - port=DB_CONFIG["port"], - database=DB_CONFIG["database"], - username=DB_CONFIG["username"], - password=DB_CONFIG["password"], + host=db_host, + port=db_port, + database=db_name, + username=db_user, + password=db_pass, ) builder = SuggestionIndexBuilder(es_client=es_client, db_engine=db_engine) result = builder.rebuild_tenant_index( diff --git a/scripts/build_suggestions.sh b/scripts/build_suggestions.sh new file mode 100755 index 0000000..a0095e4 --- /dev/null +++ b/scripts/build_suggestions.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +# Convenience script to rebuild suggestion index for a tenant. +# +# Usage: +# ./scripts/build_suggestions.sh [--days 30] [--batch-size 500] [--min-query-len 1] [--es-host http://localhost:9200] +# + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "Usage: $0 [extra args...]" + echo "Example: $0 162 --days 30 --recreate" + exit 1 +fi + +TENANT_ID="$1" +shift || true + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +cd "$ROOT_DIR" + +python main.py build-suggestions \ + --tenant-id "$TENANT_ID" \ + "$@" + diff --git a/tests/test_suggestions.py b/tests/test_suggestions.py new file mode 100644 index 0000000..ece27a6 --- /dev/null +++ b/tests/test_suggestions.py @@ -0,0 +1,183 @@ +import json +from typing import Any, Dict, List + +import pytest + +from suggestion.builder import SuggestionIndexBuilder +from suggestion.service import SuggestionService + + +class FakeESClient: + """Minimal fake ES client for SuggestionService tests.""" + + def __init__(self) -> None: + self.calls: List[Dict[str, Any]] = [] + + def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]: + self.calls.append({"index": index_name, "body": body, "size": size, "from": from_}) + # Suggestion index + if "search_suggestions_tenant_" in index_name: + return { + "hits": { + "total": {"value": 1}, + "max_score": 3.2, + "hits": [ + { + "_id": "1", + "_score": 3.2, + "_source": { + "text": "iphone 15", + "lang": "en", + "rank_score": 5.0, + "sources": ["query_log", "qanchor"], + "lang_source": "log_field", + "lang_confidence": 1.0, + "lang_conflict": False, + "top_spu_ids": ["12345"], + }, + } + ], + } + } + # Product index + if "search_products_tenant_" in index_name: + return { + "hits": { + "total": {"value": 1}, + "max_score": 2.5, + "hits": [ + { + "_id": "12345", + "_score": 2.5, + "_source": { + "spu_id": "12345", + "title": {"en": "iPhone 15 Pro Max"}, + "min_price": 999.0, + "image_url": "https://example.com/image.jpg", + "sales": 100, + "total_inventory": 50, + }, + } + ], + } + } + return {"hits": {"total": {"value": 0}, "max_score": 0.0, "hits": []}} + + # For builder.bulk_index usage compatibility in full runs (not used in these unit tests) + def bulk_index(self, index_name: str, docs: List[Dict[str, Any]]) -> Dict[str, Any]: + self.calls.append({"index": index_name, "bulk": True, "docs": docs}) + return {"success": len(docs), "failed": 0, "errors": []} + + def index_exists(self, index_name: str) -> bool: + return False + + def delete_index(self, index_name: str) -> bool: + return True + + def create_index(self, index_name: str, body: Dict[str, Any]) -> bool: + self.calls.append({"index": index_name, "create": True, "body": body}) + return True + + def refresh(self, index_name: str) -> bool: + return True + + +@pytest.mark.unit +def test_resolve_query_language_prefers_log_field(monkeypatch): + """builder.resolve_query_language 应优先使用日志 language 字段。""" + fake_es = FakeESClient() + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) + # index_languages 里包含 en/zh,primary 设为 zh + lang, conf, source, conflict = builder._resolve_query_language( + query="iphone 15", + log_language="en", + request_params=None, + index_languages=["zh", "en"], + primary_language="zh", + ) + assert lang == "en" + assert conf == 1.0 + assert source == "log_field" + assert conflict is False + + +@pytest.mark.unit +def test_resolve_query_language_uses_request_params_when_log_missing(): + """当日志 language 为空时,应从 request_params.language 解析。""" + fake_es = FakeESClient() + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) + request_params = json.dumps({"language": "zh"}) + lang, conf, source, conflict = builder._resolve_query_language( + query="芭比娃娃", + log_language=None, + request_params=request_params, + index_languages=["zh", "en"], + primary_language="en", + ) + assert lang == "zh" + assert conf == 1.0 + assert source == "request_params" + assert conflict is False + + +@pytest.mark.unit +def test_resolve_query_language_fallback_to_primary(): + """当无任何语言线索时,应回落到租户 primary_language。""" + fake_es = FakeESClient() + builder = SuggestionIndexBuilder(es_client=fake_es, db_engine=None) + lang, conf, source, conflict = builder._resolve_query_language( + query="some text", + log_language=None, + request_params=None, + index_languages=["zh", "en"], + primary_language="zh", + ) + assert lang == "zh" + assert source == "default" + assert conflict is False + + +@pytest.mark.unit +def test_suggestion_service_basic_flow(monkeypatch): + """ + SuggestionService.search 应正确调用 ES 并返回 suggestion + products。 + 使用 FakeESClient 避免真实 ES 依赖。 + """ + # 覆盖 tenant_config_loader 以避免依赖外部 config.yaml 改动 + from config import tenant_config_loader as tcl + + loader = tcl.get_tenant_config_loader() + # 强制覆盖内部缓存配置 + loader._config = { + "default": {"primary_language": "en", "index_languages": ["en", "zh"]}, + "tenants": { + "1": {"primary_language": "en", "index_languages": ["en", "zh"]}, + }, + } + + fake_es = FakeESClient() + service = SuggestionService(es_client=fake_es) + result = service.search( + tenant_id="1", + query="iph", + language="en", + size=5, + with_results=True, + result_size=2, + ) + + assert result["resolved_language"] == "en" + assert result["query"] == "iph" + assert result["took_ms"] >= 0 + suggestions = result["suggestions"] + assert len(suggestions) == 1 + s0 = suggestions[0] + assert s0["text"] == "iphone 15" + assert s0["lang"] == "en" + assert isinstance(s0.get("products"), list) + assert len(s0["products"]) >= 1 + p0 = s0["products"][0] + assert p0["spu_id"] == "12345" + assert "title" in p0 + assert "price" in p0 + -- libgit2 0.21.2