From 7299bae6396856b85f415a780200216d2d97fede Mon Sep 17 00:00:00 2001 From: tangwang Date: Sun, 8 Mar 2026 17:46:21 +0800 Subject: [PATCH] tests --- .github/workflows/test.yml | 550 ++++++++++++++++++++++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ README.md | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- docs/QUICKSTART.md | 30 ++++++++++++++++++++++++++++++ scripts/run_ci_tests.sh | 9 +++++++++ tests/ci/test_service_api_contracts.py | 281 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/test_cloud_embedding.py | 3 +++ tests/test_cnclip_service.py | 18 ++++++++++++------ 7 files changed, 435 insertions(+), 723 deletions(-) create mode 100755 scripts/run_ci_tests.sh create mode 100644 tests/ci/test_service_api_contracts.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d638cb9..9c8ce26 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,537 +1,31 @@ -name: saas-search Test Pipeline +name: CI - Service Contracts on: push: - branches: [ main, master, develop ] + branches: [main, master, develop] pull_request: - branches: [ main, master, develop ] - workflow_dispatch: # 允许手动触发 - -env: - PYTHON_VERSION: '3.9' - NODE_VERSION: '16' + branches: [main, master, develop] + workflow_dispatch: jobs: - # 代码质量检查 - code-quality: - runs-on: ubuntu-latest - name: Code Quality Check - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 black isort mypy pylint - pip install -r requirements.txt - - - name: Run Black (code formatting) - run: | - black --check --diff . - - - name: Run isort (import sorting) - run: | - isort --check-only --diff . - - - name: Run Flake8 (linting) - run: | - flake8 --max-line-length=100 --ignore=E203,W503 . - - - name: Run MyPy (type checking) - run: | - mypy --ignore-missing-imports --no-strict-optional . - - - name: Run Pylint - run: | - pylint --disable=C0114,C0115,C0116 --errors-only . - - # 单元测试 - unit-tests: - runs-on: ubuntu-latest - name: Unit Tests - - strategy: - matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Cache pip dependencies - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-cov pytest-json-report - pip install -r requirements.txt - - - name: Create test logs directory - run: mkdir -p test_logs - - - name: Run unit tests - run: | - python -m pytest tests/unit/ \ - -v \ - --tb=short \ - --cov=. \ - --cov-report=xml \ - --cov-report=html \ - --cov-report=term-missing \ - --json-report \ - --json-report-file=test_logs/unit_test_results.json - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - file: ./coverage.xml - flags: unittests - name: codecov-umbrella - - - name: Upload unit test results - uses: actions/upload-artifact@v3 - if: always() - with: - name: unit-test-results-${{ matrix.python-version }} - path: | - test_logs/unit_test_results.json - htmlcov/ - - # 集成测试 - integration-tests: - runs-on: ubuntu-latest - name: Integration Tests - needs: [code-quality, unit-tests] - - services: - elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0 - env: - discovery.type: single-node - ES_JAVA_OPTS: -Xms1g -Xmx1g - xpack.security.enabled: false - ports: - - 9200:9200 - options: >- - --health-cmd "curl http://localhost:9200/_cluster/health" - --health-interval 10s - --health-timeout 5s - --health-retries 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y curl - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-json-report httpx - pip install -r requirements.txt - - - name: Create test logs directory - run: mkdir -p test_logs - - - name: Wait for Elasticsearch - run: | - echo "Waiting for Elasticsearch to be ready..." - for i in {1..30}; do - if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then - echo "Elasticsearch is ready" - break - fi - echo "Attempt $i/30: Elasticsearch not ready yet" - sleep 2 - done - - - name: Setup test index - run: | - curl -X PUT http://localhost:9200/test_products \ - -H 'Content-Type: application/json' \ - -d '{ - "settings": { - "number_of_shards": 1, - "number_of_replicas": 0 - }, - "mappings": { - "properties": { - "name": {"type": "text"}, - "brand_name": {"type": "text"}, - "tags": {"type": "text"}, - "price": {"type": "double"}, - "category_id": {"type": "integer"}, - "spu_id": {"type": "keyword"}, - "text_embedding": {"type": "dense_vector", "dims": 1024} - } - } - }' - - - name: Insert test data - run: | - curl -X POST http://localhost:9200/test_products/_bulk \ - -H 'Content-Type: application/json' \ - --data-binary @- << 'EOF' -{"index": {"_id": "1"}} -{"name": "红色连衣裙", "brand_name": "测试品牌", "tags": ["红色", "连衣裙", "女装"], "price": 299.0, "category_id": 1, "spu_id": "dress_001"} -{"index": {"_id": "2"}} -{"name": "蓝色连衣裙", "brand_name": "测试品牌", "tags": ["蓝色", "连衣裙", "女装"], "price": 399.0, "category_id": 1, "spu_id": "dress_002"} -{"index": {"_id": "3"}} -{"name": "智能手机", "brand_name": "科技品牌", "tags": ["智能", "手机", "数码"], "price": 2999.0, "category_id": 2, "spu_id": "phone_001"} -EOF - - - name: Run integration tests - env: - ES_HOST: http://localhost:9200 - TENANT_ID: test_tenant - TESTING_MODE: true - run: | - python -m pytest tests/integration/ \ - -v \ - --tb=short \ - -m "not slow" \ - --json-report \ - --json-report-file=test_logs/integration_test_results.json - - - name: Upload integration test results - uses: actions/upload-artifact@v3 - if: always() - with: - name: integration-test-results - path: test_logs/integration_test_results.json - - # API测试 - api-tests: - runs-on: ubuntu-latest - name: API Tests - needs: [code-quality, unit-tests] - - services: - elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0 - env: - discovery.type: single-node - ES_JAVA_OPTS: -Xms1g -Xmx1g - xpack.security.enabled: false - ports: - - 9200:9200 - options: >- - --health-cmd "curl http://localhost:9200/_cluster/health" - --health-interval 10s - --health-timeout 5s - --health-retries 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y curl - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-json-report httpx - pip install -r requirements.txt - - - name: Create test logs directory - run: mkdir -p test_logs - - - name: Wait for Elasticsearch - run: | - echo "Waiting for Elasticsearch to be ready..." - for i in {1..30}; do - if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then - echo "Elasticsearch is ready" - break - fi - echo "Attempt $i/30: Elasticsearch not ready yet" - sleep 2 - done - - - name: Setup test index and data - run: | - # 创建索引 - curl -X PUT http://localhost:9200/test_products \ - -H 'Content-Type: application/json' \ - -d '{ - "settings": {"number_of_shards": 1, "number_of_replicas": 0}, - "mappings": { - "properties": { - "name": {"type": "text"}, "brand_name": {"type": "text"}, - "tags": {"type": "text"}, "price": {"type": "double"}, - "category_id": {"type": "integer"}, "spu_id": {"type": "keyword"}, - "text_embedding": {"type": "dense_vector", "dims": 1024} - } - } - }' - - # 插入测试数据 - curl -X POST http://localhost:9200/test_products/_bulk \ - -H 'Content-Type: application/json' \ - --data-binary @- << 'EOF' -{"index": {"_id": "1"}} -{"name": "红色连衣裙", "brand_name": "测试品牌", "tags": ["红色", "连衣裙", "女装"], "price": 299.0, "category_id": 1, "spu_id": "dress_001"} -{"index": {"_id": "2"}} -{"name": "蓝色连衣裙", "brand_name": "测试品牌", "tags": ["蓝色", "连衣裙", "女装"], "price": 399.0, "category_id": 1, "spu_id": "dress_002"} -EOF - - - name: Start API service - env: - ES_HOST: http://localhost:9200 - TENANT_ID: test_tenant - API_HOST: 127.0.0.1 - API_PORT: 6003 - TESTING_MODE: true - run: | - python -m api.app \ - --host $API_HOST \ - --port $API_PORT \ - --tenant $TENANT_ID \ - --es-host $ES_HOST & - echo $! > api.pid - - # 等待API服务启动 - for i in {1..30}; do - if curl -s http://$API_HOST:$API_PORT/health > /dev/null; then - echo "API service is ready" - break - fi - echo "Attempt $i/30: API service not ready yet" - sleep 2 - done - - - name: Run API tests - env: - ES_HOST: http://localhost:9200 - API_HOST: 127.0.0.1 - API_PORT: 6003 - TENANT_ID: test_tenant - TESTING_MODE: true - run: | - python -m pytest tests/integration/test_api_integration.py \ - -v \ - --tb=short \ - --json-report \ - --json-report-file=test_logs/api_test_results.json - - - name: Stop API service - if: always() - run: | - if [ -f api.pid ]; then - kill $(cat api.pid) || true - rm api.pid - fi - - - name: Upload API test results - uses: actions/upload-artifact@v3 - if: always() - with: - name: api-test-results - path: test_logs/api_test_results.json - - # 性能测试 - performance-tests: - runs-on: ubuntu-latest - name: Performance Tests - needs: [code-quality, unit-tests] - if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' - - services: - elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0 - env: - discovery.type: single-node - ES_JAVA_OPTS: -Xms2g -Xmx2g - xpack.security.enabled: false - ports: - - 9200:9200 - options: >- - --health-cmd "curl http://localhost:9200/_cluster/health" - --health-interval 10s - --health-timeout 5s - --health-retries 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest locust - pip install -r requirements.txt - - - name: Wait for Elasticsearch - run: | - echo "Waiting for Elasticsearch to be ready..." - for i in {1..30}; do - if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then - echo "Elasticsearch is ready" - break - fi - sleep 2 - done - - - name: Setup test data - run: | - # 创建并填充测试索引 - python scripts/create_test_data.py --count 1000 - - - name: Run performance tests - env: - ES_HOST: http://localhost:9200 - TESTING_MODE: true - run: | - python scripts/run_performance_tests.py - - - name: Upload performance results - uses: actions/upload-artifact@v3 - if: always() - with: - name: performance-test-results - path: performance_results/ - - # 安全扫描 - security-scan: + service-contract-tests: runs-on: ubuntu-latest - name: Security Scan - needs: [code-quality] + name: Service Contract Tests steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install security scanning tools - run: | - python -m pip install --upgrade pip - pip install safety bandit - - - name: Run Safety (dependency check) - run: | - safety check --json --output safety_report.json || true - - - name: Run Bandit (security linter) - run: | - bandit -r . -f json -o bandit_report.json || true - - - name: Upload security reports - uses: actions/upload-artifact@v3 - if: always() - with: - name: security-reports - path: | - safety_report.json - bandit_report.json - - # 测试结果汇总 - test-summary: - runs-on: ubuntu-latest - name: Test Summary - needs: [unit-tests, integration-tests, api-tests, security-scan] - if: always() - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download all test artifacts - uses: actions/download-artifact@v3 - - - name: Generate test summary - run: | - python scripts/generate_test_summary.py - - - name: Upload final report - uses: actions/upload-artifact@v3 - with: - name: final-test-report - path: final_test_report.* - - - name: Comment PR with results - if: github.event_name == 'pull_request' - uses: actions/github-script@v6 - with: - script: | - const fs = require('fs'); - - // 读取测试报告 - let reportContent = ''; - try { - reportContent = fs.readFileSync('final_test_report.txt', 'utf8'); - } catch (e) { - console.log('Could not read report file'); - return; - } - - // 提取摘要信息 - const lines = reportContent.split('\n'); - let summary = ''; - let inSummary = false; - - for (const line of lines) { - if (line.includes('测试摘要')) { - inSummary = true; - continue; - } - if (inSummary && line.includes('测试套件详情')) { - break; - } - if (inSummary && line.trim()) { - summary += line + '\n'; - } - } - - // 构建评论内容 - const comment = `## 🧪 测试报告\n\n${summary}\n\n详细的测试报告请查看 [Artifacts](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) 部分。`; - - // 发送评论 - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: comment - }); \ No newline at end of file + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run CI contract tests + run: | + python -m pytest tests/ci -q \ No newline at end of file diff --git a/README.md b/README.md index b9b4aeb..46acc12 100644 --- a/README.md +++ b/README.md @@ -1,228 +1,117 @@ -# TODO - -**多语言索引**:已改为可配置的 `index_languages`(默认为 `["en", "zh"]`),商家可勾选主市场语言。支持语言见 `config.tenant_config_loader.SUPPORTED_INDEX_LANGUAGES`(含 en, zh, zh_tw, ru, ja, ko, es, fr, pt, de, it, th, vi, id, ms, ar, hi, he, my, ta, ur, bn, pl, nl, ro, tr, km, lo, yue, cs, el, sv, hu, da, fi, uk, bg 等)。 - -前端: -搜索模态框 -点击搜索的时候,弹出 搜索模态框,参考 react、AJAX等技术来实现,搜索模态框的页面宽度和原始页面相同(占满),左侧是suggestions,右侧是即使刷新的搜索结果(每输入一个字母都刷新一次结果)。 -但是要注意:搜索过程中,后端不要触发翻译(因为输入过程中的query翻译结果会有问题),因此需要增加一个参数:搜索类型,默认为当前的回车后发起的搜索,如果是输入过程中的结果刷新则类型为typing。 - - - -多语言: -语义: -多语言: -1. dis_max的方式 -"query" : { - "dis_max" : { - "queries" : [ - {"match" : { "title.en" : xxx }}, - {"match" : { "title.zh" : xxx }}, - {"match" : { "title_xx" : xxx }} - ], - "tie_breakler" : 0.8 - } -} - - - -"corss_field": -"multi_match" : { - "query" : - "fields" : [...], - "type": "cross_fields", - "operator" : "and" -} - - -支持英文的拼写接错: -title: multi_field -"query" : { - "query_string" : { - "query": "xxx", - "default_field": "title.ngram", - "minimum_should_match": "85%" - } -} - - - -业务提权: -rescore: window_size query_weight rescore_query_weight - -function_score: -boost -以及 可以把每个子查询用function_score包一下 - - - - -query anchor -我想给elasticsearch 增加字段 query anchor ,即哪些query点击到了这个doc,一个doc下面有多个query anchor,每个query anchor又有这两个属性:weight、dweight,分别代表 query在doc下的点击分布权重、doc在query下的点击分布权重。请问该如何设计这两个ES字段。 - -需要有zh en两套query anchor,因为他们的解析器不一样。 - -他的功能是辅助召回和排序。我搜索一个query,那么每个query跟 doc中的query anchor的相关性,也就是,除了将query到title 和keywords / brief等文本字段中搜索,也到 query anchor中搜索,从而辅助召回和相关性的计算。 - - +# 电商搜索引擎 SaaS +多租户、可配置、可扩展的电商搜索平台(Shoplazza 等独立站场景)。 +README 用于给后续开发者建立统一认知:**系统框架、模块边界、设计原则、研发流程与 CI 测试入口**,帮助持续迭代时避免分叉设计与冗余代码。 -# 电商搜索引擎 SaaS +--- -一个针对跨境独立站(店匠 Shoplazza 等)的多租户可配置搜索平台。README 作为项目导航入口,帮助你在不同阶段定位到更详细的文档。 +## 1) 项目目标与边界 +- **目标**:在统一架构下支持关键词检索、语义检索、分面过滤、多语言、重排、图片检索。 +- **边界**:本仓库负责搜索核心能力与服务编排;业务方通过标准 HTTP API 对接。 +- **核心约束**: + - 调用方稳定(API/Provider 契约优先) + - 配置单一来源(`config/config.yaml` + `.env` 覆盖) + - 扩展优先走插件化(provider/backend),避免散落式分叉实现 -## 项目环境 +--- -以项目根目录的 **`activate.sh`** 为准(**优先激活 venv:`./.venv`,并加载 `.env`;兼容 Conda 回退**): +## 2) 快速开始 ```bash -# 推荐:首次创建 venv(默认安装基础依赖) +# 首次创建环境(默认基础依赖) ./scripts/create_venv.sh - -# 如需本地向量/图片编码(会安装 torch/transformers 等重依赖) -# INSTALL_ML=1 ./scripts/create_venv.sh source activate.sh -``` -新机器首次需创建环境,见 `docs/环境配置说明.md`(推荐 venv;Conda 为兼容旧流程)。 +# 启动核心服务(backend/indexer/frontend) +./run.sh -## 测试pipeline +# 可选:附加能力服务 +START_EMBEDDING=1 START_TRANSLATOR=1 START_RERANKER=1 ./run.sh -1. -店铺1 tenant_id=162: -fake数据 生成商品导入数据 提交到店匠的店铺: -cd /data/saas-search && source activate.sh && python scripts/csv_to_excel_multi_variant.py --output with_colors.xlsx +# 查看状态 +./scripts/service_ctl.sh status +``` -店铺2 tenant_id= +核心端口: +- `6002` backend(`/search/*`, `/admin/*`) +- `6004` indexer(`/indexer/*`) +- `6003` frontend +- `6005` embedding(可选) +- `6006` translator(可选) +- `6007` reranker(可选) -2. 后端:自动同步到mysql +更完整示例见 `docs/QUICKSTART.md`。 -3. mysql到ES: +--- -python scripts/recreate_and_import.py \ - --tenant-id 162 \ - --db-host \ - --db-database saas \ - --db-username saas \ - --db-password \ - --es-host http://localhost:9200 +## 3) 总体架构(开发者视角) -构造查询: -参考 @常用查询 - ES.md +- `api/`:统一 API 入口(search/admin/indexer app) +- `search/`:召回、排序、结果组织 +- `query/`:查询解析、多语言处理、改写 +- `indexer/`:MySQL 行数据 -> ES 文档的转换与索引流程 +- `providers/`:能力调用抽象(translation/embedding/rerank) +- `reranker/`:重排服务及后端实现 +- `embeddings/`:向量服务(文本/图像) +- `config/`:配置加载与服务配置解析 +关键设计:**Provider(调用方式)与 Backend(推理实现)分离**,新增能力优先在协议与工厂注册,不改调用方主流程。 -## 核心能力速览 +--- -- **多语言 + 自动翻译**:中文、英文、俄文等语言检测与路由(BGE-M3、DeepL) -- **语义 + 关键词混排**:BM25、dense vector(BGE-M3/CN-CLIP)融合 -- **布尔与分面**:AND / OR / ANDNOT / RANK、Terms & Range facets -- **多租户隔离**:共享 `search_products` 索引,通过 `tenant_id` 严格隔离 -- **可配置化**:字段/索引域/排序表达式/查询改写全部配置驱动 -- **脚本化流水线**:Mock/CSV 数据 → MySQL → Elasticsearch → API/前端 +## 4) 设计原则(避免后续分叉) -## 新人入口 +- **单一配置源**:服务地址、provider 选择、后端参数统一在 `config/config.yaml`,环境变量仅做覆盖。 +- **接口契约优先**:外部 API 契约与 provider 契约稳定,内部重构不影响调用方。 +- **扩展走工厂**:新增 provider/backend 必须在工厂函数中显式注册,禁止旁路分支。 +- **可观测性优先**:健康检查、关键日志、请求上下文必须可追踪。 +- **测试优先保障契约**:CI 首先保证接口契约和核心路径可用,再逐步扩展性能与业务测试。 -**→ 开发者必读**:[docs/DEVELOPER_GUIDE.md](docs/DEVELOPER_GUIDE.md) — 项目全貌、设计原则、扩展规范与迭代检查清单,保证后续开发在统一框架内进行。 +--- -**→ 快速上手**:[docs/QUICKSTART.md](docs/QUICKSTART.md) — 环境、服务、模块、请求示例一页搞定。 +## 5) 文档入口(建议阅读顺序) | 步骤 | 文档 | |------|------| -| 0. 框架与规范(推荐首读) | `docs/DEVELOPER_GUIDE.md` | -| 1. 环境与启动 | `docs/QUICKSTART.md` | -| 2. 搜索/索引 API | `docs/QUICKSTART.md` §3、`docs/搜索API速查表.md` | -| 3. 运维与故障 | `docs/Usage-Guide.md` | -| 4. 架构与扩展 | `docs/PROVIDER_ARCHITECTURE.md`、`docs/MODULE_EXTENSION_SPEC.md`、`docs/系统设计文档.md` | - -### Runtimes & 命令示例 +| 0. 全局规范(首读) | `docs/DEVELOPER_GUIDE.md` | +| 1. 开发与配置 | `docs/QUICKSTART.md` | +| 2. 运行与排障 | `docs/Usage-Guide.md` | +| 3. API 详细说明 | `docs/搜索API对接指南.md` | +| 4. 快速参数速查 | `docs/搜索API速查表.md` | +| 5. 首次环境搭建 | `docs/环境配置说明.md` | -```bash -# 1. 安装依赖与准备服务(环境创建见 docs/环境配置说明.md) -source activate.sh # 或先 export CONDA_ROOT=你的conda路径 -pip install -r requirements.txt # 若用 environment.yml 创建环境可省略 -docker run -d --name es -p 9200:9200 elasticsearch:8.11.0 +--- -# 2. 构造测试数据并导入 MySQL -./scripts/mock_data.sh # 详见 TEST_DATA_GUIDE.md +## 6) 持续集成测试(推荐最小集) -# 3. 创建租户索引结构并导入数据(推荐) -./scripts/create_tenant_index.sh 162 -curl -X POST "http://localhost:6004/indexer/reindex" \ - -H "Content-Type: application/json" \ - -d '{"tenant_id":"162","batch_size":500}' +本仓库提供一套轻量、稳定、易维护的 CI 测试入口,覆盖以下服务契约: -# 4. 启动核心服务(backend/indexer/frontend) -./run.sh +- 搜索接口(search API) +- 索引接口(indexer API) +- 向量服务(embedding service) +- 翻译服务(translator service) +- 重排服务(reranker service) -# (可选)附加启动 embedding / translator / reranker -START_EMBEDDING=1 START_TRANSLATOR=1 START_RERANKER=1 ./run.sh -# -# 查看服务状态 / 停止 -./scripts/service_ctl.sh status -./scripts/stop.sh +本地运行: -# 5. 调用文本搜索 API -curl -X POST http://localhost:6002/search/ \ - -H "Content-Type: application/json" \ - -H "X-Tenant-ID: 1" \ - -d '{"query": "玩具", "size": 10}' +```bash +source activate.sh +python -m pytest tests/ci -q ``` -## 文档索引 +该测试集采用 mock/stub,**不依赖真实 ES/MySQL/大模型服务**,适合作为 PR 级快速回归门禁。 -| 文档 | 用途 | -|------|------| -| `docs/DEVELOPER_GUIDE.md` | **开发者开放指南**:全貌、原则、规范、检查清单 | -| `docs/QUICKSTART.md` | 新人上手:环境、服务、模块、请求 | -| `docs/Usage-Guide.md` | 运维:日志、多环境、故障排查 | -| `docs/搜索API速查表.md` | 搜索 API 参数速查 | -| `docs/搜索API对接指南.md` | 搜索 API 完整说明 | -| `docs/PROVIDER_ARCHITECTURE.md` | 翻译/向量/重排 provider 扩展 | -| `docs/MODULE_EXTENSION_SPEC.md` | 向量/重排后端可插拔规范 | -| `docs/环境配置说明.md` | 首次部署、新机器环境 | -| `docs/系统设计文档.md` | 架构与模块细节 | - -## 关键工作流指引 - -- **数据构建 → MySQL → Elasticsearch** - - `scripts/mock_data.sh`:Tenant1 Mock + Tenant2 CSV 一条龙 - - `scripts/create_tenant_index.sh ` + `POST /indexer/reindex`:推荐导入链路 - - 详解:`测试数据指南.md` - -- **索引富化 & Java 对接** - - Java 索引程序负责:全量/增量调度 + 从 MySQL 查询 `shoplazza_product_spu/sku/option/...` - - Python `indexer` 模块负责:**MySQL 行 → ES doc** 的全部逻辑(多语言、翻译、向量、规格聚合等) - - 正式对接接口(推荐): - - `POST http://:6004/indexer/build-docs` - - 入参:`tenant_id + items[{spu, skus, options}]` - - 出参:与 `mappings/search_products.json` 完全一致的 `docs` 列表,上游自行写入 ES - - 调试/自测接口(内部使用): - - `POST http://127.0.0.1:6004/indexer/build-docs-from-db`,只需要 `tenant_id + spu_ids`,由服务内部查库并返回 ES doc - - 详解:`indexer/README.md`、`docs/索引字段说明v2.md` - -- **搜索服务 & API** - - `api/`(FastAPI)承载 REST API,`search/` + `query/` 负责查询解析与下发 - - API、分页、过滤、Facet、KNN 等:`搜索API对接指南.md` - - 对接案例、示例与错误码:`搜索API对接指南.md`、`Search-API-Examples.md` - -- **统一配置** - - 所有租户共享统一的索引结构和查询配置(硬编码) - - 索引 mapping: `mappings/search_products.json` - - 查询配置: `search/query_config.py` - - 详解:`基础配置指南.md`、`索引字段说明v2.md` - -## 仓库结构(概览) +--- -``` -api/ FastAPI 服务与路由 -config/ 字段/索引/查询配置体系 -indexer/ MySQL → ES 管道(mapping / transformer / bulk) -query/ 查询解析、改写、翻译、embedding -search/ 多语言构建、布尔解析、排序引擎 -scripts/ 数据/服务脚本(mock_data, ingest, run 等) -frontend/ 简易调试页面 -docs/ 运营及中文资料 -``` +## 7) 代码质量与持续继承要求 + +- 新增功能必须补最小测试(至少覆盖 1 条成功路径 + 1 条参数异常路径) +- 修改公共协议时必须同步更新: + - `docs/QUICKSTART.md` + - 对应服务 README / API 文档 + - `tests/ci` 契约用例 +- 禁止新增“临时分支逻辑”绕过 provider/backend 工厂 +- 优先减少重复实现,复用现有转换链路与配置解析入口 diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index a81dc8d..ca0fe10 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -27,6 +27,7 @@ 4. [模块扩展规范(Embedding / Rerank)](#4-模块扩展规范embedding--rerank) 5. [验证、日志与常见排障入口](#5-验证日志与常见排障入口) 6. [相关文档](#6-相关文档) +7. [持续集成测试(最小可维护方案)](#7-持续集成测试最小可维护方案) --- @@ -374,3 +375,32 @@ lsof -i :6004 | `indexer/README.md` | 索引模块职责与接口 | | `embeddings/README.md` | 向量化服务说明 | | `reranker/README.md` | 重排服务说明 | + +--- + +## 7. 持续集成测试(最小可维护方案) + +目标:让后续开发者在不依赖真实 ES/MySQL/模型服务的前提下,快速验证核心服务契约不被破坏。 + +### 7.1 测试范围 + +`tests/ci/test_service_api_contracts.py` 覆盖: + +- 搜索接口:`/search/`、`/search/image`、`/search/suggestions` +- 索引接口:`/indexer/reindex`、`/indexer/index`、`/indexer/build-docs` +- 向量服务:`/embed/text`、`/embed/image` +- 翻译服务:`/translate`、`/health` +- 重排服务:`/rerank`、`/health` + +### 7.2 运行方式 + +```bash +source activate.sh +python -m pytest tests/ci -q +``` + +### 7.3 设计取舍 + +- 使用 mock/stub 注入依赖,确保测试快且稳定 +- 重点测“接口契约与参数行为”,而不是底层模型质量 +- 作为 PR 级门禁;真实环境联调放在运维/预发布流程 diff --git a/scripts/run_ci_tests.sh b/scripts/run_ci_tests.sh new file mode 100755 index 0000000..e064f7a --- /dev/null +++ b/scripts/run_ci_tests.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -euo pipefail + +cd "$(dirname "$0")/.." +source ./activate.sh + +echo "Running CI contract tests..." +python -m pytest tests/ci -q diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py new file mode 100644 index 0000000..7dbe5df --- /dev/null +++ b/tests/ci/test_service_api_contracts.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any, Dict, List + +import numpy as np +import pytest +from fastapi.testclient import TestClient + + +class _FakeSearcher: + def search(self, **kwargs): + return SimpleNamespace( + results=[ + { + "spu_id": "spu-1", + "title": "测试商品", + "price": 99.0, + "currency": "USD", + "in_stock": True, + "skus": [], + "relevance_score": 1.2, + } + ], + total=1, + max_score=1.2, + took_ms=8, + facets=[], + query_info={"normalized_query": kwargs.get("query", "")}, + suggestions=[], + related_searches=[], + debug_info=None, + ) + + def search_by_image(self, **kwargs): + return self.search(**kwargs) + + +class _FakeSuggestionService: + def search(self, **kwargs): + return { + "query": kwargs["query"], + "language": kwargs.get("language", "en"), + "resolved_language": kwargs.get("language", "en"), + "suggestions": [{"text": "iphone 15", "score": 1.0}], + "took_ms": 3, + } + + +@pytest.fixture +def search_client(monkeypatch): + import api.app as search_app + + monkeypatch.setattr(search_app, "init_service", lambda es_host="": None) + monkeypatch.setattr(search_app, "get_searcher", lambda: _FakeSearcher()) + monkeypatch.setattr(search_app, "get_suggestion_service", lambda: _FakeSuggestionService()) + + with TestClient(search_app.app) as client: + yield client + + +def test_search_api_contract(search_client: TestClient): + response = search_client.post( + "/search/", + headers={"X-Tenant-ID": "162"}, + json={"query": "toy", "size": 5}, + ) + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert data["results"][0]["spu_id"] == "spu-1" + + +def test_image_search_api_contract(search_client: TestClient): + response = search_client.post( + "/search/image", + headers={"X-Tenant-ID": "162"}, + json={"image_url": "https://example.com/a.jpg", "size": 3}, + ) + assert response.status_code == 200 + assert response.json()["results"][0]["spu_id"] == "spu-1" + + +def test_suggestion_api_contract(search_client: TestClient): + response = search_client.get( + "/search/suggestions?q=iph&size=5&language=en", + headers={"X-Tenant-ID": "162"}, + ) + assert response.status_code == 200 + data = response.json() + assert data["query"] == "iph" + assert len(data["suggestions"]) == 1 + + +class _FakeBulkService: + def bulk_index(self, tenant_id: str, recreate_index: bool, batch_size: int): + return { + "tenant_id": tenant_id, + "recreate_index": recreate_index, + "batch_size": batch_size, + "success": True, + } + + +class _FakeTransformer: + def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options): + return { + "tenant_id": tenant_id, + "spu_id": str(spu_row.get("id", "0")), + "title": {"zh": str(spu_row.get("title", ""))}, + } + + +class _FakeIncrementalService: + def index_spus_to_es(self, es_client, tenant_id: str, spu_ids: List[str], delete_spu_ids=None): + return { + "tenant_id": tenant_id, + "spu_ids": [{"spu_id": s, "status": "indexed"} for s in spu_ids], + "delete_spu_ids": [], + "total": len(spu_ids), + "success_count": len(spu_ids), + "failed_count": 0, + } + + def _get_transformer_bundle(self, tenant_id: str): + return _FakeTransformer(), None, False + + +@pytest.fixture +def indexer_client(monkeypatch): + import api.indexer_app as indexer_app + import api.routes.indexer as indexer_routes + + monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None) + monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService()) + monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService()) + monkeypatch.setattr(indexer_routes, "get_es_client", lambda: object()) + + with TestClient(indexer_app.app) as client: + yield client + + +def test_indexer_reindex_contract(indexer_client: TestClient): + response = indexer_client.post( + "/indexer/reindex", + json={"tenant_id": "162", "batch_size": 100}, + ) + assert response.status_code == 200 + assert response.json()["success"] is True + + +def test_indexer_incremental_contract(indexer_client: TestClient): + response = indexer_client.post( + "/indexer/index", + json={"tenant_id": "162", "spu_ids": ["1001", "1002"]}, + ) + assert response.status_code == 200 + data = response.json() + assert data["success_count"] == 2 + + +def test_indexer_build_docs_contract(indexer_client: TestClient): + response = indexer_client.post( + "/indexer/build-docs", + json={ + "tenant_id": "162", + "items": [{"spu": {"id": 1, "title": "T-shirt"}, "skus": [], "options": []}], + }, + ) + assert response.status_code == 200 + data = response.json() + assert data["success_count"] == 1 + assert data["docs"][0]["spu_id"] == "1" + + +class _FakeTextModel: + def encode_batch(self, texts, batch_size=32, device="cpu"): + return [np.array([0.1, 0.2, 0.3], dtype=np.float32) for _ in texts] + + +class _FakeImageModel: + def encode_image_urls(self, urls, batch_size=8): + return [np.array([0.3, 0.2, 0.1], dtype=np.float32) for _ in urls] + + +@pytest.fixture +def embedding_client(): + import embeddings.server as emb_server + + emb_server.app.router.on_startup.clear() + emb_server._text_model = _FakeTextModel() + emb_server._image_model = _FakeImageModel() + + with TestClient(emb_server.app) as client: + yield client + + +def test_embedding_text_contract(embedding_client: TestClient): + response = embedding_client.post("/embed/text", json=["hello", "world"]) + assert response.status_code == 200 + data = response.json() + assert len(data) == 2 + assert len(data[0]) == 3 + + +def test_embedding_image_contract(embedding_client: TestClient): + response = embedding_client.post("/embed/image", json=["https://example.com/a.jpg"]) + assert response.status_code == 200 + assert len(response.json()[0]) == 3 + + +class _FakeTranslator: + model = "qwen" + use_cache = True + + def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None): + return f"{text}-{target_lang}" + + +@pytest.fixture +def translator_client(monkeypatch): + import api.translator_app as translator_app + + translator_app.app.router.on_startup.clear() + monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FakeTranslator()) + + with TestClient(translator_app.app) as client: + yield client + + +def test_translator_api_contract(translator_client: TestClient): + response = translator_client.post( + "/translate", + json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"}, + ) + assert response.status_code == 200 + assert response.json()["translated_text"] == "商品名称-en" + + +def test_translator_health_contract(translator_client: TestClient): + response = translator_client.get("/health") + assert response.status_code == 200 + assert response.json()["status"] == "healthy" + + +class _FakeReranker: + _model_name = "fake-reranker" + + def score_with_meta(self, query: str, docs: List[str], normalize: bool = True): + scores = [float(i + 1) for i in range(len(docs))] + meta: Dict[str, Any] = {"input_docs": len(docs), "unique_docs": len(set(docs))} + return scores, meta + + +@pytest.fixture +def reranker_client(): + import reranker.server as reranker_server + + reranker_server.app.router.on_startup.clear() + reranker_server._reranker = _FakeReranker() + reranker_server._backend_name = "fake" + + with TestClient(reranker_server.app) as client: + yield client + + +def test_reranker_api_contract(reranker_client: TestClient): + response = reranker_client.post( + "/rerank", + json={"query": "wireless mouse", "docs": ["doc-a", "doc-b"]}, + ) + assert response.status_code == 200 + data = response.json() + assert data["scores"] == [1.0, 2.0] + assert data["meta"]["input_docs"] == 2 + + +def test_reranker_health_contract(reranker_client: TestClient): + response = reranker_client.get("/health") + assert response.status_code == 200 + assert response.json()["status"] == "ok" diff --git a/tests/test_cloud_embedding.py b/tests/test_cloud_embedding.py index 67b358c..dc7c381 100644 --- a/tests/test_cloud_embedding.py +++ b/tests/test_cloud_embedding.py @@ -11,6 +11,8 @@ import time from datetime import datetime from pathlib import Path +import pytest + # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -44,6 +46,7 @@ def read_queries(file_path: str, limit: int = 100) -> list: return queries +@pytest.mark.skip(reason="Requires data file and DASHSCOPE_API_KEY; run manually when needed") def test_cloud_embedding(queries_file: str, num_queries: int = 100): """ Test cloud embedding with queries from file. diff --git a/tests/test_cnclip_service.py b/tests/test_cnclip_service.py index 2fcfc7b..9751346 100755 --- a/tests/test_cnclip_service.py +++ b/tests/test_cnclip_service.py @@ -13,11 +13,17 @@ CN-CLIP 服务测试脚本 """ import sys -import numpy as np -from clip_client import Client +import pytest -def test_encoding(client, test_name, inputs): +try: + import numpy as np + from clip_client import Client +except ImportError: + pytest.skip("clip_client not installed (optional clip-as-service client)", allow_module_level=True) + + +def _test_encoding(client, test_name, inputs): """测试编码功能""" print(f"\n{test_name}...") try: @@ -74,21 +80,21 @@ def main(): results = [] # 测试1: 文本编码 - results.append(test_encoding( + results.append(_test_encoding( client, "测试1: 编码文本", ['这是一个测试文本', '另一个测试文本'] )) # 测试2: 图像编码 - results.append(test_encoding( + results.append(_test_encoding( client, "测试2: 编码图像(远程 URL)", ['https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg'] )) # 测试3: 混合编码 - results.append(test_encoding( + results.append(_test_encoding( client, "测试3: 混合编码(文本和图像)", ['这是一段文本', 'https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg'] -- libgit2 0.21.2