tests

tangwang
1 parent bc54124c
Showing 7 changed files with 435 additions and 723 deletions Show diff stats
.github/workflows/test.yml
README.md
docs/QUICKSTART.md
scripts/run_ci_tests.sh
tests/ci/test_service_api_contracts.py
tests/test_cloud_embedding.py
tests/test_cnclip_service.py
-name: saas-search Test Pipeline
+name: CI - Service Contracts
 on:
   push:
-    branches: [ main, master, develop ]
+    branches: [main, master, develop]
   pull_request:
-    branches: [ main, master, develop ]
-  workflow_dispatch:  # 允许手动触发
-
-env:
-  PYTHON_VERSION: '3.9'
-  NODE_VERSION: '16'
+    branches: [main, master, develop]
+  workflow_dispatch:
 jobs:
-  # 代码质量检查
-  code-quality:
-    runs-on: ubuntu-latest
-    name: Code Quality Check
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 black isort mypy pylint
-        pip install -r requirements.txt
-
-    - name: Run Black (code formatting)
-      run: |
-        black --check --diff .
-
-    - name: Run isort (import sorting)
-      run: |
-        isort --check-only --diff .
-
-    - name: Run Flake8 (linting)
-      run: |
-        flake8 --max-line-length=100 --ignore=E203,W503 .
-
-    - name: Run MyPy (type checking)
-      run: |
-        mypy --ignore-missing-imports --no-strict-optional .
-
-    - name: Run Pylint
-      run: |
-        pylint --disable=C0114,C0115,C0116 --errors-only .
-
-  # 单元测试
-  unit-tests:
-    runs-on: ubuntu-latest
-    name: Unit Tests
-
-    strategy:
-      matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Cache pip dependencies
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pytest pytest-cov pytest-json-report
-        pip install -r requirements.txt
-
-    - name: Create test logs directory
-      run: mkdir -p test_logs
-
-    - name: Run unit tests
-      run: |
-        python -m pytest tests/unit/ \
-          -v \
-          --tb=short \
-          --cov=. \
-          --cov-report=xml \
-          --cov-report=html \
-          --cov-report=term-missing \
-          --json-report \
-          --json-report-file=test_logs/unit_test_results.json
-
-    - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v3
-      with:
-        file: ./coverage.xml
-        flags: unittests
-        name: codecov-umbrella
-
-    - name: Upload unit test results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: unit-test-results-${{ matrix.python-version }}
-        path: |
-          test_logs/unit_test_results.json
-          htmlcov/
-
-  # 集成测试
-  integration-tests:
-    runs-on: ubuntu-latest
-    name: Integration Tests
-    needs: [code-quality, unit-tests]
-
-    services:
-      elasticsearch:
-        image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0
-        env:
-          discovery.type: single-node
-          ES_JAVA_OPTS: -Xms1g -Xmx1g
-          xpack.security.enabled: false
-        ports:
-          - 9200:9200
-        options: >-
-          --health-cmd "curl http://localhost:9200/_cluster/health"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 10
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Install system dependencies
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y curl
-
-    - name: Install Python dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pytest pytest-json-report httpx
-        pip install -r requirements.txt
-
-    - name: Create test logs directory
-      run: mkdir -p test_logs
-
-    - name: Wait for Elasticsearch
-      run: |
-        echo "Waiting for Elasticsearch to be ready..."
-        for i in {1..30}; do
-          if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then
-            echo "Elasticsearch is ready"
-            break
-          fi
-          echo "Attempt $i/30: Elasticsearch not ready yet"
-          sleep 2
-        done
-
-    - name: Setup test index
-      run: |
-        curl -X PUT http://localhost:9200/test_products \
-          -H 'Content-Type: application/json' \
-          -d '{
-            "settings": {
-              "number_of_shards": 1,
-              "number_of_replicas": 0
-            },
-            "mappings": {
-              "properties": {
-                "name": {"type": "text"},
-                "brand_name": {"type": "text"},
-                "tags": {"type": "text"},
-                "price": {"type": "double"},
-                "category_id": {"type": "integer"},
-                "spu_id": {"type": "keyword"},
-                "text_embedding": {"type": "dense_vector", "dims": 1024}
-              }
-            }
-          }'
-
-    - name: Insert test data
-      run: |
-        curl -X POST http://localhost:9200/test_products/_bulk \
-          -H 'Content-Type: application/json' \
-          --data-binary @- << 'EOF'
-{"index": {"_id": "1"}}
-{"name": "红色连衣裙", "brand_name": "测试品牌", "tags": ["红色", "连衣裙", "女装"], "price": 299.0, "category_id": 1, "spu_id": "dress_001"}
-{"index": {"_id": "2"}}
-{"name": "蓝色连衣裙", "brand_name": "测试品牌", "tags": ["蓝色", "连衣裙", "女装"], "price": 399.0, "category_id": 1, "spu_id": "dress_002"}
-{"index": {"_id": "3"}}
-{"name": "智能手机", "brand_name": "科技品牌", "tags": ["智能", "手机", "数码"], "price": 2999.0, "category_id": 2, "spu_id": "phone_001"}
-EOF
-
-    - name: Run integration tests
-      env:
-        ES_HOST: http://localhost:9200
-        TENANT_ID: test_tenant
-        TESTING_MODE: true
-      run: |
-        python -m pytest tests/integration/ \
-          -v \
-          --tb=short \
-          -m "not slow" \
-          --json-report \
-          --json-report-file=test_logs/integration_test_results.json
-
-    - name: Upload integration test results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: integration-test-results
-        path: test_logs/integration_test_results.json
-
-  # API测试
-  api-tests:
-    runs-on: ubuntu-latest
-    name: API Tests
-    needs: [code-quality, unit-tests]
-
-    services:
-      elasticsearch:
-        image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0
-        env:
-          discovery.type: single-node
-          ES_JAVA_OPTS: -Xms1g -Xmx1g
-          xpack.security.enabled: false
-        ports:
-          - 9200:9200
-        options: >-
-          --health-cmd "curl http://localhost:9200/_cluster/health"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 10
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Install system dependencies
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y curl
-
-    - name: Install Python dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pytest pytest-json-report httpx
-        pip install -r requirements.txt
-
-    - name: Create test logs directory
-      run: mkdir -p test_logs
-
-    - name: Wait for Elasticsearch
-      run: |
-        echo "Waiting for Elasticsearch to be ready..."
-        for i in {1..30}; do
-          if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then
-            echo "Elasticsearch is ready"
-            break
-          fi
-          echo "Attempt $i/30: Elasticsearch not ready yet"
-          sleep 2
-        done
-
-    - name: Setup test index and data
-      run: |
-        # 创建索引
-        curl -X PUT http://localhost:9200/test_products \
-          -H 'Content-Type: application/json' \
-          -d '{
-            "settings": {"number_of_shards": 1, "number_of_replicas": 0},
-            "mappings": {
-              "properties": {
-                "name": {"type": "text"}, "brand_name": {"type": "text"},
-                "tags": {"type": "text"}, "price": {"type": "double"},
-                "category_id": {"type": "integer"}, "spu_id": {"type": "keyword"},
-                "text_embedding": {"type": "dense_vector", "dims": 1024}
-              }
-            }
-          }'
-
-        # 插入测试数据
-        curl -X POST http://localhost:9200/test_products/_bulk \
-          -H 'Content-Type: application/json' \
-          --data-binary @- << 'EOF'
-{"index": {"_id": "1"}}
-{"name": "红色连衣裙", "brand_name": "测试品牌", "tags": ["红色", "连衣裙", "女装"], "price": 299.0, "category_id": 1, "spu_id": "dress_001"}
-{"index": {"_id": "2"}}
-{"name": "蓝色连衣裙", "brand_name": "测试品牌", "tags": ["蓝色", "连衣裙", "女装"], "price": 399.0, "category_id": 1, "spu_id": "dress_002"}
-EOF
-
-    - name: Start API service
-      env:
-        ES_HOST: http://localhost:9200
-        TENANT_ID: test_tenant
-        API_HOST: 127.0.0.1
-        API_PORT: 6003
-        TESTING_MODE: true
-      run: |
-        python -m api.app \
-          --host $API_HOST \
-          --port $API_PORT \
-          --tenant $TENANT_ID \
-          --es-host $ES_HOST &
-        echo $! > api.pid
-
-        # 等待API服务启动
-        for i in {1..30}; do
-          if curl -s http://$API_HOST:$API_PORT/health > /dev/null; then
-            echo "API service is ready"
-            break
-          fi
-          echo "Attempt $i/30: API service not ready yet"
-          sleep 2
-        done
-
-    - name: Run API tests
-      env:
-        ES_HOST: http://localhost:9200
-        API_HOST: 127.0.0.1
-        API_PORT: 6003
-        TENANT_ID: test_tenant
-        TESTING_MODE: true
-      run: |
-        python -m pytest tests/integration/test_api_integration.py \
-          -v \
-          --tb=short \
-          --json-report \
-          --json-report-file=test_logs/api_test_results.json
-
-    - name: Stop API service
-      if: always()
-      run: |
-        if [ -f api.pid ]; then
-          kill $(cat api.pid) || true
-          rm api.pid
-        fi
-
-    - name: Upload API test results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: api-test-results
-        path: test_logs/api_test_results.json
-
-  # 性能测试
-  performance-tests:
-    runs-on: ubuntu-latest
-    name: Performance Tests
-    needs: [code-quality, unit-tests]
-    if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
-
-    services:
-      elasticsearch:
-        image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0
-        env:
-          discovery.type: single-node
-          ES_JAVA_OPTS: -Xms2g -Xmx2g
-          xpack.security.enabled: false
-        ports:
-          - 9200:9200
-        options: >-
-          --health-cmd "curl http://localhost:9200/_cluster/health"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 10
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pytest locust
-        pip install -r requirements.txt
-
-    - name: Wait for Elasticsearch
-      run: |
-        echo "Waiting for Elasticsearch to be ready..."
-        for i in {1..30}; do
-          if curl -s http://localhost:9200/_cluster/health | grep -q '"status":"green\|yellow"'; then
-            echo "Elasticsearch is ready"
-            break
-          fi
-          sleep 2
-        done
-
-    - name: Setup test data
-      run: |
-        # 创建并填充测试索引
-        python scripts/create_test_data.py --count 1000
-
-    - name: Run performance tests
-      env:
-        ES_HOST: http://localhost:9200
-        TESTING_MODE: true
-      run: |
-        python scripts/run_performance_tests.py
-
-    - name: Upload performance results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: performance-test-results
-        path: performance_results/
-
-  # 安全扫描
-  security-scan:
+  service-contract-tests:
     runs-on: ubuntu-latest
-    name: Security Scan
-    needs: [code-quality]
+    name: Service Contract Tests
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Install security scanning tools
-      run: |
-        python -m pip install --upgrade pip
-        pip install safety bandit
-
-    - name: Run Safety (dependency check)
-      run: |
-        safety check --json --output safety_report.json || true
-
-    - name: Run Bandit (security linter)
-      run: |
-        bandit -r . -f json -o bandit_report.json || true
-
-    - name: Upload security reports
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: security-reports
-        path: |
-          safety_report.json
-          bandit_report.json
-
-  # 测试结果汇总
-  test-summary:
-    runs-on: ubuntu-latest
-    name: Test Summary
-    needs: [unit-tests, integration-tests, api-tests, security-scan]
-    if: always()
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Download all test artifacts
-      uses: actions/download-artifact@v3
-
-    - name: Generate test summary
-      run: |
-        python scripts/generate_test_summary.py
-
-    - name: Upload final report
-      uses: actions/upload-artifact@v3
-      with:
-        name: final-test-report
-        path: final_test_report.*
-
-    - name: Comment PR with results
-      if: github.event_name == 'pull_request'
-      uses: actions/github-script@v6
-      with:
-        script: |
-          const fs = require('fs');
-
-          // 读取测试报告
-          let reportContent = '';
-          try {
-            reportContent = fs.readFileSync('final_test_report.txt', 'utf8');
-          } catch (e) {
-            console.log('Could not read report file');
-            return;
-          }
-
-          // 提取摘要信息
-          const lines = reportContent.split('\n');
-          let summary = '';
-          let inSummary = false;
-
-          for (const line of lines) {
-            if (line.includes('测试摘要')) {
-              inSummary = true;
-              continue;
-            }
-            if (inSummary && line.includes('测试套件详情')) {
-              break;
-            }
-            if (inSummary && line.trim()) {
-              summary += line + '\n';
-            }
-          }
-
-          // 构建评论内容
-          const comment = `## 🧪 测试报告\n\n${summary}\n\n详细的测试报告请查看 [Artifacts](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) 部分。`;
-
-          // 发送评论
-          github.rest.issues.createComment({
-            issue_number: context.issue.number,
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            body: comment
-          });
 \ No newline at end of file
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run CI contract tests
+        run: |
+          python -m pytest tests/ci -q
 \ No newline at end of file
-# TODO
-
-**多语言索引**：已改为可配置的 `index_languages`（默认为 `["en", "zh"]`），商家可勾选主市场语言。支持语言见 `config.tenant_config_loader.SUPPORTED_INDEX_LANGUAGES`（含 en, zh, zh_tw, ru, ja, ko, es, fr, pt, de, it, th, vi, id, ms, ar, hi, he, my, ta, ur, bn, pl, nl, ro, tr, km, lo, yue, cs, el, sv, hu, da, fi, uk, bg 等）。
-
-前端：
-搜索模态框
-点击搜索的时候，弹出 搜索模态框，参考 react、AJAX等技术来实现，搜索模态框的页面宽度和原始页面相同（占满），左侧是suggestions，右侧是即使刷新的搜索结果（每输入一个字母都刷新一次结果）。
-但是要注意：搜索过程中，后端不要触发翻译（因为输入过程中的query翻译结果会有问题），因此需要增加一个参数：搜索类型，默认为当前的回车后发起的搜索，如果是输入过程中的结果刷新则类型为typing。
-
-
-
-多语言：
-语义：
-多语言：
-1. dis_max的方式
-"query" : {
-  "dis_max" : {
-    "queries" : [
-      {"match" : { "title.en" : xxx }},
-      {"match" : { "title.zh" : xxx }},
-      {"match" : { "title_xx" : xxx }}    
-    ],
-    "tie_breakler" : 0.8
-  }
-}
-
-
-
-"corss_field":
-"multi_match" : {
-  "query" : 
-  "fields" : [...],
-  "type": "cross_fields",
-  "operator" : "and"
-}
-
-
-支持英文的拼写接错：
-title： multi_field
-"query" : {
-  "query_string" : {
-    "query": "xxx",
-    "default_field": "title.ngram",
-    "minimum_should_match": "85%"
-  }
-}
-
-
-
-业务提权：
-rescore： window_size query_weight rescore_query_weight
-
-function_score:
-boost
-以及 可以把每个子查询用function_score包一下
-
-
-
-
-query anchor  
-我想给elasticsearch 增加字段 query anchor ，即哪些query点击到了这个doc，一个doc下面有多个query anchor，每个query anchor又有这两个属性：weight、dweight，分别代表 query在doc下的点击分布权重、doc在query下的点击分布权重。请问该如何设计这两个ES字段。
-
-需要有zh en两套query anchor，因为他们的解析器不一样。
-
-他的功能是辅助召回和排序。我搜索一个query，那么每个query跟 doc中的query anchor的相关性，也就是，除了将query到title 和keywords / brief等文本字段中搜索，也到 query anchor中搜索，从而辅助召回和相关性的计算。
-
-
+# 电商搜索引擎 SaaS
+多租户、可配置、可扩展的电商搜索平台（Shoplazza 等独立站场景）。
+README 用于给后续开发者建立统一认知：**系统框架、模块边界、设计原则、研发流程与 CI 测试入口**，帮助持续迭代时避免分叉设计与冗余代码。
-# 电商搜索引擎 SaaS
+---
-一个针对跨境独立站（店匠 Shoplazza 等）的多租户可配置搜索平台。README 作为项目导航入口，帮助你在不同阶段定位到更详细的文档。
+## 1) 项目目标与边界
+- **目标**：在统一架构下支持关键词检索、语义检索、分面过滤、多语言、重排、图片检索。
+- **边界**：本仓库负责搜索核心能力与服务编排；业务方通过标准 HTTP API 对接。
+- **核心约束**：
+  - 调用方稳定（API/Provider 契约优先）
+  - 配置单一来源（`config/config.yaml` + `.env` 覆盖）
+  - 扩展优先走插件化（provider/backend），避免散落式分叉实现
-## 项目环境
+---
-以项目根目录的 **`activate.sh`** 为准（**优先激活 venv：`./.venv`，并加载 `.env`；兼容 Conda 回退**）：
+## 2) 快速开始
 ```bash
-# 推荐：首次创建 venv（默认安装基础依赖）
+# 首次创建环境（默认基础依赖）
 ./scripts/create_venv.sh
-
-# 如需本地向量/图片编码（会安装 torch/transformers 等重依赖）
-# INSTALL_ML=1 ./scripts/create_venv.sh
 source activate.sh
-```
-新机器首次需创建环境，见 `docs/环境配置说明.md`（推荐 venv；Conda 为兼容旧流程）。
+# 启动核心服务（backend/indexer/frontend）
+./run.sh
-## 测试pipeline
+# 可选：附加能力服务
+START_EMBEDDING=1 START_TRANSLATOR=1 START_RERANKER=1 ./run.sh
-1. 
-店铺1  tenant_id=162：
-fake数据 生成商品导入数据 提交到店匠的店铺：
-cd /data/saas-search && source activate.sh && python scripts/csv_to_excel_multi_variant.py --output with_colors.xlsx
+# 查看状态
+./scripts/service_ctl.sh status
+```
-店铺2  tenant_id= 
+核心端口：
+- `6002` backend（`/search/*`, `/admin/*`）
+- `6004` indexer（`/indexer/*`）
+- `6003` frontend
+- `6005` embedding（可选）
+- `6006` translator（可选）
+- `6007` reranker（可选）
-2. 后端：自动同步到mysql
+更完整示例见 `docs/QUICKSTART.md`。
-3. mysql到ES：
+---
-python scripts/recreate_and_import.py \
-    --tenant-id 162 \
-    --db-host <mysql_host> \
-    --db-database saas \
-    --db-username saas \
-    --db-password <password> \
-    --es-host http://localhost:9200
+## 3) 总体架构（开发者视角）
-构造查询：
-参考 @常用查询 - ES.md
+- `api/`：统一 API 入口（search/admin/indexer app）
+- `search/`：召回、排序、结果组织
+- `query/`：查询解析、多语言处理、改写
+- `indexer/`：MySQL 行数据 -> ES 文档的转换与索引流程
+- `providers/`：能力调用抽象（translation/embedding/rerank）
+- `reranker/`：重排服务及后端实现
+- `embeddings/`：向量服务（文本/图像）
+- `config/`：配置加载与服务配置解析
+关键设计：**Provider（调用方式）与 Backend（推理实现）分离**，新增能力优先在协议与工厂注册，不改调用方主流程。
-## 核心能力速览
+---
-- **多语言 + 自动翻译**：中文、英文、俄文等语言检测与路由（BGE-M3、DeepL）
-- **语义 + 关键词混排**：BM25、dense vector（BGE-M3/CN-CLIP）融合
-- **布尔与分面**：AND / OR / ANDNOT / RANK、Terms & Range facets
-- **多租户隔离**：共享 `search_products` 索引，通过 `tenant_id` 严格隔离
-- **可配置化**：字段/索引域/排序表达式/查询改写全部配置驱动
-- **脚本化流水线**：Mock/CSV 数据 → MySQL → Elasticsearch → API/前端
+## 4) 设计原则（避免后续分叉）
-## 新人入口
+- **单一配置源**：服务地址、provider 选择、后端参数统一在 `config/config.yaml`，环境变量仅做覆盖。
+- **接口契约优先**：外部 API 契约与 provider 契约稳定，内部重构不影响调用方。
+- **扩展走工厂**：新增 provider/backend 必须在工厂函数中显式注册，禁止旁路分支。
+- **可观测性优先**：健康检查、关键日志、请求上下文必须可追踪。
+- **测试优先保障契约**：CI 首先保证接口契约和核心路径可用，再逐步扩展性能与业务测试。
-**→ 开发者必读**：[docs/DEVELOPER_GUIDE.md](docs/DEVELOPER_GUIDE.md) — 项目全貌、设计原则、扩展规范与迭代检查清单，保证后续开发在统一框架内进行。
+---
-**→ 快速上手**：[docs/QUICKSTART.md](docs/QUICKSTART.md) — 环境、服务、模块、请求示例一页搞定。
+## 5) 文档入口（建议阅读顺序）
 | 步骤 | 文档 |
 |------|------|
-| 0. 框架与规范（推荐首读） | `docs/DEVELOPER_GUIDE.md` |
-| 1. 环境与启动 | `docs/QUICKSTART.md` |
-| 2. 搜索/索引 API | `docs/QUICKSTART.md` §3、`docs/搜索API速查表.md` |
-| 3. 运维与故障 | `docs/Usage-Guide.md` |
-| 4. 架构与扩展 | `docs/PROVIDER_ARCHITECTURE.md`、`docs/MODULE_EXTENSION_SPEC.md`、`docs/系统设计文档.md` |
-
-### Runtimes & 命令示例
+| 0. 全局规范（首读） | `docs/DEVELOPER_GUIDE.md` |
+| 1. 开发与配置 | `docs/QUICKSTART.md` |
+| 2. 运行与排障 | `docs/Usage-Guide.md` |
+| 3. API 详细说明 | `docs/搜索API对接指南.md` |
+| 4. 快速参数速查 | `docs/搜索API速查表.md` |
+| 5. 首次环境搭建 | `docs/环境配置说明.md` |
-```bash
-# 1. 安装依赖与准备服务（环境创建见 docs/环境配置说明.md）
-source activate.sh   # 或先 export CONDA_ROOT=你的conda路径
-pip install -r requirements.txt   # 若用 environment.yml 创建环境可省略
-docker run -d --name es -p 9200:9200 elasticsearch:8.11.0
+---
-# 2. 构造测试数据并导入 MySQL
-./scripts/mock_data.sh                               # 详见 TEST_DATA_GUIDE.md
+## 6) 持续集成测试（推荐最小集）
-# 3. 创建租户索引结构并导入数据（推荐）
-./scripts/create_tenant_index.sh 162
-curl -X POST "http://localhost:6004/indexer/reindex" \
-  -H "Content-Type: application/json" \
-  -d '{"tenant_id":"162","batch_size":500}'
+本仓库提供一套轻量、稳定、易维护的 CI 测试入口，覆盖以下服务契约：
-# 4. 启动核心服务（backend/indexer/frontend）
-./run.sh
+- 搜索接口（search API）
+- 索引接口（indexer API）
+- 向量服务（embedding service）
+- 翻译服务（translator service）
+- 重排服务（reranker service）
-# （可选）附加启动 embedding / translator / reranker
-START_EMBEDDING=1 START_TRANSLATOR=1 START_RERANKER=1 ./run.sh
-#
-# 查看服务状态 / 停止
-./scripts/service_ctl.sh status
-./scripts/stop.sh
+本地运行：
-# 5. 调用文本搜索 API
-curl -X POST http://localhost:6002/search/ \
-  -H "Content-Type: application/json" \
-  -H "X-Tenant-ID: 1" \
-  -d '{"query": "玩具", "size": 10}'
+```bash
+source activate.sh
+python -m pytest tests/ci -q
 ```
-## 文档索引
+该测试集采用 mock/stub，**不依赖真实 ES/MySQL/大模型服务**，适合作为 PR 级快速回归门禁。
-| 文档 | 用途 |
-|------|------|
-| `docs/DEVELOPER_GUIDE.md` | **开发者开放指南**：全貌、原则、规范、检查清单 |
-| `docs/QUICKSTART.md` | 新人上手：环境、服务、模块、请求 |
-| `docs/Usage-Guide.md` | 运维：日志、多环境、故障排查 |
-| `docs/搜索API速查表.md` | 搜索 API 参数速查 |
-| `docs/搜索API对接指南.md` | 搜索 API 完整说明 |
-| `docs/PROVIDER_ARCHITECTURE.md` | 翻译/向量/重排 provider 扩展 |
-| `docs/MODULE_EXTENSION_SPEC.md` | 向量/重排后端可插拔规范 |
-| `docs/环境配置说明.md` | 首次部署、新机器环境 |
-| `docs/系统设计文档.md` | 架构与模块细节 |
-
-## 关键工作流指引
-
-- **数据构建 → MySQL → Elasticsearch**  
-  - `scripts/mock_data.sh`：Tenant1 Mock + Tenant2 CSV 一条龙  
-  - `scripts/create_tenant_index.sh <tenant_id>` + `POST /indexer/reindex`：推荐导入链路
-  - 详解：`测试数据指南.md`
-
-- **索引富化 & Java 对接**  
-  - Java 索引程序负责：全量/增量调度 + 从 MySQL 查询 `shoplazza_product_spu/sku/option/...`  
-  - Python `indexer` 模块负责：**MySQL 行 → ES doc** 的全部逻辑（多语言、翻译、向量、规格聚合等）  
-  - 正式对接接口（推荐）：  
-    - `POST http://<indexer_host>:6004/indexer/build-docs`  
-      - 入参：`tenant_id + items[{spu, skus, options}]`  
-      - 出参：与 `mappings/search_products.json` 完全一致的 `docs` 列表，上游自行写入 ES  
-  - 调试/自测接口（内部使用）：  
-    - `POST http://127.0.0.1:6004/indexer/build-docs-from-db`，只需要 `tenant_id + spu_ids`，由服务内部查库并返回 ES doc  
-  - 详解：`indexer/README.md`、`docs/索引字段说明v2.md`
-
-- **搜索服务 & API**  
-  - `api/`（FastAPI）承载 REST API，`search/` + `query/` 负责查询解析与下发  
-  - API、分页、过滤、Facet、KNN 等：`搜索API对接指南.md`
-  - 对接案例、示例与错误码：`搜索API对接指南.md`、`Search-API-Examples.md`
-
-- **统一配置**  
-  - 所有租户共享统一的索引结构和查询配置（硬编码）  
-  - 索引 mapping: `mappings/search_products.json`  
-  - 查询配置: `search/query_config.py`  
-  - 详解：`基础配置指南.md`、`索引字段说明v2.md`
-
-## 仓库结构（概览）
+---
-```
-api/            FastAPI 服务与路由
-config/         字段/索引/查询配置体系
-indexer/        MySQL → ES 管道（mapping / transformer / bulk）
-query/          查询解析、改写、翻译、embedding
-search/         多语言构建、布尔解析、排序引擎
-scripts/        数据/服务脚本（mock_data, ingest, run 等）
-frontend/       简易调试页面
-docs/           运营及中文资料
-```
+## 7) 代码质量与持续继承要求
+
+- 新增功能必须补最小测试（至少覆盖 1 条成功路径 + 1 条参数异常路径）
+- 修改公共协议时必须同步更新：
+  - `docs/QUICKSTART.md`
+  - 对应服务 README / API 文档
+  - `tests/ci` 契约用例
+- 禁止新增“临时分支逻辑”绕过 provider/backend 工厂
+- 优先减少重复实现，复用现有转换链路与配置解析入口
@@ -27,6 +27,7 @@
 4. [模块扩展规范（Embedding / Rerank）](#4-模块扩展规范embedding--rerank)
 5. [验证、日志与常见排障入口](#5-验证日志与常见排障入口)
 6. [相关文档](#6-相关文档)
+7. [持续集成测试（最小可维护方案）](#7-持续集成测试最小可维护方案)
 ---
@@ -374,3 +375,32 @@ lsof -i :6004
 | `indexer/README.md` | 索引模块职责与接口 |
 | `embeddings/README.md` | 向量化服务说明 |
 | `reranker/README.md` | 重排服务说明 |
+
+---
+
+## 7. 持续集成测试（最小可维护方案）
+
+目标：让后续开发者在不依赖真实 ES/MySQL/模型服务的前提下，快速验证核心服务契约不被破坏。
+
+### 7.1 测试范围
+
+`tests/ci/test_service_api_contracts.py` 覆盖：
+
+- 搜索接口：`/search/`、`/search/image`、`/search/suggestions`
+- 索引接口：`/indexer/reindex`、`/indexer/index`、`/indexer/build-docs`
+- 向量服务：`/embed/text`、`/embed/image`
+- 翻译服务：`/translate`、`/health`
+- 重排服务：`/rerank`、`/health`
+
+### 7.2 运行方式
+
+```bash
+source activate.sh
+python -m pytest tests/ci -q
+```
+
+### 7.3 设计取舍
+
+- 使用 mock/stub 注入依赖，确保测试快且稳定
+- 重点测“接口契约与参数行为”，而不是底层模型质量
+- 作为 PR 级门禁；真实环境联调放在运维/预发布流程
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")/.."
+source ./activate.sh
+
+echo "Running CI contract tests..."
+python -m pytest tests/ci -q
@@ -0,0 +1,281 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any, Dict, List
+
+import numpy as np
+import pytest
+from fastapi.testclient import TestClient
+
+
+class _FakeSearcher:
+    def search(self, **kwargs):
+        return SimpleNamespace(
+            results=[
+                {
+                    "spu_id": "spu-1",
+                    "title": "测试商品",
+                    "price": 99.0,
+                    "currency": "USD",
+                    "in_stock": True,
+                    "skus": [],
+                    "relevance_score": 1.2,
+                }
+            ],
+            total=1,
+            max_score=1.2,
+            took_ms=8,
+            facets=[],
+            query_info={"normalized_query": kwargs.get("query", "")},
+            suggestions=[],
+            related_searches=[],
+            debug_info=None,
+        )
+
+    def search_by_image(self, **kwargs):
+        return self.search(**kwargs)
+
+
+class _FakeSuggestionService:
+    def search(self, **kwargs):
+        return {
+            "query": kwargs["query"],
+            "language": kwargs.get("language", "en"),
+            "resolved_language": kwargs.get("language", "en"),
+            "suggestions": [{"text": "iphone 15", "score": 1.0}],
+            "took_ms": 3,
+        }
+
+
+@pytest.fixture
+def search_client(monkeypatch):
+    import api.app as search_app
+
+    monkeypatch.setattr(search_app, "init_service", lambda es_host="": None)
+    monkeypatch.setattr(search_app, "get_searcher", lambda: _FakeSearcher())
+    monkeypatch.setattr(search_app, "get_suggestion_service", lambda: _FakeSuggestionService())
+
+    with TestClient(search_app.app) as client:
+        yield client
+
+
+def test_search_api_contract(search_client: TestClient):
+    response = search_client.post(
+        "/search/",
+        headers={"X-Tenant-ID": "162"},
+        json={"query": "toy", "size": 5},
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["total"] == 1
+    assert data["results"][0]["spu_id"] == "spu-1"
+
+
+def test_image_search_api_contract(search_client: TestClient):
+    response = search_client.post(
+        "/search/image",
+        headers={"X-Tenant-ID": "162"},
+        json={"image_url": "https://example.com/a.jpg", "size": 3},
+    )
+    assert response.status_code == 200
+    assert response.json()["results"][0]["spu_id"] == "spu-1"
+
+
+def test_suggestion_api_contract(search_client: TestClient):
+    response = search_client.get(
+        "/search/suggestions?q=iph&size=5&language=en",
+        headers={"X-Tenant-ID": "162"},
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["query"] == "iph"
+    assert len(data["suggestions"]) == 1
+
+
+class _FakeBulkService:
+    def bulk_index(self, tenant_id: str, recreate_index: bool, batch_size: int):
+        return {
+            "tenant_id": tenant_id,
+            "recreate_index": recreate_index,
+            "batch_size": batch_size,
+            "success": True,
+        }
+
+
+class _FakeTransformer:
+    def transform_spu_to_doc(self, tenant_id: str, spu_row, skus, options):
+        return {
+            "tenant_id": tenant_id,
+            "spu_id": str(spu_row.get("id", "0")),
+            "title": {"zh": str(spu_row.get("title", ""))},
+        }
+
+
+class _FakeIncrementalService:
+    def index_spus_to_es(self, es_client, tenant_id: str, spu_ids: List[str], delete_spu_ids=None):
+        return {
+            "tenant_id": tenant_id,
+            "spu_ids": [{"spu_id": s, "status": "indexed"} for s in spu_ids],
+            "delete_spu_ids": [],
+            "total": len(spu_ids),
+            "success_count": len(spu_ids),
+            "failed_count": 0,
+        }
+
+    def _get_transformer_bundle(self, tenant_id: str):
+        return _FakeTransformer(), None, False
+
+
+@pytest.fixture
+def indexer_client(monkeypatch):
+    import api.indexer_app as indexer_app
+    import api.routes.indexer as indexer_routes
+
+    monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None)
+    monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService())
+    monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService())
+    monkeypatch.setattr(indexer_routes, "get_es_client", lambda: object())
+
+    with TestClient(indexer_app.app) as client:
+        yield client
+
+
+def test_indexer_reindex_contract(indexer_client: TestClient):
+    response = indexer_client.post(
+        "/indexer/reindex",
+        json={"tenant_id": "162", "batch_size": 100},
+    )
+    assert response.status_code == 200
+    assert response.json()["success"] is True
+
+
+def test_indexer_incremental_contract(indexer_client: TestClient):
+    response = indexer_client.post(
+        "/indexer/index",
+        json={"tenant_id": "162", "spu_ids": ["1001", "1002"]},
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["success_count"] == 2
+
+
+def test_indexer_build_docs_contract(indexer_client: TestClient):
+    response = indexer_client.post(
+        "/indexer/build-docs",
+        json={
+            "tenant_id": "162",
+            "items": [{"spu": {"id": 1, "title": "T-shirt"}, "skus": [], "options": []}],
+        },
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["success_count"] == 1
+    assert data["docs"][0]["spu_id"] == "1"
+
+
+class _FakeTextModel:
+    def encode_batch(self, texts, batch_size=32, device="cpu"):
+        return [np.array([0.1, 0.2, 0.3], dtype=np.float32) for _ in texts]
+
+
+class _FakeImageModel:
+    def encode_image_urls(self, urls, batch_size=8):
+        return [np.array([0.3, 0.2, 0.1], dtype=np.float32) for _ in urls]
+
+
+@pytest.fixture
+def embedding_client():
+    import embeddings.server as emb_server
+
+    emb_server.app.router.on_startup.clear()
+    emb_server._text_model = _FakeTextModel()
+    emb_server._image_model = _FakeImageModel()
+
+    with TestClient(emb_server.app) as client:
+        yield client
+
+
+def test_embedding_text_contract(embedding_client: TestClient):
+    response = embedding_client.post("/embed/text", json=["hello", "world"])
+    assert response.status_code == 200
+    data = response.json()
+    assert len(data) == 2
+    assert len(data[0]) == 3
+
+
+def test_embedding_image_contract(embedding_client: TestClient):
+    response = embedding_client.post("/embed/image", json=["https://example.com/a.jpg"])
+    assert response.status_code == 200
+    assert len(response.json()[0]) == 3
+
+
+class _FakeTranslator:
+    model = "qwen"
+    use_cache = True
+
+    def translate(self, text: str, target_lang: str, source_lang: str | None = None, prompt: str | None = None):
+        return f"{text}-{target_lang}"
+
+
+@pytest.fixture
+def translator_client(monkeypatch):
+    import api.translator_app as translator_app
+
+    translator_app.app.router.on_startup.clear()
+    monkeypatch.setattr(translator_app, "get_translator", lambda model="qwen": _FakeTranslator())
+
+    with TestClient(translator_app.app) as client:
+        yield client
+
+
+def test_translator_api_contract(translator_client: TestClient):
+    response = translator_client.post(
+        "/translate",
+        json={"text": "商品名称", "target_lang": "en", "source_lang": "zh"},
+    )
+    assert response.status_code == 200
+    assert response.json()["translated_text"] == "商品名称-en"
+
+
+def test_translator_health_contract(translator_client: TestClient):
+    response = translator_client.get("/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "healthy"
+
+
+class _FakeReranker:
+    _model_name = "fake-reranker"
+
+    def score_with_meta(self, query: str, docs: List[str], normalize: bool = True):
+        scores = [float(i + 1) for i in range(len(docs))]
+        meta: Dict[str, Any] = {"input_docs": len(docs), "unique_docs": len(set(docs))}
+        return scores, meta
+
+
+@pytest.fixture
+def reranker_client():
+    import reranker.server as reranker_server
+
+    reranker_server.app.router.on_startup.clear()
+    reranker_server._reranker = _FakeReranker()
+    reranker_server._backend_name = "fake"
+
+    with TestClient(reranker_server.app) as client:
+        yield client
+
+
+def test_reranker_api_contract(reranker_client: TestClient):
+    response = reranker_client.post(
+        "/rerank",
+        json={"query": "wireless mouse", "docs": ["doc-a", "doc-b"]},
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["scores"] == [1.0, 2.0]
+    assert data["meta"]["input_docs"] == 2
+
+
+def test_reranker_health_contract(reranker_client: TestClient):
+    response = reranker_client.get("/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "ok"
@@ -11,6 +11,8 @@ import time
 from datetime import datetime
 from pathlib import Path
+import pytest
+
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
@@ -44,6 +46,7 @@ def read_queries(file_path: str, limit: int = 100) -&gt; list:
     return queries
+@pytest.mark.skip(reason="Requires data file and DASHSCOPE_API_KEY; run manually when needed")
 def test_cloud_embedding(queries_file: str, num_queries: int = 100):
     """
     Test cloud embedding with queries from file.
@@ -13,11 +13,17 @@ CN-CLIP 服务测试脚本
 """
 import sys
-import numpy as np
-from clip_client import Client
+import pytest
-def test_encoding(client, test_name, inputs):
+try:
+    import numpy as np
+    from clip_client import Client
+except ImportError:
+    pytest.skip("clip_client not installed (optional clip-as-service client)", allow_module_level=True)
+
+
+def _test_encoding(client, test_name, inputs):
     """测试编码功能"""
     print(f"\n{test_name}...")
     try:
@@ -74,21 +80,21 @@ def main():
     results = []
     # 测试1: 文本编码
-    results.append(test_encoding(
+    results.append(_test_encoding(
         client,
         "测试1: 编码文本",
         ['这是一个测试文本', '另一个测试文本']
     ))
     # 测试2: 图像编码
-    results.append(test_encoding(
+    results.append(_test_encoding(
         client,
         "测试2: 编码图像（远程 URL）",
         ['https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']
     ))
     # 测试3: 混合编码
-    results.append(test_encoding(
+    results.append(_test_encoding(
         client,
         "测试3: 混合编码（文本和图像）",
         ['这是一段文本', 'https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']