Commit 3abbc95aef12ebad7dead00036e4c0523b2d5c55
1 parent
048631be
重构(scripts): 整理scripts目录,按现架构分类并迁移性能/手动测试脚本
问题背景: - scripts/ 目录下混有服务启动、数据转换、性能压测、临时脚本及历史备份目录 - 存在大量中间迭代遗留信息,不利于维护和新人理解 - 现行服务编排已稳定为 service_ctl up all 的集合:tei / cnclip / embedding / embedding-image / translator / reranker / backend / indexer / frontend / eval-web,不再保留 reranker-fine 默认位 调整内容: 1. 根 scripts/ 收敛为运行、运维、环境、数据处理脚本,并新增 scripts/README.md 说明文档 2. 性能/压测/调参脚本整体迁至 benchmarks/ 目录,同步更新 benchmarks/README.md 3. 人工试跑脚本迁至 tests/manual/ 目录,同步更新 tests/manual/README.md 4. 删除明确过时内容: - scripts/indexer__old_2025_11/ - scripts/start.sh - scripts/install_server_deps.sh 5. 同步修正以下文档中的路径及过时描述: - 根目录 README.md - 性能报告相关文档 - reranker/translation 模块文档 技术细节: - 性能测试不放常规 tests/ 的原因:这类脚本依赖真实服务、GPU、模型和环境噪声,不适合作为稳定回归门禁;benchmarks/ 更贴合其定位 - tests/manual/ 仅存放需要人工启动依赖、手工观察结果的接口试跑脚本 - 所有迁移后的 Python 脚本已通过 py_compile 语法校验 - 所有迁移后的 Shell 脚本已通过 bash -n 语法校验 校验结果: - py_compile: 通过 - bash -n: 通过
Showing
53 changed files
with
191 additions
and
1437 deletions
Show diff stats
CLAUDE.md
| @@ -77,9 +77,11 @@ source activate.sh | @@ -77,9 +77,11 @@ source activate.sh | ||
| 77 | # Generate test data (Tenant1 Mock + Tenant2 CSV) | 77 | # Generate test data (Tenant1 Mock + Tenant2 CSV) |
| 78 | ./scripts/mock_data.sh | 78 | ./scripts/mock_data.sh |
| 79 | 79 | ||
| 80 | -# Ingest data to Elasticsearch | ||
| 81 | -./scripts/ingest.sh <tenant_id> [recreate] # e.g., ./scripts/ingest.sh 1 true | ||
| 82 | -python main.py ingest data.csv --limit 1000 --batch-size 50 | 80 | +# Create tenant index structure |
| 81 | +./scripts/create_tenant_index.sh <tenant_id> | ||
| 82 | + | ||
| 83 | +# Build / refresh suggestion index | ||
| 84 | +./scripts/build_suggestions.sh <tenant_id> --mode incremental | ||
| 83 | ``` | 85 | ``` |
| 84 | 86 | ||
| 85 | ### Running Services | 87 | ### Running Services |
| @@ -100,10 +102,10 @@ python main.py serve --host 0.0.0.0 --port 6002 --reload | @@ -100,10 +102,10 @@ python main.py serve --host 0.0.0.0 --port 6002 --reload | ||
| 100 | # Run all tests | 102 | # Run all tests |
| 101 | pytest tests/ | 103 | pytest tests/ |
| 102 | 104 | ||
| 103 | -# Run specific test types | ||
| 104 | -pytest tests/unit/ # Unit tests | ||
| 105 | -pytest tests/integration/ # Integration tests | ||
| 106 | -pytest -m "api" # API tests only | 105 | +# Run focused regression sets |
| 106 | +python -m pytest tests/ci -q | ||
| 107 | +pytest tests/test_rerank_client.py | ||
| 108 | +pytest tests/test_query_parser_mixed_language.py | ||
| 107 | 109 | ||
| 108 | # Test search from command line | 110 | # Test search from command line |
| 109 | python main.py search "query" --tenant-id 1 --size 10 | 111 | python main.py search "query" --tenant-id 1 --size 10 |
| @@ -114,12 +116,8 @@ python main.py search "query" --tenant-id 1 --size 10 | @@ -114,12 +116,8 @@ python main.py search "query" --tenant-id 1 --size 10 | ||
| 114 | # Stop all services | 116 | # Stop all services |
| 115 | ./scripts/stop.sh | 117 | ./scripts/stop.sh |
| 116 | 118 | ||
| 117 | -# Test environment (for CI/development) | ||
| 118 | -./scripts/start_test_environment.sh | ||
| 119 | -./scripts/stop_test_environment.sh | ||
| 120 | - | ||
| 121 | -# Install server dependencies | ||
| 122 | -./scripts/install_server_deps.sh | 119 | +# Run CI contract tests |
| 120 | +./scripts/run_ci_tests.sh | ||
| 123 | ``` | 121 | ``` |
| 124 | 122 | ||
| 125 | ## Architecture Overview | 123 | ## Architecture Overview |
| @@ -585,7 +583,7 @@ GET /admin/stats # Index statistics | @@ -585,7 +583,7 @@ GET /admin/stats # Index statistics | ||
| 585 | ./scripts/start_frontend.sh # Frontend UI (port 6003) | 583 | ./scripts/start_frontend.sh # Frontend UI (port 6003) |
| 586 | 584 | ||
| 587 | # Data Operations | 585 | # Data Operations |
| 588 | -./scripts/ingest.sh <tenant_id> [recreate] # Index data | 586 | +./scripts/create_tenant_index.sh <tenant_id> # Create tenant index |
| 589 | ./scripts/mock_data.sh # Generate test data | 587 | ./scripts/mock_data.sh # Generate test data |
| 590 | 588 | ||
| 591 | # Testing | 589 | # Testing |
| @@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
| 1 | +# Benchmarks | ||
| 2 | + | ||
| 3 | +基准压测脚本统一放在 `benchmarks/`,不再和 `scripts/` 里的服务启动/运维脚本混放。 | ||
| 4 | + | ||
| 5 | +目录约定: | ||
| 6 | + | ||
| 7 | +- `benchmarks/perf_api_benchmark.py`:通用 HTTP 接口压测入口 | ||
| 8 | +- `benchmarks/reranker/`:reranker 定向 benchmark、smoke、手工对比脚本 | ||
| 9 | +- `benchmarks/translation/`:translation 本地模型 benchmark | ||
| 10 | + | ||
| 11 | +这些脚本默认不是 CI 测试的一部分,因为它们通常具备以下特征: | ||
| 12 | + | ||
| 13 | +- 依赖真实服务、GPU、模型或特定数据集 | ||
| 14 | +- 结果受机器配置和运行时负载影响,不适合作为稳定回归门禁 | ||
| 15 | +- 更多用于容量评估、调参和问题复现,而不是功能正确性判定 | ||
| 16 | + | ||
| 17 | +如果某个性能场景需要进入自动化回归,应新增到 `tests/` 下并明确收敛输入、环境和判定阈值,而不是直接复用这里的基准脚本。 |
scripts/perf_api_benchmark.py renamed to benchmarks/perf_api_benchmark.py
| @@ -11,13 +11,13 @@ Default scenarios (aligned with docs/搜索API对接指南 分册,如 -01 / -0 | @@ -11,13 +11,13 @@ Default scenarios (aligned with docs/搜索API对接指南 分册,如 -01 / -0 | ||
| 11 | - rerank POST /rerank | 11 | - rerank POST /rerank |
| 12 | 12 | ||
| 13 | Examples: | 13 | Examples: |
| 14 | - python scripts/perf_api_benchmark.py --scenario backend_search --duration 30 --concurrency 20 --tenant-id 162 | ||
| 15 | - python scripts/perf_api_benchmark.py --scenario backend_suggest --duration 30 --concurrency 50 --tenant-id 162 | ||
| 16 | - python scripts/perf_api_benchmark.py --scenario all --duration 60 --concurrency 80 --tenant-id 162 | ||
| 17 | - python scripts/perf_api_benchmark.py --scenario all --cases-file scripts/perf_cases.json.example --output perf_result.json | 14 | + python benchmarks/perf_api_benchmark.py --scenario backend_search --duration 30 --concurrency 20 --tenant-id 162 |
| 15 | + python benchmarks/perf_api_benchmark.py --scenario backend_suggest --duration 30 --concurrency 50 --tenant-id 162 | ||
| 16 | + python benchmarks/perf_api_benchmark.py --scenario all --duration 60 --concurrency 80 --tenant-id 162 | ||
| 17 | + python benchmarks/perf_api_benchmark.py --scenario all --cases-file benchmarks/perf_cases.json.example --output perf_result.json | ||
| 18 | # Embedding admission / priority (query param `priority`; same semantics as embedding service): | 18 | # Embedding admission / priority (query param `priority`; same semantics as embedding service): |
| 19 | - python scripts/perf_api_benchmark.py --scenario embed_text --embed-text-priority 1 --duration 30 --concurrency 20 | ||
| 20 | - python scripts/perf_api_benchmark.py --scenario embed_image --embed-image-priority 1 --duration 30 --concurrency 10 | 19 | + python benchmarks/perf_api_benchmark.py --scenario embed_text --embed-text-priority 1 --duration 30 --concurrency 20 |
| 20 | + python benchmarks/perf_api_benchmark.py --scenario embed_image --embed-image-priority 1 --duration 30 --concurrency 10 | ||
| 21 | """ | 21 | """ |
| 22 | 22 | ||
| 23 | from __future__ import annotations | 23 | from __future__ import annotations |
| @@ -229,7 +229,7 @@ def apply_embed_priority_params( | @@ -229,7 +229,7 @@ def apply_embed_priority_params( | ||
| 229 | ) -> None: | 229 | ) -> None: |
| 230 | """ | 230 | """ |
| 231 | Merge default `priority` query param into embed templates when absent. | 231 | Merge default `priority` query param into embed templates when absent. |
| 232 | - `scripts/perf_cases.json` may set per-request `params.priority` to override. | 232 | + `benchmarks/perf_cases.json` may set per-request `params.priority` to override. |
| 233 | """ | 233 | """ |
| 234 | mapping = { | 234 | mapping = { |
| 235 | "embed_text": max(0, int(embed_text_priority)), | 235 | "embed_text": max(0, int(embed_text_priority)), |
scripts/perf_cases.json.example renamed to benchmarks/perf_cases.json.example
scripts/benchmark_reranker_1000docs.sh renamed to benchmarks/reranker/benchmark_reranker_1000docs.sh
| @@ -8,7 +8,7 @@ | @@ -8,7 +8,7 @@ | ||
| 8 | # Outputs JSON reports under perf_reports/<date>/reranker_1000docs/ | 8 | # Outputs JSON reports under perf_reports/<date>/reranker_1000docs/ |
| 9 | # | 9 | # |
| 10 | # Usage: | 10 | # Usage: |
| 11 | -# ./scripts/benchmark_reranker_1000docs.sh | 11 | +# ./benchmarks/reranker/benchmark_reranker_1000docs.sh |
| 12 | # Optional env: | 12 | # Optional env: |
| 13 | # BATCH_SIZES="24 32 48 64" | 13 | # BATCH_SIZES="24 32 48 64" |
| 14 | # C1_REQUESTS=4 | 14 | # C1_REQUESTS=4 |
| @@ -85,7 +85,7 @@ run_bench() { | @@ -85,7 +85,7 @@ run_bench() { | ||
| 85 | local c="$2" | 85 | local c="$2" |
| 86 | local req="$3" | 86 | local req="$3" |
| 87 | local out="${OUT_DIR}/rerank_bs${bs}_c${c}_r${req}.json" | 87 | local out="${OUT_DIR}/rerank_bs${bs}_c${c}_r${req}.json" |
| 88 | - .venv/bin/python scripts/perf_api_benchmark.py \ | 88 | + .venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 89 | --scenario rerank \ | 89 | --scenario rerank \ |
| 90 | --tenant-id "${TENANT_ID}" \ | 90 | --tenant-id "${TENANT_ID}" \ |
| 91 | --reranker-base "${RERANK_BASE}" \ | 91 | --reranker-base "${RERANK_BASE}" \ |
scripts/benchmark_reranker_gguf_local.py renamed to benchmarks/reranker/benchmark_reranker_gguf_local.py
| @@ -8,8 +8,8 @@ Runs the backend directly in a fresh process per config to measure: | @@ -8,8 +8,8 @@ Runs the backend directly in a fresh process per config to measure: | ||
| 8 | - single-request rerank latency | 8 | - single-request rerank latency |
| 9 | 9 | ||
| 10 | Example: | 10 | Example: |
| 11 | - ./.venv-reranker-gguf/bin/python scripts/benchmark_reranker_gguf_local.py | ||
| 12 | - ./.venv-reranker-gguf-06b/bin/python scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | 11 | + ./.venv-reranker-gguf/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py |
| 12 | + ./.venv-reranker-gguf-06b/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | ||
| 13 | """ | 13 | """ |
| 14 | 14 | ||
| 15 | from __future__ import annotations | 15 | from __future__ import annotations |
scripts/benchmark_reranker_random_titles.py renamed to benchmarks/reranker/benchmark_reranker_random_titles.py
| @@ -10,10 +10,10 @@ Each invocation runs 3 warmup requests with n=400 first; those are not timed for | @@ -10,10 +10,10 @@ Each invocation runs 3 warmup requests with n=400 first; those are not timed for | ||
| 10 | 10 | ||
| 11 | Example: | 11 | Example: |
| 12 | source activate.sh | 12 | source activate.sh |
| 13 | - python scripts/benchmark_reranker_random_titles.py 386 | ||
| 14 | - python scripts/benchmark_reranker_random_titles.py 40,80,100 | ||
| 15 | - python scripts/benchmark_reranker_random_titles.py 40,80,100 --repeat 3 --seed 42 | ||
| 16 | - RERANK_BASE=http://127.0.0.1:6007 python scripts/benchmark_reranker_random_titles.py 200 | 13 | + python benchmarks/reranker/benchmark_reranker_random_titles.py 386 |
| 14 | + python benchmarks/reranker/benchmark_reranker_random_titles.py 40,80,100 | ||
| 15 | + python benchmarks/reranker/benchmark_reranker_random_titles.py 40,80,100 --repeat 3 --seed 42 | ||
| 16 | + RERANK_BASE=http://127.0.0.1:6007 python benchmarks/reranker/benchmark_reranker_random_titles.py 200 | ||
| 17 | """ | 17 | """ |
| 18 | 18 | ||
| 19 | from __future__ import annotations | 19 | from __future__ import annotations |
tests/reranker_performance/curl1.sh renamed to benchmarks/reranker/manual/curl1.sh
tests/reranker_performance/curl1_simple.sh renamed to benchmarks/reranker/manual/curl1_simple.sh
tests/reranker_performance/curl2.sh renamed to benchmarks/reranker/manual/curl2.sh
tests/reranker_performance/rerank_performance_compare.sh renamed to benchmarks/reranker/manual/rerank_performance_compare.sh
scripts/patch_rerank_vllm_benchmark_config.py renamed to benchmarks/reranker/patch_rerank_vllm_benchmark_config.py
| @@ -73,7 +73,7 @@ def main() -> int: | @@ -73,7 +73,7 @@ def main() -> int: | ||
| 73 | p.add_argument( | 73 | p.add_argument( |
| 74 | "--config", | 74 | "--config", |
| 75 | type=Path, | 75 | type=Path, |
| 76 | - default=Path(__file__).resolve().parent.parent / "config" / "config.yaml", | 76 | + default=Path(__file__).resolve().parents[2] / "config" / "config.yaml", |
| 77 | ) | 77 | ) |
| 78 | p.add_argument("--backend", choices=("qwen3_vllm", "qwen3_vllm_score"), required=True) | 78 | p.add_argument("--backend", choices=("qwen3_vllm", "qwen3_vllm_score"), required=True) |
| 79 | p.add_argument( | 79 | p.add_argument( |
scripts/run_reranker_vllm_instruction_benchmark.sh renamed to benchmarks/reranker/run_reranker_vllm_instruction_benchmark.sh
| @@ -55,13 +55,13 @@ run_one() { | @@ -55,13 +55,13 @@ run_one() { | ||
| 55 | local jf="${OUT_DIR}/${backend}_${fmt}.json" | 55 | local jf="${OUT_DIR}/${backend}_${fmt}.json" |
| 56 | 56 | ||
| 57 | echo "========== ${tag} ==========" | 57 | echo "========== ${tag} ==========" |
| 58 | - "$PYTHON" "${ROOT}/scripts/patch_rerank_vllm_benchmark_config.py" \ | 58 | + "$PYTHON" "${ROOT}/benchmarks/reranker/patch_rerank_vllm_benchmark_config.py" \ |
| 59 | --backend "$backend" --instruction-format "$fmt" | 59 | --backend "$backend" --instruction-format "$fmt" |
| 60 | 60 | ||
| 61 | "${ROOT}/restart.sh" reranker | 61 | "${ROOT}/restart.sh" reranker |
| 62 | wait_health "$backend" "$fmt" | 62 | wait_health "$backend" "$fmt" |
| 63 | 63 | ||
| 64 | - if ! "$PYTHON" "${ROOT}/scripts/benchmark_reranker_random_titles.py" \ | 64 | + if ! "$PYTHON" "${ROOT}/benchmarks/reranker/benchmark_reranker_random_titles.py" \ |
| 65 | 100,200,400,600,800,1000 \ | 65 | 100,200,400,600,800,1000 \ |
| 66 | --repeat 5 \ | 66 | --repeat 5 \ |
| 67 | --seed 42 \ | 67 | --seed 42 \ |
| @@ -82,7 +82,7 @@ run_one qwen3_vllm_score compact | @@ -82,7 +82,7 @@ run_one qwen3_vllm_score compact | ||
| 82 | run_one qwen3_vllm_score standard | 82 | run_one qwen3_vllm_score standard |
| 83 | 83 | ||
| 84 | # Restore repo-default-style rerank settings (score + compact). | 84 | # Restore repo-default-style rerank settings (score + compact). |
| 85 | -"$PYTHON" "${ROOT}/scripts/patch_rerank_vllm_benchmark_config.py" \ | 85 | +"$PYTHON" "${ROOT}/benchmarks/reranker/patch_rerank_vllm_benchmark_config.py" \ |
| 86 | --backend qwen3_vllm_score --instruction-format compact | 86 | --backend qwen3_vllm_score --instruction-format compact |
| 87 | "${ROOT}/restart.sh" reranker | 87 | "${ROOT}/restart.sh" reranker |
| 88 | wait_health qwen3_vllm_score compact | 88 | wait_health qwen3_vllm_score compact |
scripts/smoke_qwen3_vllm_score_backend.py renamed to benchmarks/reranker/smoke_qwen3_vllm_score_backend.py
| @@ -3,7 +3,7 @@ | @@ -3,7 +3,7 @@ | ||
| 3 | Smoke test: load Qwen3VLLMScoreRerankerBackend (must run as a file, not stdin — vLLM spawn). | 3 | Smoke test: load Qwen3VLLMScoreRerankerBackend (must run as a file, not stdin — vLLM spawn). |
| 4 | 4 | ||
| 5 | Usage (from repo root, score venv): | 5 | Usage (from repo root, score venv): |
| 6 | - PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py | 6 | + PYTHONPATH=. ./.venv-reranker-score/bin/python benchmarks/reranker/smoke_qwen3_vllm_score_backend.py |
| 7 | 7 | ||
| 8 | Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when | 8 | Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when |
| 9 | vLLM auto-selects FLASHINFER on T4/Turing). ``start_reranker.sh`` exports that; this script prepends | 9 | vLLM auto-selects FLASHINFER on T4/Turing). ``start_reranker.sh`` exports that; this script prepends |
| @@ -20,8 +20,8 @@ import sys | @@ -20,8 +20,8 @@ import sys | ||
| 20 | import sysconfig | 20 | import sysconfig |
| 21 | from pathlib import Path | 21 | from pathlib import Path |
| 22 | 22 | ||
| 23 | -# Repo root on sys.path when run as scripts/smoke_*.py | ||
| 24 | -_ROOT = Path(__file__).resolve().parents[1] | 23 | +# Repo root on sys.path when run from benchmarks/reranker/. |
| 24 | +_ROOT = Path(__file__).resolve().parents[2] | ||
| 25 | if str(_ROOT) not in sys.path: | 25 | if str(_ROOT) not in sys.path: |
| 26 | sys.path.insert(0, str(_ROOT)) | 26 | sys.path.insert(0, str(_ROOT)) |
| 27 | 27 |
scripts/benchmark_nllb_t4_tuning.py renamed to benchmarks/translation/benchmark_nllb_t4_tuning.py
| @@ -11,12 +11,12 @@ from datetime import datetime | @@ -11,12 +11,12 @@ from datetime import datetime | ||
| 11 | from pathlib import Path | 11 | from pathlib import Path |
| 12 | from typing import Any, Dict, List, Tuple | 12 | from typing import Any, Dict, List, Tuple |
| 13 | 13 | ||
| 14 | -PROJECT_ROOT = Path(__file__).resolve().parent.parent | 14 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| 15 | if str(PROJECT_ROOT) not in sys.path: | 15 | if str(PROJECT_ROOT) not in sys.path: |
| 16 | sys.path.insert(0, str(PROJECT_ROOT)) | 16 | sys.path.insert(0, str(PROJECT_ROOT)) |
| 17 | 17 | ||
| 18 | from config.services_config import get_translation_config | 18 | from config.services_config import get_translation_config |
| 19 | -from scripts.benchmark_translation_local_models import ( | 19 | +from benchmarks.translation.benchmark_translation_local_models import ( |
| 20 | benchmark_concurrency_case, | 20 | benchmark_concurrency_case, |
| 21 | benchmark_serial_case, | 21 | benchmark_serial_case, |
| 22 | build_environment_info, | 22 | build_environment_info, |
scripts/benchmark_translation_local_models.py renamed to benchmarks/translation/benchmark_translation_local_models.py
| @@ -22,7 +22,7 @@ from typing import Any, Dict, Iterable, List, Sequence | @@ -22,7 +22,7 @@ from typing import Any, Dict, Iterable, List, Sequence | ||
| 22 | import torch | 22 | import torch |
| 23 | import transformers | 23 | import transformers |
| 24 | 24 | ||
| 25 | -PROJECT_ROOT = Path(__file__).resolve().parent.parent | 25 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| 26 | if str(PROJECT_ROOT) not in sys.path: | 26 | if str(PROJECT_ROOT) not in sys.path: |
| 27 | sys.path.insert(0, str(PROJECT_ROOT)) | 27 | sys.path.insert(0, str(PROJECT_ROOT)) |
| 28 | 28 |
scripts/benchmark_translation_local_models_focus.py renamed to benchmarks/translation/benchmark_translation_local_models_focus.py
| @@ -11,12 +11,12 @@ from datetime import datetime | @@ -11,12 +11,12 @@ from datetime import datetime | ||
| 11 | from pathlib import Path | 11 | from pathlib import Path |
| 12 | from typing import Any, Dict, List | 12 | from typing import Any, Dict, List |
| 13 | 13 | ||
| 14 | -PROJECT_ROOT = Path(__file__).resolve().parent.parent | 14 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| 15 | if str(PROJECT_ROOT) not in sys.path: | 15 | if str(PROJECT_ROOT) not in sys.path: |
| 16 | sys.path.insert(0, str(PROJECT_ROOT)) | 16 | sys.path.insert(0, str(PROJECT_ROOT)) |
| 17 | 17 | ||
| 18 | from config.services_config import get_translation_config | 18 | from config.services_config import get_translation_config |
| 19 | -from scripts.benchmark_translation_local_models import ( | 19 | +from benchmarks.translation.benchmark_translation_local_models import ( |
| 20 | SCENARIOS, | 20 | SCENARIOS, |
| 21 | benchmark_concurrency_case, | 21 | benchmark_concurrency_case, |
| 22 | benchmark_serial_case, | 22 | benchmark_serial_case, |
scripts/benchmark_translation_longtext_single.py renamed to benchmarks/translation/benchmark_translation_longtext_single.py
config/config.yaml
| @@ -114,7 +114,7 @@ field_boosts: | @@ -114,7 +114,7 @@ field_boosts: | ||
| 114 | qanchors: 1.0 | 114 | qanchors: 1.0 |
| 115 | enriched_tags: 1.0 | 115 | enriched_tags: 1.0 |
| 116 | enriched_attributes.value: 1.5 | 116 | enriched_attributes.value: 1.5 |
| 117 | - enriched_taxonomy_attributes.value: 0.3 | 117 | + # enriched_taxonomy_attributes.value: 0.3 |
| 118 | category_name_text: 2.0 | 118 | category_name_text: 2.0 |
| 119 | category_path: 2.0 | 119 | category_path: 2.0 |
| 120 | keywords: 2.0 | 120 | keywords: 2.0 |
| @@ -195,7 +195,7 @@ query_config: | @@ -195,7 +195,7 @@ query_config: | ||
| 195 | - qanchors | 195 | - qanchors |
| 196 | - enriched_tags | 196 | - enriched_tags |
| 197 | - enriched_attributes.value | 197 | - enriched_attributes.value |
| 198 | - - enriched_taxonomy_attributes.value | 198 | + # - enriched_taxonomy_attributes.value |
| 199 | - option1_values | 199 | - option1_values |
| 200 | - option2_values | 200 | - option2_values |
| 201 | - option3_values | 201 | - option3_values |
| @@ -254,7 +254,7 @@ query_config: | @@ -254,7 +254,7 @@ query_config: | ||
| 254 | # - qanchors | 254 | # - qanchors |
| 255 | # - enriched_tags | 255 | # - enriched_tags |
| 256 | # - enriched_attributes | 256 | # - enriched_attributes |
| 257 | - # - enriched_taxonomy_attributes.value | 257 | + # - # enriched_taxonomy_attributes.value |
| 258 | - min_price | 258 | - min_price |
| 259 | - compare_at_price | 259 | - compare_at_price |
| 260 | - image_url | 260 | - image_url |
docs/DEVELOPER_GUIDE.md
| @@ -389,7 +389,7 @@ services: | @@ -389,7 +389,7 @@ services: | ||
| 389 | - **位置**:`tests/`,可按 `unit/`、`integration/` 或按模块划分子目录;公共 fixture 在 `conftest.py`。 | 389 | - **位置**:`tests/`,可按 `unit/`、`integration/` 或按模块划分子目录;公共 fixture 在 `conftest.py`。 |
| 390 | - **标记**:使用 `@pytest.mark.unit`、`@pytest.mark.integration`、`@pytest.mark.api` 等区分用例类型,便于按需运行。 | 390 | - **标记**:使用 `@pytest.mark.unit`、`@pytest.mark.integration`、`@pytest.mark.api` 等区分用例类型,便于按需运行。 |
| 391 | - **依赖**:单元测试通过 mock(如 `mock_es_client`、`sample_search_config`)不依赖真实 ES/DB;集成测试需在说明中注明依赖服务。 | 391 | - **依赖**:单元测试通过 mock(如 `mock_es_client`、`sample_search_config`)不依赖真实 ES/DB;集成测试需在说明中注明依赖服务。 |
| 392 | -- **运行**:`python -m pytest tests/`;仅单元:`python -m pytest tests/unit/` 或 `-m unit`。 | 392 | +- **运行**:`python -m pytest tests/`;推荐最小回归:`python -m pytest tests/ci -q`;按模块聚焦可直接指定具体测试文件。 |
| 393 | - **原则**:新增逻辑应有对应测试;修改协议或配置契约时更新相关测试与 fixture。 | 393 | - **原则**:新增逻辑应有对应测试;修改协议或配置契约时更新相关测试与 fixture。 |
| 394 | 394 | ||
| 395 | ### 8.3 配置与环境 | 395 | ### 8.3 配置与环境 |
docs/QUICKSTART.md
| @@ -69,7 +69,7 @@ source activate.sh | @@ -69,7 +69,7 @@ source activate.sh | ||
| 69 | ./run.sh all | 69 | ./run.sh all |
| 70 | # 仅为薄封装:等价于 ./scripts/service_ctl.sh up all | 70 | # 仅为薄封装:等价于 ./scripts/service_ctl.sh up all |
| 71 | # 说明: | 71 | # 说明: |
| 72 | -# - all = tei cnclip embedding embedding-image translator reranker reranker-fine backend indexer frontend eval-web | 72 | +# - all = tei cnclip embedding embedding-image translator reranker backend indexer frontend eval-web |
| 73 | # - up 会同时启动 monitor daemon(运行期连续失败自动重启) | 73 | # - up 会同时启动 monitor daemon(运行期连续失败自动重启) |
| 74 | # - reranker 为 GPU 强制模式(资源不足会直接启动失败) | 74 | # - reranker 为 GPU 强制模式(资源不足会直接启动失败) |
| 75 | # - TEI 默认使用 GPU;当 TEI_DEVICE=cuda 且 GPU 不可用时会直接失败(不会自动降级到 CPU) | 75 | # - TEI 默认使用 GPU;当 TEI_DEVICE=cuda 且 GPU 不可用时会直接失败(不会自动降级到 CPU) |
docs/Usage-Guide.md
| @@ -126,7 +126,7 @@ cd /data/saas-search | @@ -126,7 +126,7 @@ cd /data/saas-search | ||
| 126 | 126 | ||
| 127 | 这个脚本会自动: | 127 | 这个脚本会自动: |
| 128 | 1. 创建日志目录 | 128 | 1. 创建日志目录 |
| 129 | -2. 按目标启动服务(`all`:`tei cnclip embedding embedding-image translator reranker reranker-fine backend indexer frontend eval-web`) | 129 | +2. 按目标启动服务(`all`:`tei cnclip embedding embedding-image translator reranker backend indexer frontend eval-web`) |
| 130 | 3. 写入 PID 到 `logs/*.pid` | 130 | 3. 写入 PID 到 `logs/*.pid` |
| 131 | 4. 执行健康检查 | 131 | 4. 执行健康检查 |
| 132 | 5. 启动 monitor daemon(运行期连续失败自动重启) | 132 | 5. 启动 monitor daemon(运行期连续失败自动重启) |
| @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t | @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t | ||
| 202 | ./scripts/service_ctl.sh restart backend | 202 | ./scripts/service_ctl.sh restart backend |
| 203 | sleep 3 | 203 | sleep 3 |
| 204 | ./scripts/service_ctl.sh status backend | 204 | ./scripts/service_ctl.sh status backend |
| 205 | -./scripts/evaluation/start_eval.sh.sh batch | 205 | +./scripts/evaluation/start_eval.sh batch |
| 206 | ``` | 206 | ``` |
| 207 | 207 | ||
| 208 | 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | 208 | 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 |
docs/工作总结-微服务性能优化与架构.md
| @@ -129,7 +129,7 @@ instruction: "Given a shopping query, rank product titles by relevance" | @@ -129,7 +129,7 @@ instruction: "Given a shopping query, rank product titles by relevance" | ||
| 129 | - 可选:embedding(text) **6005**、embedding-image **6008**、translator **6006**、reranker **6007**、tei **8080**、cnclip **51000**。 | 129 | - 可选:embedding(text) **6005**、embedding-image **6008**、translator **6006**、reranker **6007**、tei **8080**、cnclip **51000**。 |
| 130 | - 端口可由环境变量覆盖:`API_PORT`、`INDEXER_PORT`、`FRONTEND_PORT`、`EVAL_WEB_PORT`、`EMBEDDING_TEXT_PORT`、`EMBEDDING_IMAGE_PORT`、`TRANSLATION_PORT`、`RERANKER_PORT`、`TEI_PORT`、`CNCLIP_PORT`。 | 130 | - 端口可由环境变量覆盖:`API_PORT`、`INDEXER_PORT`、`FRONTEND_PORT`、`EVAL_WEB_PORT`、`EMBEDDING_TEXT_PORT`、`EMBEDDING_IMAGE_PORT`、`TRANSLATION_PORT`、`RERANKER_PORT`、`TEI_PORT`、`CNCLIP_PORT`。 |
| 131 | - **命令**: | 131 | - **命令**: |
| 132 | - - `./scripts/service_ctl.sh start [service...]` 或 `up all` / `start all`(all 含 tei、cnclip、embedding、embedding-image、translator、reranker、reranker-fine、backend、indexer、frontend、eval-web,按依赖顺序);`stop`、`restart`、`down` 同参数;`status` 默认列出所有服务。 | 132 | + - `./scripts/service_ctl.sh start [service...]` 或 `up all` / `start all`(all 含 tei、cnclip、embedding、embedding-image、translator、reranker、backend、indexer、frontend、eval-web,按依赖顺序);`stop`、`restart`、`down` 同参数;`status` 默认列出所有服务。 |
| 133 | - 启动时:backend/indexer/frontend/embedding/translator/reranker 会写 pid 到 `logs/<service>.pid`,并执行 `wait_for_health`(GET `http://127.0.0.1:<port>/health`);reranker 健康重试 90 次,其余 30 次;TEI 校验 Docker 容器存在且 `/health` 成功;cnclip 无 HTTP 健康则仅校验进程/端口。 | 133 | - 启动时:backend/indexer/frontend/embedding/translator/reranker 会写 pid 到 `logs/<service>.pid`,并执行 `wait_for_health`(GET `http://127.0.0.1:<port>/health`);reranker 健康重试 90 次,其余 30 次;TEI 校验 Docker 容器存在且 `/health` 成功;cnclip 无 HTTP 健康则仅校验进程/端口。 |
| 134 | - **监控常驻**: | 134 | - **监控常驻**: |
| 135 | - `./scripts/service_ctl.sh monitor-start <targets>` 启动后台监控进程,将 targets 写入 `logs/service-monitor.targets`,pid 写入 `logs/service-monitor.pid`,日志追加到 `logs/service-monitor.log`。 | 135 | - `./scripts/service_ctl.sh monitor-start <targets>` 启动后台监控进程,将 targets 写入 `logs/service-monitor.targets`,pid 写入 `logs/service-monitor.pid`,日志追加到 `logs/service-monitor.log`。 |
| @@ -153,12 +153,12 @@ instruction: "Given a shopping query, rank product titles by relevance" | @@ -153,12 +153,12 @@ instruction: "Given a shopping query, rank product titles by relevance" | ||
| 153 | 153 | ||
| 154 | ## 三、性能测试报告摘要 | 154 | ## 三、性能测试报告摘要 |
| 155 | 155 | ||
| 156 | -以下数据来自 `docs/性能测试报告.md`,测试时间 **2026-03-12**,环境:**8 vCPU**(Intel Xeon Platinum 8255C @ 2.50GHz)、**约 15Gi 可用内存**;租户 **162** 文档数约 **53**(search/search/suggestions/rerank 与文档规模相关)。压测工具:`scripts/perf_api_benchmark.py`,场景×并发矩阵,每档 **20s**。 | 156 | +以下数据来自 `docs/性能测试报告.md`,测试时间 **2026-03-12**,环境:**8 vCPU**(Intel Xeon Platinum 8255C @ 2.50GHz)、**约 15Gi 可用内存**;租户 **162** 文档数约 **53**(search/search/suggestions/rerank 与文档规模相关)。压测工具:`benchmarks/perf_api_benchmark.py`,场景×并发矩阵,每档 **20s**。 |
| 157 | 157 | ||
| 158 | **复现命令(四场景×四并发)**: | 158 | **复现命令(四场景×四并发)**: |
| 159 | ```bash | 159 | ```bash |
| 160 | cd /data/saas-search | 160 | cd /data/saas-search |
| 161 | -.venv/bin/python scripts/perf_api_benchmark.py \ | 161 | +.venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 162 | --scenario backend_search,backend_suggest,embed_text,rerank \ | 162 | --scenario backend_search,backend_suggest,embed_text,rerank \ |
| 163 | --concurrency-list 1,5,10,20 \ | 163 | --concurrency-list 1,5,10,20 \ |
| 164 | --duration 20 \ | 164 | --duration 20 \ |
| @@ -188,7 +188,7 @@ cd /data/saas-search | @@ -188,7 +188,7 @@ cd /data/saas-search | ||
| 188 | 188 | ||
| 189 | 口径:query 固定 `wireless mouse`,每次请求 **386 docs**,句长 15–40 词随机(从 1000 词池采样);配置 `rerank_window=384`。复现命令: | 189 | 口径:query 固定 `wireless mouse`,每次请求 **386 docs**,句长 15–40 词随机(从 1000 词池采样);配置 `rerank_window=384`。复现命令: |
| 190 | ```bash | 190 | ```bash |
| 191 | -.venv/bin/python scripts/perf_api_benchmark.py \ | 191 | +.venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 192 | --scenario rerank --duration 20 --concurrency-list 1,5,10,20 --timeout 60 \ | 192 | --scenario rerank --duration 20 --concurrency-list 1,5,10,20 --timeout 60 \ |
| 193 | --rerank-dynamic-docs --rerank-doc-count 386 --rerank-vocab-size 1000 \ | 193 | --rerank-dynamic-docs --rerank-doc-count 386 --rerank-vocab-size 1000 \ |
| 194 | --rerank-sentence-min-words 15 --rerank-sentence-max-words 40 \ | 194 | --rerank-sentence-min-words 15 --rerank-sentence-max-words 40 \ |
| @@ -217,7 +217,7 @@ cd /data/saas-search | @@ -217,7 +217,7 @@ cd /data/saas-search | ||
| 217 | | 10 | 181 | 100% | 8.78 | 1129.23| 1295.88| 1330.96| | 217 | | 10 | 181 | 100% | 8.78 | 1129.23| 1295.88| 1330.96| |
| 218 | | 20 | 161 | 100% | 7.63 | 2594.00| 4706.44| 4783.05| | 218 | | 20 | 161 | 100% | 7.63 | 2594.00| 4706.44| 4783.05| |
| 219 | 219 | ||
| 220 | -**结论**:吞吐约 **8 rps** 平台化,延迟随并发上升明显,符合“检索 + 向量 + 重排”重链路特征。多租户补测(文档数 500–10000,见报告 §12)表明:文档数越大,RPS 下降、延迟升高;tenant 0(10000 doc)在并发 20 出现部分 ReadTimeout(成功率 59.02%),需注意 timeout 与容量规划;补测命令示例:`for t in 0 1 2 3 4; do .venv/bin/python scripts/perf_api_benchmark.py --scenario backend_search --concurrency-list 1,5,10,20 --duration 20 --tenant-id $t --output perf_reports/2026-03-12/search_tenant_matrix/tenant_${t}.json; done`。 | 220 | +**结论**:吞吐约 **8 rps** 平台化,延迟随并发上升明显,符合“检索 + 向量 + 重排”重链路特征。多租户补测(文档数 500–10000,见报告 §12)表明:文档数越大,RPS 下降、延迟升高;tenant 0(10000 doc)在并发 20 出现部分 ReadTimeout(成功率 59.02%),需注意 timeout 与容量规划;补测命令示例:`for t in 0 1 2 3 4; do .venv/bin/python benchmarks/perf_api_benchmark.py --scenario backend_search --concurrency-list 1,5,10,20 --duration 20 --tenant-id $t --output perf_reports/2026-03-12/search_tenant_matrix/tenant_${t}.json; done`。 |
| 221 | 221 | ||
| 222 | --- | 222 | --- |
| 223 | 223 | ||
| @@ -247,5 +247,5 @@ cd /data/saas-search | @@ -247,5 +247,5 @@ cd /data/saas-search | ||
| 247 | 247 | ||
| 248 | **关键文件与复现**: | 248 | **关键文件与复现**: |
| 249 | - 配置:`config/config.yaml`(services、rerank、query_config)、`.env`(端口与 API Key)。 | 249 | - 配置:`config/config.yaml`(services、rerank、query_config)、`.env`(端口与 API Key)。 |
| 250 | -- 脚本:`scripts/service_ctl.sh`(启停与监控)、`scripts/perf_api_benchmark.py`(压测)、`scripts/build_suggestions.sh`(suggest 构建)。 | 250 | +- 脚本:`scripts/service_ctl.sh`(启停与监控)、`benchmarks/perf_api_benchmark.py`(压测)、`scripts/build_suggestions.sh`(suggest 构建)。 |
| 251 | - 完整步骤与多租户/rerank 对比见:`docs/性能测试报告.md`。 | 251 | - 完整步骤与多租户/rerank 对比见:`docs/性能测试报告.md`。 |
docs/性能测试报告.md
| @@ -18,13 +18,13 @@ | @@ -18,13 +18,13 @@ | ||
| 18 | 18 | ||
| 19 | 执行方式: | 19 | 执行方式: |
| 20 | - 每组压测持续 `20s` | 20 | - 每组压测持续 `20s` |
| 21 | -- 使用统一脚本 `scripts/perf_api_benchmark.py` | 21 | +- 使用统一脚本 `benchmarks/perf_api_benchmark.py` |
| 22 | - 通过 `--scenario` 多值 + `--concurrency-list` 一次性跑完 `场景 x 并发` | 22 | - 通过 `--scenario` 多值 + `--concurrency-list` 一次性跑完 `场景 x 并发` |
| 23 | 23 | ||
| 24 | ## 3. 压测工具优化说明(复用现有脚本) | 24 | ## 3. 压测工具优化说明(复用现有脚本) |
| 25 | 25 | ||
| 26 | 为了解决原脚本“一次只能跑一个场景+一个并发”的可用性问题,本次直接扩展现有脚本: | 26 | 为了解决原脚本“一次只能跑一个场景+一个并发”的可用性问题,本次直接扩展现有脚本: |
| 27 | -- `scripts/perf_api_benchmark.py` | 27 | +- `benchmarks/perf_api_benchmark.py` |
| 28 | 28 | ||
| 29 | 能力: | 29 | 能力: |
| 30 | - 一条命令执行 `场景列表 x 并发列表` 全矩阵 | 30 | - 一条命令执行 `场景列表 x 并发列表` 全矩阵 |
| @@ -33,7 +33,7 @@ | @@ -33,7 +33,7 @@ | ||
| 33 | 示例: | 33 | 示例: |
| 34 | 34 | ||
| 35 | ```bash | 35 | ```bash |
| 36 | -.venv/bin/python scripts/perf_api_benchmark.py \ | 36 | +.venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 37 | --scenario backend_search,backend_suggest,embed_text,rerank \ | 37 | --scenario backend_search,backend_suggest,embed_text,rerank \ |
| 38 | --concurrency-list 1,5,10,20 \ | 38 | --concurrency-list 1,5,10,20 \ |
| 39 | --duration 20 \ | 39 | --duration 20 \ |
| @@ -106,7 +106,7 @@ curl -sS http://127.0.0.1:6007/health | @@ -106,7 +106,7 @@ curl -sS http://127.0.0.1:6007/health | ||
| 106 | 106 | ||
| 107 | ```bash | 107 | ```bash |
| 108 | cd /data/saas-search | 108 | cd /data/saas-search |
| 109 | -.venv/bin/python scripts/perf_api_benchmark.py \ | 109 | +.venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 110 | --scenario backend_search,backend_suggest,embed_text,rerank \ | 110 | --scenario backend_search,backend_suggest,embed_text,rerank \ |
| 111 | --concurrency-list 1,5,10,20 \ | 111 | --concurrency-list 1,5,10,20 \ |
| 112 | --duration 20 \ | 112 | --duration 20 \ |
| @@ -164,7 +164,7 @@ cd /data/saas-search | @@ -164,7 +164,7 @@ cd /data/saas-search | ||
| 164 | 复现命令: | 164 | 复现命令: |
| 165 | 165 | ||
| 166 | ```bash | 166 | ```bash |
| 167 | -.venv/bin/python scripts/perf_api_benchmark.py \ | 167 | +.venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 168 | --scenario rerank \ | 168 | --scenario rerank \ |
| 169 | --duration 20 \ | 169 | --duration 20 \ |
| 170 | --concurrency-list 1,5,10,20 \ | 170 | --concurrency-list 1,5,10,20 \ |
| @@ -237,7 +237,7 @@ cd /data/saas-search | @@ -237,7 +237,7 @@ cd /data/saas-search | ||
| 237 | - 使用项目虚拟环境执行: | 237 | - 使用项目虚拟环境执行: |
| 238 | 238 | ||
| 239 | ```bash | 239 | ```bash |
| 240 | -.venv/bin/python scripts/perf_api_benchmark.py -h | 240 | +.venv/bin/python benchmarks/perf_api_benchmark.py -h |
| 241 | ``` | 241 | ``` |
| 242 | 242 | ||
| 243 | ### 10.3 某场景成功率下降 | 243 | ### 10.3 某场景成功率下降 |
| @@ -249,7 +249,7 @@ cd /data/saas-search | @@ -249,7 +249,7 @@ cd /data/saas-search | ||
| 249 | 249 | ||
| 250 | ## 11. 关联文件 | 250 | ## 11. 关联文件 |
| 251 | 251 | ||
| 252 | -- 压测脚本:`scripts/perf_api_benchmark.py` | 252 | +- 压测脚本:`benchmarks/perf_api_benchmark.py` |
| 253 | - 本次结果:`perf_reports/2026-03-12/perf_matrix_report.json` | 253 | - 本次结果:`perf_reports/2026-03-12/perf_matrix_report.json` |
| 254 | - Search 多租户补测:`perf_reports/2026-03-12/search_tenant_matrix/` | 254 | - Search 多租户补测:`perf_reports/2026-03-12/search_tenant_matrix/` |
| 255 | - Reranker 386 docs 口径补测:`perf_reports/2026-03-12/rerank_realistic/rerank_386docs.json` | 255 | - Reranker 386 docs 口径补测:`perf_reports/2026-03-12/rerank_realistic/rerank_386docs.json` |
| @@ -280,7 +280,7 @@ cd /data/saas-search | @@ -280,7 +280,7 @@ cd /data/saas-search | ||
| 280 | cd /data/saas-search | 280 | cd /data/saas-search |
| 281 | mkdir -p perf_reports/2026-03-12/search_tenant_matrix | 281 | mkdir -p perf_reports/2026-03-12/search_tenant_matrix |
| 282 | for t in 0 1 2 3 4; do | 282 | for t in 0 1 2 3 4; do |
| 283 | - .venv/bin/python scripts/perf_api_benchmark.py \ | 283 | + .venv/bin/python benchmarks/perf_api_benchmark.py \ |
| 284 | --scenario backend_search \ | 284 | --scenario backend_search \ |
| 285 | --concurrency-list 1,5,10,20 \ | 285 | --concurrency-list 1,5,10,20 \ |
| 286 | --duration 20 \ | 286 | --duration 20 \ |
docs/搜索API对接指南-05-索引接口(Indexer).md
| @@ -498,7 +498,7 @@ curl -X GET "http://localhost:6004/indexer/health" | @@ -498,7 +498,7 @@ curl -X GET "http://localhost:6004/indexer/health" | ||
| 498 | 498 | ||
| 499 | #### 请求示例(完整 curl) | 499 | #### 请求示例(完整 curl) |
| 500 | 500 | ||
| 501 | -> 完整请求体参考 `scripts/test_build_docs_api.py` 中的 `build_sample_request()`。 | 501 | +> 完整请求体参考 `tests/manual/test_build_docs_api.py` 中的 `build_sample_request()`。 |
| 502 | 502 | ||
| 503 | ```bash | 503 | ```bash |
| 504 | # 单条 SPU 示例(含 spu、skus、options) | 504 | # 单条 SPU 示例(含 spu、skus、options) |
docs/搜索API对接指南-10-接口级压测脚本.md
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | ## 10. 接口级压测脚本 | 5 | ## 10. 接口级压测脚本 |
| 6 | 6 | ||
| 7 | -仓库提供统一压测脚本:`scripts/perf_api_benchmark.py`,用于对以下接口做并发压测: | 7 | +仓库提供统一压测脚本:`benchmarks/perf_api_benchmark.py`,用于对以下接口做并发压测: |
| 8 | 8 | ||
| 9 | - 后端搜索:`POST /search/` | 9 | - 后端搜索:`POST /search/` |
| 10 | - 搜索建议:`GET /search/suggestions` | 10 | - 搜索建议:`GET /search/suggestions` |
| @@ -18,21 +18,21 @@ | @@ -18,21 +18,21 @@ | ||
| 18 | 18 | ||
| 19 | ```bash | 19 | ```bash |
| 20 | # suggest 压测(tenant 162) | 20 | # suggest 压测(tenant 162) |
| 21 | -python scripts/perf_api_benchmark.py \ | 21 | +python benchmarks/perf_api_benchmark.py \ |
| 22 | --scenario backend_suggest \ | 22 | --scenario backend_suggest \ |
| 23 | --tenant-id 162 \ | 23 | --tenant-id 162 \ |
| 24 | --duration 30 \ | 24 | --duration 30 \ |
| 25 | --concurrency 50 | 25 | --concurrency 50 |
| 26 | 26 | ||
| 27 | # search 压测 | 27 | # search 压测 |
| 28 | -python scripts/perf_api_benchmark.py \ | 28 | +python benchmarks/perf_api_benchmark.py \ |
| 29 | --scenario backend_search \ | 29 | --scenario backend_search \ |
| 30 | --tenant-id 162 \ | 30 | --tenant-id 162 \ |
| 31 | --duration 30 \ | 31 | --duration 30 \ |
| 32 | --concurrency 20 | 32 | --concurrency 20 |
| 33 | 33 | ||
| 34 | # 全链路压测(search + suggest + embedding + translate + rerank) | 34 | # 全链路压测(search + suggest + embedding + translate + rerank) |
| 35 | -python scripts/perf_api_benchmark.py \ | 35 | +python benchmarks/perf_api_benchmark.py \ |
| 36 | --scenario all \ | 36 | --scenario all \ |
| 37 | --tenant-id 162 \ | 37 | --tenant-id 162 \ |
| 38 | --duration 60 \ | 38 | --duration 60 \ |
| @@ -45,17 +45,16 @@ python scripts/perf_api_benchmark.py \ | @@ -45,17 +45,16 @@ python scripts/perf_api_benchmark.py \ | ||
| 45 | 可通过 `--cases-file` 覆盖默认请求模板。示例文件: | 45 | 可通过 `--cases-file` 覆盖默认请求模板。示例文件: |
| 46 | 46 | ||
| 47 | ```bash | 47 | ```bash |
| 48 | -scripts/perf_cases.json.example | 48 | +benchmarks/perf_cases.json.example |
| 49 | ``` | 49 | ``` |
| 50 | 50 | ||
| 51 | 执行示例: | 51 | 执行示例: |
| 52 | 52 | ||
| 53 | ```bash | 53 | ```bash |
| 54 | -python scripts/perf_api_benchmark.py \ | 54 | +python benchmarks/perf_api_benchmark.py \ |
| 55 | --scenario all \ | 55 | --scenario all \ |
| 56 | --tenant-id 162 \ | 56 | --tenant-id 162 \ |
| 57 | - --cases-file scripts/perf_cases.json.example \ | 57 | + --cases-file benchmarks/perf_cases.json.example \ |
| 58 | --duration 60 \ | 58 | --duration 60 \ |
| 59 | --concurrency 40 | 59 | --concurrency 40 |
| 60 | ``` | 60 | ``` |
| 61 | - |
docs/相关性检索优化说明.md
| @@ -330,7 +330,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t | @@ -330,7 +330,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t | ||
| 330 | ./scripts/service_ctl.sh restart backend | 330 | ./scripts/service_ctl.sh restart backend |
| 331 | sleep 3 | 331 | sleep 3 |
| 332 | ./scripts/service_ctl.sh status backend | 332 | ./scripts/service_ctl.sh status backend |
| 333 | -./scripts/evaluation/start_eval.sh.sh batch | 333 | +./scripts/evaluation/start_eval.sh batch |
| 334 | ``` | 334 | ``` |
| 335 | 335 | ||
| 336 | 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | 336 | 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 |
| @@ -895,4 +895,3 @@ rerank_score:0.4784 | @@ -895,4 +895,3 @@ rerank_score:0.4784 | ||
| 895 | rerank_score:0.5849 | 895 | rerank_score:0.5849 |
| 896 | "zh": "新款女士修身仿旧牛仔短裤 – 休闲性感磨边水洗牛仔短裤,时尚舒", | 896 | "zh": "新款女士修身仿旧牛仔短裤 – 休闲性感磨边水洗牛仔短裤,时尚舒", |
| 897 | "en": "New Women's Slim-fit Vintage Washed Denim Shorts – Casual Sexy Frayed Hem, Fashionable & Comfortable" | 897 | "en": "New Women's Slim-fit Vintage Washed Denim Shorts – Casual Sexy Frayed Hem, Fashionable & Comfortable" |
| 898 | - |
embeddings/README.md
| @@ -98,10 +98,10 @@ | @@ -98,10 +98,10 @@ | ||
| 98 | 98 | ||
| 99 | ### 性能与压测(沿用仓库脚本) | 99 | ### 性能与压测(沿用仓库脚本) |
| 100 | 100 | ||
| 101 | -- 接口级压测(与 `perf_reports/2026-03-12/matrix_report/` 等方法一致):`scripts/perf_api_benchmark.py` | ||
| 102 | - - 示例:`python scripts/perf_api_benchmark.py --scenario embed_text --duration 30 --concurrency 20` | 101 | +- 接口级压测(与 `perf_reports/2026-03-12/matrix_report/` 等方法一致):`benchmarks/perf_api_benchmark.py` |
| 102 | + - 示例:`python benchmarks/perf_api_benchmark.py --scenario embed_text --duration 30 --concurrency 20` | ||
| 103 | - 文本/图片向量可带 `priority`(与线上 admission 语义一致):`--embed-text-priority 1`、`--embed-image-priority 1` | 103 | - 文本/图片向量可带 `priority`(与线上 admission 语义一致):`--embed-text-priority 1`、`--embed-image-priority 1` |
| 104 | - - 自定义请求模板:`--cases-file scripts/perf_cases.json.example` | 104 | + - 自定义请求模板:`--cases-file benchmarks/perf_cases.json.example` |
| 105 | - 历史矩阵结果与说明见 `perf_reports/2026-03-12/matrix_report/summary.md`。 | 105 | - 历史矩阵结果与说明见 `perf_reports/2026-03-12/matrix_report/summary.md`。 |
| 106 | 106 | ||
| 107 | ### 启动服务 | 107 | ### 启动服务 |
perf_reports/20260311/reranker_1000docs/report.md
perf_reports/20260317/translation_local_models/README.md
| 1 | # Local Translation Model Benchmark Report | 1 | # Local Translation Model Benchmark Report |
| 2 | 2 | ||
| 3 | -Test script: [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) | 3 | +Test script: [`benchmarks/translation/benchmark_translation_local_models.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models.py) |
| 4 | 4 | ||
| 5 | Test time: `2026-03-17` | 5 | Test time: `2026-03-17` |
| 6 | 6 | ||
| @@ -67,7 +67,7 @@ To model online search query translation, we reran NLLB with `batch_size=1`. In | @@ -67,7 +67,7 @@ To model online search query translation, we reran NLLB with `batch_size=1`. In | ||
| 67 | Command used: | 67 | Command used: |
| 68 | 68 | ||
| 69 | ```bash | 69 | ```bash |
| 70 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | 70 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py \ |
| 71 | --single \ | 71 | --single \ |
| 72 | --model nllb-200-distilled-600m \ | 72 | --model nllb-200-distilled-600m \ |
| 73 | --source-lang zh \ | 73 | --source-lang zh \ |
perf_reports/20260318/nllb_t4_product_names_ct2/README.md
| 1 | # NLLB T4 Product-Name Tuning Summary | 1 | # NLLB T4 Product-Name Tuning Summary |
| 2 | 2 | ||
| 3 | 测试脚本: | 3 | 测试脚本: |
| 4 | -- [`scripts/benchmark_nllb_t4_tuning.py`](/data/saas-search/scripts/benchmark_nllb_t4_tuning.py) | 4 | +- [`benchmarks/translation/benchmark_nllb_t4_tuning.py`](/data/saas-search/benchmarks/translation/benchmark_nllb_t4_tuning.py) |
| 5 | 5 | ||
| 6 | 本轮报告: | 6 | 本轮报告: |
| 7 | - Markdown:[`nllb_t4_tuning_003608.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md) | 7 | - Markdown:[`nllb_t4_tuning_003608.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md) |
perf_reports/20260318/translation_local_models/README.md
| 1 | # Local Translation Model Benchmark Report | 1 | # Local Translation Model Benchmark Report |
| 2 | 2 | ||
| 3 | 测试脚本: | 3 | 测试脚本: |
| 4 | -- [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) | 4 | +- [`benchmarks/translation/benchmark_translation_local_models.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models.py) |
| 5 | 5 | ||
| 6 | 完整结果: | 6 | 完整结果: |
| 7 | - Markdown:[`translation_local_models_extended_221846.md`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.md) | 7 | - Markdown:[`translation_local_models_extended_221846.md`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.md) |
| @@ -39,7 +39,7 @@ | @@ -39,7 +39,7 @@ | ||
| 39 | 39 | ||
| 40 | ```bash | 40 | ```bash |
| 41 | cd /data/saas-search | 41 | cd /data/saas-search |
| 42 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | 42 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py \ |
| 43 | --suite extended \ | 43 | --suite extended \ |
| 44 | --disable-cache \ | 44 | --disable-cache \ |
| 45 | --serial-items-per-case 256 \ | 45 | --serial-items-per-case 256 \ |
perf_reports/20260318/translation_local_models_ct2/README.md
| 1 | # Local Translation Model Benchmark Report (CTranslate2) | 1 | # Local Translation Model Benchmark Report (CTranslate2) |
| 2 | 2 | ||
| 3 | 测试脚本: | 3 | 测试脚本: |
| 4 | -- [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) | 4 | +- [`benchmarks/translation/benchmark_translation_local_models.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models.py) |
| 5 | 5 | ||
| 6 | 本轮 CT2 结果: | 6 | 本轮 CT2 结果: |
| 7 | - Markdown:[`translation_local_models_ct2_extended_233253.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md) | 7 | - Markdown:[`translation_local_models_ct2_extended_233253.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md) |
| @@ -46,7 +46,7 @@ from datetime import datetime | @@ -46,7 +46,7 @@ from datetime import datetime | ||
| 46 | from pathlib import Path | 46 | from pathlib import Path |
| 47 | from types import SimpleNamespace | 47 | from types import SimpleNamespace |
| 48 | 48 | ||
| 49 | -from scripts.benchmark_translation_local_models import ( | 49 | +from benchmarks.translation.benchmark_translation_local_models import ( |
| 50 | SCENARIOS, | 50 | SCENARIOS, |
| 51 | benchmark_extended_scenario, | 51 | benchmark_extended_scenario, |
| 52 | build_environment_info, | 52 | build_environment_info, |
perf_reports/20260318/translation_local_models_ct2_focus/README.md
| 1 | # Local Translation Model Focused T4 Tuning | 1 | # Local Translation Model Focused T4 Tuning |
| 2 | 2 | ||
| 3 | 测试脚本: | 3 | 测试脚本: |
| 4 | -- [`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) | 4 | +- [`benchmarks/translation/benchmark_translation_local_models_focus.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models_focus.py) |
| 5 | 5 | ||
| 6 | 本轮聚焦结果: | 6 | 本轮聚焦结果: |
| 7 | - Markdown:[`translation_local_models_focus_235018.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md) | 7 | - Markdown:[`translation_local_models_focus_235018.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md) |
perf_reports/README.md
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | | 脚本 | 用途 | | 5 | | 脚本 | 用途 | |
| 6 | |------|------| | 6 | |------|------| |
| 7 | -| `scripts/perf_api_benchmark.py` | 搜索后端、向量、翻译、重排等 HTTP 接口压测;支持 `--embed-text-priority` / `--embed-image-priority` 与 `scripts/perf_cases.json.example` | | 7 | +| `benchmarks/perf_api_benchmark.py` | 搜索后端、向量、翻译、重排等 HTTP 接口压测;支持 `--embed-text-priority` / `--embed-image-priority` 与 `benchmarks/perf_cases.json.example` | |
| 8 | 8 | ||
| 9 | 历史矩阵示例(并发扫描): | 9 | 历史矩阵示例(并发扫描): |
| 10 | 10 | ||
| @@ -25,10 +25,10 @@ | @@ -25,10 +25,10 @@ | ||
| 25 | 25 | ||
| 26 | ```bash | 26 | ```bash |
| 27 | source activate.sh | 27 | source activate.sh |
| 28 | -python scripts/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --timeout 30 --output perf_reports/2026-03-20_embed_text_p0.json | ||
| 29 | -python scripts/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --embed-text-priority 1 --output perf_reports/2026-03-20_embed_text_p1.json | ||
| 30 | -python scripts/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --timeout 60 --output perf_reports/2026-03-20_embed_image_p0.json | ||
| 31 | -python scripts/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --embed-image-priority 1 --output perf_reports/2026-03-20_embed_image_p1.json | 28 | +python benchmarks/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --timeout 30 --output perf_reports/2026-03-20_embed_text_p0.json |
| 29 | +python benchmarks/perf_api_benchmark.py --scenario embed_text --duration 8 --concurrency 10 --embed-text-priority 1 --output perf_reports/2026-03-20_embed_text_p1.json | ||
| 30 | +python benchmarks/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --timeout 60 --output perf_reports/2026-03-20_embed_image_p0.json | ||
| 31 | +python benchmarks/perf_api_benchmark.py --scenario embed_image --duration 8 --concurrency 5 --embed-image-priority 1 --output perf_reports/2026-03-20_embed_image_p1.json | ||
| 32 | ``` | 32 | ``` |
| 33 | 33 | ||
| 34 | 说明:本次为 **8 秒 smoke**,与 `2026-03-12` 矩阵的时长/并发不可直接横向对比;仅验证 `priority` 参数下服务仍返回 200 且 payload 校验通过。 | 34 | 说明:本次为 **8 秒 smoke**,与 `2026-03-12` 矩阵的时长/并发不可直接横向对比;仅验证 `priority` 参数下服务仍返回 200 且 payload 校验通过。 |
perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md
| @@ -25,7 +25,7 @@ Shared across both backends for this run: | @@ -25,7 +25,7 @@ Shared across both backends for this run: | ||
| 25 | 25 | ||
| 26 | ## Methodology | 26 | ## Methodology |
| 27 | 27 | ||
| 28 | -- Script: `python scripts/benchmark_reranker_random_titles.py 100,200,400,600,800,1000 --repeat 5` with **`--seed 99`** (see note below), **`--quiet-runs`**, **`--timeout 360`**. | 28 | +- Script: `python benchmarks/reranker/benchmark_reranker_random_titles.py 100,200,400,600,800,1000 --repeat 5` with **`--seed 99`** (see note below), **`--quiet-runs`**, **`--timeout 360`**. |
| 29 | - Titles: default file `/home/ubuntu/rerank_test/titles.1.8w` (one title per line). | 29 | - Titles: default file `/home/ubuntu/rerank_test/titles.1.8w` (one title per line). |
| 30 | - Query: default `健身女生T恤短袖`. | 30 | - Query: default `健身女生T恤短袖`. |
| 31 | - Each scenario: **3 warm-up** requests at `n=400` (not timed), then **5 timed** runs per `n`. | 31 | - Each scenario: **3 warm-up** requests at `n=400` (not timed), then **5 timed** runs per `n`. |
| @@ -56,9 +56,9 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | @@ -56,9 +56,9 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | ||
| 56 | ## Tooling added / changed | 56 | ## Tooling added / changed |
| 57 | 57 | ||
| 58 | - `reranker/server.py`: `/health` includes `instruction_format` when the active backend sets `_instruction_format`. | 58 | - `reranker/server.py`: `/health` includes `instruction_format` when the active backend sets `_instruction_format`. |
| 59 | -- `scripts/benchmark_reranker_random_titles.py`: `--tag`, `--json-summary-out`, `--quiet-runs`. | ||
| 60 | -- `scripts/patch_rerank_vllm_benchmark_config.py`: surgical YAML patch (preserves newlines). | ||
| 61 | -- `scripts/run_reranker_vllm_instruction_benchmark.sh`: full matrix driver (continues if a benchmark exits non-zero; uses `--timeout 360`). | 59 | +- `benchmarks/reranker/benchmark_reranker_random_titles.py`: `--tag`, `--json-summary-out`, `--quiet-runs`. |
| 60 | +- `benchmarks/reranker/patch_rerank_vllm_benchmark_config.py`: surgical YAML patch (preserves newlines). | ||
| 61 | +- `benchmarks/reranker/run_reranker_vllm_instruction_benchmark.sh`: full matrix driver (continues if a benchmark exits non-zero; uses `--timeout 360`). | ||
| 62 | 62 | ||
| 63 | --- | 63 | --- |
| 64 | 64 | ||
| @@ -73,7 +73,7 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | @@ -73,7 +73,7 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | ||
| 73 | | Attention | Backend forced / steered attention on T4 (e.g. `TRITON_ATTN` path) | **No** `attention_config` in `LLM(...)`; vLLM **auto** — on this T4 run, logs show **`FLASHINFER`** | | 73 | | Attention | Backend forced / steered attention on T4 (e.g. `TRITON_ATTN` path) | **No** `attention_config` in `LLM(...)`; vLLM **auto** — on this T4 run, logs show **`FLASHINFER`** | |
| 74 | | Config surface | `vllm_attention_backend` / `RERANK_VLLM_ATTENTION_BACKEND` 等 | **Removed**(少 YAML/环境变量分支,逻辑收敛) | | 74 | | Config surface | `vllm_attention_backend` / `RERANK_VLLM_ATTENTION_BACKEND` 等 | **Removed**(少 YAML/环境变量分支,逻辑收敛) | |
| 75 | | Code default `instruction_format` | `qwen3_vllm_score` 默认 `standard` | 与 `qwen3_vllm` 对齐为 **`compact`**(仍可在 YAML 写 `standard`) | | 75 | | Code default `instruction_format` | `qwen3_vllm_score` 默认 `standard` | 与 `qwen3_vllm` 对齐为 **`compact`**(仍可在 YAML 写 `standard`) | |
| 76 | -| Smoke / 启动 | — | `scripts/smoke_qwen3_vllm_score_backend.py`;`scripts/start_reranker.sh` 将 **venv `bin` 置于 `PATH`**(FLASHINFER JIT 依赖 venv 内的 `ninja`) | | 76 | +| Smoke / 启动 | — | `benchmarks/reranker/smoke_qwen3_vllm_score_backend.py`;`scripts/start_reranker.sh` 将 **venv `bin` 置于 `PATH`**(FLASHINFER JIT 依赖 venv 内的 `ninja`) | |
| 77 | 77 | ||
| 78 | Micro-benchmark (same machine, isolated): **~927.5 ms → ~673.1 ms** at **n=400** docs on `LLM.score()` steady state (~**28%**), after removing the forced attention path and letting vLLM pick **FLASHINFER**. | 78 | Micro-benchmark (same machine, isolated): **~927.5 ms → ~673.1 ms** at **n=400** docs on `LLM.score()` steady state (~**28%**), after removing the forced attention path and letting vLLM pick **FLASHINFER**. |
| 79 | 79 |
reranker/DEPLOYMENT_AND_TUNING.md
| @@ -109,7 +109,7 @@ curl -sS http://127.0.0.1:6007/health | @@ -109,7 +109,7 @@ curl -sS http://127.0.0.1:6007/health | ||
| 109 | ### 5.1 使用一键压测脚本 | 109 | ### 5.1 使用一键压测脚本 |
| 110 | 110 | ||
| 111 | ```bash | 111 | ```bash |
| 112 | -./scripts/benchmark_reranker_1000docs.sh | 112 | +./benchmarks/reranker/benchmark_reranker_1000docs.sh |
| 113 | ``` | 113 | ``` |
| 114 | 114 | ||
| 115 | 输出目录: | 115 | 输出目录: |
reranker/GGUF_0_6B_INSTALL_AND_TUNING.md
| @@ -144,7 +144,7 @@ qwen3_gguf_06b: | @@ -144,7 +144,7 @@ qwen3_gguf_06b: | ||
| 144 | 144 | ||
| 145 | ```bash | 145 | ```bash |
| 146 | PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | 146 | PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ |
| 147 | - scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | 147 | + benchmarks/reranker/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 |
| 148 | ``` | 148 | ``` |
| 149 | 149 | ||
| 150 | 按服务方式启动: | 150 | 按服务方式启动: |
reranker/GGUF_INSTALL_AND_TUNING.md
| @@ -117,7 +117,7 @@ HF_HUB_DISABLE_XET=1 | @@ -117,7 +117,7 @@ HF_HUB_DISABLE_XET=1 | ||
| 117 | 117 | ||
| 118 | ```bash | 118 | ```bash |
| 119 | PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | 119 | PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ |
| 120 | - scripts/benchmark_reranker_gguf_local.py --docs 64 --repeat 1 | 120 | + benchmarks/reranker/benchmark_reranker_gguf_local.py --docs 64 --repeat 1 |
| 121 | ``` | 121 | ``` |
| 122 | 122 | ||
| 123 | 它会直接实例化 GGUF backend,输出: | 123 | 它会直接实例化 GGUF backend,输出: |
| @@ -134,7 +134,7 @@ PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | @@ -134,7 +134,7 @@ PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | ||
| 134 | 134 | ||
| 135 | - Query: `白色oversized T-shirt` | 135 | - Query: `白色oversized T-shirt` |
| 136 | - Docs: `64` 条商品标题 | 136 | - Docs: `64` 条商品标题 |
| 137 | -- 本地脚本:`scripts/benchmark_reranker_gguf_local.py` | 137 | +- 本地脚本:`benchmarks/reranker/benchmark_reranker_gguf_local.py` |
| 138 | - 每组 1 次,重点比较相对趋势 | 138 | - 每组 1 次,重点比较相对趋势 |
| 139 | 139 | ||
| 140 | 结果: | 140 | 结果: |
| @@ -195,7 +195,7 @@ n_gpu_layers=999 | @@ -195,7 +195,7 @@ n_gpu_layers=999 | ||
| 195 | 195 | ||
| 196 | ```bash | 196 | ```bash |
| 197 | RERANK_BASE=http://127.0.0.1:6007 \ | 197 | RERANK_BASE=http://127.0.0.1:6007 \ |
| 198 | - ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt' | 198 | + ./.venv/bin/python benchmarks/reranker/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt' |
| 199 | ``` | 199 | ``` |
| 200 | 200 | ||
| 201 | 得到: | 201 | 得到: |
| @@ -206,7 +206,7 @@ RERANK_BASE=http://127.0.0.1:6007 \ | @@ -206,7 +206,7 @@ RERANK_BASE=http://127.0.0.1:6007 \ | ||
| 206 | 206 | ||
| 207 | ```bash | 207 | ```bash |
| 208 | RERANK_BASE=http://127.0.0.1:6007 \ | 208 | RERANK_BASE=http://127.0.0.1:6007 \ |
| 209 | - ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt' | 209 | + ./.venv/bin/python benchmarks/reranker/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt' |
| 210 | ``` | 210 | ``` |
| 211 | 211 | ||
| 212 | 得到: | 212 | 得到: |
| @@ -276,5 +276,5 @@ offload_kqv: true | @@ -276,5 +276,5 @@ offload_kqv: true | ||
| 276 | - `config/config.yaml` | 276 | - `config/config.yaml` |
| 277 | - `scripts/setup_reranker_venv.sh` | 277 | - `scripts/setup_reranker_venv.sh` |
| 278 | - `scripts/start_reranker.sh` | 278 | - `scripts/start_reranker.sh` |
| 279 | -- `scripts/benchmark_reranker_gguf_local.py` | 279 | +- `benchmarks/reranker/benchmark_reranker_gguf_local.py` |
| 280 | - `reranker/GGUF_INSTALL_AND_TUNING.md` | 280 | - `reranker/GGUF_INSTALL_AND_TUNING.md` |
reranker/README.md
| @@ -46,9 +46,9 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Jin | @@ -46,9 +46,9 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Jin | ||
| 46 | - `backends/dashscope_rerank.py`:DashScope 云端重排后端 | 46 | - `backends/dashscope_rerank.py`:DashScope 云端重排后端 |
| 47 | - `scripts/setup_reranker_venv.sh`:按后端创建独立 venv | 47 | - `scripts/setup_reranker_venv.sh`:按后端创建独立 venv |
| 48 | - `scripts/start_reranker.sh`:启动 reranker 服务 | 48 | - `scripts/start_reranker.sh`:启动 reranker 服务 |
| 49 | -- `scripts/smoke_qwen3_vllm_score_backend.py`:`qwen3_vllm_score` 本地 smoke | ||
| 50 | -- `scripts/benchmark_reranker_random_titles.py`:随机标题压测脚本 | ||
| 51 | -- `scripts/run_reranker_vllm_instruction_benchmark.sh`:历史矩阵脚本 | 49 | +- `benchmarks/reranker/smoke_qwen3_vllm_score_backend.py`:`qwen3_vllm_score` 本地 smoke |
| 50 | +- `benchmarks/reranker/benchmark_reranker_random_titles.py`:随机标题压测脚本 | ||
| 51 | +- `benchmarks/reranker/run_reranker_vllm_instruction_benchmark.sh`:历史矩阵脚本 | ||
| 52 | 52 | ||
| 53 | ## 环境基线 | 53 | ## 环境基线 |
| 54 | 54 | ||
| @@ -118,7 +118,7 @@ nvidia-smi | @@ -118,7 +118,7 @@ nvidia-smi | ||
| 118 | ### 4. Smoke | 118 | ### 4. Smoke |
| 119 | 119 | ||
| 120 | ```bash | 120 | ```bash |
| 121 | -PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py --gpu-memory-utilization 0.2 | 121 | +PYTHONPATH=. ./.venv-reranker-score/bin/python benchmarks/reranker/smoke_qwen3_vllm_score_backend.py --gpu-memory-utilization 0.2 |
| 122 | ``` | 122 | ``` |
| 123 | 123 | ||
| 124 | ## `jina_reranker_v3` | 124 | ## `jina_reranker_v3` |
| @@ -0,0 +1,53 @@ | @@ -0,0 +1,53 @@ | ||
| 1 | +# Scripts | ||
| 2 | + | ||
| 3 | +`scripts/` 现在只保留当前架构下仍然有效的运行、运维、环境和数据处理脚本。 | ||
| 4 | + | ||
| 5 | +## 当前分类 | ||
| 6 | + | ||
| 7 | +- 服务编排 | ||
| 8 | + - `service_ctl.sh` | ||
| 9 | + - `start_backend.sh` | ||
| 10 | + - `start_indexer.sh` | ||
| 11 | + - `start_frontend.sh` | ||
| 12 | + - `start_eval_web.sh` | ||
| 13 | + - `start_embedding_service.sh` | ||
| 14 | + - `start_embedding_text_service.sh` | ||
| 15 | + - `start_embedding_image_service.sh` | ||
| 16 | + - `start_reranker.sh` | ||
| 17 | + - `start_translator.sh` | ||
| 18 | + - `start_tei_service.sh` | ||
| 19 | + - `start_cnclip_service.sh` | ||
| 20 | + - `stop.sh` | ||
| 21 | + - `stop_tei_service.sh` | ||
| 22 | + - `stop_cnclip_service.sh` | ||
| 23 | + | ||
| 24 | +- 环境初始化 | ||
| 25 | + - `create_venv.sh` | ||
| 26 | + - `init_env.sh` | ||
| 27 | + - `setup_embedding_venv.sh` | ||
| 28 | + - `setup_reranker_venv.sh` | ||
| 29 | + - `setup_translator_venv.sh` | ||
| 30 | + - `setup_cnclip_venv.sh` | ||
| 31 | + | ||
| 32 | +- 数据与索引 | ||
| 33 | + - `create_tenant_index.sh` | ||
| 34 | + - `build_suggestions.sh` | ||
| 35 | + - `mock_data.sh` | ||
| 36 | + | ||
| 37 | +- 评估与专项工具 | ||
| 38 | + - `evaluation/` | ||
| 39 | + - `redis/` | ||
| 40 | + - `debug/` | ||
| 41 | + | ||
| 42 | +## 已迁移 | ||
| 43 | + | ||
| 44 | +- 基准压测与 smoke 脚本:迁到 `benchmarks/` | ||
| 45 | +- 手工接口试跑脚本:迁到 `tests/manual/` | ||
| 46 | + | ||
| 47 | +## 已清理 | ||
| 48 | + | ||
| 49 | +- 历史备份目录:`indexer__old_2025_11/` | ||
| 50 | +- 过时壳脚本:`start.sh` | ||
| 51 | +- Conda 时代残留:`install_server_deps.sh` | ||
| 52 | + | ||
| 53 | +后续如果新增脚本,优先放到明确子目录,不再把 benchmark、manual、历史备份直接丢回根 `scripts/`。 |
scripts/trace_indexer_calls.sh renamed to scripts/debug/trace_indexer_calls.sh
scripts/indexer__old_2025_11/import_tenant2_csv.py deleted
| @@ -1,495 +0,0 @@ | @@ -1,495 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -Import tenant2 CSV data into MySQL Shoplazza tables. | ||
| 4 | - | ||
| 5 | -Reads CSV file and generates SQL INSERT statements for SPU and SKU tables. | ||
| 6 | -Each CSV row corresponds to 1 SPU and 1 SKU. | ||
| 7 | -This script is for generating test data for tenant_id=2 from CSV files. | ||
| 8 | -""" | ||
| 9 | - | ||
| 10 | -import sys | ||
| 11 | -import os | ||
| 12 | -import csv | ||
| 13 | -import random | ||
| 14 | -import argparse | ||
| 15 | -import re | ||
| 16 | -from pathlib import Path | ||
| 17 | -from datetime import datetime, timedelta | ||
| 18 | - | ||
| 19 | -# Add parent directory to path | ||
| 20 | -sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 21 | - | ||
| 22 | - | ||
| 23 | -def escape_sql_string(value: str) -> str: | ||
| 24 | - """ | ||
| 25 | - Escape SQL string value (replace single quotes with doubled quotes and handle special characters). | ||
| 26 | - | ||
| 27 | - Args: | ||
| 28 | - value: String value to escape | ||
| 29 | - | ||
| 30 | - Returns: | ||
| 31 | - Escaped string | ||
| 32 | - """ | ||
| 33 | - if value is None: | ||
| 34 | - return '' | ||
| 35 | - | ||
| 36 | - # Convert to string and handle None | ||
| 37 | - s = str(value) | ||
| 38 | - | ||
| 39 | - # Replace single quotes with doubled quotes (SQL standard) | ||
| 40 | - s = s.replace("'", "''") | ||
| 41 | - | ||
| 42 | - # Replace backslashes (MySQL escape) | ||
| 43 | - s = s.replace("\\", "\\\\") | ||
| 44 | - | ||
| 45 | - # Remove or replace control characters that can break SQL | ||
| 46 | - # Replace newlines and carriage returns with spaces | ||
| 47 | - s = s.replace("\n", " ").replace("\r", " ") | ||
| 48 | - | ||
| 49 | - # Remove other control characters (except tab) | ||
| 50 | - s = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', s) | ||
| 51 | - | ||
| 52 | - # Remove null bytes | ||
| 53 | - s = s.replace('\x00', '') | ||
| 54 | - | ||
| 55 | - return s | ||
| 56 | - | ||
| 57 | - | ||
| 58 | -def generate_handle(title: str) -> str: | ||
| 59 | - """ | ||
| 60 | - Generate URL-friendly handle from title. | ||
| 61 | - | ||
| 62 | - Args: | ||
| 63 | - title: Product title | ||
| 64 | - | ||
| 65 | - Returns: | ||
| 66 | - URL-friendly handle | ||
| 67 | - """ | ||
| 68 | - # Remove special characters, convert to lowercase, replace spaces with hyphens | ||
| 69 | - handle = re.sub(r'[^\w\s-]', '', title.lower()) | ||
| 70 | - handle = re.sub(r'[-\s]+', '-', handle) | ||
| 71 | - handle = handle.strip('-') | ||
| 72 | - # Limit length | ||
| 73 | - if len(handle) > 255: | ||
| 74 | - handle = handle[:255] | ||
| 75 | - return handle or 'product' | ||
| 76 | - | ||
| 77 | - | ||
| 78 | -def parse_csv_row(row: dict) -> dict: | ||
| 79 | - """ | ||
| 80 | - Parse CSV row and extract fields. | ||
| 81 | - | ||
| 82 | - Args: | ||
| 83 | - row: CSV row dictionary | ||
| 84 | - | ||
| 85 | - Returns: | ||
| 86 | - Parsed data dictionary | ||
| 87 | - """ | ||
| 88 | - # Remove quotes from values if present | ||
| 89 | - def clean_value(value): | ||
| 90 | - if value is None: | ||
| 91 | - return '' | ||
| 92 | - value = str(value).strip() | ||
| 93 | - # Remove surrounding quotes | ||
| 94 | - if value.startswith('"') and value.endswith('"'): | ||
| 95 | - value = value[1:-1] | ||
| 96 | - return value | ||
| 97 | - | ||
| 98 | - return { | ||
| 99 | - 'skuId': clean_value(row.get('skuId', '')), | ||
| 100 | - 'name': clean_value(row.get('name', '')), | ||
| 101 | - 'name_pinyin': clean_value(row.get('name_pinyin', '')), | ||
| 102 | - 'create_time': clean_value(row.get('create_time', '')), | ||
| 103 | - 'ruSkuName': clean_value(row.get('ruSkuName', '')), | ||
| 104 | - 'enSpuName': clean_value(row.get('enSpuName', '')), | ||
| 105 | - 'categoryName': clean_value(row.get('categoryName', '')), | ||
| 106 | - 'supplierName': clean_value(row.get('supplierName', '')), | ||
| 107 | - 'brandName': clean_value(row.get('brandName', '')), | ||
| 108 | - 'file_id': clean_value(row.get('file_id', '')), | ||
| 109 | - 'days_since_last_update': clean_value(row.get('days_since_last_update', '')), | ||
| 110 | - 'id': clean_value(row.get('id', '')), | ||
| 111 | - 'imageUrl': clean_value(row.get('imageUrl', '')) | ||
| 112 | - } | ||
| 113 | - | ||
| 114 | - | ||
| 115 | -def generate_spu_data(csv_data: dict, spu_id: int, tenant_id: str = "2") -> dict: | ||
| 116 | - """ | ||
| 117 | - Generate SPU data from CSV row. | ||
| 118 | - | ||
| 119 | - Args: | ||
| 120 | - csv_data: Parsed CSV row data | ||
| 121 | - spu_id: SPU ID | ||
| 122 | - tenant_id: Tenant ID (default: "2") | ||
| 123 | - | ||
| 124 | - Returns: | ||
| 125 | - SPU data dictionary | ||
| 126 | - """ | ||
| 127 | - # Parse create_time | ||
| 128 | - try: | ||
| 129 | - created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') | ||
| 130 | - except: | ||
| 131 | - created_at = datetime.now() - timedelta(days=random.randint(1, 365)) | ||
| 132 | - | ||
| 133 | - updated_at = created_at + timedelta(days=random.randint(0, 30)) | ||
| 134 | - | ||
| 135 | - # Generate handle from title | ||
| 136 | - title = csv_data['name'] or csv_data['enSpuName'] or 'Product' | ||
| 137 | - handle = generate_handle(title) | ||
| 138 | - | ||
| 139 | - # Generate tags from category and brand | ||
| 140 | - tags_parts = [] | ||
| 141 | - if csv_data['categoryName']: | ||
| 142 | - tags_parts.append(csv_data['categoryName']) | ||
| 143 | - if csv_data['brandName']: | ||
| 144 | - tags_parts.append(csv_data['brandName']) | ||
| 145 | - tags = ','.join(tags_parts) if tags_parts else '' | ||
| 146 | - | ||
| 147 | - # Generate SEO fields | ||
| 148 | - seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title | ||
| 149 | - seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title | ||
| 150 | - seo_keywords = f"{title},{csv_data['categoryName']},{csv_data['brandName']}" if csv_data['categoryName'] else title | ||
| 151 | - | ||
| 152 | - spu = { | ||
| 153 | - 'id': spu_id, | ||
| 154 | - 'shop_id': 1, | ||
| 155 | - 'shoplazza_id': csv_data['id'] or f"spu-{spu_id}", | ||
| 156 | - 'handle': handle, | ||
| 157 | - 'title': title, | ||
| 158 | - 'brief': csv_data['name'] or '', | ||
| 159 | - 'description': f"<p>{csv_data['name']}</p>" if csv_data['name'] else '', | ||
| 160 | - 'spu': '', | ||
| 161 | - 'vendor': csv_data['supplierName'] or '', | ||
| 162 | - 'vendor_url': '', | ||
| 163 | - 'seo_title': seo_title, | ||
| 164 | - 'seo_description': seo_description, | ||
| 165 | - 'seo_keywords': seo_keywords, | ||
| 166 | - 'image_src': csv_data['imageUrl'] or '', | ||
| 167 | - 'image_width': 800, | ||
| 168 | - 'image_height': 600, | ||
| 169 | - 'image_path': f"products/{spu_id}.jpg", | ||
| 170 | - 'image_alt': title, | ||
| 171 | - 'inventory_policy': '', | ||
| 172 | - 'inventory_quantity': 0, | ||
| 173 | - 'inventory_tracking': '0', | ||
| 174 | - 'published': 1, | ||
| 175 | - 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 176 | - 'requires_shipping': 1, | ||
| 177 | - 'taxable': 0, | ||
| 178 | - 'fake_sales': 0, | ||
| 179 | - 'display_fake_sales': 0, | ||
| 180 | - 'mixed_wholesale': 0, | ||
| 181 | - 'need_variant_image': 0, | ||
| 182 | - 'has_only_default_variant': 0, | ||
| 183 | - 'tags': tags, | ||
| 184 | - 'note': '', | ||
| 185 | - 'category': csv_data['categoryName'] or '', | ||
| 186 | - 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 187 | - 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 188 | - 'tenant_id': tenant_id, | ||
| 189 | - 'creator': '1', | ||
| 190 | - 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 191 | - 'updater': '1', | ||
| 192 | - 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 193 | - 'deleted': 0 | ||
| 194 | - } | ||
| 195 | - | ||
| 196 | - return spu | ||
| 197 | - | ||
| 198 | - | ||
| 199 | -def generate_sku_data(csv_data: dict, spu_id: int, sku_id: int, tenant_id: str = "2") -> dict: | ||
| 200 | - """ | ||
| 201 | - Generate SKU data from CSV row. | ||
| 202 | - | ||
| 203 | - Args: | ||
| 204 | - csv_data: Parsed CSV row data | ||
| 205 | - spu_id: Associated SPU ID | ||
| 206 | - sku_id: SKU ID (from CSV skuId) | ||
| 207 | - tenant_id: Tenant ID (default: "2") | ||
| 208 | - | ||
| 209 | - Returns: | ||
| 210 | - SKU data dictionary | ||
| 211 | - """ | ||
| 212 | - # Parse create_time | ||
| 213 | - try: | ||
| 214 | - created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') | ||
| 215 | - except: | ||
| 216 | - created_at = datetime.now() - timedelta(days=random.randint(1, 365)) | ||
| 217 | - | ||
| 218 | - updated_at = created_at + timedelta(days=random.randint(0, 30)) | ||
| 219 | - | ||
| 220 | - # Generate random price | ||
| 221 | - price = round(random.uniform(50, 500), 2) | ||
| 222 | - compare_at_price = round(price * random.uniform(1.2, 1.5), 2) | ||
| 223 | - cost_price = round(price * 0.6, 2) | ||
| 224 | - | ||
| 225 | - # Generate random stock | ||
| 226 | - inventory_quantity = random.randint(0, 100) | ||
| 227 | - | ||
| 228 | - # Generate random weight | ||
| 229 | - weight = round(random.uniform(0.1, 5.0), 2) | ||
| 230 | - | ||
| 231 | - # Use ruSkuName as title, fallback to name | ||
| 232 | - title = csv_data['ruSkuName'] or csv_data['name'] or 'SKU' | ||
| 233 | - | ||
| 234 | - # Use skuId as SKU code | ||
| 235 | - sku_code = csv_data['skuId'] or f"SKU-{sku_id}" | ||
| 236 | - | ||
| 237 | - sku = { | ||
| 238 | - 'id': sku_id, | ||
| 239 | - 'spu_id': spu_id, | ||
| 240 | - 'shop_id': 1, | ||
| 241 | - 'shoplazza_id': f"sku-{sku_id}", | ||
| 242 | - 'shoplazza_product_id': csv_data['id'] or f"spu-{spu_id}", | ||
| 243 | - 'shoplazza_image_id': '', | ||
| 244 | - 'title': title, | ||
| 245 | - 'sku': sku_code, | ||
| 246 | - 'barcode': f"BAR{sku_id:08d}", | ||
| 247 | - 'position': 1, | ||
| 248 | - 'price': price, | ||
| 249 | - 'compare_at_price': compare_at_price, | ||
| 250 | - 'cost_price': cost_price, | ||
| 251 | - 'option1': '', | ||
| 252 | - 'option2': '', | ||
| 253 | - 'option3': '', | ||
| 254 | - 'inventory_quantity': inventory_quantity, | ||
| 255 | - 'weight': weight, | ||
| 256 | - 'weight_unit': 'kg', | ||
| 257 | - 'image_src': csv_data['imageUrl'] or '', | ||
| 258 | - 'wholesale_price': f'[{{"price": {round(price * 0.8, 2)}, "minQuantity": 10}}]', | ||
| 259 | - 'note': '', | ||
| 260 | - 'extend': None, # JSON field, use NULL | ||
| 261 | - 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 262 | - 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 263 | - 'tenant_id': tenant_id, | ||
| 264 | - 'creator': '1', | ||
| 265 | - 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 266 | - 'updater': '1', | ||
| 267 | - 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 268 | - 'deleted': 0 | ||
| 269 | - } | ||
| 270 | - | ||
| 271 | - return sku | ||
| 272 | - | ||
| 273 | - | ||
| 274 | -def read_csv_file(csv_file: str) -> list: | ||
| 275 | - """ | ||
| 276 | - Read CSV file and return list of parsed rows. | ||
| 277 | - | ||
| 278 | - Args: | ||
| 279 | - csv_file: Path to CSV file | ||
| 280 | - | ||
| 281 | - Returns: | ||
| 282 | - List of parsed CSV data dictionaries | ||
| 283 | - """ | ||
| 284 | - csv_data_list = [] | ||
| 285 | - | ||
| 286 | - with open(csv_file, 'r', encoding='utf-8') as f: | ||
| 287 | - # Use csv.DictReader to handle quoted fields properly | ||
| 288 | - reader = csv.DictReader(f) | ||
| 289 | - for row in reader: | ||
| 290 | - parsed = parse_csv_row(row) | ||
| 291 | - csv_data_list.append(parsed) | ||
| 292 | - | ||
| 293 | - return csv_data_list | ||
| 294 | - | ||
| 295 | - | ||
| 296 | -def generate_sql_inserts(spus: list, skus: list, output_file: str): | ||
| 297 | - """ | ||
| 298 | - Generate SQL INSERT statements. | ||
| 299 | - | ||
| 300 | - Args: | ||
| 301 | - spus: List of SPU data | ||
| 302 | - skus: List of SKU data | ||
| 303 | - output_file: Output file path | ||
| 304 | - """ | ||
| 305 | - with open(output_file, 'w', encoding='utf-8') as f: | ||
| 306 | - f.write("-- SPU Data from tenant2 CSV\n") | ||
| 307 | - f.write("INSERT INTO shoplazza_product_spu (\n") | ||
| 308 | - f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n") | ||
| 309 | - f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n") | ||
| 310 | - f.write(" image_src, image_width, image_height, image_path, image_alt,\n") | ||
| 311 | - f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n") | ||
| 312 | - f.write(" published, published_at, requires_shipping, taxable,\n") | ||
| 313 | - f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n") | ||
| 314 | - f.write(" has_only_default_variant, tags, note, category,\n") | ||
| 315 | - f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n") | ||
| 316 | - f.write(" creator, create_time, updater, update_time, deleted\n") | ||
| 317 | - f.write(") VALUES\n") | ||
| 318 | - | ||
| 319 | - for i, spu in enumerate(spus): | ||
| 320 | - values = ( | ||
| 321 | - f"({spu['id']}, {spu['shop_id']}, '{escape_sql_string(spu['shoplazza_id'])}', " | ||
| 322 | - f"'{escape_sql_string(spu['handle'])}', '{escape_sql_string(spu['title'])}', " | ||
| 323 | - f"'{escape_sql_string(spu['brief'])}', '{escape_sql_string(spu['description'])}', " | ||
| 324 | - f"'{escape_sql_string(spu['spu'])}', '{escape_sql_string(spu['vendor'])}', " | ||
| 325 | - f"'{escape_sql_string(spu['vendor_url'])}', '{escape_sql_string(spu['seo_title'])}', " | ||
| 326 | - f"'{escape_sql_string(spu['seo_description'])}', '{escape_sql_string(spu['seo_keywords'])}', " | ||
| 327 | - f"'{escape_sql_string(spu['image_src'])}', {spu['image_width']}, " | ||
| 328 | - f"{spu['image_height']}, '{escape_sql_string(spu['image_path'])}', " | ||
| 329 | - f"'{escape_sql_string(spu['image_alt'])}', '{escape_sql_string(spu['inventory_policy'])}', " | ||
| 330 | - f"{spu['inventory_quantity']}, '{escape_sql_string(spu['inventory_tracking'])}', " | ||
| 331 | - f"{spu['published']}, '{escape_sql_string(spu['published_at'])}', " | ||
| 332 | - f"{spu['requires_shipping']}, {spu['taxable']}, " | ||
| 333 | - f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, " | ||
| 334 | - f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, " | ||
| 335 | - f"'{escape_sql_string(spu['tags'])}', '{escape_sql_string(spu['note'])}', " | ||
| 336 | - f"'{escape_sql_string(spu['category'])}', '{escape_sql_string(spu['shoplazza_created_at'])}', " | ||
| 337 | - f"'{escape_sql_string(spu['shoplazza_updated_at'])}', '{escape_sql_string(spu['tenant_id'])}', " | ||
| 338 | - f"'{escape_sql_string(spu['creator'])}', '{escape_sql_string(spu['create_time'])}', " | ||
| 339 | - f"'{escape_sql_string(spu['updater'])}', '{escape_sql_string(spu['update_time'])}', " | ||
| 340 | - f"{spu['deleted']})" | ||
| 341 | - ) | ||
| 342 | - f.write(values) | ||
| 343 | - if i < len(spus) - 1: | ||
| 344 | - f.write(",\n") | ||
| 345 | - else: | ||
| 346 | - f.write(";\n\n") | ||
| 347 | - | ||
| 348 | - f.write("-- SKU Data from tenant2 CSV\n") | ||
| 349 | - f.write("INSERT INTO shoplazza_product_sku (\n") | ||
| 350 | - f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n") | ||
| 351 | - f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n") | ||
| 352 | - f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n") | ||
| 353 | - f.write(" image_src, wholesale_price, note, extend,\n") | ||
| 354 | - f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n") | ||
| 355 | - f.write(" creator, create_time, updater, update_time, deleted\n") | ||
| 356 | - f.write(") VALUES\n") | ||
| 357 | - | ||
| 358 | - for i, sku in enumerate(skus): | ||
| 359 | - # Handle extend field (JSON, can be NULL) | ||
| 360 | - extend_value = 'NULL' if sku['extend'] is None else f"'{escape_sql_string(sku['extend'])}'" | ||
| 361 | - | ||
| 362 | - values = ( | ||
| 363 | - f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{escape_sql_string(sku['shoplazza_id'])}', " | ||
| 364 | - f"'{escape_sql_string(sku['shoplazza_product_id'])}', '{escape_sql_string(sku['shoplazza_image_id'])}', " | ||
| 365 | - f"'{escape_sql_string(sku['title'])}', '{escape_sql_string(sku['sku'])}', " | ||
| 366 | - f"'{escape_sql_string(sku['barcode'])}', {sku['position']}, " | ||
| 367 | - f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, " | ||
| 368 | - f"'{escape_sql_string(sku['option1'])}', '{escape_sql_string(sku['option2'])}', " | ||
| 369 | - f"'{escape_sql_string(sku['option3'])}', {sku['inventory_quantity']}, {sku['weight']}, " | ||
| 370 | - f"'{escape_sql_string(sku['weight_unit'])}', '{escape_sql_string(sku['image_src'])}', " | ||
| 371 | - f"'{escape_sql_string(sku['wholesale_price'])}', '{escape_sql_string(sku['note'])}', " | ||
| 372 | - f"{extend_value}, '{escape_sql_string(sku['shoplazza_created_at'])}', " | ||
| 373 | - f"'{escape_sql_string(sku['shoplazza_updated_at'])}', '{escape_sql_string(sku['tenant_id'])}', " | ||
| 374 | - f"'{escape_sql_string(sku['creator'])}', '{escape_sql_string(sku['create_time'])}', " | ||
| 375 | - f"'{escape_sql_string(sku['updater'])}', '{escape_sql_string(sku['update_time'])}', " | ||
| 376 | - f"{sku['deleted']})" | ||
| 377 | - ) | ||
| 378 | - f.write(values) | ||
| 379 | - if i < len(skus) - 1: | ||
| 380 | - f.write(",\n") | ||
| 381 | - else: | ||
| 382 | - f.write(";\n") | ||
| 383 | - | ||
| 384 | - | ||
| 385 | -def get_max_ids_from_db(db_config=None): | ||
| 386 | - """ | ||
| 387 | - Get maximum IDs from database to avoid primary key conflicts. | ||
| 388 | - | ||
| 389 | - Args: | ||
| 390 | - db_config: Optional database config dict with keys: host, port, database, username, password | ||
| 391 | - | ||
| 392 | - Returns: | ||
| 393 | - tuple: (max_spu_id, max_sku_id) or (0, 0) if cannot connect | ||
| 394 | - """ | ||
| 395 | - if not db_config: | ||
| 396 | - return 0, 0 | ||
| 397 | - | ||
| 398 | - try: | ||
| 399 | - from utils.db_connector import create_db_connection | ||
| 400 | - from sqlalchemy import text | ||
| 401 | - | ||
| 402 | - db_engine = create_db_connection( | ||
| 403 | - host=db_config['host'], | ||
| 404 | - port=db_config['port'], | ||
| 405 | - database=db_config['database'], | ||
| 406 | - username=db_config['username'], | ||
| 407 | - password=db_config['password'] | ||
| 408 | - ) | ||
| 409 | - | ||
| 410 | - with db_engine.connect() as conn: | ||
| 411 | - result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_spu')) | ||
| 412 | - max_spu_id = result.scalar() or 0 | ||
| 413 | - | ||
| 414 | - result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_sku')) | ||
| 415 | - max_sku_id = result.scalar() or 0 | ||
| 416 | - | ||
| 417 | - return max_spu_id, max_sku_id | ||
| 418 | - except Exception as e: | ||
| 419 | - print(f"Warning: Could not get max IDs from database: {e}") | ||
| 420 | - return 0, 0 | ||
| 421 | - | ||
| 422 | - | ||
| 423 | -def main(): | ||
| 424 | - parser = argparse.ArgumentParser(description='Import tenant2 CSV data into MySQL Shoplazza tables') | ||
| 425 | - parser.add_argument('--csv-file', required=True, help='CSV file path') | ||
| 426 | - parser.add_argument('--tenant-id', default='2', help='Tenant ID (default: 2)') | ||
| 427 | - parser.add_argument('--start-spu-id', type=int, default=None, help='Starting SPU ID (default: auto-calculate from DB)') | ||
| 428 | - parser.add_argument('--output', default='tenant2_data.sql', help='Output SQL file (default: tenant2_data.sql)') | ||
| 429 | - parser.add_argument('--db-host', help='Database host (for auto-calculating start IDs)') | ||
| 430 | - parser.add_argument('--db-port', type=int, default=3306, help='Database port (default: 3306)') | ||
| 431 | - parser.add_argument('--db-database', help='Database name (for auto-calculating start IDs)') | ||
| 432 | - parser.add_argument('--db-username', help='Database username (for auto-calculating start IDs)') | ||
| 433 | - parser.add_argument('--db-password', help='Database password (for auto-calculating start IDs)') | ||
| 434 | - | ||
| 435 | - args = parser.parse_args() | ||
| 436 | - | ||
| 437 | - print(f"Reading CSV file: {args.csv_file}") | ||
| 438 | - csv_data_list = read_csv_file(args.csv_file) | ||
| 439 | - print(f"Read {len(csv_data_list)} rows from CSV") | ||
| 440 | - | ||
| 441 | - # Auto-calculate start IDs if not provided and DB config available | ||
| 442 | - start_spu_id = args.start_spu_id | ||
| 443 | - if start_spu_id is None and args.db_host and args.db_database and args.db_username and args.db_password: | ||
| 444 | - print("Auto-calculating start IDs from database...") | ||
| 445 | - db_config = { | ||
| 446 | - 'host': args.db_host, | ||
| 447 | - 'port': args.db_port, | ||
| 448 | - 'database': args.db_database, | ||
| 449 | - 'username': args.db_username, | ||
| 450 | - 'password': args.db_password | ||
| 451 | - } | ||
| 452 | - max_spu_id, max_sku_id = get_max_ids_from_db(db_config) | ||
| 453 | - start_spu_id = max_spu_id + 1 | ||
| 454 | - print(f" Max SPU ID in DB: {max_spu_id}") | ||
| 455 | - print(f" Using start SPU ID: {start_spu_id}") | ||
| 456 | - elif start_spu_id is None: | ||
| 457 | - start_spu_id = 1 | ||
| 458 | - print(f"Using default start SPU ID: {start_spu_id}") | ||
| 459 | - | ||
| 460 | - # Generate SPU and SKU data | ||
| 461 | - print(f"Generating SPU and SKU data (tenant_id={args.tenant_id})...") | ||
| 462 | - spus = [] | ||
| 463 | - skus = [] | ||
| 464 | - spu_id = start_spu_id | ||
| 465 | - | ||
| 466 | - for csv_data in csv_data_list: | ||
| 467 | - # Generate SPU | ||
| 468 | - spu = generate_spu_data(csv_data, spu_id, args.tenant_id) | ||
| 469 | - spus.append(spu) | ||
| 470 | - | ||
| 471 | - # Generate SKU - use skuId from CSV as SKU ID | ||
| 472 | - try: | ||
| 473 | - sku_id = int(csv_data['skuId']) | ||
| 474 | - except: | ||
| 475 | - # If skuId is not valid, use a generated ID | ||
| 476 | - sku_id = 1000000 + spu_id | ||
| 477 | - | ||
| 478 | - sku = generate_sku_data(csv_data, spu_id, sku_id, args.tenant_id) | ||
| 479 | - skus.append(sku) | ||
| 480 | - | ||
| 481 | - spu_id += 1 | ||
| 482 | - | ||
| 483 | - print(f"Generated {len(spus)} SPUs and {len(skus)} SKUs") | ||
| 484 | - | ||
| 485 | - # Generate SQL file | ||
| 486 | - print(f"Generating SQL file: {args.output}") | ||
| 487 | - generate_sql_inserts(spus, skus, args.output) | ||
| 488 | - print(f"SQL file generated: {args.output}") | ||
| 489 | - print(f" - SPUs: {len(spus)}") | ||
| 490 | - print(f" - SKUs: {len(skus)}") | ||
| 491 | - | ||
| 492 | - | ||
| 493 | -if __name__ == '__main__': | ||
| 494 | - main() | ||
| 495 | - |
scripts/indexer__old_2025_11/import_test_data.py deleted
| @@ -1,277 +0,0 @@ | @@ -1,277 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -Import test data into MySQL Shoplazza tables. | ||
| 4 | - | ||
| 5 | -Reads SQL file generated by generate_test_data.py and imports into MySQL. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -import sys | ||
| 9 | -import os | ||
| 10 | -import argparse | ||
| 11 | -from pathlib import Path | ||
| 12 | - | ||
| 13 | -# Add parent directory to path | ||
| 14 | -sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 15 | - | ||
| 16 | -from utils.db_connector import create_db_connection, test_connection | ||
| 17 | - | ||
| 18 | - | ||
| 19 | -def import_sql_file(db_engine, sql_file: str): | ||
| 20 | - """ | ||
| 21 | - Import SQL file into database using MySQL client (more reliable for large files). | ||
| 22 | - | ||
| 23 | - Args: | ||
| 24 | - db_engine: SQLAlchemy database engine (used to get connection info) | ||
| 25 | - sql_file: Path to SQL file | ||
| 26 | - """ | ||
| 27 | - import subprocess | ||
| 28 | - import os | ||
| 29 | - from pathlib import Path | ||
| 30 | - | ||
| 31 | - # Get connection info from engine URL | ||
| 32 | - engine_url = str(db_engine.url) | ||
| 33 | - # Parse: mysql+pymysql://user:pass@host:port/database | ||
| 34 | - import re | ||
| 35 | - match = re.match(r'mysql\+pymysql://([^:]+):([^@]+)@([^:]+):(\d+)/(.+)', engine_url) | ||
| 36 | - if not match: | ||
| 37 | - raise ValueError(f"Cannot parse database URL: {engine_url}") | ||
| 38 | - | ||
| 39 | - username, password, host, port, database = match.groups() | ||
| 40 | - | ||
| 41 | - # Use MySQL client to execute SQL file (more reliable) | ||
| 42 | - sql_file_path = Path(sql_file).absolute() | ||
| 43 | - | ||
| 44 | - # Build mysql command | ||
| 45 | - mysql_cmd = [ | ||
| 46 | - 'mysql', | ||
| 47 | - f'-h{host}', | ||
| 48 | - f'-P{port}', | ||
| 49 | - f'-u{username}', | ||
| 50 | - f'-p{password}', | ||
| 51 | - database | ||
| 52 | - ] | ||
| 53 | - | ||
| 54 | - print(f"Executing SQL file using MySQL client...") | ||
| 55 | - print(f" File: {sql_file_path}") | ||
| 56 | - print(f" Database: {host}:{port}/{database}") | ||
| 57 | - | ||
| 58 | - try: | ||
| 59 | - with open(sql_file_path, 'r', encoding='utf-8') as f: | ||
| 60 | - result = subprocess.run( | ||
| 61 | - mysql_cmd, | ||
| 62 | - stdin=f, | ||
| 63 | - capture_output=True, | ||
| 64 | - text=True, | ||
| 65 | - timeout=300 # 5 minute timeout | ||
| 66 | - ) | ||
| 67 | - | ||
| 68 | - if result.returncode != 0: | ||
| 69 | - error_msg = result.stderr or result.stdout | ||
| 70 | - print(f"ERROR: MySQL execution failed") | ||
| 71 | - print(f"Error output: {error_msg[:500]}") | ||
| 72 | - raise Exception(f"MySQL execution failed: {error_msg[:200]}") | ||
| 73 | - | ||
| 74 | - print("SQL file executed successfully") | ||
| 75 | - return True | ||
| 76 | - | ||
| 77 | - except FileNotFoundError: | ||
| 78 | - # Fallback to SQLAlchemy if mysql client not available | ||
| 79 | - print("MySQL client not found, falling back to SQLAlchemy...") | ||
| 80 | - return import_sql_file_sqlalchemy(db_engine, sql_file) | ||
| 81 | - except subprocess.TimeoutExpired: | ||
| 82 | - raise Exception("SQL execution timed out after 5 minutes") | ||
| 83 | - except Exception as e: | ||
| 84 | - print(f"Error using MySQL client: {e}") | ||
| 85 | - print("Falling back to SQLAlchemy...") | ||
| 86 | - return import_sql_file_sqlalchemy(db_engine, sql_file) | ||
| 87 | - | ||
| 88 | - | ||
| 89 | -def import_sql_file_sqlalchemy(db_engine, sql_file: str): | ||
| 90 | - """ | ||
| 91 | - Fallback method: Import SQL file using SQLAlchemy (for when mysql client unavailable). | ||
| 92 | - """ | ||
| 93 | - from sqlalchemy import text | ||
| 94 | - | ||
| 95 | - with open(sql_file, 'r', encoding='utf-8') as f: | ||
| 96 | - sql_content = f.read() | ||
| 97 | - | ||
| 98 | - # Remove comment lines | ||
| 99 | - lines = sql_content.split('\n') | ||
| 100 | - cleaned_lines = [] | ||
| 101 | - for line in lines: | ||
| 102 | - stripped = line.lstrip() | ||
| 103 | - if stripped.startswith('--'): | ||
| 104 | - continue | ||
| 105 | - cleaned_lines.append(line) | ||
| 106 | - | ||
| 107 | - sql_content = '\n'.join(cleaned_lines) | ||
| 108 | - | ||
| 109 | - # Split by semicolon - but we need to handle strings properly | ||
| 110 | - # Use a state machine to track string boundaries | ||
| 111 | - statements = [] | ||
| 112 | - current = [] | ||
| 113 | - in_string = False | ||
| 114 | - i = 0 | ||
| 115 | - | ||
| 116 | - while i < len(sql_content): | ||
| 117 | - char = sql_content[i] | ||
| 118 | - | ||
| 119 | - if char == "'": | ||
| 120 | - # Check for escaped quote (two single quotes) | ||
| 121 | - if i + 1 < len(sql_content) and sql_content[i+1] == "'": | ||
| 122 | - current.append("''") | ||
| 123 | - i += 1 # Skip next quote | ||
| 124 | - elif not in_string: | ||
| 125 | - in_string = True | ||
| 126 | - current.append(char) | ||
| 127 | - else: | ||
| 128 | - in_string = False | ||
| 129 | - current.append(char) | ||
| 130 | - else: | ||
| 131 | - current.append(char) | ||
| 132 | - | ||
| 133 | - # Split on semicolon only if not in string | ||
| 134 | - if char == ';' and not in_string: | ||
| 135 | - stmt = ''.join(current).strip() | ||
| 136 | - if stmt and stmt.upper().startswith('INSERT INTO'): | ||
| 137 | - statements.append(stmt) | ||
| 138 | - current = [] | ||
| 139 | - | ||
| 140 | - i += 1 | ||
| 141 | - | ||
| 142 | - # Handle last statement | ||
| 143 | - if current: | ||
| 144 | - stmt = ''.join(current).strip() | ||
| 145 | - if stmt and stmt.upper().startswith('INSERT INTO'): | ||
| 146 | - statements.append(stmt) | ||
| 147 | - | ||
| 148 | - print(f"Parsed {len(statements)} SQL statements") | ||
| 149 | - print(f"Executing {len(statements)} SQL statements...") | ||
| 150 | - | ||
| 151 | - # Use raw connection to avoid SQLAlchemy parameter parsing | ||
| 152 | - raw_conn = db_engine.raw_connection() | ||
| 153 | - try: | ||
| 154 | - cursor = raw_conn.cursor() | ||
| 155 | - try: | ||
| 156 | - for i, statement in enumerate(statements, 1): | ||
| 157 | - try: | ||
| 158 | - # Execute raw SQL directly using pymysql cursor | ||
| 159 | - cursor.execute(statement) | ||
| 160 | - raw_conn.commit() | ||
| 161 | - if i % 1000 == 0 or i == len(statements): | ||
| 162 | - print(f" [{i}/{len(statements)}] Executed successfully") | ||
| 163 | - except Exception as e: | ||
| 164 | - print(f" [{i}/{len(statements)}] ERROR: {e}") | ||
| 165 | - error_start = max(0, statement.find('VALUES') - 100) | ||
| 166 | - error_end = min(len(statement), error_start + 500) | ||
| 167 | - print(f" Statement context: ...{statement[error_start:error_end]}...") | ||
| 168 | - raise | ||
| 169 | - finally: | ||
| 170 | - cursor.close() | ||
| 171 | - finally: | ||
| 172 | - raw_conn.close() | ||
| 173 | - | ||
| 174 | - return True | ||
| 175 | - | ||
| 176 | - | ||
| 177 | -def verify_import(db_engine, tenant_id: str): | ||
| 178 | - """ | ||
| 179 | - Verify imported data. | ||
| 180 | - | ||
| 181 | - Args: | ||
| 182 | - db_engine: SQLAlchemy database engine | ||
| 183 | - tenant_id: Tenant ID to verify | ||
| 184 | - """ | ||
| 185 | - from sqlalchemy import text | ||
| 186 | - | ||
| 187 | - with db_engine.connect() as conn: | ||
| 188 | - # Count SPUs | ||
| 189 | - result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id}) | ||
| 190 | - spu_count = result.scalar() | ||
| 191 | - | ||
| 192 | - # Count SKUs | ||
| 193 | - result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id}) | ||
| 194 | - sku_count = result.scalar() | ||
| 195 | - | ||
| 196 | - print(f"\nVerification:") | ||
| 197 | - print(f" SPUs: {spu_count}") | ||
| 198 | - print(f" SKUs: {sku_count}") | ||
| 199 | - | ||
| 200 | - return spu_count, sku_count | ||
| 201 | - | ||
| 202 | - | ||
| 203 | -def main(): | ||
| 204 | - parser = argparse.ArgumentParser(description='Import test data into MySQL') | ||
| 205 | - | ||
| 206 | - # Database connection | ||
| 207 | - parser.add_argument('--db-host', required=True, help='MySQL host') | ||
| 208 | - parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)') | ||
| 209 | - parser.add_argument('--db-database', required=True, help='MySQL database name') | ||
| 210 | - parser.add_argument('--db-username', required=True, help='MySQL username') | ||
| 211 | - parser.add_argument('--db-password', required=True, help='MySQL password') | ||
| 212 | - | ||
| 213 | - # Import options | ||
| 214 | - parser.add_argument('--sql-file', required=True, help='SQL file to import') | ||
| 215 | - parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)') | ||
| 216 | - | ||
| 217 | - args = parser.parse_args() | ||
| 218 | - | ||
| 219 | - print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}") | ||
| 220 | - | ||
| 221 | - # Connect to database | ||
| 222 | - try: | ||
| 223 | - db_engine = create_db_connection( | ||
| 224 | - host=args.db_host, | ||
| 225 | - port=args.db_port, | ||
| 226 | - database=args.db_database, | ||
| 227 | - username=args.db_username, | ||
| 228 | - password=args.db_password | ||
| 229 | - ) | ||
| 230 | - except Exception as e: | ||
| 231 | - print(f"ERROR: Failed to connect to MySQL: {e}") | ||
| 232 | - return 1 | ||
| 233 | - | ||
| 234 | - # Test connection | ||
| 235 | - if not test_connection(db_engine): | ||
| 236 | - print("ERROR: Database connection test failed") | ||
| 237 | - return 1 | ||
| 238 | - | ||
| 239 | - print("Database connection successful") | ||
| 240 | - | ||
| 241 | - # Clean existing data if tenant_id provided | ||
| 242 | - if args.tenant_id: | ||
| 243 | - print(f"\nCleaning existing data for tenant_id: {args.tenant_id}") | ||
| 244 | - from sqlalchemy import text | ||
| 245 | - try: | ||
| 246 | - with db_engine.connect() as conn: | ||
| 247 | - # Delete SKUs first (foreign key constraint) | ||
| 248 | - conn.execute(text(f"DELETE FROM shoplazza_product_sku WHERE tenant_id = '{args.tenant_id}'")) | ||
| 249 | - # Delete SPUs | ||
| 250 | - conn.execute(text(f"DELETE FROM shoplazza_product_spu WHERE tenant_id = '{args.tenant_id}'")) | ||
| 251 | - conn.commit() | ||
| 252 | - print("✓ Existing data cleaned") | ||
| 253 | - except Exception as e: | ||
| 254 | - print(f"⚠ Warning: Failed to clean existing data: {e}") | ||
| 255 | - # Continue anyway | ||
| 256 | - | ||
| 257 | - # Import SQL file | ||
| 258 | - print(f"\nImporting SQL file: {args.sql_file}") | ||
| 259 | - try: | ||
| 260 | - import_sql_file(db_engine, args.sql_file) | ||
| 261 | - print("Import completed successfully") | ||
| 262 | - except Exception as e: | ||
| 263 | - print(f"ERROR: Failed to import SQL file: {e}") | ||
| 264 | - import traceback | ||
| 265 | - traceback.print_exc() | ||
| 266 | - return 1 | ||
| 267 | - | ||
| 268 | - # Verify import if tenant_id provided | ||
| 269 | - if args.tenant_id: | ||
| 270 | - verify_import(db_engine, args.tenant_id) | ||
| 271 | - | ||
| 272 | - return 0 | ||
| 273 | - | ||
| 274 | - | ||
| 275 | -if __name__ == '__main__': | ||
| 276 | - sys.exit(main()) | ||
| 277 | - |
scripts/indexer__old_2025_11/ingest.sh deleted
| @@ -1,92 +0,0 @@ | @@ -1,92 +0,0 @@ | ||
| 1 | -#!/bin/bash | ||
| 2 | - | ||
| 3 | -# Unified data ingestion script for saas-search | ||
| 4 | -# Ingests data from MySQL to Elasticsearch | ||
| 5 | -# | ||
| 6 | -# [LEGACY] 此脚本仅保留用于历史兼容,不建议新流程继续使用。 | ||
| 7 | -# 推荐改用: | ||
| 8 | -# 1) ./scripts/create_tenant_index.sh <tenant_id> | ||
| 9 | -# 2) POST /indexer/reindex | ||
| 10 | - | ||
| 11 | -cd "$(dirname "$0")/.." | ||
| 12 | -source /home/tw/miniconda3/etc/profile.d/conda.sh | ||
| 13 | -conda activate searchengine | ||
| 14 | - | ||
| 15 | -GREEN='\033[0;32m' | ||
| 16 | -YELLOW='\033[1;33m' | ||
| 17 | -RED='\033[0;31m' | ||
| 18 | -NC='\033[0m' | ||
| 19 | - | ||
| 20 | -echo -e "${GREEN}========================================${NC}" | ||
| 21 | -echo -e "${GREEN}数据灌入脚本${NC}" | ||
| 22 | -echo -e "${GREEN}========================================${NC}" | ||
| 23 | - | ||
| 24 | -# Load config from .env file if it exists | ||
| 25 | -if [ -f .env ]; then | ||
| 26 | - set -a | ||
| 27 | - source .env | ||
| 28 | - set +a | ||
| 29 | -fi | ||
| 30 | - | ||
| 31 | -# Parameters | ||
| 32 | -TENANT_ID=${1:-""} | ||
| 33 | -RECREATE_INDEX=${2:-"false"} | ||
| 34 | - | ||
| 35 | -DB_HOST=${DB_HOST:-"120.79.247.228"} | ||
| 36 | -DB_PORT=${DB_PORT:-"3316"} | ||
| 37 | -DB_DATABASE=${DB_DATABASE:-"saas"} | ||
| 38 | -DB_USERNAME=${DB_USERNAME:-"saas"} | ||
| 39 | -DB_PASSWORD=${DB_PASSWORD:-"P89cZHS5d7dFyc9R"} | ||
| 40 | -ES_HOST=${ES_HOST:-"http://localhost:9200"} | ||
| 41 | -BATCH_SIZE=${BATCH_SIZE:-500} | ||
| 42 | - | ||
| 43 | -echo -e "\n${YELLOW}Configuration:${NC}" | ||
| 44 | -echo " Tenant ID: $TENANT_ID" | ||
| 45 | -echo " Recreate Index: $RECREATE_INDEX" | ||
| 46 | -echo " MySQL: $DB_HOST:$DB_PORT/$DB_DATABASE" | ||
| 47 | -echo " Elasticsearch: $ES_HOST" | ||
| 48 | -echo " Batch Size: $BATCH_SIZE" | ||
| 49 | - | ||
| 50 | -# Validate parameters | ||
| 51 | -if [ -z "$TENANT_ID" ]; then | ||
| 52 | - echo -e "${RED}ERROR: Tenant ID is required${NC}" | ||
| 53 | - echo "Usage: $0 <tenant_id> [recreate_index]" | ||
| 54 | - echo " tenant_id: Required, tenant ID" | ||
| 55 | - echo " recreate_index: Optional, recreate index if exists (true/false, default: false)" | ||
| 56 | - exit 1 | ||
| 57 | -fi | ||
| 58 | - | ||
| 59 | -if [ -z "$DB_PASSWORD" ]; then | ||
| 60 | - echo -e "${RED}ERROR: DB_PASSWORD未设置,请检查.env文件或环境变量${NC}" | ||
| 61 | - exit 1 | ||
| 62 | -fi | ||
| 63 | - | ||
| 64 | -# Build command | ||
| 65 | -CMD="python scripts/ingest_shoplazza.py \ | ||
| 66 | - --db-host $DB_HOST \ | ||
| 67 | - --db-port $DB_PORT \ | ||
| 68 | - --db-database $DB_DATABASE \ | ||
| 69 | - --db-username $DB_USERNAME \ | ||
| 70 | - --db-password $DB_PASSWORD \ | ||
| 71 | - --tenant-id $TENANT_ID \ | ||
| 72 | - --es-host $ES_HOST \ | ||
| 73 | - --batch-size $BATCH_SIZE" | ||
| 74 | - | ||
| 75 | -if [ "$RECREATE_INDEX" = "true" ] || [ "$RECREATE_INDEX" = "1" ]; then | ||
| 76 | - CMD="$CMD --recreate" | ||
| 77 | - echo -e "\n${YELLOW}Warning: Index will be deleted and recreated!${NC}" | ||
| 78 | -fi | ||
| 79 | - | ||
| 80 | -echo -e "\n${YELLOW}Starting data ingestion...${NC}" | ||
| 81 | -eval $CMD | ||
| 82 | - | ||
| 83 | -if [ $? -eq 0 ]; then | ||
| 84 | - echo -e "\n${GREEN}========================================${NC}" | ||
| 85 | - echo -e "${GREEN}数据灌入完成!${NC}" | ||
| 86 | - echo -e "${GREEN}========================================${NC}" | ||
| 87 | -else | ||
| 88 | - echo -e "\n${RED}========================================${NC}" | ||
| 89 | - echo -e "${RED}数据灌入失败!${NC}" | ||
| 90 | - echo -e "${RED}========================================${NC}" | ||
| 91 | - exit 1 | ||
| 92 | -fi |
scripts/indexer__old_2025_11/ingest_shoplazza.py deleted
| @@ -1,146 +0,0 @@ | @@ -1,146 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -Shoplazza data ingestion script. | ||
| 4 | - | ||
| 5 | -Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -import sys | ||
| 9 | -import os | ||
| 10 | -import argparse | ||
| 11 | -from pathlib import Path | ||
| 12 | - | ||
| 13 | -# Add parent directory to path | ||
| 14 | -sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 15 | - | ||
| 16 | -from utils.db_connector import create_db_connection | ||
| 17 | -from utils.es_client import ESClient | ||
| 18 | -from indexer.spu_transformer import SPUTransformer | ||
| 19 | -from indexer.mapping_generator import load_mapping, DEFAULT_INDEX_NAME | ||
| 20 | -from indexer.bulk_indexer import BulkIndexer | ||
| 21 | - | ||
| 22 | - | ||
| 23 | -def main(): | ||
| 24 | - parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch') | ||
| 25 | - | ||
| 26 | - # Database connection | ||
| 27 | - parser.add_argument('--db-host', required=True, help='MySQL host') | ||
| 28 | - parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)') | ||
| 29 | - parser.add_argument('--db-database', required=True, help='MySQL database name') | ||
| 30 | - parser.add_argument('--db-username', required=True, help='MySQL username') | ||
| 31 | - parser.add_argument('--db-password', required=True, help='MySQL password') | ||
| 32 | - | ||
| 33 | - # Tenant and index | ||
| 34 | - parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)') | ||
| 35 | - parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host') | ||
| 36 | - | ||
| 37 | - # Options | ||
| 38 | - parser.add_argument('--recreate', action='store_true', help='Recreate index if exists') | ||
| 39 | - parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)') | ||
| 40 | - | ||
| 41 | - args = parser.parse_args() | ||
| 42 | - | ||
| 43 | - print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}") | ||
| 44 | - | ||
| 45 | - # Load mapping from JSON file | ||
| 46 | - try: | ||
| 47 | - mapping = load_mapping() | ||
| 48 | - print(f"Loaded mapping configuration") | ||
| 49 | - except Exception as e: | ||
| 50 | - print(f"ERROR: Failed to load mapping: {e}") | ||
| 51 | - return 1 | ||
| 52 | - | ||
| 53 | - index_name = DEFAULT_INDEX_NAME | ||
| 54 | - | ||
| 55 | - # Connect to MySQL | ||
| 56 | - print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}") | ||
| 57 | - try: | ||
| 58 | - db_engine = create_db_connection( | ||
| 59 | - host=args.db_host, | ||
| 60 | - port=args.db_port, | ||
| 61 | - database=args.db_database, | ||
| 62 | - username=args.db_username, | ||
| 63 | - password=args.db_password | ||
| 64 | - ) | ||
| 65 | - except Exception as e: | ||
| 66 | - print(f"ERROR: Failed to connect to MySQL: {e}") | ||
| 67 | - return 1 | ||
| 68 | - | ||
| 69 | - # Connect to Elasticsearch | ||
| 70 | - es_host = args.es_host | ||
| 71 | - es_username = os.environ.get('ES_USERNAME') | ||
| 72 | - es_password = os.environ.get('ES_PASSWORD') | ||
| 73 | - | ||
| 74 | - print(f"Connecting to Elasticsearch: {es_host}") | ||
| 75 | - if es_username and es_password: | ||
| 76 | - print(f"Using authentication: {es_username}") | ||
| 77 | - es_client = ESClient(hosts=[es_host], username=es_username, password=es_password) | ||
| 78 | - else: | ||
| 79 | - es_client = ESClient(hosts=[es_host]) | ||
| 80 | - | ||
| 81 | - if not es_client.ping(): | ||
| 82 | - print(f"ERROR: Cannot connect to Elasticsearch at {es_host}") | ||
| 83 | - return 1 | ||
| 84 | - | ||
| 85 | - # Create index if needed | ||
| 86 | - if args.recreate: | ||
| 87 | - if es_client.index_exists(index_name): | ||
| 88 | - print(f"Deleting existing index: {index_name}") | ||
| 89 | - if not es_client.delete_index(index_name): | ||
| 90 | - print(f"ERROR: Failed to delete index '{index_name}'") | ||
| 91 | - return 1 | ||
| 92 | - | ||
| 93 | - if not es_client.index_exists(index_name): | ||
| 94 | - print(f"Creating index: {index_name}") | ||
| 95 | - if not es_client.create_index(index_name, mapping): | ||
| 96 | - print(f"ERROR: Failed to create index '{index_name}'") | ||
| 97 | - print("Please check the mapping configuration and try again.") | ||
| 98 | - return 1 | ||
| 99 | - else: | ||
| 100 | - print(f"Using existing index: {index_name}") | ||
| 101 | - | ||
| 102 | - # Initialize SPU transformer | ||
| 103 | - print(f"Initializing SPU transformer for tenant: {args.tenant_id}") | ||
| 104 | - transformer = SPUTransformer(db_engine, args.tenant_id) | ||
| 105 | - | ||
| 106 | - # Transform data | ||
| 107 | - print("Transforming SPU and SKU data...") | ||
| 108 | - try: | ||
| 109 | - documents = transformer.transform_batch() | ||
| 110 | - print(f"Transformed {len(documents)} SPU documents") | ||
| 111 | - except Exception as e: | ||
| 112 | - print(f"ERROR: Failed to transform data: {e}") | ||
| 113 | - import traceback | ||
| 114 | - traceback.print_exc() | ||
| 115 | - return 1 | ||
| 116 | - | ||
| 117 | - if not documents: | ||
| 118 | - print("WARNING: No documents to index") | ||
| 119 | - return 0 | ||
| 120 | - | ||
| 121 | - # Bulk index | ||
| 122 | - print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...") | ||
| 123 | - indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size) | ||
| 124 | - | ||
| 125 | - try: | ||
| 126 | - results = indexer.index_documents(documents, id_field="spu_id", show_progress=True) | ||
| 127 | - print(f"\nIngestion complete:") | ||
| 128 | - print(f" Success: {results['success']}") | ||
| 129 | - print(f" Failed: {results['failed']}") | ||
| 130 | - print(f" Time: {results.get('elapsed_time', 0):.2f}s") | ||
| 131 | - | ||
| 132 | - if results['failed'] > 0: | ||
| 133 | - print(f"\nWARNING: {results['failed']} documents failed to index") | ||
| 134 | - return 1 | ||
| 135 | - | ||
| 136 | - return 0 | ||
| 137 | - except Exception as e: | ||
| 138 | - print(f"ERROR: Failed to index documents: {e}") | ||
| 139 | - import traceback | ||
| 140 | - traceback.print_exc() | ||
| 141 | - return 1 | ||
| 142 | - | ||
| 143 | - | ||
| 144 | -if __name__ == '__main__': | ||
| 145 | - sys.exit(main()) | ||
| 146 | - |
scripts/indexer__old_2025_11/recreate_and_import.py deleted
| @@ -1,184 +0,0 @@ | @@ -1,184 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -重建索引并导入数据的脚本。 | ||
| 4 | - | ||
| 5 | -清除旧索引,使用新的mapping重建索引,然后导入数据。 | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -import sys | ||
| 9 | -import os | ||
| 10 | -import argparse | ||
| 11 | -from pathlib import Path | ||
| 12 | - | ||
| 13 | -# Add parent directory to path | ||
| 14 | -sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 15 | - | ||
| 16 | -from utils.db_connector import create_db_connection | ||
| 17 | -from utils.es_client import ESClient | ||
| 18 | -from indexer.mapping_generator import load_mapping, delete_index_if_exists, DEFAULT_INDEX_NAME | ||
| 19 | -from indexer.spu_transformer import SPUTransformer | ||
| 20 | -from indexer.bulk_indexer import BulkIndexer | ||
| 21 | - | ||
| 22 | - | ||
| 23 | -def main(): | ||
| 24 | - parser = argparse.ArgumentParser(description='重建ES索引并导入数据') | ||
| 25 | - | ||
| 26 | - # Database connection | ||
| 27 | - parser.add_argument('--db-host', help='MySQL host (或使用环境变量 DB_HOST)') | ||
| 28 | - parser.add_argument('--db-port', type=int, help='MySQL port (或使用环境变量 DB_PORT, 默认: 3306)') | ||
| 29 | - parser.add_argument('--db-database', help='MySQL database (或使用环境变量 DB_DATABASE)') | ||
| 30 | - parser.add_argument('--db-username', help='MySQL username (或使用环境变量 DB_USERNAME)') | ||
| 31 | - parser.add_argument('--db-password', help='MySQL password (或使用环境变量 DB_PASSWORD)') | ||
| 32 | - | ||
| 33 | - # Tenant and ES | ||
| 34 | - parser.add_argument('--tenant-id', required=True, help='Tenant ID (必需)') | ||
| 35 | - parser.add_argument('--es-host', help='Elasticsearch host (或使用环境变量 ES_HOST, 默认: http://localhost:9200)') | ||
| 36 | - | ||
| 37 | - # Options | ||
| 38 | - parser.add_argument('--batch-size', type=int, default=500, help='批量导入大小 (默认: 500)') | ||
| 39 | - parser.add_argument('--skip-delete', action='store_true', help='跳过删除旧索引步骤') | ||
| 40 | - | ||
| 41 | - args = parser.parse_args() | ||
| 42 | - | ||
| 43 | - print("=" * 60) | ||
| 44 | - print("重建ES索引并导入数据") | ||
| 45 | - print("=" * 60) | ||
| 46 | - | ||
| 47 | - # 加载mapping | ||
| 48 | - print("\n[1/4] 加载mapping配置...") | ||
| 49 | - try: | ||
| 50 | - mapping = load_mapping() | ||
| 51 | - print(f"✓ 成功加载mapping配置") | ||
| 52 | - except Exception as e: | ||
| 53 | - print(f"✗ 加载mapping失败: {e}") | ||
| 54 | - return 1 | ||
| 55 | - | ||
| 56 | - index_name = DEFAULT_INDEX_NAME | ||
| 57 | - print(f"索引名称: {index_name}") | ||
| 58 | - | ||
| 59 | - # 连接Elasticsearch | ||
| 60 | - print("\n[2/4] 连接Elasticsearch...") | ||
| 61 | - es_host = args.es_host or os.environ.get('ES_HOST', 'http://localhost:9200') | ||
| 62 | - es_username = os.environ.get('ES_USERNAME') | ||
| 63 | - es_password = os.environ.get('ES_PASSWORD') | ||
| 64 | - | ||
| 65 | - print(f"ES地址: {es_host}") | ||
| 66 | - if es_username: | ||
| 67 | - print(f"ES用户名: {es_username}") | ||
| 68 | - | ||
| 69 | - try: | ||
| 70 | - if es_username and es_password: | ||
| 71 | - es_client = ESClient(hosts=[es_host], username=es_username, password=es_password) | ||
| 72 | - else: | ||
| 73 | - es_client = ESClient(hosts=[es_host]) | ||
| 74 | - | ||
| 75 | - if not es_client.ping(): | ||
| 76 | - print(f"✗ 无法连接到Elasticsearch: {es_host}") | ||
| 77 | - return 1 | ||
| 78 | - print("✓ Elasticsearch连接成功") | ||
| 79 | - except Exception as e: | ||
| 80 | - print(f"✗ 连接Elasticsearch失败: {e}") | ||
| 81 | - return 1 | ||
| 82 | - | ||
| 83 | - # 删除旧索引 | ||
| 84 | - if not args.skip_delete: | ||
| 85 | - print("\n[3/4] 删除旧索引...") | ||
| 86 | - if es_client.index_exists(index_name): | ||
| 87 | - print(f"发现已存在的索引: {index_name}") | ||
| 88 | - if delete_index_if_exists(es_client, index_name): | ||
| 89 | - print(f"✓ 成功删除索引: {index_name}") | ||
| 90 | - else: | ||
| 91 | - print(f"✗ 删除索引失败: {index_name}") | ||
| 92 | - return 1 | ||
| 93 | - else: | ||
| 94 | - print(f"索引不存在,跳过删除: {index_name}") | ||
| 95 | - else: | ||
| 96 | - print("\n[3/4] 跳过删除旧索引步骤") | ||
| 97 | - | ||
| 98 | - # 创建新索引 | ||
| 99 | - print("\n[4/4] 创建新索引...") | ||
| 100 | - try: | ||
| 101 | - if es_client.index_exists(index_name): | ||
| 102 | - print(f"✓ 索引已存在: {index_name},跳过创建") | ||
| 103 | - else: | ||
| 104 | - print(f"创建索引: {index_name}") | ||
| 105 | - if es_client.create_index(index_name, mapping): | ||
| 106 | - print(f"✓ 成功创建索引: {index_name}") | ||
| 107 | - else: | ||
| 108 | - print(f"✗ 创建索引失败: {index_name}") | ||
| 109 | - return 1 | ||
| 110 | - except Exception as e: | ||
| 111 | - print(f"✗ 创建索引失败: {e}") | ||
| 112 | - import traceback | ||
| 113 | - traceback.print_exc() | ||
| 114 | - return 1 | ||
| 115 | - | ||
| 116 | - # 连接MySQL | ||
| 117 | - print("\n[5/5] 连接MySQL...") | ||
| 118 | - db_host = args.db_host or os.environ.get('DB_HOST') | ||
| 119 | - db_port = args.db_port or int(os.environ.get('DB_PORT', 3306)) | ||
| 120 | - db_database = args.db_database or os.environ.get('DB_DATABASE') | ||
| 121 | - db_username = args.db_username or os.environ.get('DB_USERNAME') | ||
| 122 | - db_password = args.db_password or os.environ.get('DB_PASSWORD') | ||
| 123 | - | ||
| 124 | - if not all([db_host, db_database, db_username, db_password]): | ||
| 125 | - print("✗ MySQL连接参数不完整") | ||
| 126 | - print("请提供 --db-host, --db-database, --db-username, --db-password") | ||
| 127 | - print("或设置环境变量: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD") | ||
| 128 | - return 1 | ||
| 129 | - | ||
| 130 | - print(f"MySQL: {db_host}:{db_port}/{db_database}") | ||
| 131 | - try: | ||
| 132 | - db_engine = create_db_connection( | ||
| 133 | - host=db_host, | ||
| 134 | - port=db_port, | ||
| 135 | - database=db_database, | ||
| 136 | - username=db_username, | ||
| 137 | - password=db_password | ||
| 138 | - ) | ||
| 139 | - print("✓ MySQL连接成功") | ||
| 140 | - except Exception as e: | ||
| 141 | - print(f"✗ 连接MySQL失败: {e}") | ||
| 142 | - return 1 | ||
| 143 | - | ||
| 144 | - # 导入数据 | ||
| 145 | - print("\n[6/6] 导入数据...") | ||
| 146 | - print(f"Tenant ID: {args.tenant_id}") | ||
| 147 | - print(f"批量大小: {args.batch_size}") | ||
| 148 | - | ||
| 149 | - try: | ||
| 150 | - transformer = SPUTransformer(db_engine, args.tenant_id) | ||
| 151 | - print("正在转换数据...") | ||
| 152 | - documents = transformer.transform_batch() | ||
| 153 | - print(f"✓ 转换完成: {len(documents)} 个文档") | ||
| 154 | - | ||
| 155 | - if not documents: | ||
| 156 | - print("⚠ 没有数据需要导入") | ||
| 157 | - return 0 | ||
| 158 | - | ||
| 159 | - print(f"正在导入数据到ES (批量大小: {args.batch_size})...") | ||
| 160 | - indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size) | ||
| 161 | - results = indexer.index_documents(documents, id_field="spu_id", show_progress=True) | ||
| 162 | - | ||
| 163 | - print(f"\n{'='*60}") | ||
| 164 | - print("导入完成!") | ||
| 165 | - print(f"{'='*60}") | ||
| 166 | - print(f"成功: {results['success']}") | ||
| 167 | - print(f"失败: {results['failed']}") | ||
| 168 | - print(f"耗时: {results.get('elapsed_time', 0):.2f}秒") | ||
| 169 | - | ||
| 170 | - if results['failed'] > 0: | ||
| 171 | - print(f"\n⚠ 警告: {results['failed']} 个文档导入失败") | ||
| 172 | - return 1 | ||
| 173 | - | ||
| 174 | - return 0 | ||
| 175 | - except Exception as e: | ||
| 176 | - print(f"✗ 导入数据失败: {e}") | ||
| 177 | - import traceback | ||
| 178 | - traceback.print_exc() | ||
| 179 | - return 1 | ||
| 180 | - | ||
| 181 | - | ||
| 182 | -if __name__ == '__main__': | ||
| 183 | - sys.exit(main()) | ||
| 184 | - |
scripts/install_server_deps.sh deleted
| @@ -1,14 +0,0 @@ | @@ -1,14 +0,0 @@ | ||
| 1 | -#!/bin/bash | ||
| 2 | - | ||
| 3 | -echo "Installing server security dependencies..." | ||
| 4 | - | ||
| 5 | -# Check if we're in a conda environment | ||
| 6 | -if [ -z "$CONDA_DEFAULT_ENV" ]; then | ||
| 7 | - echo "Warning: No conda environment detected. Installing with pip..." | ||
| 8 | - pip install slowapi>=0.1.9 anyio>=3.7.0 | ||
| 9 | -else | ||
| 10 | - echo "Installing in conda environment: $CONDA_DEFAULT_ENV" | ||
| 11 | - pip install slowapi>=0.1.9 anyio>=3.7.0 | ||
| 12 | -fi | ||
| 13 | - | ||
| 14 | -echo "Dependencies installed successfully!" | ||
| 15 | \ No newline at end of file | 0 | \ No newline at end of file |
scripts/reindex_from_remote_tenant_170_to_0.sh deleted
| @@ -1,99 +0,0 @@ | @@ -1,99 +0,0 @@ | ||
| 1 | -#!/bin/bash | ||
| 2 | -# | ||
| 3 | -# 从远程 ES 的 search_products_tenant_170 同步 10000 条到本机 search_products_tenant_0。 | ||
| 4 | -# 请求发往本机 ES,由本机去拉远程数据;需在本机 elasticsearch.yml 配置 reindex.remote.whitelist。 | ||
| 5 | -# | ||
| 6 | -# 用法: | ||
| 7 | -# ./scripts/reindex_from_remote_tenant_170_to_0.sh | ||
| 8 | -# | ||
| 9 | -# 环境变量(可选): | ||
| 10 | -# LOCAL_ES_HOST 本机 ES 地址,用于创建索引和发送 _reindex(默认从 .env 的 ES_HOST 读取,应为本机) | ||
| 11 | -# REMOTE_ES_HOST 远程 ES 地址(默认 http://120.76.41.98:9200) | ||
| 12 | -# REMOTE_ES_USER 远程 ES 用户名(默认 essa) | ||
| 13 | -# REMOTE_ES_PASS 远程 ES 密码(默认 4hOaLaf41y2VuI8y) | ||
| 14 | -# MAX_DOCS 同步条数(默认 10000) | ||
| 15 | -# | ||
| 16 | - | ||
| 17 | -set -e | ||
| 18 | - | ||
| 19 | -cd "$(dirname "$0")/.." | ||
| 20 | -PROJECT_ROOT="$(pwd)" | ||
| 21 | - | ||
| 22 | -# 加载 .env | ||
| 23 | -# shellcheck source=scripts/lib/load_env.sh | ||
| 24 | -source "${PROJECT_ROOT}/scripts/lib/load_env.sh" | ||
| 25 | -load_env_file "${PROJECT_ROOT}/.env" | ||
| 26 | - | ||
| 27 | -# 本机 ES(发 _reindex 请求的目标) | ||
| 28 | -LOCAL_ES_HOST="${LOCAL_ES_HOST:-${ES_HOST:-http://localhost:9200}}" | ||
| 29 | -ES_USERNAME="${ES_USERNAME:-}" | ||
| 30 | -ES_PASSWORD="${ES_PASSWORD:-}" | ||
| 31 | -ES_INDEX_NAMESPACE="${ES_INDEX_NAMESPACE:-}" | ||
| 32 | - | ||
| 33 | -# 远程 ES(数据源) | ||
| 34 | -REMOTE_ES_HOST="${REMOTE_ES_HOST:-http://120.76.41.98:9200}" | ||
| 35 | -REMOTE_ES_USER="${REMOTE_ES_USER:-essa}" | ||
| 36 | -REMOTE_ES_PASS="${REMOTE_ES_PASS:-4hOaLaf41y2VuI8y}" | ||
| 37 | - | ||
| 38 | -MAX_DOCS="${MAX_DOCS:-10000}" | ||
| 39 | -SOURCE_INDEX="search_products_tenant_170" | ||
| 40 | -DEST_INDEX="${ES_INDEX_NAMESPACE}search_products_tenant_0" | ||
| 41 | -MAPPING_FILE="${PROJECT_ROOT}/mappings/search_products.json" | ||
| 42 | - | ||
| 43 | -# 本机 curl 认证 | ||
| 44 | -AUTH_PARAM="" | ||
| 45 | -if [ -n "$ES_USERNAME" ] && [ -n "$ES_PASSWORD" ]; then | ||
| 46 | - AUTH_PARAM="-u ${ES_USERNAME}:${ES_PASSWORD}" | ||
| 47 | -fi | ||
| 48 | - | ||
| 49 | -echo "本机 ES: $LOCAL_ES_HOST" | ||
| 50 | -echo "远程 ES: $REMOTE_ES_HOST" | ||
| 51 | -echo "源索引: $SOURCE_INDEX" | ||
| 52 | -echo "目标索引: $DEST_INDEX" | ||
| 53 | -echo "同步条数: $MAX_DOCS" | ||
| 54 | -echo "" | ||
| 55 | - | ||
| 56 | -# 1. 若目标索引不存在,则创建 | ||
| 57 | -if ! curl -s $AUTH_PARAM "${LOCAL_ES_HOST}/${DEST_INDEX}" -o /dev/null -w "%{http_code}" | grep -q 200; then | ||
| 58 | - echo "创建目标索引: $DEST_INDEX" | ||
| 59 | - if [ ! -f "$MAPPING_FILE" ]; then | ||
| 60 | - echo "错误: mapping 文件不存在: $MAPPING_FILE" | ||
| 61 | - exit 1 | ||
| 62 | - fi | ||
| 63 | - curl -X PUT "${LOCAL_ES_HOST}/${DEST_INDEX}" \ | ||
| 64 | - -H "Content-Type: application/json" \ | ||
| 65 | - $AUTH_PARAM \ | ||
| 66 | - -d @"${MAPPING_FILE}" \ | ||
| 67 | - -w "\nHTTP: %{http_code}\n" -s | tail -1 | ||
| 68 | - echo "" | ||
| 69 | -else | ||
| 70 | - echo "目标索引已存在: $DEST_INDEX,将写入数据(可能覆盖同 id 文档)" | ||
| 71 | -fi | ||
| 72 | - | ||
| 73 | -# 2. Reindex from remote(JSON 中的密码用 env 传入,避免 shell 转义) | ||
| 74 | -echo "执行 Reindex from remote(最多 $MAX_DOCS 条)..." | ||
| 75 | -export REMOTE_ES_HOST REMOTE_ES_USER REMOTE_ES_PASS SOURCE_INDEX DEST_INDEX MAX_DOCS | ||
| 76 | -# ES 9.x 将 wait_for_completion 放在 query 参数,不在 body | ||
| 77 | -curl -X POST "${LOCAL_ES_HOST}/_reindex?wait_for_completion=true&pretty" \ | ||
| 78 | - -H "Content-Type: application/json" \ | ||
| 79 | - $AUTH_PARAM \ | ||
| 80 | - -d @- <<EOF | ||
| 81 | -{ | ||
| 82 | - "max_docs": ${MAX_DOCS}, | ||
| 83 | - "source": { | ||
| 84 | - "remote": { | ||
| 85 | - "host": "${REMOTE_ES_HOST}", | ||
| 86 | - "username": "${REMOTE_ES_USER}", | ||
| 87 | - "password": "${REMOTE_ES_PASS}" | ||
| 88 | - }, | ||
| 89 | - "index": "${SOURCE_INDEX}", | ||
| 90 | - "size": 500 | ||
| 91 | - }, | ||
| 92 | - "dest": { | ||
| 93 | - "index": "${DEST_INDEX}" | ||
| 94 | - } | ||
| 95 | -} | ||
| 96 | -EOF | ||
| 97 | - | ||
| 98 | -echo "" | ||
| 99 | -echo "完成。校验条数: curl $AUTH_PARAM '${LOCAL_ES_HOST}/${DEST_INDEX}/_count?pretty' -H 'Content-Type: application/json' -d '{\"query\":{\"match_all\":{}}}'" |
scripts/start.sh deleted
scripts/test_build_docs_api.py renamed to tests/manual/test_build_docs_api.py
| @@ -4,9 +4,9 @@ | @@ -4,9 +4,9 @@ | ||
| 4 | 4 | ||
| 5 | 用法: | 5 | 用法: |
| 6 | 1. 先启动 Indexer 服务: ./scripts/start_indexer.sh (或 uvicorn api.indexer_app:app --port 6004) | 6 | 1. 先启动 Indexer 服务: ./scripts/start_indexer.sh (或 uvicorn api.indexer_app:app --port 6004) |
| 7 | - 2. 执行: python scripts/test_build_docs_api.py | 7 | + 2. 执行: python tests/manual/test_build_docs_api.py |
| 8 | 8 | ||
| 9 | - 也可指定地址: INDEXER_URL=http://localhost:6004 python scripts/test_build_docs_api.py | 9 | + 也可指定地址: INDEXER_URL=http://localhost:6004 python tests/manual/test_build_docs_api.py |
| 10 | """ | 10 | """ |
| 11 | 11 | ||
| 12 | import json | 12 | import json |
| @@ -15,7 +15,7 @@ import sys | @@ -15,7 +15,7 @@ import sys | ||
| 15 | from datetime import datetime, timezone | 15 | from datetime import datetime, timezone |
| 16 | 16 | ||
| 17 | # 项目根目录 | 17 | # 项目根目录 |
| 18 | -ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | 18 | +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 19 | sys.path.insert(0, ROOT) | 19 | sys.path.insert(0, ROOT) |
| 20 | 20 | ||
| 21 | # 默认使用 requests 调真实服务;若未安装则回退到 TestClient | 21 | # 默认使用 requests 调真实服务;若未安装则回退到 TestClient |
| @@ -122,7 +122,7 @@ def main(): | @@ -122,7 +122,7 @@ def main(): | ||
| 122 | print("\n[错误] 无法连接 Indexer 服务:", e) | 122 | print("\n[错误] 无法连接 Indexer 服务:", e) |
| 123 | print("请先启动: ./scripts/start_indexer.sh 或 uvicorn api.indexer_app:app --port 6004") | 123 | print("请先启动: ./scripts/start_indexer.sh 或 uvicorn api.indexer_app:app --port 6004") |
| 124 | if HAS_REQUESTS: | 124 | if HAS_REQUESTS: |
| 125 | - print("或使用进程内测试: USE_TEST_CLIENT=1 python scripts/test_build_docs_api.py") | 125 | + print("或使用进程内测试: USE_TEST_CLIENT=1 python tests/manual/test_build_docs_api.py") |
| 126 | sys.exit(1) | 126 | sys.exit(1) |
| 127 | else: | 127 | else: |
| 128 | if not use_http and not HAS_REQUESTS: | 128 | if not use_http and not HAS_REQUESTS: |
translation/README.md
| @@ -12,8 +12,8 @@ | @@ -12,8 +12,8 @@ | ||
| 12 | - 启动脚本:[`scripts/start_translator.sh`](/data/saas-search/scripts/start_translator.sh) | 12 | - 启动脚本:[`scripts/start_translator.sh`](/data/saas-search/scripts/start_translator.sh) |
| 13 | - 虚拟环境:[`scripts/setup_translator_venv.sh`](/data/saas-search/scripts/setup_translator_venv.sh) | 13 | - 虚拟环境:[`scripts/setup_translator_venv.sh`](/data/saas-search/scripts/setup_translator_venv.sh) |
| 14 | - 模型下载:[`scripts/download_translation_models.py`](/data/saas-search/scripts/download_translation_models.py) | 14 | - 模型下载:[`scripts/download_translation_models.py`](/data/saas-search/scripts/download_translation_models.py) |
| 15 | -- 本地模型压测:[`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) | ||
| 16 | -- 聚焦压测脚本:[`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) | 15 | +- 本地模型压测:[`benchmarks/translation/benchmark_translation_local_models.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models.py) |
| 16 | +- 聚焦压测脚本:[`benchmarks/translation/benchmark_translation_local_models_focus.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models_focus.py) | ||
| 17 | - 基线性能报告:[`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) | 17 | - 基线性能报告:[`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) |
| 18 | - CT2 扩展报告:[`perf_reports/20260318/translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) | 18 | - CT2 扩展报告:[`perf_reports/20260318/translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) |
| 19 | - CT2 聚焦调优报告:[`perf_reports/20260318/translation_local_models_ct2_focus/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/README.md) | 19 | - CT2 聚焦调优报告:[`perf_reports/20260318/translation_local_models_ct2_focus/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/README.md) |
| @@ -550,8 +550,8 @@ curl -X POST http://127.0.0.1:6006/translate \ | @@ -550,8 +550,8 @@ curl -X POST http://127.0.0.1:6006/translate \ | ||
| 550 | - 切换到 CTranslate2 后需要重新跑一轮基准,尤其关注 `nllb-200-distilled-600m` 的单条延迟、并发 tail latency 和 `opus-mt-*` 的 batch throughput。 | 550 | - 切换到 CTranslate2 后需要重新跑一轮基准,尤其关注 `nllb-200-distilled-600m` 的单条延迟、并发 tail latency 和 `opus-mt-*` 的 batch throughput。 |
| 551 | 551 | ||
| 552 | 性能脚本: | 552 | 性能脚本: |
| 553 | -- [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) | ||
| 554 | -- [`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) | 553 | +- [`benchmarks/translation/benchmark_translation_local_models.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models.py) |
| 554 | +- [`benchmarks/translation/benchmark_translation_local_models_focus.py`](/data/saas-search/benchmarks/translation/benchmark_translation_local_models_focus.py) | ||
| 555 | 555 | ||
| 556 | 数据集: | 556 | 数据集: |
| 557 | - [`products_analyzed.csv`](/data/saas-search/products_analyzed.csv) | 557 | - [`products_analyzed.csv`](/data/saas-search/products_analyzed.csv) |
| @@ -601,14 +601,14 @@ curl -X POST http://127.0.0.1:6006/translate \ | @@ -601,14 +601,14 @@ curl -X POST http://127.0.0.1:6006/translate \ | ||
| 601 | 601 | ||
| 602 | ```bash | 602 | ```bash |
| 603 | cd /data/saas-search | 603 | cd /data/saas-search |
| 604 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py | 604 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py |
| 605 | ``` | 605 | ``` |
| 606 | 606 | ||
| 607 | 本轮扩展压测复现命令: | 607 | 本轮扩展压测复现命令: |
| 608 | 608 | ||
| 609 | ```bash | 609 | ```bash |
| 610 | cd /data/saas-search | 610 | cd /data/saas-search |
| 611 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | 611 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py \ |
| 612 | --suite extended \ | 612 | --suite extended \ |
| 613 | --disable-cache \ | 613 | --disable-cache \ |
| 614 | --serial-items-per-case 256 \ | 614 | --serial-items-per-case 256 \ |
| @@ -620,7 +620,7 @@ cd /data/saas-search | @@ -620,7 +620,7 @@ cd /data/saas-search | ||
| 620 | 单模型扩展压测示例: | 620 | 单模型扩展压测示例: |
| 621 | 621 | ||
| 622 | ```bash | 622 | ```bash |
| 623 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | 623 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py \ |
| 624 | --single \ | 624 | --single \ |
| 625 | --suite extended \ | 625 | --suite extended \ |
| 626 | --model opus-mt-zh-en \ | 626 | --model opus-mt-zh-en \ |
| @@ -639,7 +639,7 @@ cd /data/saas-search | @@ -639,7 +639,7 @@ cd /data/saas-search | ||
| 639 | 单条请求延迟复现: | 639 | 单条请求延迟复现: |
| 640 | 640 | ||
| 641 | ```bash | 641 | ```bash |
| 642 | -./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | 642 | +./.venv-translator/bin/python benchmarks/translation/benchmark_translation_local_models.py \ |
| 643 | --single \ | 643 | --single \ |
| 644 | --suite extended \ | 644 | --suite extended \ |
| 645 | --model nllb-200-distilled-600m \ | 645 | --model nllb-200-distilled-600m \ |