From b85ffc9a672c014483b2d21cf5dfe7f7d0dee516 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 16 Apr 2026 10:55:44 +0800 Subject: [PATCH] Add test env TEI GPU overrides examples --- .env | 8 ++++---- .env.example | 8 ++++++++ .env.test.example | 40 ++++++++++++++++++++++++++++++++++++++++ .gitignore | 2 ++ docs/TEST_ENV_BRANCH_GUIDE.md | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/start_tei_service.sh | 17 ++++++++--------- 6 files changed, 292 insertions(+), 13 deletions(-) create mode 100644 .env.test.example create mode 100644 docs/TEST_ENV_BRANCH_GUIDE.md diff --git a/.env b/.env index 3b4a563..97a1d65 100644 --- a/.env +++ b/.env @@ -68,13 +68,13 @@ DB_DATABASE=saas DB_USERNAME=saas DB_PASSWORD=pcjY7iwX1C6le1oz # ===== test env embedding alignment overrides (2026-04-12 23:xx) ===== -TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:cpu-1.9 TEI_PORT=8080 -TEI_DEVICE=cpu -TEI_DTYPE=float32 +TEI_DEVICE=cuda +TEI_DTYPE=float16 +TEI_IMAGE_REPO=ghcr.m.daocloud.io/huggingface/text-embeddings-inference TEI_MODEL_ID=/data/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181 TEI_HEALTH_TIMEOUT_SEC=240 CNCLIP_DEVICE=cpu CNCLIP_MODEL_NAME=CN-CLIP/ViT-L-14 EMBEDDING_VENV=/home/tw/saas-search/.venv-cnclip -TRANSLATOR_VENV=/home/tw/saas-search/.venv +TRANSLATOR_VENV=/home/tw/saas-search/.venv-translator diff --git a/.env.example b/.env.example index f248b88..39e0ae2 100644 --- a/.env.example +++ b/.env.example @@ -46,6 +46,14 @@ EMBEDDING_BACKEND=tei TEI_BASE_URL=http://127.0.0.1:8080 TEI_DEVICE=cuda TEI_VERSION=1.9 +# Optional: override TEI docker image repository (useful for mirrors). +# TEI_IMAGE_REPO=ghcr.m.daocloud.io/huggingface/text-embeddings-inference +# +# Optional: pin an explicit TEI image tag. +# - For Tesla T4 (compute capability 7.5), prefer the `turing-*` image tag, e.g.: +# TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9 +# - For Ampere+ GPUs, prefer `cuda-*` image tag, e.g.: +# TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:cuda-1.9 TEI_MAX_BATCH_TOKENS=2048 TEI_MAX_CLIENT_BATCH_SIZE=8 TEI_HEALTH_TIMEOUT_SEC=300 diff --git a/.env.test.example b/.env.test.example new file mode 100644 index 0000000..28ab2da --- /dev/null +++ b/.env.test.example @@ -0,0 +1,40 @@ +# Test environment overrides example (no secrets). +# +# Usage: +# cp .env.example .env +# cat .env.test.example >> .env +# +# Notes: +# - This repo is multi-service; values below focus on local test deployment. +# - Keep real credentials (Redis/MySQL/ES passwords) out of VCS. + +# ===== runtime / namespace ===== +RUNTIME_ENV=test +ES_INDEX_NAMESPACE=test_ + +# ===== Elasticsearch (example: local docker on non-default port) ===== +ES_HOST=http://127.0.0.1:19200 +ES_USERNAME= +ES_PASSWORD= +ES_DOCKER_HTTP_PORT=19200 +ES_DOCKER_CONTAINER_NAME=saas-search-es9-test + +# ===== HuggingFace cache ===== +HF_CACHE_DIR=/data/tw/.cache/huggingface + +# ===== TEI (text embeddings inference) ===== +# Service port exposed by container (host:8080 -> container:80) +TEI_PORT=8080 +# Use GPU when available +TEI_DEVICE=cuda +# Use float16 for performance on GPU +TEI_DTYPE=float16 +# IMPORTANT for Tesla T4 (compute capability 7.5): use turing image tag +TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9 +# Example pinned model snapshot path (update per-machine) +TEI_MODEL_ID=/data/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181 +TEI_MAX_BATCH_TOKENS=2048 +TEI_MAX_CLIENT_BATCH_SIZE=8 +TEI_HEALTH_TIMEOUT_SEC=240 +TEI_CONTAINER_NAME=saas-search-tei-test + diff --git a/.gitignore b/.gitignore index 36f6814..e602d63 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,5 @@ model_cache/ artifacts/search_evaluation/*.sqlite3 artifacts/search_evaluation/batch_reports/ artifacts/search_evaluation/tuning_runs/ + +.tmp/ \ No newline at end of file diff --git a/docs/TEST_ENV_BRANCH_GUIDE.md b/docs/TEST_ENV_BRANCH_GUIDE.md new file mode 100644 index 0000000..7dccc0d --- /dev/null +++ b/docs/TEST_ENV_BRANCH_GUIDE.md @@ -0,0 +1,230 @@ +# 测试环境分支说明:`test/small-gpu-es9` + +本文用于帮助开发同学理解 **测试环境部署分支** `test/small-gpu-es9` 与 `master` 的差异、部署要点,以及未来如何更容易与主干同步。 + +> 重要原则(目标态) +> 该分支应尽量做到:**只引入部署配置差异,不引入业务/代码逻辑差异**。 +> 但历史上该分支在 `master` 分离后曾引入过多处代码改动,见下方“分离以来变动汇总”。 + +--- + +## 1. 分支定位与分离点 + +- **分支名**:`test/small-gpu-es9` +- **远程**:`origin/test/small-gpu-es9` +- **相对 `origin/master` 的 merge-base(分离点)**: + +```bash +git fetch origin --prune +git merge-base HEAD origin/master +``` + +在当前仓库状态下,merge-base 为: + +- `dc22700d04fd8b40978710b6ca6d3d016a66318b` + +> 说明:`origin/master` 已继续向前推进,测试分支需要定期同步主干,否则会长期落后并产生更大合并成本。 + +--- + +## 2. 如何一键查看“从分离点以来”的全部差异(推荐命令) + +### 2.1 仅看测试分支新增的 commits(不含 master 后续前进) + +```bash +BASE=$(git merge-base HEAD origin/master) +git log --oneline ${BASE}..HEAD +``` + +### 2.2 仅看测试分支相对分离点的文件差异(最关键) + +```bash +BASE=$(git merge-base HEAD origin/master) +git diff --name-status ${BASE}..HEAD +``` + +### 2.3 看 master 已经有、但测试分支还没有的 commits(用于评估落后程度) + +```bash +git log --oneline HEAD..origin/master +``` + +--- + +## 3. 从分离点以来变动汇总(高层) + +### 3.1 分支新增提交(当前共 5 个) + +```text +e8d3bbb Add test env TEI GPU overrides examples +b3ffdc7 Sync legacy frontend entrypoint from 0a440fb +89fa3f3 Sync master portability fixes from f07947a +778c299 测试环境redis配置 +b423bf4 测试环境配置:关闭reranker,其余的都打开,对接本机es docker内的19200 +``` + +### 3.2 文件层面的差异(按类别归纳) + +> 下列列表来自 `git diff --name-status ${BASE}..HEAD` 的结果归类;不在本文粘贴文件内容,避免将本地敏感信息写入文档。 + +#### A) 测试环境配置 / 示例文件 + +- `.env.example`(M):补充 TEI 镜像 tag 选择说明(T4 用 `turing-*`) +- `.env.test.example`(A):新增“测试环境 override 示例”(不含密钥) +- `config/environments/test.yaml`(A):新增测试环境覆盖项(ES host/embedding/rerank 开关) + +#### B) 翻译服务与本地模型相关 + +- `translation/`、`api/translator_app.py`、`requirements_translator_service.txt`、若干测试用例等(M/A) +- 这部分属于**代码逻辑差异**,与“仅配置差异”的目标不一致,未来同步主干时需要重点关注与取舍。 + +#### C) service_ctl / scripts / frontend / indexer 等 + +- `scripts/service_ctl.sh`(M) +- `scripts/*`(M/A) +- `frontend/static/js/app.js`(M) +- `indexer/product_enrich.py`(M) +- `models`(D) + +以上同样属于**代码差异**(历史原因),并非纯部署配置差异。 + +#### D) `.env` 与备份文件(敏感风险提示) + +- `.env`(M)以及 `.env.bak.*` / `.env.backup.*`(A) + +这些文件通常会包含敏感配置(如 DB/Redis 密码、token 等)。**原则上不应提交到版本库**。 +如果这些文件当前已经被 git 跟踪,建议后续做一次清理:将其从版本控制中移除,仅保留 `.env.example` / `.env.*.example` 这类无密钥模板。 + +--- + +## 4. 测试环境部署要点(与 master 的“常见差异点”) + +### 4.1 Elasticsearch:使用本机 Docker 暴露的 19200 端口 + +测试环境常见配置为: + +- `ES_HOST=http://127.0.0.1:19200` + +对应文件(示例/覆盖): + +- `.env.test.example` +- `config/environments/test.yaml`(`infrastructure.elasticsearch.host`) + +### 4.2 Embedding(6005)后端走 TEI(8080) + +链路: + +- `embedding` 服务:`http://127.0.0.1:6005`(对外) +- TEI 服务:`http://127.0.0.1:8080`(embedding 的 text backend) + +验证命令: + +```bash +curl -sSf http://127.0.0.1:8080/health +curl -sSf http://127.0.0.1:6005/health +curl -sSf -X POST http://127.0.0.1:6005/embed/text \ + -H "Content-Type: application/json" \ + -d '{"inputs":["health check"],"normalize":true}' \ + | python -c 'import sys,json; d=json.load(sys.stdin); print(len(d["embeddings"][0]))' +``` + +预期输出维度:`1024`。 + +### 4.3 TEI GPU:Tesla T4 需要 `turing-*` 镜像 + +在 Tesla T4(compute capability 7.5)上,TEI 若误用 `cuda-*` 镜像可能出现类似错误: + +- `compute cap 75 is not compatible with compile time compute cap 80` + +因此测试环境建议显式 pin: + +- `TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9` +- `TEI_DEVICE=cuda` +- `TEI_DTYPE=float16` + +这些已在 `.env.test.example` 中给出。 + +验证是否真的跑在 GPU: + +```bash +nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader +``` + +应能看到 `text-embeddings-router` 占用显存。 + +### 4.4 Reranker / Fine-rank:测试环境通常关闭 + +测试环境覆盖(见 `config/environments/test.yaml`): + +- `rerank.enabled: false` +- `fine_rank.enabled: false` + +这可减少 GPU/资源占用,并避免未配置时服务启动失败。 + +--- + +## 5. 测试环境部署建议流程(不含密钥) + +1) 生成本地 `.env`(不要提交): + +```bash +cp .env.example .env +cat .env.test.example >> .env +``` + +2) 根据测试机实际情况补齐敏感项(仅本机): + +- `REDIS_PASSWORD` / `DB_PASSWORD` / `ES_PASSWORD` / `DEEPL_AUTH_KEY` 等 + +3) 拉起服务(按需): + +```bash +./scripts/service_ctl.sh up tei embedding backend indexer frontend eval-web +``` + +4) 逐项健康检查: + +```bash +./scripts/service_ctl.sh status tei embedding backend indexer +``` + +--- + +## 6. 与主干同步策略(建议做法) + +### 6.1 目标态:创建“干净的测试部署分支” + +因为当前 `test/small-gpu-es9` 含有历史代码差异,后续主干同步成本会持续升高。推荐做法: + +1. 从最新 `origin/master` 新建分支(例如 `test/deploy-es9-gpu`)。 +2. 只 cherry-pick 与测试部署相关的**配置类提交**(例如 `.env.*.example`、`config/environments/test.yaml` 这类)。 +3. 严格禁止提交 `.env`、`.env.bak*` 等包含敏感信息/本机路径的文件。 + +### 6.2 若继续使用当前分支:同步主干的基本步骤 + +```bash +git fetch origin --prune +git checkout test/small-gpu-es9 +git merge origin/master +``` + +合并时重点关注冲突/回归风险区域: + +- `translation/`(本地模型/依赖/服务行为) +- `scripts/`、`scripts/service_ctl.sh` +- `frontend/`、`indexer/` + +--- + +## 7. 附:常用对照表(端口与服务) + +| 服务 | 端口 | 说明 | +|------|-----:|------| +| Elasticsearch(测试 docker) | 19200 | `ES_HOST=http://127.0.0.1:19200` | +| TEI(text embeddings backend) | 8080 | `TEI_BASE_URL=http://127.0.0.1:8080` | +| embedding(text) | 6005 | `/embed/text`,后端转发 TEI | +| backend | 6002 | 搜索 API | +| indexer | 6004 | 索引 API | +| frontend | 6003 | 调试 UI | +| eval-web | 6010 | 评估 UI | + diff --git a/scripts/start_tei_service.sh b/scripts/start_tei_service.sh index 666d11e..6bd4839 100755 --- a/scripts/start_tei_service.sh +++ b/scripts/start_tei_service.sh @@ -26,6 +26,7 @@ TEI_MAX_CLIENT_BATCH_SIZE="${TEI_MAX_CLIENT_BATCH_SIZE:-24}" TEI_DTYPE="${TEI_DTYPE:-float16}" HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/huggingface}" TEI_HEALTH_TIMEOUT_SEC="${TEI_HEALTH_TIMEOUT_SEC:-300}" +TEI_IMAGE_REPO="${TEI_IMAGE_REPO:-ghcr.m.daocloud.io/huggingface/text-embeddings-inference}" TEI_DEVICE_RAW="${TEI_DEVICE:-cuda}" TEI_DEVICE="$(echo "${TEI_DEVICE_RAW}" | tr '[:upper:]' '[:lower:]')" @@ -40,9 +41,9 @@ detect_gpu_tei_image() { compute_cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 || true)" major="${compute_cap%%.*}" if [[ -n "${major}" && "${major}" -lt 8 ]]; then - echo "ghcr.io/huggingface/text-embeddings-inference:turing-${TEI_VERSION}" + echo "${TEI_IMAGE_REPO}:turing-${TEI_VERSION}" else - echo "ghcr.io/huggingface/text-embeddings-inference:cuda-${TEI_VERSION}" + echo "${TEI_IMAGE_REPO}:cuda-${TEI_VERSION}" fi } @@ -51,16 +52,14 @@ if [[ "${TEI_DEVICE}" == "cuda" ]]; then echo "ERROR: TEI_DEVICE=cuda but NVIDIA GPU is not available. No CPU fallback." >&2 exit 1 fi - if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q 'nvidia'; then - echo "ERROR: TEI_DEVICE=cuda but Docker nvidia runtime is not configured." >&2 - echo "Install and configure nvidia-container-toolkit, then restart Docker." >&2 - exit 1 - fi + # Note: modern Docker setups can support `--gpus all` without exposing an explicit "nvidia" runtime + # in `docker info`. We rely on `docker run` failure to surface misconfiguration instead of + # blocking here with a false negative. TEI_IMAGE="${TEI_IMAGE:-$(detect_gpu_tei_image)}" GPU_ARGS=(--gpus all) TEI_MODE="cuda" else - TEI_IMAGE="${TEI_IMAGE:-ghcr.io/huggingface/text-embeddings-inference:${TEI_VERSION}}" + TEI_IMAGE="${TEI_IMAGE:-${TEI_IMAGE_REPO}:${TEI_VERSION}}" GPU_ARGS=() TEI_MODE="cpu" fi @@ -146,7 +145,7 @@ for probe_idx in 1 2; do exit 1 fi # Detect non-finite-like payloads (observed as null/NaN on incompatible CUDA image + GPU). - if echo "${probe_resp}" | rg -qi '(null|nan|inf)'; then + if echo "${probe_resp}" | grep -Eqi '(null|nan|inf)'; then echo "ERROR: TEI probe ${probe_idx} detected invalid embedding values (null/NaN/Inf)." >&2 echo "Response preview: $(echo "${probe_resp}" | head -c 220)" >&2 docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true -- libgit2 0.21.2