Commit b85ffc9a672c014483b2d21cf5dfe7f7d0dee516
1 parent
b3ffdc72
Add test env TEI GPU overrides examples
This branch is intended to differ from master only by deployment configuration for the test machine.
- Add `.env.test.example` as a secrets-free override snippet to be appended onto `.env`.
- Pins TEI to GPU mode (`TEI_DEVICE=cuda`) with `float16` for performance.
- Pins a Tesla T4 compatible TEI image (`text-embeddings-inference:turing-1.9`) to avoid
compute-capability mismatch errors (T4=sm75 vs non-turing images compiled for sm80).
- Keeps TEI request limits aligned with current service settings (`TEI_MAX_BATCH_TOKENS=2048`,
`TEI_MAX_CLIENT_BATCH_SIZE=8`) and provides an example BGE-M3 snapshot path.
- Extend `.env.example` with guidance on selecting the correct TEI image tag (`turing-*` for T4,
`cuda-*` for Ampere+) and optional mirror repository override.
No credentials are committed; `.env` remains local-only.
Made-with: Cursor
Showing
6 changed files
with
292 additions
and
13 deletions
Show diff stats
| ... | ... | @@ -68,13 +68,13 @@ DB_DATABASE=saas |
| 68 | 68 | DB_USERNAME=saas |
| 69 | 69 | DB_PASSWORD=pcjY7iwX1C6le1oz |
| 70 | 70 | # ===== test env embedding alignment overrides (2026-04-12 23:xx) ===== |
| 71 | -TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:cpu-1.9 | |
| 72 | 71 | TEI_PORT=8080 |
| 73 | -TEI_DEVICE=cpu | |
| 74 | -TEI_DTYPE=float32 | |
| 72 | +TEI_DEVICE=cuda | |
| 73 | +TEI_DTYPE=float16 | |
| 74 | +TEI_IMAGE_REPO=ghcr.m.daocloud.io/huggingface/text-embeddings-inference | |
| 75 | 75 | TEI_MODEL_ID=/data/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181 |
| 76 | 76 | TEI_HEALTH_TIMEOUT_SEC=240 |
| 77 | 77 | CNCLIP_DEVICE=cpu |
| 78 | 78 | CNCLIP_MODEL_NAME=CN-CLIP/ViT-L-14 |
| 79 | 79 | EMBEDDING_VENV=/home/tw/saas-search/.venv-cnclip |
| 80 | -TRANSLATOR_VENV=/home/tw/saas-search/.venv | |
| 80 | +TRANSLATOR_VENV=/home/tw/saas-search/.venv-translator | ... | ... |
.env.example
| ... | ... | @@ -46,6 +46,14 @@ EMBEDDING_BACKEND=tei |
| 46 | 46 | TEI_BASE_URL=http://127.0.0.1:8080 |
| 47 | 47 | TEI_DEVICE=cuda |
| 48 | 48 | TEI_VERSION=1.9 |
| 49 | +# Optional: override TEI docker image repository (useful for mirrors). | |
| 50 | +# TEI_IMAGE_REPO=ghcr.m.daocloud.io/huggingface/text-embeddings-inference | |
| 51 | +# | |
| 52 | +# Optional: pin an explicit TEI image tag. | |
| 53 | +# - For Tesla T4 (compute capability 7.5), prefer the `turing-*` image tag, e.g.: | |
| 54 | +# TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9 | |
| 55 | +# - For Ampere+ GPUs, prefer `cuda-*` image tag, e.g.: | |
| 56 | +# TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:cuda-1.9 | |
| 49 | 57 | TEI_MAX_BATCH_TOKENS=2048 |
| 50 | 58 | TEI_MAX_CLIENT_BATCH_SIZE=8 |
| 51 | 59 | TEI_HEALTH_TIMEOUT_SEC=300 | ... | ... |
| ... | ... | @@ -0,0 +1,40 @@ |
| 1 | +# Test environment overrides example (no secrets). | |
| 2 | +# | |
| 3 | +# Usage: | |
| 4 | +# cp .env.example .env | |
| 5 | +# cat .env.test.example >> .env | |
| 6 | +# | |
| 7 | +# Notes: | |
| 8 | +# - This repo is multi-service; values below focus on local test deployment. | |
| 9 | +# - Keep real credentials (Redis/MySQL/ES passwords) out of VCS. | |
| 10 | + | |
| 11 | +# ===== runtime / namespace ===== | |
| 12 | +RUNTIME_ENV=test | |
| 13 | +ES_INDEX_NAMESPACE=test_ | |
| 14 | + | |
| 15 | +# ===== Elasticsearch (example: local docker on non-default port) ===== | |
| 16 | +ES_HOST=http://127.0.0.1:19200 | |
| 17 | +ES_USERNAME= | |
| 18 | +ES_PASSWORD= | |
| 19 | +ES_DOCKER_HTTP_PORT=19200 | |
| 20 | +ES_DOCKER_CONTAINER_NAME=saas-search-es9-test | |
| 21 | + | |
| 22 | +# ===== HuggingFace cache ===== | |
| 23 | +HF_CACHE_DIR=/data/tw/.cache/huggingface | |
| 24 | + | |
| 25 | +# ===== TEI (text embeddings inference) ===== | |
| 26 | +# Service port exposed by container (host:8080 -> container:80) | |
| 27 | +TEI_PORT=8080 | |
| 28 | +# Use GPU when available | |
| 29 | +TEI_DEVICE=cuda | |
| 30 | +# Use float16 for performance on GPU | |
| 31 | +TEI_DTYPE=float16 | |
| 32 | +# IMPORTANT for Tesla T4 (compute capability 7.5): use turing image tag | |
| 33 | +TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9 | |
| 34 | +# Example pinned model snapshot path (update per-machine) | |
| 35 | +TEI_MODEL_ID=/data/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181 | |
| 36 | +TEI_MAX_BATCH_TOKENS=2048 | |
| 37 | +TEI_MAX_CLIENT_BATCH_SIZE=8 | |
| 38 | +TEI_HEALTH_TIMEOUT_SEC=240 | |
| 39 | +TEI_CONTAINER_NAME=saas-search-tei-test | |
| 40 | + | ... | ... |
.gitignore
| ... | ... | @@ -0,0 +1,230 @@ |
| 1 | +# 测试环境分支说明:`test/small-gpu-es9` | |
| 2 | + | |
| 3 | +本文用于帮助开发同学理解 **测试环境部署分支** `test/small-gpu-es9` 与 `master` 的差异、部署要点,以及未来如何更容易与主干同步。 | |
| 4 | + | |
| 5 | +> 重要原则(目标态) | |
| 6 | +> 该分支应尽量做到:**只引入部署配置差异,不引入业务/代码逻辑差异**。 | |
| 7 | +> 但历史上该分支在 `master` 分离后曾引入过多处代码改动,见下方“分离以来变动汇总”。 | |
| 8 | + | |
| 9 | +--- | |
| 10 | + | |
| 11 | +## 1. 分支定位与分离点 | |
| 12 | + | |
| 13 | +- **分支名**:`test/small-gpu-es9` | |
| 14 | +- **远程**:`origin/test/small-gpu-es9` | |
| 15 | +- **相对 `origin/master` 的 merge-base(分离点)**: | |
| 16 | + | |
| 17 | +```bash | |
| 18 | +git fetch origin --prune | |
| 19 | +git merge-base HEAD origin/master | |
| 20 | +``` | |
| 21 | + | |
| 22 | +在当前仓库状态下,merge-base 为: | |
| 23 | + | |
| 24 | +- `dc22700d04fd8b40978710b6ca6d3d016a66318b` | |
| 25 | + | |
| 26 | +> 说明:`origin/master` 已继续向前推进,测试分支需要定期同步主干,否则会长期落后并产生更大合并成本。 | |
| 27 | + | |
| 28 | +--- | |
| 29 | + | |
| 30 | +## 2. 如何一键查看“从分离点以来”的全部差异(推荐命令) | |
| 31 | + | |
| 32 | +### 2.1 仅看测试分支新增的 commits(不含 master 后续前进) | |
| 33 | + | |
| 34 | +```bash | |
| 35 | +BASE=$(git merge-base HEAD origin/master) | |
| 36 | +git log --oneline ${BASE}..HEAD | |
| 37 | +``` | |
| 38 | + | |
| 39 | +### 2.2 仅看测试分支相对分离点的文件差异(最关键) | |
| 40 | + | |
| 41 | +```bash | |
| 42 | +BASE=$(git merge-base HEAD origin/master) | |
| 43 | +git diff --name-status ${BASE}..HEAD | |
| 44 | +``` | |
| 45 | + | |
| 46 | +### 2.3 看 master 已经有、但测试分支还没有的 commits(用于评估落后程度) | |
| 47 | + | |
| 48 | +```bash | |
| 49 | +git log --oneline HEAD..origin/master | |
| 50 | +``` | |
| 51 | + | |
| 52 | +--- | |
| 53 | + | |
| 54 | +## 3. 从分离点以来变动汇总(高层) | |
| 55 | + | |
| 56 | +### 3.1 分支新增提交(当前共 5 个) | |
| 57 | + | |
| 58 | +```text | |
| 59 | +e8d3bbb Add test env TEI GPU overrides examples | |
| 60 | +b3ffdc7 Sync legacy frontend entrypoint from 0a440fb | |
| 61 | +89fa3f3 Sync master portability fixes from f07947a | |
| 62 | +778c299 测试环境redis配置 | |
| 63 | +b423bf4 测试环境配置:关闭reranker,其余的都打开,对接本机es docker内的19200 | |
| 64 | +``` | |
| 65 | + | |
| 66 | +### 3.2 文件层面的差异(按类别归纳) | |
| 67 | + | |
| 68 | +> 下列列表来自 `git diff --name-status ${BASE}..HEAD` 的结果归类;不在本文粘贴文件内容,避免将本地敏感信息写入文档。 | |
| 69 | + | |
| 70 | +#### A) 测试环境配置 / 示例文件 | |
| 71 | + | |
| 72 | +- `.env.example`(M):补充 TEI 镜像 tag 选择说明(T4 用 `turing-*`) | |
| 73 | +- `.env.test.example`(A):新增“测试环境 override 示例”(不含密钥) | |
| 74 | +- `config/environments/test.yaml`(A):新增测试环境覆盖项(ES host/embedding/rerank 开关) | |
| 75 | + | |
| 76 | +#### B) 翻译服务与本地模型相关 | |
| 77 | + | |
| 78 | +- `translation/`、`api/translator_app.py`、`requirements_translator_service.txt`、若干测试用例等(M/A) | |
| 79 | +- 这部分属于**代码逻辑差异**,与“仅配置差异”的目标不一致,未来同步主干时需要重点关注与取舍。 | |
| 80 | + | |
| 81 | +#### C) service_ctl / scripts / frontend / indexer 等 | |
| 82 | + | |
| 83 | +- `scripts/service_ctl.sh`(M) | |
| 84 | +- `scripts/*`(M/A) | |
| 85 | +- `frontend/static/js/app.js`(M) | |
| 86 | +- `indexer/product_enrich.py`(M) | |
| 87 | +- `models`(D) | |
| 88 | + | |
| 89 | +以上同样属于**代码差异**(历史原因),并非纯部署配置差异。 | |
| 90 | + | |
| 91 | +#### D) `.env` 与备份文件(敏感风险提示) | |
| 92 | + | |
| 93 | +- `.env`(M)以及 `.env.bak.*` / `.env.backup.*`(A) | |
| 94 | + | |
| 95 | +这些文件通常会包含敏感配置(如 DB/Redis 密码、token 等)。**原则上不应提交到版本库**。 | |
| 96 | +如果这些文件当前已经被 git 跟踪,建议后续做一次清理:将其从版本控制中移除,仅保留 `.env.example` / `.env.*.example` 这类无密钥模板。 | |
| 97 | + | |
| 98 | +--- | |
| 99 | + | |
| 100 | +## 4. 测试环境部署要点(与 master 的“常见差异点”) | |
| 101 | + | |
| 102 | +### 4.1 Elasticsearch:使用本机 Docker 暴露的 19200 端口 | |
| 103 | + | |
| 104 | +测试环境常见配置为: | |
| 105 | + | |
| 106 | +- `ES_HOST=http://127.0.0.1:19200` | |
| 107 | + | |
| 108 | +对应文件(示例/覆盖): | |
| 109 | + | |
| 110 | +- `.env.test.example` | |
| 111 | +- `config/environments/test.yaml`(`infrastructure.elasticsearch.host`) | |
| 112 | + | |
| 113 | +### 4.2 Embedding(6005)后端走 TEI(8080) | |
| 114 | + | |
| 115 | +链路: | |
| 116 | + | |
| 117 | +- `embedding` 服务:`http://127.0.0.1:6005`(对外) | |
| 118 | +- TEI 服务:`http://127.0.0.1:8080`(embedding 的 text backend) | |
| 119 | + | |
| 120 | +验证命令: | |
| 121 | + | |
| 122 | +```bash | |
| 123 | +curl -sSf http://127.0.0.1:8080/health | |
| 124 | +curl -sSf http://127.0.0.1:6005/health | |
| 125 | +curl -sSf -X POST http://127.0.0.1:6005/embed/text \ | |
| 126 | + -H "Content-Type: application/json" \ | |
| 127 | + -d '{"inputs":["health check"],"normalize":true}' \ | |
| 128 | + | python -c 'import sys,json; d=json.load(sys.stdin); print(len(d["embeddings"][0]))' | |
| 129 | +``` | |
| 130 | + | |
| 131 | +预期输出维度:`1024`。 | |
| 132 | + | |
| 133 | +### 4.3 TEI GPU:Tesla T4 需要 `turing-*` 镜像 | |
| 134 | + | |
| 135 | +在 Tesla T4(compute capability 7.5)上,TEI 若误用 `cuda-*` 镜像可能出现类似错误: | |
| 136 | + | |
| 137 | +- `compute cap 75 is not compatible with compile time compute cap 80` | |
| 138 | + | |
| 139 | +因此测试环境建议显式 pin: | |
| 140 | + | |
| 141 | +- `TEI_IMAGE=ghcr.m.daocloud.io/huggingface/text-embeddings-inference:turing-1.9` | |
| 142 | +- `TEI_DEVICE=cuda` | |
| 143 | +- `TEI_DTYPE=float16` | |
| 144 | + | |
| 145 | +这些已在 `.env.test.example` 中给出。 | |
| 146 | + | |
| 147 | +验证是否真的跑在 GPU: | |
| 148 | + | |
| 149 | +```bash | |
| 150 | +nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader | |
| 151 | +``` | |
| 152 | + | |
| 153 | +应能看到 `text-embeddings-router` 占用显存。 | |
| 154 | + | |
| 155 | +### 4.4 Reranker / Fine-rank:测试环境通常关闭 | |
| 156 | + | |
| 157 | +测试环境覆盖(见 `config/environments/test.yaml`): | |
| 158 | + | |
| 159 | +- `rerank.enabled: false` | |
| 160 | +- `fine_rank.enabled: false` | |
| 161 | + | |
| 162 | +这可减少 GPU/资源占用,并避免未配置时服务启动失败。 | |
| 163 | + | |
| 164 | +--- | |
| 165 | + | |
| 166 | +## 5. 测试环境部署建议流程(不含密钥) | |
| 167 | + | |
| 168 | +1) 生成本地 `.env`(不要提交): | |
| 169 | + | |
| 170 | +```bash | |
| 171 | +cp .env.example .env | |
| 172 | +cat .env.test.example >> .env | |
| 173 | +``` | |
| 174 | + | |
| 175 | +2) 根据测试机实际情况补齐敏感项(仅本机): | |
| 176 | + | |
| 177 | +- `REDIS_PASSWORD` / `DB_PASSWORD` / `ES_PASSWORD` / `DEEPL_AUTH_KEY` 等 | |
| 178 | + | |
| 179 | +3) 拉起服务(按需): | |
| 180 | + | |
| 181 | +```bash | |
| 182 | +./scripts/service_ctl.sh up tei embedding backend indexer frontend eval-web | |
| 183 | +``` | |
| 184 | + | |
| 185 | +4) 逐项健康检查: | |
| 186 | + | |
| 187 | +```bash | |
| 188 | +./scripts/service_ctl.sh status tei embedding backend indexer | |
| 189 | +``` | |
| 190 | + | |
| 191 | +--- | |
| 192 | + | |
| 193 | +## 6. 与主干同步策略(建议做法) | |
| 194 | + | |
| 195 | +### 6.1 目标态:创建“干净的测试部署分支” | |
| 196 | + | |
| 197 | +因为当前 `test/small-gpu-es9` 含有历史代码差异,后续主干同步成本会持续升高。推荐做法: | |
| 198 | + | |
| 199 | +1. 从最新 `origin/master` 新建分支(例如 `test/deploy-es9-gpu`)。 | |
| 200 | +2. 只 cherry-pick 与测试部署相关的**配置类提交**(例如 `.env.*.example`、`config/environments/test.yaml` 这类)。 | |
| 201 | +3. 严格禁止提交 `.env`、`.env.bak*` 等包含敏感信息/本机路径的文件。 | |
| 202 | + | |
| 203 | +### 6.2 若继续使用当前分支:同步主干的基本步骤 | |
| 204 | + | |
| 205 | +```bash | |
| 206 | +git fetch origin --prune | |
| 207 | +git checkout test/small-gpu-es9 | |
| 208 | +git merge origin/master | |
| 209 | +``` | |
| 210 | + | |
| 211 | +合并时重点关注冲突/回归风险区域: | |
| 212 | + | |
| 213 | +- `translation/`(本地模型/依赖/服务行为) | |
| 214 | +- `scripts/`、`scripts/service_ctl.sh` | |
| 215 | +- `frontend/`、`indexer/` | |
| 216 | + | |
| 217 | +--- | |
| 218 | + | |
| 219 | +## 7. 附:常用对照表(端口与服务) | |
| 220 | + | |
| 221 | +| 服务 | 端口 | 说明 | | |
| 222 | +|------|-----:|------| | |
| 223 | +| Elasticsearch(测试 docker) | 19200 | `ES_HOST=http://127.0.0.1:19200` | | |
| 224 | +| TEI(text embeddings backend) | 8080 | `TEI_BASE_URL=http://127.0.0.1:8080` | | |
| 225 | +| embedding(text) | 6005 | `/embed/text`,后端转发 TEI | | |
| 226 | +| backend | 6002 | 搜索 API | | |
| 227 | +| indexer | 6004 | 索引 API | | |
| 228 | +| frontend | 6003 | 调试 UI | | |
| 229 | +| eval-web | 6010 | 评估 UI | | |
| 230 | + | ... | ... |
scripts/start_tei_service.sh
| ... | ... | @@ -26,6 +26,7 @@ TEI_MAX_CLIENT_BATCH_SIZE="${TEI_MAX_CLIENT_BATCH_SIZE:-24}" |
| 26 | 26 | TEI_DTYPE="${TEI_DTYPE:-float16}" |
| 27 | 27 | HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/huggingface}" |
| 28 | 28 | TEI_HEALTH_TIMEOUT_SEC="${TEI_HEALTH_TIMEOUT_SEC:-300}" |
| 29 | +TEI_IMAGE_REPO="${TEI_IMAGE_REPO:-ghcr.m.daocloud.io/huggingface/text-embeddings-inference}" | |
| 29 | 30 | |
| 30 | 31 | TEI_DEVICE_RAW="${TEI_DEVICE:-cuda}" |
| 31 | 32 | TEI_DEVICE="$(echo "${TEI_DEVICE_RAW}" | tr '[:upper:]' '[:lower:]')" |
| ... | ... | @@ -40,9 +41,9 @@ detect_gpu_tei_image() { |
| 40 | 41 | compute_cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 || true)" |
| 41 | 42 | major="${compute_cap%%.*}" |
| 42 | 43 | if [[ -n "${major}" && "${major}" -lt 8 ]]; then |
| 43 | - echo "ghcr.io/huggingface/text-embeddings-inference:turing-${TEI_VERSION}" | |
| 44 | + echo "${TEI_IMAGE_REPO}:turing-${TEI_VERSION}" | |
| 44 | 45 | else |
| 45 | - echo "ghcr.io/huggingface/text-embeddings-inference:cuda-${TEI_VERSION}" | |
| 46 | + echo "${TEI_IMAGE_REPO}:cuda-${TEI_VERSION}" | |
| 46 | 47 | fi |
| 47 | 48 | } |
| 48 | 49 | |
| ... | ... | @@ -51,16 +52,14 @@ if [[ "${TEI_DEVICE}" == "cuda" ]]; then |
| 51 | 52 | echo "ERROR: TEI_DEVICE=cuda but NVIDIA GPU is not available. No CPU fallback." >&2 |
| 52 | 53 | exit 1 |
| 53 | 54 | fi |
| 54 | - if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q 'nvidia'; then | |
| 55 | - echo "ERROR: TEI_DEVICE=cuda but Docker nvidia runtime is not configured." >&2 | |
| 56 | - echo "Install and configure nvidia-container-toolkit, then restart Docker." >&2 | |
| 57 | - exit 1 | |
| 58 | - fi | |
| 55 | + # Note: modern Docker setups can support `--gpus all` without exposing an explicit "nvidia" runtime | |
| 56 | + # in `docker info`. We rely on `docker run` failure to surface misconfiguration instead of | |
| 57 | + # blocking here with a false negative. | |
| 59 | 58 | TEI_IMAGE="${TEI_IMAGE:-$(detect_gpu_tei_image)}" |
| 60 | 59 | GPU_ARGS=(--gpus all) |
| 61 | 60 | TEI_MODE="cuda" |
| 62 | 61 | else |
| 63 | - TEI_IMAGE="${TEI_IMAGE:-ghcr.io/huggingface/text-embeddings-inference:${TEI_VERSION}}" | |
| 62 | + TEI_IMAGE="${TEI_IMAGE:-${TEI_IMAGE_REPO}:${TEI_VERSION}}" | |
| 64 | 63 | GPU_ARGS=() |
| 65 | 64 | TEI_MODE="cpu" |
| 66 | 65 | fi |
| ... | ... | @@ -146,7 +145,7 @@ for probe_idx in 1 2; do |
| 146 | 145 | exit 1 |
| 147 | 146 | fi |
| 148 | 147 | # Detect non-finite-like payloads (observed as null/NaN on incompatible CUDA image + GPU). |
| 149 | - if echo "${probe_resp}" | rg -qi '(null|nan|inf)'; then | |
| 148 | + if echo "${probe_resp}" | grep -Eqi '(null|nan|inf)'; then | |
| 150 | 149 | echo "ERROR: TEI probe ${probe_idx} detected invalid embedding values (null/NaN/Inf)." >&2 |
| 151 | 150 | echo "Response preview: $(echo "${probe_resp}" | head -c 220)" >&2 |
| 152 | 151 | docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true | ... | ... |