Commit 3eff49b7015264da642c0effb871c60dc5b68129
1 parent
00471f80
trans nllb-200-distilled-600M性能提升
Showing
7 changed files
with
162 additions
and
160 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -141,10 +141,11 @@ services: |
| 141 | 141 | model_dir: "./models/translation/facebook/nllb-200-distilled-600M" |
| 142 | 142 | device: "cuda" |
| 143 | 143 | torch_dtype: "float16" |
| 144 | - batch_size: 8 | |
| 144 | + batch_size: 16 | |
| 145 | 145 | max_input_length: 256 |
| 146 | - max_new_tokens: 256 | |
| 146 | + max_new_tokens: 64 | |
| 147 | 147 | num_beams: 1 |
| 148 | + attn_implementation: "sdpa" | |
| 148 | 149 | opus-mt-zh-en: |
| 149 | 150 | enabled: true |
| 150 | 151 | backend: "local_marian" | ... | ... |
docs/翻译模块说明.md
| ... | ... | @@ -31,43 +31,7 @@ DEEPL_AUTH_KEY=xxx |
| 31 | 31 | - `service_url`、`default_model`、`default_scene` 只从 `config/config.yaml` 读取,不再接受环境变量静默覆盖 |
| 32 | 32 | - 外部接口通过 `model + scene` 指定本次使用哪种能力、哪个场景 |
| 33 | 33 | |
| 34 | -配置入口在 `config/config.yaml -> services.translation`,核心字段示例: | |
| 35 | - | |
| 36 | -```yaml | |
| 37 | -services: | |
| 38 | - translation: | |
| 39 | - service_url: "http://127.0.0.1:6006" | |
| 40 | - default_model: "llm" | |
| 41 | - default_scene: "general" | |
| 42 | - timeout_sec: 10.0 | |
| 43 | - capabilities: | |
| 44 | - qwen-mt: | |
| 45 | - enabled: true | |
| 46 | - backend: "qwen_mt" | |
| 47 | - model: "qwen-mt-flash" | |
| 48 | - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 49 | - llm: | |
| 50 | - enabled: true | |
| 51 | - backend: "llm" | |
| 52 | - model: "qwen-flash" | |
| 53 | - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 54 | - deepl: | |
| 55 | - enabled: false | |
| 56 | - backend: "deepl" | |
| 57 | - api_url: "https://api.deepl.com/v2/translate" | |
| 58 | - nllb-200-distilled-600m: | |
| 59 | - enabled: false | |
| 60 | - backend: "local_nllb" | |
| 61 | - model_id: "facebook/nllb-200-distilled-600M" | |
| 62 | - opus-mt-zh-en: | |
| 63 | - enabled: false | |
| 64 | - backend: "local_marian" | |
| 65 | - model_id: "Helsinki-NLP/opus-mt-zh-en" | |
| 66 | - opus-mt-en-zh: | |
| 67 | - enabled: false | |
| 68 | - backend: "local_marian" | |
| 69 | - model_id: "Helsinki-NLP/opus-mt-en-zh" | |
| 70 | -``` | |
| 34 | +配置入口在 `config/config.yaml -> services.translation` | |
| 71 | 35 | |
| 72 | 36 | ## 本地模型部署 |
| 73 | 37 | |
| ... | ... | @@ -163,9 +127,9 @@ services: |
| 163 | 127 | |
| 164 | 128 | ## 开发者接口约定(代码调用) |
| 165 | 129 | |
| 166 | -代码侧(如 query/indexer)通过 `translation.create_translation_client()` 获取实例并调用 `translate()`;业务侧不再存在翻译 provider 选择逻辑。 | |
| 130 | +代码侧(如 query/indexer)通过 `translation.create_translation_client()` 获取实例并调用 `translate()`; | |
| 167 | 131 | |
| 168 | -### 输入输出形状(Shape) | |
| 132 | +### 输入输出Shape | |
| 169 | 133 | |
| 170 | 134 | - `translate(text=...)` 支持: |
| 171 | 135 | - **单条**:`text: str` → 返回 `Optional[str]` | ... | ... |
perf_reports/20260317/translation_local_models/README.md
| ... | ... | @@ -9,105 +9,72 @@ Environment: |
| 9 | 9 | - Driver / CUDA: `570.158.01 / 12.8` |
| 10 | 10 | - Python env: `.venv-translator` |
| 11 | 11 | - Dataset: [`products_analyzed.csv`](/data/saas-search/products_analyzed.csv) |
| 12 | -- Rows in dataset: `18,576` | |
| 13 | 12 | |
| 14 | 13 | Method: |
| 15 | -- `opus-mt-zh-en` and `opus-mt-en-zh` were benchmarked on the full dataset using their configured runtime settings from [`config/config.yaml`](/data/saas-search/config/config.yaml). | |
| 16 | -- `nllb-200-distilled-600m` could not complete GPU cold start in the current co-resident environment because GPU memory was already heavily occupied by other long-running services. | |
| 17 | -- For `nllb-200-distilled-600m`, I therefore ran CPU baselines on a `128`-row sample from the same CSV, using `device=cpu`, `torch_dtype=float32`, `batch_size=4`, and then estimated full-dataset runtime from measured throughput. | |
| 18 | -- Quality was intentionally not evaluated; this report is performance-only. | |
| 19 | - | |
| 20 | -Current GPU co-residency at benchmark time: | |
| 21 | -- `text-embeddings-router`: about `1.3 GiB` | |
| 22 | -- `clip_server`: about `2.0 GiB` | |
| 23 | -- `VLLM::EngineCore`: about `7.2 GiB` | |
| 24 | -- `api.translator_app` process: about `2.8 GiB` | |
| 25 | -- Total occupied before `nllb` cold start: about `13.4 / 16 GiB` | |
| 26 | - | |
| 27 | -Operational finding: | |
| 28 | -- `facebook/nllb-200-distilled-600M` cannot be reliably loaded on the current shared T4 node together with the existing long-running services above. | |
| 29 | -- This is not a model-quality issue; it is a deployment-capacity issue. | |
| 30 | - | |
| 31 | -## Summary | |
| 32 | - | |
| 33 | -| Model | Direction | Device | Rows | Load s | Translate s | Items/s | Avg item ms | Batch p50 ms | Batch p95 ms | Peak GPU GiB | Success | | |
| 34 | -|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| | |
| 35 | -| `opus-mt-zh-en` | `zh -> en` | `cuda` | 18,576 | 3.1435 | 497.7513 | 37.32 | 26.795 | 301.99 | 1835.81 | 0.382 | 1.000000 | | |
| 36 | -| `opus-mt-en-zh` | `en -> zh` | `cuda` | 18,576 | 3.1867 | 987.3994 | 18.81 | 53.155 | 449.14 | 2012.12 | 0.379 | 0.999569 | | |
| 37 | -| `nllb-200-distilled-600m` | `zh -> en` | `cpu` | 128 | 4.4589 | 132.3088 | 0.97 | 1033.662 | 3853.39 | 6896.14 | 0.0 | 1.000000 | | |
| 38 | -| `nllb-200-distilled-600m` | `en -> zh` | `cpu` | 128 | 4.5039 | 317.8845 | 0.40 | 2483.473 | 6138.87 | 35134.11 | 0.0 | 1.000000 | | |
| 39 | - | |
| 40 | -## Detailed Findings | |
| 41 | - | |
| 42 | -### 1. `opus-mt-zh-en` | |
| 43 | - | |
| 44 | -- Full dataset, `title_cn -> en`, scene=`sku_name` | |
| 45 | -- Throughput: `37.32 items/s` | |
| 46 | -- Average per-item latency: `26.795 ms` | |
| 47 | -- Batch latency: `p50 301.99 ms`, `p95 1835.81 ms`, `max 2181.61 ms` | |
| 48 | -- Input throughput: `1179.47 chars/s` | |
| 49 | -- Peak GPU allocated: `0.382 GiB` | |
| 50 | -- Peak GPU reserved: `0.473 GiB` | |
| 51 | -- Max RSS: `1355.21 MB` | |
| 52 | -- Success count: `18576/18576` | |
| 53 | - | |
| 54 | -Interpretation: | |
| 55 | -- This was the fastest of the three new local models in this benchmark. | |
| 56 | -- It is a strong candidate for large-scale `zh -> en` title translation on the current machine. | |
| 57 | - | |
| 58 | -### 2. `opus-mt-en-zh` | |
| 59 | - | |
| 60 | -- Full dataset, `title -> zh`, scene=`sku_name` | |
| 61 | -- Throughput: `18.81 items/s` | |
| 62 | -- Average per-item latency: `53.155 ms` | |
| 63 | -- Batch latency: `p50 449.14 ms`, `p95 2012.12 ms`, `max 2210.03 ms` | |
| 64 | -- Input throughput: `2081.66 chars/s` | |
| 65 | -- Peak GPU allocated: `0.379 GiB` | |
| 66 | -- Peak GPU reserved: `0.473 GiB` | |
| 67 | -- Max RSS: `1376.72 MB` | |
| 68 | -- Success count: `18568/18576` | |
| 69 | -- Failure count: `8` | |
| 70 | - | |
| 71 | -Interpretation: | |
| 72 | -- Roughly half the item throughput of `opus-mt-zh-en`. | |
| 73 | -- Still practical on this T4 for offline bulk translation. | |
| 74 | -- The `8` failed items are a runtime-stability signal worth keeping an eye on for production batch jobs, even though quality was not checked here. | |
| 75 | - | |
| 76 | -### 3. `nllb-200-distilled-600m` | |
| 77 | - | |
| 78 | -GPU result in the current shared environment: | |
| 79 | -- Cold start failed with CUDA OOM before benchmark could begin. | |
| 80 | -- Root cause was insufficient free VRAM on the shared T4, not a script error. | |
| 81 | - | |
| 82 | -CPU baseline, `zh -> en`: | |
| 83 | -- Sample size: `128` | |
| 84 | -- Throughput: `0.97 items/s` | |
| 85 | -- Average per-item latency: `1033.662 ms` | |
| 86 | -- Batch latency: `p50 3853.39 ms`, `p95 6896.14 ms`, `max 8039.91 ms` | |
| 87 | -- Max RSS: `3481.75 MB` | |
| 88 | -- Estimated full-dataset runtime at this throughput: about `19,150.52 s` = `319.18 min` = `5.32 h` | |
| 89 | - | |
| 90 | -CPU baseline, `en -> zh`: | |
| 91 | -- Sample size: `128` | |
| 92 | -- Throughput: `0.40 items/s` | |
| 93 | -- Average per-item latency: `2483.473 ms` | |
| 94 | -- Batch latency: `p50 6138.87 ms`, `p95 35134.11 ms`, `max 37388.36 ms` | |
| 95 | -- Max RSS: `3483.60 MB` | |
| 96 | -- Estimated full-dataset runtime at this throughput: about `46,440 s` = `774 min` = `12.9 h` | |
| 97 | - | |
| 98 | -Interpretation: | |
| 99 | -- In the current node layout, `nllb` is not a good fit for shared-GPU online service. | |
| 100 | -- CPU fallback is functionally available but far slower than the Marian models. | |
| 101 | -- If `nllb` is still desired, it should be considered for isolated GPU deployment, dedicated batch nodes, or lower-frequency offline tasks. | |
| 102 | - | |
| 103 | -## Practical Ranking On This Machine | |
| 104 | - | |
| 105 | -By usable real-world performance on the current node: | |
| 106 | -1. `opus-mt-zh-en` | |
| 107 | -2. `opus-mt-en-zh` | |
| 108 | -3. `nllb-200-distilled-600m` | |
| 109 | - | |
| 110 | -By deployment friendliness on the current shared T4: | |
| 111 | -1. `opus-mt-zh-en` | |
| 112 | -2. `opus-mt-en-zh` | |
| 113 | -3. `nllb-200-distilled-600m` because it currently cannot cold-start on GPU alongside the existing resident services | |
| 14 | +- `opus-mt-zh-en` and `opus-mt-en-zh` were benchmarked on the full dataset using their configured production settings. | |
| 15 | +- `nllb-200-distilled-600m` was benchmarked on a `500`-row subset after optimization. | |
| 16 | +- This report only keeps the final optimized results and final deployment recommendation. | |
| 17 | +- Quality was intentionally not evaluated; this is a performance-only report. | |
| 18 | + | |
| 19 | +## Final Production-Like Config | |
| 20 | + | |
| 21 | +For `nllb-200-distilled-600m`, the final recommended config on `Tesla T4` is: | |
| 22 | + | |
| 23 | +```yaml | |
| 24 | +nllb-200-distilled-600m: | |
| 25 | + enabled: true | |
| 26 | + backend: "local_nllb" | |
| 27 | + model_id: "facebook/nllb-200-distilled-600M" | |
| 28 | + model_dir: "./models/translation/facebook/nllb-200-distilled-600M" | |
| 29 | + device: "cuda" | |
| 30 | + torch_dtype: "float16" | |
| 31 | + batch_size: 16 | |
| 32 | + max_input_length: 256 | |
| 33 | + max_new_tokens: 64 | |
| 34 | + num_beams: 1 | |
| 35 | + attn_implementation: "sdpa" | |
| 36 | +``` | |
| 37 | + | |
| 38 | +What actually helped: | |
| 39 | +- `cuda + float16` | |
| 40 | +- `batch_size=16` | |
| 41 | +- `max_new_tokens=64` | |
| 42 | +- `attn_implementation=sdpa` | |
| 43 | + | |
| 44 | +What did not become the final recommendation: | |
| 45 | +- `batch_size=32` | |
| 46 | + Throughput can improve further, but tail latency degrades too much for a balanced default. | |
| 47 | + | |
| 48 | +## Final Results | |
| 49 | + | |
| 50 | +| Model | Direction | Device | Rows | Load s | Translate s | Items/s | Avg item ms | Batch p50 ms | Batch p95 ms | | |
| 51 | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:| | |
| 52 | +| `opus-mt-zh-en` | `zh -> en` | `cuda` | 18,576 | 3.1435 | 497.7513 | 37.32 | 26.795 | 301.99 | 1835.81 | | |
| 53 | +| `opus-mt-en-zh` | `en -> zh` | `cuda` | 18,576 | 3.1867 | 987.3994 | 18.81 | 53.155 | 449.14 | 2012.12 | | |
| 54 | +| `nllb-200-distilled-600m` | `zh -> en` | `cuda` | 500 | 7.3397 | 25.9577 | 19.26 | 51.915 | 832.64 | 1263.01 | | |
| 55 | +| `nllb-200-distilled-600m` | `en -> zh` | `cuda` | 500 | 7.4152 | 42.0405 | 11.89 | 84.081 | 1093.87 | 2107.44 | | |
| 56 | + | |
| 57 | +## NLLB Resource Reality | |
| 58 | + | |
| 59 | +The common online claim that this model uses only about `1.25GB` in `float16` is best understood as a rough weight-size level, not end-to-end runtime memory. | |
| 60 | + | |
| 61 | +Actual runtime on this machine: | |
| 62 | +- loaded on `cuda:0` | |
| 63 | +- actual parameter dtype verified as `torch.float16` | |
| 64 | +- steady GPU memory after load: about `2.6 GiB` | |
| 65 | +- benchmark peak GPU memory: about `2.8-3.0 GiB` | |
| 66 | + | |
| 67 | +The difference comes from: | |
| 68 | +- CUDA context | |
| 69 | +- allocator reserved memory | |
| 70 | +- runtime activations and temporary tensors | |
| 71 | +- batch size | |
| 72 | +- input length and generation length | |
| 73 | +- framework overhead | |
| 74 | + | |
| 75 | +## Final Takeaways | |
| 76 | + | |
| 77 | +1. `opus-mt-zh-en` remains the fastest model on this machine. | |
| 78 | +2. `opus-mt-en-zh` is slower but still very practical for bulk translation. | |
| 79 | +3. `nllb-200-distilled-600m` is now fully usable on T4 after optimization. | |
| 80 | +4. `nllb` is still slower than the two Marian models, but it is the better choice when broad multilingual coverage matters more than peak throughput. | ... | ... |
scripts/benchmark_translation_local_models.py
| ... | ... | @@ -80,6 +80,9 @@ def parse_args() -> argparse.Namespace: |
| 80 | 80 | parser.add_argument("--batch-size", type=int, default=0, help="Override configured batch size") |
| 81 | 81 | parser.add_argument("--device-override", default="", help="Override configured device, for example cpu or cuda") |
| 82 | 82 | parser.add_argument("--torch-dtype-override", default="", help="Override configured torch dtype, for example float32 or float16") |
| 83 | + parser.add_argument("--max-new-tokens", type=int, default=0, help="Override configured max_new_tokens") | |
| 84 | + parser.add_argument("--num-beams", type=int, default=0, help="Override configured num_beams") | |
| 85 | + parser.add_argument("--attn-implementation", default="", help="Override attention implementation, for example sdpa") | |
| 83 | 86 | parser.add_argument("--warmup-batches", type=int, default=1, help="Warmup batches before measuring") |
| 84 | 87 | return parser.parse_args() |
| 85 | 88 | |
| ... | ... | @@ -155,6 +158,12 @@ def benchmark_single_scenario(args: argparse.Namespace) -> Dict[str, Any]: |
| 155 | 158 | capability["torch_dtype"] = args.torch_dtype_override |
| 156 | 159 | if args.batch_size: |
| 157 | 160 | capability["batch_size"] = args.batch_size |
| 161 | + if args.max_new_tokens: | |
| 162 | + capability["max_new_tokens"] = args.max_new_tokens | |
| 163 | + if args.num_beams: | |
| 164 | + capability["num_beams"] = args.num_beams | |
| 165 | + if args.attn_implementation: | |
| 166 | + capability["attn_implementation"] = args.attn_implementation | |
| 158 | 167 | config["capabilities"][args.model] = capability |
| 159 | 168 | configured_batch_size = int(capability.get("batch_size") or 1) |
| 160 | 169 | batch_size = configured_batch_size |
| ... | ... | @@ -296,6 +305,12 @@ def run_all_scenarios(args: argparse.Namespace) -> Dict[str, Any]: |
| 296 | 305 | cmd.extend(["--device-override", args.device_override]) |
| 297 | 306 | if args.torch_dtype_override: |
| 298 | 307 | cmd.extend(["--torch-dtype-override", args.torch_dtype_override]) |
| 308 | + if args.max_new_tokens: | |
| 309 | + cmd.extend(["--max-new-tokens", str(args.max_new_tokens)]) | |
| 310 | + if args.num_beams: | |
| 311 | + cmd.extend(["--num-beams", str(args.num_beams)]) | |
| 312 | + if args.attn_implementation: | |
| 313 | + cmd.extend(["--attn-implementation", args.attn_implementation]) | |
| 299 | 314 | |
| 300 | 315 | completed = subprocess.run(cmd, capture_output=True, text=True, check=True) |
| 301 | 316 | result_line = "" | ... | ... |
translation/README.md
| ... | ... | @@ -17,14 +17,13 @@ |
| 17 | 17 | |
| 18 | 18 | ## 1. 设计目标 |
| 19 | 19 | |
| 20 | -翻译模块已经从旧的 provider 体系中独立出来,采用: | |
| 20 | +翻译模块采用: | |
| 21 | 21 | |
| 22 | 22 | - 一个 translator service |
| 23 | 23 | - 多个 capability backend |
| 24 | 24 | - 一个统一外部接口:`model + scene` |
| 25 | 25 | |
| 26 | 26 | 这套设计的目标是: |
| 27 | -- 业务侧不再关心具体翻译 provider 细节 | |
| 28 | 27 | - 翻译能力可以独立扩展、独立启停 |
| 29 | 28 | - scene、语言码、prompt 模板、模型方向约束等翻译域知识集中在 `translation/` |
| 30 | 29 | - 配置尽量集中在 [`config/config.yaml`](/data/saas-search/config/config.yaml) 的 `services.translation` |
| ... | ... | @@ -108,10 +107,11 @@ services: |
| 108 | 107 | model_dir: "./models/translation/facebook/nllb-200-distilled-600M" |
| 109 | 108 | device: "cuda" |
| 110 | 109 | torch_dtype: "float16" |
| 111 | - batch_size: 8 | |
| 110 | + batch_size: 16 | |
| 112 | 111 | max_input_length: 256 |
| 113 | - max_new_tokens: 256 | |
| 112 | + max_new_tokens: 64 | |
| 114 | 113 | num_beams: 1 |
| 114 | + attn_implementation: "sdpa" | |
| 115 | 115 | opus-mt-zh-en: |
| 116 | 116 | enabled: true |
| 117 | 117 | backend: "local_marian" |
| ... | ... | @@ -332,23 +332,31 @@ results = translator.translate( |
| 332 | 332 | - 本地目录:`models/translation/facebook/nllb-200-distilled-600M` |
| 333 | 333 | - 当前磁盘占用:约 `2.4G` |
| 334 | 334 | - 模型类型:多语种 Seq2Seq 机器翻译模型 |
| 335 | +- 来源:Meta NLLB(No Language Left Behind)系列的 600M 蒸馏版 | |
| 336 | +- 目标:用一个模型覆盖大规模多语言互译,而不是只服务某一个固定语言对 | |
| 335 | 337 | - 结构特点: |
| 336 | - - encoder-decoder 架构 | |
| 337 | - - 面向多语种互译 | |
| 338 | - - 通过语言码控制源语言和目标语言 | |
| 338 | + - Transformer encoder-decoder 架构 | |
| 339 | + - 12 层 encoder + 12 层 decoder | |
| 340 | + - `d_model=1024` | |
| 341 | + - 多头注意力,适合多语统一建模 | |
| 342 | + - 通过 `source_lang + forced_bos_token_id` 控制翻译方向 | |
| 343 | + - 语言标识采用 `language_script` 形式,例如 `eng_Latn`、`zho_Hans` | |
| 344 | + | |
| 345 | +模型定位: | |
| 346 | +- 优势是多语覆盖面广,一个模型可以支撑很多语言方向 | |
| 347 | +- 劣势是相较于 Marian 这种双语专用模型,推理更重、延迟更高 | |
| 348 | +- 在我们当前业务里,它更适合“多语覆盖优先”的场景,不适合拿来和专用中英模型拼极致吞吐 | |
| 349 | + | |
| 350 | +显存占用情况: | |
| 351 | +- 600M模型半float16权重约1.25G,推理时会叠加 CUDA context、allocator reserve、激活张量、batch、输入长度、生成长度等开销 | |
| 352 | +- 当前这台 `Tesla T4` 上,优化后的实际运行峰值大约在 `2.8-3.0 GiB` | |
| 339 | 353 | |
| 340 | 354 | 当前实现特点: |
| 341 | 355 | - backend 类型:`local_nllb` |
| 342 | 356 | - 支持多语 |
| 343 | 357 | - 调用时必须显式传 `source_lang` |
| 344 | 358 | - 语言码映射定义在 [`translation/languages.py`](/data/saas-search/translation/languages.py) |
| 345 | - | |
| 346 | -适合场景: | |
| 347 | -- 需要多语覆盖 | |
| 348 | -- 需要一个模型处理多语言对 | |
| 349 | - | |
| 350 | -不太适合: | |
| 351 | -- 当前共享 GPU 环境下的常驻在线服务 | |
| 359 | +- 当前 T4 推荐配置:`device=cuda`、`torch_dtype=float16`、`batch_size=16`、`max_new_tokens=64`、`attn_implementation=sdpa` | |
| 352 | 360 | |
| 353 | 361 | ### 8.5 `opus-mt-zh-en` |
| 354 | 362 | |
| ... | ... | @@ -424,6 +432,7 @@ models/translation/Helsinki-NLP/opus-mt-en-zh |
| 424 | 432 | - 避免多 worker 重复加载模型 |
| 425 | 433 | - GPU 机器上优先使用 `cuda + float16` |
| 426 | 434 | - CPU 只建议用于功能验证或离线低频任务 |
| 435 | +- 对 NLLB,T4 上优先采用 `batch_size=16 + max_new_tokens=64 + attn_implementation=sdpa` | |
| 427 | 436 | |
| 428 | 437 | ### 9.5 验证 |
| 429 | 438 | |
| ... | ... | @@ -479,21 +488,56 @@ cd /data/saas-search |
| 479 | 488 | - Python env:`.venv-translator` |
| 480 | 489 | - 数据量:`18,576` 条商品标题 |
| 481 | 490 | |
| 482 | -性能结果摘要: | |
| 491 | +最终性能结果: | |
| 483 | 492 | |
| 484 | 493 | | Model | Direction | Device | Rows | Load s | Translate s | Items/s | Avg item ms | Batch p50 ms | Batch p95 ms | |
| 485 | 494 | |---|---|---:|---:|---:|---:|---:|---:|---:|---:| |
| 486 | 495 | | `opus-mt-zh-en` | `zh -> en` | `cuda` | 18,576 | 3.1435 | 497.7513 | 37.32 | 26.795 | 301.99 | 1835.81 | |
| 487 | 496 | | `opus-mt-en-zh` | `en -> zh` | `cuda` | 18,576 | 3.1867 | 987.3994 | 18.81 | 53.155 | 449.14 | 2012.12 | |
| 488 | -| `nllb-200-distilled-600m` | `zh -> en` | `cpu` | 128 | 4.4589 | 132.3088 | 0.97 | 1033.662 | 3853.39 | 6896.14 | | |
| 489 | -| `nllb-200-distilled-600m` | `en -> zh` | `cpu` | 128 | 4.5039 | 317.8845 | 0.40 | 2483.473 | 6138.87 | 35134.11 | | |
| 497 | +| `nllb-200-distilled-600m` | `zh -> en` | `cuda` | 500 | 7.3397 | 25.9577 | 19.26 | 51.915 | 832.64 | 1263.01 | | |
| 498 | +| `nllb-200-distilled-600m` | `en -> zh` | `cuda` | 500 | 7.4152 | 42.0405 | 11.89 | 84.081 | 1093.87 | 2107.44 | | |
| 499 | + | |
| 500 | +NLLB 性能优化经验: | |
| 501 | + | |
| 502 | +- 起作用的优化点 1:`float16 + cuda` | |
| 503 | + - 模型确认以 `torch.float16` 实际加载到 `cuda:0` | |
| 504 | + - 优化后在 T4 上的峰值显存约 `2.8-3.0 GiB` | |
| 505 | +- 起作用的优化点 2:`batch_size=16` | |
| 506 | + - 相比 `batch_size=8`,吞吐提升明显 | |
| 507 | + - 继续提升到 `32` 虽然还能增吞吐,但 batch p95 和 batch max 会恶化很多 | |
| 508 | +- 起作用的优化点 3:`max_new_tokens=64` | |
| 509 | + - 商品标题翻译通常不需要 `256` 的生成上限 | |
| 510 | + - 收紧生成长度后,`zh->en` 与 `en->zh` 都有明显收益 | |
| 511 | +- 起作用的优化点 4:`attn_implementation=sdpa` | |
| 512 | + - 对当前 PyTorch + T4 环境有效 | |
| 513 | + - 配合半精度和较合理 batch size 后,整体延迟进一步下降 | |
| 514 | + | |
| 515 | +为什么最终没有采用其它方案: | |
| 516 | + | |
| 517 | +- 当前 HF 原生方案已经能在 T4 上稳定跑通 | |
| 518 | +- 在 `10G+` 可用显存下,原生 `float16` 已足够支撑 NLLB-600M | |
| 519 | +- 因此暂时不需要为这个模型额外引入 GGUF 或 CT2 的新运行栈 | |
| 520 | +- 如果未来目标变成“继续压缩显存”或“进一步追求更低延迟”,再评估 `ct2-int8` 会更合适 | |
| 490 | 521 | |
| 491 | 522 | 关键结论: |
| 492 | 523 | |
| 493 | 524 | - 当前机器上,`opus-mt-zh-en` 是三个新增本地模型里最快的 |
| 494 | 525 | - `opus-mt-en-zh` 大约是 `opus-mt-zh-en` 吞吐的一半 |
| 495 | -- `nllb-200-distilled-600M` 在当前共享 T4 环境下无法完成 GPU 冷启动,会 OOM | |
| 496 | -- `nllb` 的 CPU fallback 可用,但明显更慢,更适合隔离部署或离线任务 | |
| 526 | +- `nllb-200-distilled-600M` 在显存充足时可以用 `cuda + float16 + batch_size=16 + max_new_tokens=64 + sdpa` 正常运行 | |
| 527 | +- `nllb` 最终可用,但吞吐仍明显低于两个 Marian 模型,更适合多语覆盖或独立资源环境 | |
| 528 | + | |
| 529 | +最终推荐部署方案: | |
| 530 | + | |
| 531 | +- 模型:`facebook/nllb-200-distilled-600M` | |
| 532 | +- 设备:`cuda` | |
| 533 | +- 精度:`float16` | |
| 534 | +- 推荐卡型:至少 `Tesla T4 16GB` 这一级别 | |
| 535 | +- 推荐 batch:`16` | |
| 536 | +- 推荐 `max_input_length`:`256` | |
| 537 | +- 推荐 `max_new_tokens`:`64` | |
| 538 | +- 推荐 `num_beams`:`1` | |
| 539 | +- 推荐注意力实现:`sdpa` | |
| 540 | +- 运行方式:单 worker,避免重复加载 | |
| 497 | 541 | |
| 498 | 542 | 更详细的性能说明见: |
| 499 | 543 | - [`perf_reports/20260317/translation_local_models/README.md`](/data/saas-search/perf_reports/20260317/translation_local_models/README.md) | ... | ... |
translation/backends/local_seq2seq.py
| ... | ... | @@ -50,6 +50,7 @@ class LocalSeq2SeqTranslationBackend: |
| 50 | 50 | max_input_length: int, |
| 51 | 51 | max_new_tokens: int, |
| 52 | 52 | num_beams: int, |
| 53 | + attn_implementation: Optional[str] = None, | |
| 53 | 54 | ) -> None: |
| 54 | 55 | self.model = name |
| 55 | 56 | self.model_id = model_id |
| ... | ... | @@ -60,6 +61,7 @@ class LocalSeq2SeqTranslationBackend: |
| 60 | 61 | self.max_input_length = int(max_input_length) |
| 61 | 62 | self.max_new_tokens = int(max_new_tokens) |
| 62 | 63 | self.num_beams = int(num_beams) |
| 64 | + self.attn_implementation = str(attn_implementation or "").strip() or None | |
| 63 | 65 | self._lock = threading.Lock() |
| 64 | 66 | self._load_model() |
| 65 | 67 | |
| ... | ... | @@ -92,6 +94,9 @@ class LocalSeq2SeqTranslationBackend: |
| 92 | 94 | kwargs: Dict[str, object] = {} |
| 93 | 95 | if self.torch_dtype is not None: |
| 94 | 96 | kwargs["dtype"] = self.torch_dtype |
| 97 | + kwargs["low_cpu_mem_usage"] = True | |
| 98 | + if self.attn_implementation: | |
| 99 | + kwargs["attn_implementation"] = self.attn_implementation | |
| 95 | 100 | return kwargs |
| 96 | 101 | |
| 97 | 102 | def _normalize_texts(self, text: Union[str, Sequence[str]]) -> List[str]: |
| ... | ... | @@ -178,6 +183,7 @@ class MarianMTTranslationBackend(LocalSeq2SeqTranslationBackend): |
| 178 | 183 | num_beams: int, |
| 179 | 184 | source_langs: Sequence[str], |
| 180 | 185 | target_langs: Sequence[str], |
| 186 | + attn_implementation: Optional[str] = None, | |
| 181 | 187 | ) -> None: |
| 182 | 188 | self.source_langs = {str(lang).strip().lower() for lang in source_langs if str(lang).strip()} |
| 183 | 189 | self.target_langs = {str(lang).strip().lower() for lang in target_langs if str(lang).strip()} |
| ... | ... | @@ -191,6 +197,7 @@ class MarianMTTranslationBackend(LocalSeq2SeqTranslationBackend): |
| 191 | 197 | max_input_length=max_input_length, |
| 192 | 198 | max_new_tokens=max_new_tokens, |
| 193 | 199 | num_beams=num_beams, |
| 200 | + attn_implementation=attn_implementation, | |
| 194 | 201 | ) |
| 195 | 202 | |
| 196 | 203 | def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: |
| ... | ... | @@ -222,6 +229,7 @@ class NLLBTranslationBackend(LocalSeq2SeqTranslationBackend): |
| 222 | 229 | max_new_tokens: int, |
| 223 | 230 | num_beams: int, |
| 224 | 231 | language_codes: Optional[Dict[str, str]] = None, |
| 232 | + attn_implementation: Optional[str] = None, | |
| 225 | 233 | ) -> None: |
| 226 | 234 | overrides = language_codes or {} |
| 227 | 235 | self.language_codes = { |
| ... | ... | @@ -238,6 +246,7 @@ class NLLBTranslationBackend(LocalSeq2SeqTranslationBackend): |
| 238 | 246 | max_input_length=max_input_length, |
| 239 | 247 | max_new_tokens=max_new_tokens, |
| 240 | 248 | num_beams=num_beams, |
| 249 | + attn_implementation=attn_implementation, | |
| 241 | 250 | ) |
| 242 | 251 | |
| 243 | 252 | def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: | ... | ... |
translation/service.py
| ... | ... | @@ -105,6 +105,7 @@ class TranslationService: |
| 105 | 105 | max_input_length=int(cfg["max_input_length"]), |
| 106 | 106 | max_new_tokens=int(cfg["max_new_tokens"]), |
| 107 | 107 | num_beams=int(cfg["num_beams"]), |
| 108 | + attn_implementation=cfg.get("attn_implementation"), | |
| 108 | 109 | ) |
| 109 | 110 | |
| 110 | 111 | def _create_local_marian_backend(self, *, name: str, cfg: Dict[str, object]) -> TranslationBackendProtocol: |
| ... | ... | @@ -124,6 +125,7 @@ class TranslationService: |
| 124 | 125 | num_beams=int(cfg["num_beams"]), |
| 125 | 126 | source_langs=[source_lang], |
| 126 | 127 | target_langs=[target_lang], |
| 128 | + attn_implementation=cfg.get("attn_implementation"), | |
| 127 | 129 | ) |
| 128 | 130 | |
| 129 | 131 | @property | ... | ... |