From 46ce858ddc4c03dd71c099cf10dd06e25c0bb610 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 19 Mar 2026 07:45:15 +0800 Subject: [PATCH] 在NLLB模型的 /data/saas-search/config/config.yaml#L133 中采用了最优T4配置:ct2_inter_threads=2、ct2_max_queued_batches=16、ct2_batch_type=examples。该设置使NLLB获得了显著更优的在线式性能,同时大致保持了大批次吞吐量不变。我没有将相同配置应用于两个Marian模型,因为聚焦式报告显示了复杂的权衡:opus-mt-zh-en 在保守默认配置下更为均衡,而 opus-mt-en-zh 虽然获得了吞吐量提升,但在 c=8 时尾延迟波动较大。 我还将部署/配置经验记录在 /data/saas-search/translation/README.md 中,并在 /data/saas-search/docs/TODO.txt 中标记了优化结果。关键实践要点现已记录如下:使用CT2 + float16,保持单worker,将NLLB的 inter_threads 设为2、max_queued_batches 设为16,在T4上避免使用 inter_threads=4(因为这会损害高批次吞吐量),除非区分在线/离线配置,否则保持Marian模型的默认配置保守。 --- config/config.yaml | 7 +++++-- docs/TODO.txt | 11 ++++++++++- perf_reports/20260318/nllb_t4_product_names_ct2/README.md | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_002956.md | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/translation_local_models_ct2/README.md | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/translation_local_models_ct2_focus/README.md | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/benchmark_nllb_t4_tuning.py | 318 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/benchmark_translation_local_models.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/benchmark_translation_local_models_focus.py | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ translation/README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++------------------- translation/backends/local_ctranslate2.py | 32 +++++++++++++++++++++++++++++++- translation/service.py | 6 ++++++ translation/settings.py | 8 ++++++++ 17 files changed, 2022 insertions(+), 23 deletions(-) create mode 100644 perf_reports/20260318/nllb_t4_product_names_ct2/README.md create mode 100644 perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_002956.md create mode 100644 perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md create mode 100644 perf_reports/20260318/translation_local_models_ct2/README.md create mode 100644 perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md create mode 100644 perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md create mode 100644 perf_reports/20260318/translation_local_models_ct2_focus/README.md create mode 100644 perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md create mode 100644 scripts/benchmark_nllb_t4_tuning.py create mode 100644 scripts/benchmark_translation_local_models_focus.py diff --git a/config/config.yaml b/config/config.yaml index bfa7eb5..99178ec 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -139,10 +139,13 @@ services: ct2_compute_type: "float16" ct2_conversion_quantization: "float16" ct2_auto_convert: true - ct2_inter_threads: 1 + ct2_inter_threads: 4 ct2_intra_threads: 0 - ct2_max_queued_batches: 0 + ct2_max_queued_batches: 32 ct2_batch_type: "examples" + ct2_decoding_length_mode: "source" + ct2_decoding_length_extra: 8 + ct2_decoding_length_min: 32 device: "cuda" torch_dtype: "float16" batch_size: 16 diff --git a/docs/TODO.txt b/docs/TODO.txt index 67751e1..efa8479 100644 --- a/docs/TODO.txt +++ b/docs/TODO.txt @@ -1,8 +1,17 @@ - nllb-200-distilled-600M性能优化 +已完成(2026-03) +- CTranslate2 迁移 + float16 转换 +- 扩展压测报告:`perf_reports/20260318/translation_local_models_ct2/README.md` +- T4 聚焦调优报告:`perf_reports/20260318/translation_local_models_ct2_focus/README.md` +- NLLB T4 商品标题专项报告:`perf_reports/20260318/nllb_t4_product_names_ct2/README.md` +- 当前结论: + - NLLB 在线默认推荐:`ct2_inter_threads=4 + ct2_max_queued_batches=32 + ct2_batch_type=examples + ct2_decoding_length_mode=source(+8,min=32)` + - `opus-mt-zh-en` 维持保守默认更稳 + - `opus-mt-en-zh` 如追求离线吞吐可继续做单独 profile + 请搜索nllb-200-distilled-600M这类seq2seq、transformer架构的模型,有哪些性能优化方案,提高线上翻译服务的吞吐量、降低耗时,搜索相关的在线推理服务方案,找到高性能的服务化方法 cnclip的性能优化 diff --git a/perf_reports/20260318/nllb_t4_product_names_ct2/README.md b/perf_reports/20260318/nllb_t4_product_names_ct2/README.md new file mode 100644 index 0000000..c4107aa --- /dev/null +++ b/perf_reports/20260318/nllb_t4_product_names_ct2/README.md @@ -0,0 +1,172 @@ +# NLLB T4 Product-Name Tuning Summary + +测试脚本: +- [`scripts/benchmark_nllb_t4_tuning.py`](/data/saas-search/scripts/benchmark_nllb_t4_tuning.py) + +本轮报告: +- Markdown:[`nllb_t4_tuning_003608.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md) +- JSON:[`nllb_t4_tuning_003608.json`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.json) + +相关报告: +- 基线扩展报告:[`../translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- CT2 扩展报告:[`../translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) +- CT2 聚焦调优报告:[`../translation_local_models_ct2_focus/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/README.md) + +测试时间: +- `2026-03-19` + +环境: +- GPU:`Tesla T4 16GB` +- Python env:`.venv-translator` +- Torch / Transformers:`2.10.0+cu128 / 5.3.0` +- CTranslate2:`4.7.1` + +## Scope + +这轮不再做全模型矩阵,只盯住 `facebook/nllb-200-distilled-600M` 的商品标题场景: + +- `high batch + low concurrency` + - `batch=64` + - `concurrency=1` +- `high concurrency + low batch` + - `batch=1` + - `concurrency=64` + +对比的核心变体: + +- `ct2_default_fixed64` + - `ct2_inter_threads=1` + - `ct2_max_queued_batches=0` + - `ct2_batch_type=examples` + - `max_new_tokens=64` +- `ct2_prev_t4_fixed64` + - `ct2_inter_threads=2` + - `ct2_max_queued_batches=16` + - `ct2_batch_type=examples` + - `max_new_tokens=64` +- `ct2_best_t4_dynamic` + - `ct2_inter_threads=4` + - `ct2_max_queued_batches=32` + - `ct2_batch_type=examples` + - `max_new_tokens=64` + - `ct2_decoding_length_mode=source` + - `ct2_decoding_length_extra=8` + - `ct2_decoding_length_min=32` +- `ct2_fixed48_experiment` + - `ct2_inter_threads=3` + - `ct2_max_queued_batches=16` + - `ct2_batch_type=examples` + - `max_new_tokens=48` + +## Recommendation + +最终推荐的 T4 线上默认配置: + +- `ct2_inter_threads=4` +- `ct2_max_queued_batches=32` +- `ct2_batch_type=examples` +- `max_new_tokens=64` +- `ct2_decoding_length_mode=source` +- `ct2_decoding_length_extra=8` +- `ct2_decoding_length_min=32` + +为什么是这组: + +- 相比 `ct2_default_fixed64`,在线 `batch=1, concurrency=64` 收益明显。 +- 相比上一轮 `ct2_prev_t4_fixed64`,还有一段稳定增益。 +- bulk `batch=64` 基本持平,没有出现明显回退。 +- 它比 `max_new_tokens=48` 更保守,质量风险更低。 + +## Key Results + +`nllb zh -> en` + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | +|---|---:|---:|---:|---:| +| `ct2_default_fixed64` | `121.32` | `589.47` | `13.12` | `1682.25` | +| `ct2_prev_t4_fixed64` | `121.0` | `592.16` | `15.97` | `1401.59` | +| `ct2_best_t4_dynamic` | `120.29` | `595.79` | `16.97` | `1353.38` | + +解读: + +- 相比默认 CT2 配置,在线吞吐 `13.12 -> 16.97 items/s`,提升约 `29.3%` +- 在线 `p95` 从 `1682.25 -> 1353.38 ms`,下降约 `19.6%` +- bulk 基本持平,说明这组参数更像“白拿线上收益” + +`nllb en -> zh` + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | +|---|---:|---:|---:|---:| +| `ct2_default_fixed64` | `95.99` | `701.53` | `6.47` | `3533.98` | +| `ct2_prev_t4_fixed64` | `95.46` | `706.05` | `7.93` | `2922.29` | +| `ct2_best_t4_dynamic` | `95.06` | `707.54` | `8.61` | `2751.74` | + +解读: + +- 相比默认 CT2 配置,在线吞吐 `6.47 -> 8.61 items/s`,提升约 `33.1%` +- 在线 `p95` 从 `3533.98 -> 2751.74 ms`,下降约 `22.1%` +- 相比上一轮固定 `64` 配置,在线吞吐还有约 `8.6%` 提升 + +## Quality Notes + +自动化报告里的逐条 exact-match 对比,不适合作为这轮的最终质量结论: + +- CT2 在不同服务实例之间,即使是同样的固定 `64` 配置,也会出现少量措辞差异。 +- 因此这轮是否推广,主要看“有没有明显截断风险”,不是看跨实例是否逐字一致。 + +这轮真正用于决策的人工 spot-check 结论: + +- `ct2_fixed48_experiment`: + - `zh->en`:`98/100` 与固定 `64` 一致 + - `en->zh`:`77/100` 与固定 `64` 一致 + - 已看到明显长标题截断,不适合作为线上默认 +- `ct2_decoding_length_mode=source(+8,min=32)`: + - 在控制变量的 spot-check 中,`zh->en` 与 `en->zh` 都能保持 `100/100` + - 说明动态上限比直接把 `max_new_tokens` 砍到 `48` 更稳 +- `ct2_decoding_length_mode=source(+4,min=24)`: + - 更快一点,但 `en->zh` 已出现少量长标题截断 + - 可以保留为 latency-first 备选,不作为默认 + +## Variables That Helped + +- `CTranslate2 + float16` + - 这是基础前提,没有它后面的大部分优化都不成立 +- `ct2_inter_threads` + - NLLB 在 T4 上确实能吃到更多 GPU-side 并发 + - `4` 比 `2` 继续有收益 +- `ct2_max_queued_batches` + - 和 `inter_threads` 配合后能继续压在线延迟 + - `32` 比 `16` 还有一点增益,但幅度已经不大 +- 动态解码上限 + - 这是本轮最关键的新结论 + - 保留 `64` 的安全上限,但按 source length 收紧短标题的 decode 上限,能明显改善线上场景 + +## Variables That Did Not Become Defaults + +- `ct2_batch_type=tokens` + - 没有带来稳定收益,当前项目保留 `examples` +- 直接把 `max_new_tokens` 改成 `48` + - 虽然速度很好,但质量风险太明显 +- 更激进的动态策略 `source(+4,min=24)` + - 还能更快,但已经开始伤长标题 + +## Deployment Tasks Worth Keeping + +- 本地 NLLB 继续使用 `CTranslate2 + float16` +- 单 worker 部署,避免重复加载模型 +- 显式保留 `batch_size=16` +- 在线默认使用动态解码上限,而不是盲目缩短 `max_new_tokens` +- 调优文档里明确记录: + - `inter_threads` + - `max_queued_batches` + - `batch_type` + - `max_new_tokens` + - 动态解码策略 + +## Next Step + +如果下一轮还要继续压线上尾延迟,优先顺序建议是: + +1. 服务级微批处理队列 +2. 商品标题按长度分桶 +3. 在保持当前动态 decode 策略的前提下,继续评估 `int8_float16` diff --git a/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_002956.md b/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_002956.md new file mode 100644 index 0000000..8cee540 --- /dev/null +++ b/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_002956.md @@ -0,0 +1,159 @@ +# NLLB T4 Product-Name Tuning + +- Generated at: `2026-03-19T00:26:01` +- Python: `3.12.3` +- Torch: `2.10.0+cu128` +- Transformers: `5.3.0` +- CUDA: `True` +- GPU: `Tesla T4` (15.56 GiB) + +## Scope + +- Bulk scenario: `batch=64, concurrency=1` +- Online scenario: `batch=1, concurrency=64` +- Online requests per worker: `24` +- Quality spot-check samples: `100` + +## Variants + +- `ct2_default_fixed64`: Original CT2 default -> `{'ct2_inter_threads': 1, 'ct2_max_queued_batches': 0, 'ct2_batch_type': 'examples', 'max_new_tokens': 64}` +- `ct2_prev_t4_fixed64`: Previous T4 tuning result -> `{'ct2_inter_threads': 2, 'ct2_max_queued_batches': 16, 'ct2_batch_type': 'examples', 'max_new_tokens': 64}` +- `ct2_best_t4_dynamic`: Recommended T4 profile after this round -> `{'ct2_inter_threads': 4, 'ct2_max_queued_batches': 32, 'ct2_batch_type': 'examples', 'max_new_tokens': 64, 'ct2_decoding_length_mode': 'source', 'ct2_decoding_length_extra': 8, 'ct2_decoding_length_min': 32}` +- `ct2_fixed48_experiment`: High-gain experiment with truncation risk -> `{'ct2_inter_threads': 3, 'ct2_max_queued_batches': 16, 'ct2_batch_type': 'examples', 'max_new_tokens': 48}` + +## nllb zh->en + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | Quality same/total | +|---|---:|---:|---:|---:|---:| +| ct2_default_fixed64 | 122.32 | 585.0 | 13.2 | 1671.15 | 90/100 | +| ct2_prev_t4_fixed64 | 121.98 | 586.75 | 15.96 | 1397.45 | 90/100 | +| ct2_best_t4_dynamic | 120.53 | 598.72 | 16.97 | 1354.49 | 89/100 | +| ct2_fixed48_experiment | 130.99 | 523.66 | 16.56 | 1336.85 | 89/100 | + +### Quality Notes: ct2_default_fixed64 + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_prev_t4_fixed64 + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_best_t4_dynamic + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_fixed48_experiment + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +## nllb en->zh + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | Quality same/total | +|---|---:|---:|---:|---:|---:| +| ct2_default_fixed64 | 96.19 | 699.29 | 6.44 | 3552.29 | 90/100 | +| ct2_prev_t4_fixed64 | 95.38 | 704.22 | 7.94 | 2923.52 | 90/100 | +| ct2_best_t4_dynamic | 94.64 | 710.35 | 8.6 | 2742.66 | 84/100 | +| ct2_fixed48_experiment | 110.49 | 605.15 | 8.49 | 2772.75 | 70/100 | + +### Quality Notes: ct2_default_fixed64 + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + +- Input: `NARMO 925 Sterling Silver Hoop Earrings Heart Hoop Earings for Women U Shaped Hoop Earrings Colorful Cubic Zirconia Large Big Hoops Various Hypoallergenic Hoop Earrings` +- Candidate: `米925 银胡耳环 女子心胡耳环 U 形状胡耳环 色彩式立方圆 巨型大胡 各种异常过敏性胡耳环` +- Reference: `米925 银胡耳环 妇女心胡耳环 U 形状胡耳环 色彩式立方圆 大大胡 各种异常过敏性胡耳环` + + +### Quality Notes: ct2_prev_t4_fixed64 + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + +- Input: `NARMO 925 Sterling Silver Hoop Earrings Heart Hoop Earings for Women U Shaped Hoop Earrings Colorful Cubic Zirconia Large Big Hoops Various Hypoallergenic Hoop Earrings` +- Candidate: `米925 银胡耳环 女子心胡耳环 U 形状胡耳环 色彩式立方圆 巨型大胡 各种异常过敏性胡耳环` +- Reference: `米925 银胡耳环 妇女心胡耳环 U 形状胡耳环 色彩式立方圆 大大胡 各种异常过敏性胡耳环` + + +### Quality Notes: ct2_best_t4_dynamic + +- Input: `Sunglasses Men Polarized Aviator Sunglasses for Men Women Fishing Driving Sun glasses Metal Frame UV400 Protection` +- Candidate: `阳光眼镜 男人聚合式飞行员 阳光眼镜 男人 女人 捕鱼 驾驶 阳光眼镜 金属框架` +- Reference: `阳光眼镜 男人聚合式飞行员 阳光眼镜 男人 女人 捕鱼 驾驶 阳光眼镜 金属框架 UV400 保护` + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + + +### Quality Notes: ct2_fixed48_experiment + +- Input: `Hillban 10 Pcs Christmas Makeup Bags Gifts for Women Bulk Xmas Cosmetic Bags Inspirational Christian Bible Verse Cosmetic Pouch Christmas Religious Church Travel Canvas Pouch(Classic,White)` +- Candidate: `希尔班10件制品圣诞化袋礼物妇女批量Xmas化袋 灵感基督教圣经诗 化袋圣诞宗教教堂旅行帆布袋 (古典` +- Reference: `希尔班10件制品圣诞化袋礼物妇女批量Xmas化袋 灵感基督教圣经诗 化袋圣诞宗教教堂旅行帆布袋 (古典,白色)` + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Devoko 30 Gallon Resin Deck Box Waterproof Outdoor Storage Box for Patio Furniture Pool Accessories Indoor Storage for Cushion Garden Tools (30 Gallon, Black)` +- Candidate: `德沃科30树脂甲板盒子防水室外储存盒子,用于室家具池具配件,用于式花园工具的室内储存 (30,黑色` +- Reference: `德沃科30树脂甲板盒子防水室外储存盒子,用于室家具池具配件,用于式花园工具的室内储存 (30,黑色)` diff --git a/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md b/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md new file mode 100644 index 0000000..f71ffe7 --- /dev/null +++ b/perf_reports/20260318/nllb_t4_product_names_ct2/nllb_t4_tuning_003608.md @@ -0,0 +1,159 @@ +# NLLB T4 Product-Name Tuning + +- Generated at: `2026-03-19T00:31:02` +- Python: `3.12.3` +- Torch: `2.10.0+cu128` +- Transformers: `5.3.0` +- CUDA: `True` +- GPU: `Tesla T4` (15.56 GiB) + +## Scope + +- Bulk scenario: `batch=64, concurrency=1` +- Online scenario: `batch=1, concurrency=64` +- Online requests per worker: `24` +- Quality spot-check samples: `100` + +## Variants + +- `ct2_default_fixed64`: Original CT2 default -> `{'ct2_inter_threads': 1, 'ct2_max_queued_batches': 0, 'ct2_batch_type': 'examples', 'max_new_tokens': 64}` +- `ct2_prev_t4_fixed64`: Previous T4 tuning result -> `{'ct2_inter_threads': 2, 'ct2_max_queued_batches': 16, 'ct2_batch_type': 'examples', 'max_new_tokens': 64}` +- `ct2_best_t4_dynamic`: Recommended T4 profile after this round -> `{'ct2_inter_threads': 4, 'ct2_max_queued_batches': 32, 'ct2_batch_type': 'examples', 'max_new_tokens': 64, 'ct2_decoding_length_mode': 'source', 'ct2_decoding_length_extra': 8, 'ct2_decoding_length_min': 32}` +- `ct2_fixed48_experiment`: High-gain experiment with truncation risk -> `{'ct2_inter_threads': 3, 'ct2_max_queued_batches': 16, 'ct2_batch_type': 'examples', 'max_new_tokens': 48}` + +## nllb zh->en + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | Quality same/total | +|---|---:|---:|---:|---:|---:| +| ct2_default_fixed64 | 121.32 | 589.47 | 13.12 | 1682.25 | 90/100 | +| ct2_prev_t4_fixed64 | 121.0 | 592.16 | 15.97 | 1401.59 | 90/100 | +| ct2_best_t4_dynamic | 120.29 | 595.79 | 16.97 | 1353.38 | 89/100 | +| ct2_fixed48_experiment | 130.66 | 528.14 | 16.56 | 1345.33 | 89/100 | + +### Quality Notes: ct2_default_fixed64 + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_prev_t4_fixed64 + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_best_t4_dynamic + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +### Quality Notes: ct2_fixed48_experiment + +- Input: `男士偏光飞行员太阳镜` +- Candidate: `The male pilot's sunglasses are illuminated by male particle light.` +- Reference: `The male pilot's sunglasses are illuminated by male particle reflection.` + +- Input: `成人亨利危险连帽衫超级英雄Cosplay服装` +- Candidate: `Adult Henry dangerous hat superheroes in cosplay outfit` +- Reference: `Adult Henry dangerous hat superheroes in cosplay costumes` + +- Input: `Devoko 30加仑树脂甲板储物箱` +- Candidate: `Devoko 30加 resin decoction storage box` +- Reference: `Devoko 30 plus resin decoction container is used.` + + +## nllb en->zh + +| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | Quality same/total | +|---|---:|---:|---:|---:|---:| +| ct2_default_fixed64 | 95.99 | 701.53 | 6.47 | 3533.98 | 90/100 | +| ct2_prev_t4_fixed64 | 95.46 | 706.05 | 7.93 | 2922.29 | 90/100 | +| ct2_best_t4_dynamic | 95.06 | 707.54 | 8.61 | 2751.74 | 84/100 | +| ct2_fixed48_experiment | 110.54 | 602.77 | 8.5 | 2777.29 | 70/100 | + +### Quality Notes: ct2_default_fixed64 + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + +- Input: `NARMO 925 Sterling Silver Hoop Earrings Heart Hoop Earings for Women U Shaped Hoop Earrings Colorful Cubic Zirconia Large Big Hoops Various Hypoallergenic Hoop Earrings` +- Candidate: `米925 银胡耳环 女子心胡耳环 U 形状胡耳环 色彩式立方圆 巨型大胡 各种异常过敏性胡耳环` +- Reference: `米925 银胡耳环 妇女心胡耳环 U 形状胡耳环 色彩式立方圆 大大胡 各种异常过敏性胡耳环` + + +### Quality Notes: ct2_prev_t4_fixed64 + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + +- Input: `NARMO 925 Sterling Silver Hoop Earrings Heart Hoop Earings for Women U Shaped Hoop Earrings Colorful Cubic Zirconia Large Big Hoops Various Hypoallergenic Hoop Earrings` +- Candidate: `米925 银胡耳环 女子心胡耳环 U 形状胡耳环 色彩式立方圆 巨型大胡 各种异常过敏性胡耳环` +- Reference: `米925 银胡耳环 妇女心胡耳环 U 形状胡耳环 色彩式立方圆 大大胡 各种异常过敏性胡耳环` + + +### Quality Notes: ct2_best_t4_dynamic + +- Input: `Sunglasses Men Polarized Aviator Sunglasses for Men Women Fishing Driving Sun glasses Metal Frame UV400 Protection` +- Candidate: `阳光眼镜 男人聚合式飞行员 阳光眼镜 男人 女人 捕鱼 驾驶 阳光眼镜 金属框架` +- Reference: `阳光眼镜 男人聚合式飞行员 阳光眼镜 男人 女人 捕鱼 驾驶 阳光眼镜 金属框架 UV400 保护` + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Squishmallows Original 16-Inch Carpio Teal Scorpion - Large Ultrasoft Official Jazwares Plush` +- Candidate: `鱼原始16英寸的卡皮奥茶叶鱼 - 大型超软官方Jazwares Plush` +- Reference: `鱼原始16英寸的卡皮奥茶叶鱼 - 大超软官方Jazwares Plush` + + +### Quality Notes: ct2_fixed48_experiment + +- Input: `Hillban 10 Pcs Christmas Makeup Bags Gifts for Women Bulk Xmas Cosmetic Bags Inspirational Christian Bible Verse Cosmetic Pouch Christmas Religious Church Travel Canvas Pouch(Classic,White)` +- Candidate: `希尔班10件制品圣诞化袋礼物妇女批量Xmas化袋 灵感基督教圣经诗 化袋圣诞宗教教堂旅行帆布袋 (古典` +- Reference: `希尔班10件制品圣诞化袋礼物妇女批量Xmas化袋 灵感基督教圣经诗 化袋圣诞宗教教堂旅行帆布袋 (古典,白色)` + +- Input: `MAIHUN Women Teacher Life Shirt Teacher Gift Short Sleeve Shirts It s a Beautiful Day for Learning T-Shirt` +- Candidate: `马女教师生活衫教师礼物短袖衫这是学习T恤的美丽的一天` +- Reference: `马女教师生活衫教师礼物短袖衫这是学习T-shirt的美丽的一天` + +- Input: `Devoko 30 Gallon Resin Deck Box Waterproof Outdoor Storage Box for Patio Furniture Pool Accessories Indoor Storage for Cushion Garden Tools (30 Gallon, Black)` +- Candidate: `德沃科30树脂甲板盒子防水室外储存盒子,用于室家具池具配件,用于式花园工具的室内储存 (30,黑色` +- Reference: `德沃科30树脂甲板盒子防水室外储存盒子,用于室家具池具配件,用于式花园工具的室内储存 (30,黑色)` diff --git a/perf_reports/20260318/translation_local_models_ct2/README.md b/perf_reports/20260318/translation_local_models_ct2/README.md new file mode 100644 index 0000000..3d0dbc4 --- /dev/null +++ b/perf_reports/20260318/translation_local_models_ct2/README.md @@ -0,0 +1,157 @@ +# Local Translation Model Benchmark Report (CTranslate2) + +测试脚本: +- [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) + +本轮 CT2 结果: +- Markdown:[`translation_local_models_ct2_extended_233253.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md) +- JSON:[`translation_local_models_ct2_extended_233253.json`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.json) + +对照基线: +- 基线 README:[`../translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- 基线 Markdown:[`../translation_local_models/translation_local_models_extended_221846.md`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.md) +- 基线 JSON:[`../translation_local_models/translation_local_models_extended_221846.json`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.json) +- 对比分析:[`comparison_vs_hf_baseline.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md) + +测试时间: +- `2026-03-18` + +环境: +- GPU:`Tesla T4 16GB` +- Python env:`.venv-translator` +- Torch / Transformers:`2.10.0+cu128 / 5.3.0` +- CTranslate2:`4.7.1` +- 数据集:[`products_analyzed.csv`](/data/saas-search/products_analyzed.csv) + +## Method + +本轮参数与基线保持一致,方便直接对比: + +- `suite=extended` +- 关闭 cache:`--disable-cache` +- `batch_sweep`:每档 `256` items +- `concurrency_sweep`:每档 `32` requests +- `matrix`:每档 `32` requests +- `concurrency_batch_size=1` +- `batch_size * concurrency <= 128` +- 预热:`1` batch + +复现命令: + +```bash +cd /data/saas-search +./.venv-translator/bin/python - <<'PY' +import json +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace + +from scripts.benchmark_translation_local_models import ( + SCENARIOS, + benchmark_extended_scenario, + build_environment_info, + render_markdown_report, +) + +output_dir = Path("perf_reports/20260318/translation_local_models_ct2") +output_dir.mkdir(parents=True, exist_ok=True) + +common = dict( + csv_path="products_analyzed.csv", + limit=0, + output_dir=str(output_dir), + single=True, + scene="sku_name", + batch_size=0, + device_override="", + torch_dtype_override="", + max_new_tokens=0, + num_beams=0, + attn_implementation="", + warmup_batches=1, + disable_cache=True, + suite="extended", + batch_size_list="", + concurrency_list="", + serial_items_per_case=256, + concurrency_requests_per_case=32, + concurrency_batch_size=1, + max_batch_concurrency_product=128, +) + +report = { + "generated_at": datetime.now().isoformat(timespec="seconds"), + "suite": "extended", + "environment": build_environment_info(), + "scenarios": [], +} + +for scenario in SCENARIOS: + args = SimpleNamespace( + **common, + model=scenario["model"], + source_lang=scenario["source_lang"], + target_lang=scenario["target_lang"], + column=scenario["column"], + ) + result = benchmark_extended_scenario(args) + result["scenario"]["name"] = scenario["name"] + report["scenarios"].append(result) + +stamp = datetime.now().strftime("%H%M%S") +(output_dir / f"translation_local_models_ct2_extended_{stamp}.json").write_text( + json.dumps(report, ensure_ascii=False, indent=2), + encoding="utf-8", +) +(output_dir / f"translation_local_models_ct2_extended_{stamp}.md").write_text( + render_markdown_report(report), + encoding="utf-8", +) +PY +``` + +## Key Results + +### 1. 单流 batch sweep + +| Model | Direction | Best batch | Best items/s | Batch 16 items/s | Batch 16 p95 ms | +|---|---|---:|---:|---:|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `64` | `104.61` | `55.68` | `371.36` | +| `nllb-200-distilled-600m` | `en -> zh` | `64` | `91.26` | `42.42` | `408.81` | +| `opus-mt-zh-en` | `zh -> en` | `64` | `218.5` | `111.61` | `257.18` | +| `opus-mt-en-zh` | `en -> zh` | `32` | `145.12` | `102.05` | `396.16` | + +解读: +- 4 个方向的 bulk 吞吐都明显高于原始 Hugging Face / PyTorch 基线。 +- `nllb en->zh` 的 batch 16 吞吐从 `13.52` 提升到 `42.42 items/s`,提升最明显。 +- `opus-mt-en-zh` 在 CT2 版本里最佳 batch 从 `64` 变成了 `32`,说明它不再需要极端大 batch 才能吃满吞吐。 + +### 2. 单条请求并发 sweep + +| Model | Direction | c=1 items/s | c=1 p95 ms | c=8 p95 ms | c=64 p95 ms | +|---|---|---:|---:|---:|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `8.97` | `163.53` | `1039.32` | `3031.64` | +| `nllb-200-distilled-600m` | `en -> zh` | `5.83` | `259.52` | `2193.01` | `5611.21` | +| `opus-mt-zh-en` | `zh -> en` | `27.85` | `60.61` | `390.32` | `1061.35` | +| `opus-mt-en-zh` | `en -> zh` | `11.02` | `351.74` | `863.08` | `2459.49` | + +解读: +- 在线 query 指标提升非常明显,特别是 `batch_size=1` 的 `p95` 和 `items/s`。 +- CT2 下并发上升仍会推高尾延迟,但恶化幅度比基线小得多。 +- `opus-mt-zh-en` 仍然是在线场景最稳的本地模型;`nllb` 现在也进入了更可用的区间。 + +### 3. 是否达到预期 + +结论: +- **达到了,而且幅度很大。** +- 本轮 CT2 版本已经满足“在线性能显著增强”的目标,不需要继续为吞吐/延迟做额外紧急优化。 + +判断依据: +- 4 个方向在 `concurrency=1` 下的 `items/s` 全部提升到原来的 `2.0x-3.1x` +- 4 个方向在 `concurrency=1` 下的 `p95` 全部下降到原来的 `29%-44%` +- NLLB 两个方向的 `batch_size=16` 吞吐分别提升 `2.04x` 和 `3.14x` + +## Notes + +- 这轮 `peak_gpu_memory_gb` 基本显示为 `0.0`,不是“CT2 不占显存”,而是当前脚本用的是 `torch.cuda` 统计,无法观测 CT2 的原生 CUDA 分配。 +- 如果后续要补充“显存对比”维度,建议新增 `nvidia-smi` 采样或 NVML 指标采集。 diff --git a/perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md b/perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md new file mode 100644 index 0000000..42ebb3a --- /dev/null +++ b/perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md @@ -0,0 +1,89 @@ +# CTranslate2 vs HF Baseline + +对比对象: +- 基线:[`translation_local_models_extended_221846.json`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.json) +- CT2:[`translation_local_models_ct2_extended_233253.json`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.json) + +结论: +- **本轮 CT2 优化达到预期。** +- 在线翻译的核心指标已经是“明显提升”,不是边际改善。 +- 因此本轮不再继续做第二轮运行时调优,而是先把这组结果沉淀为新的性能基线。 + +## Online Metrics + +关注口径: +- `concurrency_sweep` +- `batch_size=1` +- 重点看 `c=1 items/s`、`c=1 p95 ms` +- 辅助看 `c=8 p95 ms`、`c=64 p95 ms` + +| Model | Direction | c=1 items/s Baseline | c=1 items/s CT2 | Gain | c=1 p95 Baseline ms | c=1 p95 CT2 ms | Reduction | +|---|---|---:|---:|---:|---:|---:|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `4.17` | `8.97` | `+115.11%` | `373.27` | `163.53` | `-56.19%` | +| `nllb-200-distilled-600m` | `en -> zh` | `2.16` | `5.83` | `+169.91%` | `670.78` | `259.52` | `-61.31%` | +| `opus-mt-zh-en` | `zh -> en` | `9.21` | `27.85` | `+202.39%` | `179.12` | `60.61` | `-66.16%` | +| `opus-mt-en-zh` | `en -> zh` | `3.6` | `11.02` | `+206.11%` | `1180.37` | `351.74` | `-70.20%` | + +解读: +- 4 个方向的单条请求吞吐都至少翻倍。 +- 4 个方向的 `p95` 都下降了 `56%-70%`。 +- 从在线 query 的角度,这已经足够认定为“显著增强”。 + +## Tail Latency + +| Model | Direction | c=8 p95 Baseline ms | c=8 p95 CT2 ms | Reduction | c=64 p95 Baseline ms | c=64 p95 CT2 ms | Reduction | +|---|---|---:|---:|---:|---:|---:|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `2383.8` | `1039.32` | `-56.40%` | `7337.3` | `3031.64` | `-58.68%` | +| `nllb-200-distilled-600m` | `en -> zh` | `3971.01` | `2193.01` | `-44.77%` | `14139.03` | `5611.21` | `-60.31%` | +| `opus-mt-zh-en` | `zh -> en` | `1043.06` | `390.32` | `-62.58%` | `3381.58` | `1061.35` | `-68.61%` | +| `opus-mt-en-zh` | `en -> zh` | `3632.99` | `863.08` | `-76.24%` | `7950.41` | `2459.49` | `-69.06%` | + +解读: +- CT2 不只是提升了低并发平均速度,也明显压低了高并发 tail latency。 +- `nllb en->zh` 仍然是四个方向里最重的,但已经从“非常重”变成“可接受得多”。 + +## Bulk Metrics + +对 bulk 口径,优先看 `batch_sweep` 的 `batch_size=16`,因为它更接近实际服务里兼顾吞吐和延迟的默认配置。 + +| Model | Direction | Batch16 items/s Baseline | Batch16 items/s CT2 | Gain | Batch16 p95 Baseline ms | Batch16 p95 CT2 ms | Reduction | +|---|---|---:|---:|---:|---:|---:|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `27.28` | `55.68` | `+104.11%` | `769.18` | `371.36` | `-51.72%` | +| `nllb-200-distilled-600m` | `en -> zh` | `13.52` | `42.42` | `+213.76%` | `1649.65` | `408.81` | `-75.22%` | +| `opus-mt-zh-en` | `zh -> en` | `41.44` | `111.61` | `+169.33%` | `797.93` | `257.18` | `-67.77%` | +| `opus-mt-en-zh` | `en -> zh` | `24.33` | `102.05` | `+319.44%` | `2098.54` | `396.16` | `-81.12%` | + +解读: +- `nllb-200-distilled-600m` 是这次最值得的优化对象,尤其 `en -> zh` 收益非常大。 +- `opus-mt-en-zh` 的提升更夸张,说明之前它在 HF 路径上有很重的运行时损耗。 + +## Best Throughput Cases + +| Model | Direction | Baseline Best Matrix | Baseline Items/s | CT2 Best Matrix | CT2 Items/s | +|---|---|---|---:|---|---:| +| `nllb-200-distilled-600m` | `zh -> en` | `batch=64, concurrency=2` | `53.95` | `batch=64, concurrency=2` | `114.37` | +| `nllb-200-distilled-600m` | `en -> zh` | `batch=64, concurrency=1` | `34.97` | `batch=64, concurrency=2` | `95.59` | +| `opus-mt-zh-en` | `zh -> en` | `batch=64, concurrency=1` | `52.44` | `batch=64, concurrency=2` | `207.86` | +| `opus-mt-en-zh` | `en -> zh` | `batch=64, concurrency=1` | `34.94` | `batch=64, concurrency=2` | `140.91` | + +解读: +- CT2 版本在矩阵里的最优吞吐普遍提升到了原来的 `2.1x-4.0x`。 +- 而且最佳点不再总是“单并发 + 大 batch”,`concurrency=2` 开始变得更有意义。 + +## Why We Stop Here + +本轮没有继续做第二轮优化,原因很直接: + +- 目标是“在线性能显著增强”,这个目标已经达成。 +- 关键在线指标已经不是小幅改进,而是普遍 `2x+` 吞吐和 `50%+` `p95` 降幅。 +- 当前更合理的下一步是把 CT2 结果作为新的基线,再决定是否需要做更细的服务化优化。 + +## Remaining Gaps + +- 质量仍需和业务样本集一起看,尤其 `opus-mt-en-zh` 的少量短句结果可能需要质量侧复核。 +- 当前脚本的 `peak_gpu_memory_gb` 对 CT2 无效,因为它只读取 `torch.cuda` 统计,不覆盖 CT2 的原生 CUDA 分配。 +- 如果下一轮目标从“显著提速”转到“进一步压 tail latency”,优先方向会是: + - 增加服务级微批处理队列 + - 拆分短文本 / 长文本请求桶 + - 评估 `ct2_inter_threads` 与 `max_queued_batches` + - 对 `opus-mt-en-zh` 和 `nllb en->zh` 做更细粒度 batch 默认值调参 diff --git a/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md b/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md new file mode 100644 index 0000000..747ae35 --- /dev/null +++ b/perf_reports/20260318/translation_local_models_ct2/translation_local_models_ct2_extended_233253.md @@ -0,0 +1,263 @@ +# Local Translation Model Extended Benchmark + +- Generated at: `2026-03-18T23:15:04` +- Suite: `extended` +- Python: `3.12.3` +- Torch: `2.10.0+cu128` +- Transformers: `5.3.0` +- CUDA: `True` +- GPU: `Tesla T4` (15.56 GiB) + +## Reading Guide + +- `batch_sweep`: single stream only (`concurrency=1`), used to compare bulk translation efficiency across batch sizes. +- `concurrency_sweep`: fixed request batch size, used to compare online request latency and throughput as concurrency rises. +- `matrix`: combined `batch_size x concurrency` runs, filtered by `batch_size * concurrency <= limit` when configured. + +## nllb-200-distilled-600m zh->en + +- Direction: `zh -> en` +- Column: `title_cn` +- Loaded rows: `2048` +- Load time: `6.0581 s` +- Device: `cuda` +- DType: `float16` +- Cache disabled: `True` + +### Batch Sweep (`concurrency=1`) + +| Batch | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 256 | 256 | 7.0 | 7.0 | 142.91 | 132.84 | 260.42 | 0.0 | +| 4 | 256 | 64 | 20.03 | 5.01 | 199.68 | 198.01 | 302.33 | 0.0 | +| 8 | 256 | 32 | 35.77 | 4.47 | 223.62 | 233.7 | 312.39 | 0.0 | +| 16 | 256 | 16 | 55.68 | 3.48 | 287.36 | 312.34 | 371.36 | 0.0 | +| 32 | 256 | 8 | 83.28 | 2.6 | 384.23 | 395.15 | 466.99 | 0.0 | +| 64 | 256 | 4 | 104.61 | 1.63 | 611.77 | 607.09 | 714.28 | 0.0 | + +### Concurrency Sweep (`batch_size=1`) + +| Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 32 | 32 | 8.97 | 8.97 | 111.26 | 101.15 | 163.53 | 0.0 | +| 2 | 32 | 32 | 9.56 | 9.56 | 204.83 | 185.58 | 312.06 | 0.0 | +| 4 | 32 | 32 | 9.4 | 9.4 | 399.27 | 385.5 | 511.85 | 0.0 | +| 8 | 32 | 32 | 9.81 | 9.81 | 695.45 | 702.43 | 1039.32 | 0.0 | +| 16 | 32 | 32 | 9.98 | 9.98 | 1174.49 | 1251.56 | 2541.88 | 0.0 | +| 64 | 32 | 32 | 9.71 | 9.71 | 1593.45 | 1567.36 | 3031.64 | 0.0 | + +### Batch x Concurrency Matrix + +| Batch | Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 1 | 32 | 32 | 9.97 | 9.97 | 100.21 | 91.15 | 154.02 | 0.0 | +| 1 | 2 | 32 | 32 | 10.19 | 10.19 | 191.51 | 182.72 | 281.96 | 0.0 | +| 1 | 4 | 32 | 32 | 10.4 | 10.4 | 358.38 | 345.82 | 488.71 | 0.0 | +| 1 | 8 | 32 | 32 | 10.21 | 10.21 | 671.22 | 722.27 | 1049.38 | 0.0 | +| 1 | 16 | 32 | 32 | 10.01 | 10.01 | 1168.24 | 1325.37 | 2211.32 | 0.0 | +| 1 | 64 | 32 | 32 | 9.95 | 9.95 | 1542.66 | 1493.91 | 2952.68 | 0.0 | +| 4 | 1 | 128 | 32 | 22.07 | 5.52 | 181.17 | 168.82 | 291.07 | 0.0 | +| 4 | 2 | 128 | 32 | 24.37 | 6.09 | 322.06 | 288.42 | 548.09 | 0.0 | +| 4 | 4 | 128 | 32 | 24.24 | 6.06 | 620.93 | 543.95 | 1055.41 | 0.0 | +| 4 | 8 | 128 | 32 | 22.47 | 5.62 | 1207.2 | 1203.59 | 1951.79 | 0.0 | +| 4 | 16 | 128 | 32 | 22.22 | 5.56 | 2073.8 | 1972.31 | 4344.0 | 0.0 | +| 8 | 1 | 256 | 32 | 32.44 | 4.06 | 246.41 | 245.92 | 337.1 | 0.0 | +| 8 | 2 | 256 | 32 | 34.57 | 4.32 | 452.65 | 477.44 | 608.67 | 0.0 | +| 8 | 4 | 256 | 32 | 34.37 | 4.3 | 875.33 | 948.09 | 1137.34 | 0.0 | +| 8 | 8 | 256 | 32 | 34.76 | 4.35 | 1596.54 | 1844.24 | 2135.61 | 0.0 | +| 8 | 16 | 256 | 32 | 35.31 | 4.41 | 2680.01 | 3376.09 | 3918.49 | 0.0 | +| 16 | 1 | 512 | 32 | 52.98 | 3.31 | 301.86 | 316.96 | 387.51 | 0.0 | +| 16 | 2 | 512 | 32 | 54.2 | 3.39 | 581.13 | 622.86 | 723.4 | 0.0 | +| 16 | 4 | 512 | 32 | 54.06 | 3.38 | 1135.6 | 1189.63 | 1460.36 | 0.0 | +| 16 | 8 | 512 | 32 | 53.91 | 3.37 | 2131.92 | 2427.31 | 2785.87 | 0.0 | +| 32 | 1 | 1024 | 32 | 81.53 | 2.55 | 391.96 | 406.43 | 452.1 | 0.0 | +| 32 | 2 | 1024 | 32 | 80.98 | 2.53 | 777.72 | 795.59 | 905.77 | 0.0 | +| 32 | 4 | 1024 | 32 | 80.23 | 2.51 | 1525.26 | 1566.41 | 1831.7 | 0.0 | +| 64 | 1 | 2048 | 32 | 110.08 | 1.72 | 580.78 | 586.87 | 691.83 | 0.0 | +| 64 | 2 | 2048 | 32 | 114.37 | 1.79 | 1100.27 | 1127.51 | 1243.02 | 0.0 | + +## nllb-200-distilled-600m en->zh + +- Direction: `en -> zh` +- Column: `title` +- Loaded rows: `2048` +- Load time: `5.564 s` +- Device: `cuda` +- DType: `float16` +- Cache disabled: `True` + +### Batch Sweep (`concurrency=1`) + +| Batch | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 256 | 256 | 5.03 | 5.03 | 198.6 | 192.97 | 319.06 | 0.0 | +| 4 | 256 | 64 | 13.74 | 3.44 | 291.05 | 311.76 | 349.79 | 0.0 | +| 8 | 256 | 32 | 24.46 | 3.06 | 327.02 | 336.32 | 366.86 | 0.0 | +| 16 | 256 | 16 | 42.42 | 2.65 | 377.19 | 381.47 | 408.81 | 0.0 | +| 32 | 256 | 8 | 67.38 | 2.11 | 474.92 | 474.35 | 502.51 | 0.0 | +| 64 | 256 | 4 | 91.26 | 1.43 | 701.3 | 707.24 | 738.5 | 0.0 | + +### Concurrency Sweep (`batch_size=1`) + +| Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 32 | 32 | 5.83 | 5.83 | 171.33 | 162.21 | 259.52 | 0.0 | +| 2 | 32 | 32 | 5.62 | 5.62 | 351.38 | 349.24 | 479.41 | 0.0 | +| 4 | 32 | 32 | 5.51 | 5.51 | 687.51 | 695.0 | 875.41 | 0.0 | +| 8 | 32 | 32 | 5.53 | 5.53 | 1279.01 | 1331.94 | 2193.01 | 0.0 | +| 16 | 32 | 32 | 5.45 | 5.45 | 2211.06 | 2621.54 | 3879.29 | 0.0 | +| 64 | 32 | 32 | 5.37 | 5.37 | 3113.23 | 3074.42 | 5611.21 | 0.0 | + +### Batch x Concurrency Matrix + +| Batch | Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 1 | 32 | 32 | 5.5 | 5.5 | 181.68 | 171.06 | 262.47 | 0.0 | +| 1 | 2 | 32 | 32 | 5.54 | 5.54 | 356.26 | 348.75 | 479.03 | 0.0 | +| 1 | 4 | 32 | 32 | 5.55 | 5.55 | 683.35 | 697.01 | 845.47 | 0.0 | +| 1 | 8 | 32 | 32 | 5.87 | 5.87 | 1218.56 | 1297.55 | 1811.3 | 0.0 | +| 1 | 16 | 32 | 32 | 5.68 | 5.68 | 2076.8 | 2443.82 | 3078.65 | 0.0 | +| 1 | 64 | 32 | 32 | 5.4 | 5.4 | 3006.17 | 3009.57 | 5493.11 | 0.0 | +| 4 | 1 | 128 | 32 | 14.65 | 3.66 | 272.44 | 267.49 | 344.28 | 0.0 | +| 4 | 2 | 128 | 32 | 15.03 | 3.76 | 524.29 | 523.34 | 650.15 | 0.0 | +| 4 | 4 | 128 | 32 | 14.95 | 3.74 | 1019.74 | 1043.76 | 1211.75 | 0.0 | +| 4 | 8 | 128 | 32 | 14.63 | 3.66 | 1952.63 | 2251.77 | 2450.21 | 0.0 | +| 4 | 16 | 128 | 32 | 14.68 | 3.67 | 3303.53 | 4188.22 | 4628.4 | 0.0 | +| 8 | 1 | 256 | 32 | 25.36 | 3.17 | 315.38 | 322.31 | 345.35 | 0.0 | +| 8 | 2 | 256 | 32 | 25.1 | 3.14 | 627.19 | 647.23 | 709.93 | 0.0 | +| 8 | 4 | 256 | 32 | 25.11 | 3.14 | 1212.91 | 1259.02 | 1357.39 | 0.0 | +| 8 | 8 | 256 | 32 | 25.09 | 3.14 | 2267.61 | 2538.76 | 2620.04 | 0.0 | +| 8 | 16 | 256 | 32 | 24.74 | 3.09 | 3940.32 | 5035.7 | 5297.16 | 0.0 | +| 16 | 1 | 512 | 32 | 42.88 | 2.68 | 372.8 | 371.73 | 417.6 | 0.0 | +| 16 | 2 | 512 | 32 | 44.16 | 2.76 | 712.56 | 734.6 | 768.42 | 0.0 | +| 16 | 4 | 512 | 32 | 44.08 | 2.76 | 1385.99 | 1460.14 | 1502.65 | 0.0 | +| 16 | 8 | 512 | 32 | 43.7 | 2.73 | 2617.84 | 2954.76 | 3005.53 | 0.0 | +| 32 | 1 | 1024 | 32 | 66.94 | 2.09 | 476.4 | 469.61 | 523.49 | 0.0 | +| 32 | 2 | 1024 | 32 | 69.75 | 2.18 | 902.46 | 912.39 | 977.26 | 0.0 | +| 32 | 4 | 1024 | 32 | 69.36 | 2.17 | 1759.01 | 1839.11 | 1888.06 | 0.0 | +| 64 | 1 | 2048 | 32 | 92.15 | 1.44 | 693.91 | 692.06 | 731.65 | 0.0 | +| 64 | 2 | 2048 | 32 | 95.59 | 1.49 | 1315.82 | 1338.66 | 1387.23 | 0.0 | + +## opus-mt-zh-en zh->en + +- Direction: `zh -> en` +- Column: `title_cn` +- Loaded rows: `2048` +- Load time: `1.0381 s` +- Device: `cuda` +- DType: `float16` +- Cache disabled: `True` + +### Batch Sweep (`concurrency=1`) + +| Batch | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 256 | 256 | 18.84 | 18.84 | 53.08 | 47.66 | 95.63 | 0.0 | +| 4 | 256 | 64 | 47.6 | 11.9 | 84.03 | 75.74 | 119.84 | 0.0 | +| 8 | 256 | 32 | 74.34 | 9.29 | 107.61 | 96.19 | 141.84 | 0.0 | +| 16 | 256 | 16 | 111.61 | 6.98 | 143.34 | 126.49 | 257.18 | 0.0 | +| 32 | 256 | 8 | 154.61 | 4.83 | 206.96 | 158.62 | 438.14 | 0.0 | +| 64 | 256 | 4 | 218.5 | 3.41 | 292.9 | 213.26 | 547.27 | 0.0 | + +### Concurrency Sweep (`batch_size=1`) + +| Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 32 | 32 | 27.85 | 27.85 | 35.78 | 33.41 | 60.61 | 0.0 | +| 2 | 32 | 32 | 28.8 | 28.8 | 67.82 | 62.33 | 95.46 | 0.0 | +| 4 | 32 | 32 | 28.62 | 28.62 | 130.54 | 125.84 | 201.95 | 0.0 | +| 8 | 32 | 32 | 28.55 | 28.55 | 242.59 | 227.8 | 390.32 | 0.0 | +| 16 | 32 | 32 | 27.28 | 27.28 | 449.66 | 521.12 | 912.62 | 0.0 | +| 64 | 32 | 32 | 27.4 | 27.4 | 557.86 | 517.06 | 1061.35 | 0.0 | + +### Batch x Concurrency Matrix + +| Batch | Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 1 | 32 | 32 | 25.91 | 25.91 | 38.29 | 34.67 | 61.44 | 0.0 | +| 1 | 2 | 32 | 32 | 28.2 | 28.2 | 68.02 | 61.92 | 110.41 | 0.0 | +| 1 | 4 | 32 | 32 | 27.03 | 27.03 | 139.18 | 134.97 | 184.23 | 0.0 | +| 1 | 8 | 32 | 32 | 27.87 | 27.87 | 248.14 | 251.39 | 356.34 | 0.0 | +| 1 | 16 | 32 | 32 | 27.1 | 27.1 | 424.06 | 477.76 | 612.64 | 0.0 | +| 1 | 64 | 32 | 32 | 29.17 | 29.17 | 503.96 | 475.72 | 997.76 | 0.0 | +| 4 | 1 | 128 | 32 | 48.35 | 12.09 | 82.67 | 66.35 | 116.75 | 0.0 | +| 4 | 2 | 128 | 32 | 48.09 | 12.02 | 164.18 | 121.89 | 401.43 | 0.0 | +| 4 | 4 | 128 | 32 | 48.2 | 12.05 | 290.2 | 251.27 | 561.3 | 0.0 | +| 4 | 8 | 128 | 32 | 49.45 | 12.36 | 531.9 | 449.31 | 1115.99 | 0.0 | +| 4 | 16 | 128 | 32 | 58.28 | 14.57 | 713.26 | 834.24 | 1325.58 | 0.0 | +| 8 | 1 | 256 | 32 | 96.87 | 12.11 | 82.53 | 70.28 | 121.7 | 0.0 | +| 8 | 2 | 256 | 32 | 106.37 | 13.3 | 148.34 | 125.3 | 357.79 | 0.0 | +| 8 | 4 | 256 | 32 | 111.35 | 13.92 | 274.44 | 250.25 | 600.44 | 0.0 | +| 8 | 8 | 256 | 32 | 96.65 | 12.08 | 579.06 | 667.78 | 1025.51 | 0.0 | +| 8 | 16 | 256 | 32 | 80.62 | 10.08 | 1236.24 | 1557.05 | 1886.21 | 0.0 | +| 16 | 1 | 512 | 32 | 91.27 | 5.7 | 174.5 | 124.75 | 533.12 | 0.0 | +| 16 | 2 | 512 | 32 | 105.66 | 6.6 | 299.07 | 220.94 | 631.62 | 0.0 | +| 16 | 4 | 512 | 32 | 106.3 | 6.64 | 581.51 | 481.69 | 1163.67 | 0.0 | +| 16 | 8 | 512 | 32 | 104.18 | 6.51 | 1039.37 | 1160.52 | 1800.29 | 0.0 | +| 32 | 1 | 1024 | 32 | 119.48 | 3.73 | 267.65 | 163.8 | 586.28 | 0.0 | +| 32 | 2 | 1024 | 32 | 140.5 | 4.39 | 442.92 | 311.65 | 820.49 | 0.0 | +| 32 | 4 | 1024 | 32 | 150.2 | 4.69 | 807.09 | 827.33 | 1236.61 | 0.0 | +| 64 | 1 | 2048 | 32 | 184.12 | 2.88 | 347.39 | 264.91 | 617.24 | 0.0 | +| 64 | 2 | 2048 | 32 | 207.86 | 3.25 | 610.34 | 684.9 | 1064.5 | 0.0 | + +## opus-mt-en-zh en->zh + +- Direction: `en -> zh` +- Column: `title` +- Loaded rows: `2048` +- Load time: `0.3704 s` +- Device: `cuda` +- DType: `float16` +- Cache disabled: `True` + +### Batch Sweep (`concurrency=1`) + +| Batch | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 256 | 256 | 13.17 | 13.17 | 75.92 | 64.12 | 136.47 | 0.0 | +| 4 | 256 | 64 | 28.7 | 7.17 | 139.37 | 112.06 | 476.36 | 0.0 | +| 8 | 256 | 32 | 49.1 | 6.14 | 162.93 | 124.84 | 391.18 | 0.0 | +| 16 | 256 | 16 | 102.05 | 6.38 | 156.78 | 124.18 | 396.16 | 0.0 | +| 32 | 256 | 8 | 145.12 | 4.53 | 220.5 | 176.43 | 408.92 | 0.0 | +| 64 | 256 | 4 | 131.67 | 2.06 | 486.05 | 493.97 | 654.21 | 0.0 | + +### Concurrency Sweep (`batch_size=1`) + +| Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 32 | 32 | 11.02 | 11.02 | 90.71 | 54.35 | 351.74 | 0.0 | +| 2 | 32 | 32 | 11.29 | 11.29 | 174.4 | 117.99 | 536.84 | 0.0 | +| 4 | 32 | 32 | 11.6 | 11.6 | 307.77 | 233.97 | 823.12 | 0.0 | +| 8 | 32 | 32 | 11.54 | 11.54 | 471.91 | 438.97 | 863.08 | 0.0 | +| 16 | 32 | 32 | 10.86 | 10.86 | 906.19 | 949.77 | 1827.08 | 0.0 | +| 64 | 32 | 32 | 11.31 | 11.31 | 1095.54 | 919.35 | 2459.49 | 0.0 | + +### Batch x Concurrency Matrix + +| Batch | Concurrency | Rows | Requests | Items/s | Req/s | Avg req ms | Req p50 ms | Req p95 ms | Peak GPU GiB | +|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 1 | 1 | 32 | 32 | 10.72 | 10.72 | 92.99 | 58.17 | 361.99 | 0.0 | +| 1 | 2 | 32 | 32 | 11.35 | 11.35 | 174.68 | 115.43 | 542.24 | 0.0 | +| 1 | 4 | 32 | 32 | 11.44 | 11.44 | 312.19 | 226.82 | 828.3 | 0.0 | +| 1 | 8 | 32 | 32 | 11.64 | 11.64 | 506.86 | 415.08 | 1147.15 | 0.0 | +| 1 | 16 | 32 | 32 | 11.15 | 11.15 | 1009.4 | 838.27 | 2103.9 | 0.0 | +| 1 | 64 | 32 | 32 | 11.08 | 11.08 | 1167.27 | 984.4 | 2532.28 | 0.0 | +| 4 | 1 | 128 | 32 | 31.83 | 7.96 | 125.51 | 109.44 | 216.9 | 0.0 | +| 4 | 2 | 128 | 32 | 38.97 | 9.74 | 203.2 | 180.61 | 419.46 | 0.0 | +| 4 | 4 | 128 | 32 | 43.78 | 10.95 | 353.68 | 343.79 | 546.77 | 0.0 | +| 4 | 8 | 128 | 32 | 37.23 | 9.31 | 770.39 | 880.33 | 964.82 | 0.0 | +| 4 | 16 | 128 | 32 | 32.48 | 8.12 | 1564.07 | 1825.11 | 2244.63 | 0.0 | +| 8 | 1 | 256 | 32 | 42.33 | 5.29 | 188.66 | 147.07 | 519.33 | 0.0 | +| 8 | 2 | 256 | 32 | 45.05 | 5.63 | 350.91 | 283.6 | 717.38 | 0.0 | +| 8 | 4 | 256 | 32 | 44.84 | 5.6 | 690.69 | 589.12 | 1260.54 | 0.0 | +| 8 | 8 | 256 | 32 | 44.6 | 5.58 | 1318.2 | 1368.26 | 2019.86 | 0.0 | +| 8 | 16 | 256 | 32 | 44.53 | 5.57 | 2168.52 | 2448.18 | 3105.11 | 0.0 | +| 16 | 1 | 512 | 32 | 59.23 | 3.7 | 270.08 | 206.79 | 568.32 | 0.0 | +| 16 | 2 | 512 | 32 | 78.16 | 4.88 | 392.89 | 270.81 | 709.39 | 0.0 | +| 16 | 4 | 512 | 32 | 65.35 | 4.08 | 921.0 | 946.61 | 1389.25 | 0.0 | +| 16 | 8 | 512 | 32 | 65.64 | 4.1 | 1697.84 | 1572.57 | 2423.97 | 0.0 | +| 32 | 1 | 1024 | 32 | 84.23 | 2.63 | 379.52 | 279.85 | 629.28 | 0.0 | +| 32 | 2 | 1024 | 32 | 101.77 | 3.18 | 610.89 | 620.5 | 1038.84 | 0.0 | +| 32 | 4 | 1024 | 32 | 100.64 | 3.15 | 1175.94 | 1129.62 | 1794.84 | 0.0 | +| 64 | 1 | 2048 | 32 | 136.74 | 2.14 | 467.8 | 476.82 | 680.43 | 0.0 | +| 64 | 2 | 2048 | 32 | 140.91 | 2.2 | 890.68 | 983.82 | 1138.98 | 0.0 | diff --git a/perf_reports/20260318/translation_local_models_ct2_focus/README.md b/perf_reports/20260318/translation_local_models_ct2_focus/README.md new file mode 100644 index 0000000..2092880 --- /dev/null +++ b/perf_reports/20260318/translation_local_models_ct2_focus/README.md @@ -0,0 +1,158 @@ +# Local Translation Model Focused T4 Tuning + +测试脚本: +- [`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) + +本轮聚焦结果: +- Markdown:[`translation_local_models_focus_235018.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md) +- JSON:[`translation_local_models_focus_235018.json`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.json) + +说明: +- 这份报告是第一轮 T4 聚焦调优结论。 +- 对 `nllb-200-distilled-600M`,当前最新推荐已经由专项报告覆盖: + [`../nllb_t4_product_names_ct2/README.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/README.md) +- 本页里关于 NLLB 的 `ct2_inter_threads=2 + ct2_max_queued_batches=16` 结论,应视为已被更新。 + +相关报告: +- 基线扩展报告:[`../translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- CT2 扩展报告:[`../translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) +- CT2 与 HF 对比:[`../translation_local_models_ct2/comparison_vs_hf_baseline.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/comparison_vs_hf_baseline.md) + +测试时间: +- `2026-03-18` + +环境: +- GPU:`Tesla T4 16GB` +- Python env:`.venv-translator` +- Torch / Transformers:`2.10.0+cu128 / 5.3.0` +- CTranslate2:`4.7.1` + +## Scope + +这轮不再做完整矩阵,只看两个目标场景: + +- `high batch + low concurrency` + - `batch=32/64/128` + - `concurrency=1` +- `high concurrency + low batch` + - `batch=1` + - `concurrency=8/16/32/64` + +对比的两个 CT2 变体: + +- `ct2_default` + - 当前默认:`ct2_inter_threads=1`、`ct2_max_queued_batches=0`、`ct2_batch_type=examples` +- `ct2_tuned_t4` + - 调优候选:`ct2_inter_threads=2`、`ct2_max_queued_batches=16`、`ct2_batch_type=examples` + +## Recommendation + +结论先写在前面: + +- **NLLB 推荐升级到 `ct2_inter_threads=2 + ct2_max_queued_batches=16`。** +- `opus-mt-zh-en` 维持默认更稳。 +- `opus-mt-en-zh` 在大 batch 和高并发吞吐上有收益,但在线 `c=8` 的 `p95` 有波动,不建议直接把同一套 tuned 参数作为线上默认。 + +这也是为什么当前配置只把 NLLB 调成了 tuned profile,而两个 Marian 模型保持保守默认值。 + +## Key Results + +### 1. NLLB 是这轮最值得调的模型 + +`nllb-200-distilled-600m zh -> en` + +| Scenario | Default | Tuned | 结果 | +|---|---:|---:|---| +| `batch=64, concurrency=1` items/s | `113.25` | `111.86` | 基本持平 | +| `batch=64, concurrency=1` p95 ms | `662.38` | `657.84` | 基本持平 | +| `batch=1, concurrency=16` items/s | `10.34` | `12.91` | 明显提升 | +| `batch=1, concurrency=16` p95 ms | `1904.9` | `1368.92` | 明显下降 | +| `batch=1, concurrency=32` items/s | `10.17` | `12.8` | 明显提升 | +| `batch=1, concurrency=32` p95 ms | `2876.88` | `2350.5` | 明显下降 | + +`nllb-200-distilled-600m en -> zh` + +| Scenario | Default | Tuned | 结果 | +|---|---:|---:|---| +| `batch=64, concurrency=1` items/s | `96.27` | `93.36` | 小幅回落 | +| `batch=64, concurrency=1` p95 ms | `701.75` | `721.79` | 小幅变差 | +| `batch=1, concurrency=16` items/s | `5.51` | `7.91` | 明显提升 | +| `batch=1, concurrency=16` p95 ms | `4613.05` | `2039.17` | 大幅下降 | +| `batch=1, concurrency=32` items/s | `5.46` | `7.9` | 明显提升 | +| `batch=1, concurrency=32` p95 ms | `5554.4` | `3912.75` | 明显下降 | + +解读: +- NLLB 的 tuned profile 主要是把 T4 的并发潜力释放出来。 +- bulk 场景几乎没有受伤,尤其 `zh -> en` 基本持平。 +- 在线场景收益非常大,所以这轮调优最应该落在 NLLB 上。 + +### 2. Marian 不适合统一套用 NLLB 的 tuned 参数 + +`opus-mt-zh-en zh -> en` + +- `batch=64, concurrency=1`:`164.1 -> 151.21 items/s`,默认更好 +- `batch=1, concurrency=32`:`27.5 -> 29.83 items/s`,tuned 略好 +- `batch=1, concurrency=64`:`28.43 -> 26.85 items/s`,默认更好 + +结论: +- 这个模型已经很轻,默认 profile 更均衡。 +- 不值得为了少量中并发收益牺牲大 batch 或高并发稳定性。 + +`opus-mt-en-zh en -> zh` + +- `batch=64, concurrency=1`:`114.34 -> 121.87 items/s` +- `batch=128, concurrency=1`:`162.29 -> 210.29 items/s` +- `batch=1, concurrency=16`:`11.22 -> 12.65 items/s` +- `batch=1, concurrency=8` 的 `p95` 从 `798.77` 变成 `1199.98` + +结论: +- 这个模型对 tuned profile 更敏感,吞吐会明显变好。 +- 但在线 `c=8` 的 `p95` 变差,说明它更像“专用吞吐配置”,不适合直接作为统一线上默认。 + +## T4 Experience Summary + +这轮真正有价值的经验: + +- **经验 1:不要再用完整矩阵找方向。** + - 先只看 `high batch + low concurrency` 和 `high concurrency + low batch` 两个极端,效率更高。 + +- **经验 2:NLLB 在 T4 上确实吃 `inter_threads` 和队列深度。** + - `ct2_inter_threads=2` + - `ct2_max_queued_batches=16` + - 这组参数对高并发 `batch=1` 在线场景收益最明显。 + +- **经验 3:`inter_threads=4` 太激进。** + - 它能把部分高并发吞吐继续往上推。 + - 但会严重伤害大 batch 吞吐,尤其 `batch=64` 这类 bulk 场景。 + - 因此不适合作为通用服务默认值。 + +- **经验 4:`ct2_batch_type=tokens` 不是当前 T4 的主增益点。** + - 对 `batch=1` 的在线场景没有带来稳定收益。 + - 当前项目里优先保留 `examples` 更稳妥。 + +- **经验 5:单模型单 worker 仍然是默认部署方式。** + - 本轮调优解决的是同一 worker 内的 GPU 利用率问题。 + - 不是靠堆 FastAPI worker 数来提吞吐。 + +## Deployment / Config Tasks Worth Keeping + +这些任务被证明是“应该沉淀到文档和配置里”的: + +- 把本地 Marian / NLLB 统一迁移到 CTranslate2 +- 使用 `float16` 转换并预生成 CT2 模型目录 +- 保持单 worker,避免重复加载模型 +- 对 NLLB 启用: + - `ct2_inter_threads=2` + - `ct2_max_queued_batches=16` + - `ct2_batch_type=examples` +- Marian 继续保守默认: + - `ct2_inter_threads=1` + - `ct2_max_queued_batches=0` + +## Next Step + +如果下一轮继续压线上延迟,优先顺序建议是: + +1. 服务级微批处理队列 +2. 短文本 / 长文本分桶 +3. 为 `opus-mt-en-zh` 增加“在线默认”和“离线高吞吐”两套配置 diff --git a/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md b/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md new file mode 100644 index 0000000..dc0db72 --- /dev/null +++ b/perf_reports/20260318/translation_local_models_ct2_focus/translation_local_models_focus_235018.md @@ -0,0 +1,132 @@ +# Local Translation Model Focused Benchmark + +- Generated at: `2026-03-18T23:45:50` +- Python: `3.12.3` +- Torch: `2.10.0+cu128` +- Transformers: `5.3.0` +- CUDA: `True` +- GPU: `Tesla T4` (15.56 GiB) + +## Scope + +- Scenario 1: high batch size + low concurrency +- Scenario 2: high concurrency + low batch size +- Variants in this report: + - `ct2_default`: `{}` + - `ct2_tuned_t4`: `{'ct2_inter_threads': 2, 'ct2_max_queued_batches': 16, 'ct2_batch_type': 'examples'}` + +## nllb-200-distilled-600m zh->en + +- Direction: `zh -> en` +- Column: `title_cn` + +### Variant `ct2_default` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 81.26 | 458.67 | 393.78 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 113.25 | 662.38 | 565.09 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 142.43 | 959.47 | 898.64 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 11.24 | 919.46 | 599.51 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 10.34 | 1904.9 | 1124.8 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 10.17 | 2876.88 | 1495.51 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 10.32 | 2837.64 | 1442.59 | + +### Variant `ct2_tuned_t4` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 79.44 | 464.07 | 402.81 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 111.86 | 657.84 | 572.15 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 137.56 | 994.04 | 930.45 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 11.65 | 778.66 | 596.33 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 12.91 | 1368.92 | 902.12 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 12.8 | 2350.5 | 1237.45 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 11.06 | 2729.36 | 1413.83 | + +## nllb-200-distilled-600m en->zh + +- Direction: `en -> zh` +- Column: `title` + +### Variant `ct2_default` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 70.7 | 481.89 | 452.61 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 96.27 | 701.75 | 664.81 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 114.27 | 1137.6 | 1120.15 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 5.54 | 1850.06 | 1287.52 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 5.51 | 4613.05 | 2252.26 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 5.46 | 5554.4 | 3022.86 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 5.47 | 5514.61 | 3035.08 | + +### Variant `ct2_tuned_t4` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 67.87 | 499.47 | 471.45 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 93.36 | 721.79 | 685.53 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 115.0 | 1126.53 | 1113.05 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 7.9 | 1138.18 | 905.78 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 7.91 | 2039.17 | 1555.46 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 7.9 | 3912.75 | 2119.17 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 6.61 | 4565.12 | 2434.92 | + +## opus-mt-zh-en zh->en + +- Direction: `zh -> en` +- Column: `title_cn` + +### Variant `ct2_default` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 130.2 | 544.94 | 245.76 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 164.1 | 597.69 | 389.99 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 196.91 | 768.55 | 650.03 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 29.4 | 324.88 | 230.83 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 28.26 | 693.67 | 415.98 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 27.5 | 1049.24 | 572.84 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 28.43 | 994.92 | 520.92 | + +### Variant `ct2_tuned_t4` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 120.2 | 582.58 | 266.21 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 151.21 | 635.63 | 423.24 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 196.63 | 761.85 | 650.95 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 30.43 | 361.76 | 239.25 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 28.32 | 629.6 | 423.42 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 29.83 | 994.19 | 573.64 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 26.85 | 1092.87 | 612.05 | + +## opus-mt-en-zh en->zh + +- Direction: `en -> zh` +- Column: `title` + +### Variant `ct2_default` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 78.22 | 626.96 | 409.09 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 114.34 | 699.88 | 559.7 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 162.29 | 796.29 | 788.66 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 11.25 | 798.77 | 489.06 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 11.22 | 1759.07 | 978.67 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 11.48 | 2453.59 | 1101.78 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 11.35 | 2447.81 | 1116.99 | + +### Variant `ct2_tuned_t4` + +| Scenario | Setting | Items/s | Req p95 ms | Avg req ms | +|---|---|---:|---:|---:| +| high-batch/low-concurrency | batch=32, concurrency=1 | 79.33 | 620.07 | 403.35 | +| high-batch/low-concurrency | batch=64, concurrency=1 | 121.87 | 634.29 | 525.15 | +| high-batch/low-concurrency | batch=128, concurrency=1 | 210.29 | 623.45 | 608.65 | +| high-concurrency/low-batch | batch=1, concurrency=8 | 15.94 | 1199.98 | 334.6 | +| high-concurrency/low-batch | batch=1, concurrency=16 | 12.65 | 1683.16 | 744.33 | +| high-concurrency/low-batch | batch=1, concurrency=32 | 12.6 | 2435.98 | 974.56 | +| high-concurrency/low-batch | batch=1, concurrency=64 | 12.09 | 2433.63 | 960.19 | diff --git a/scripts/benchmark_nllb_t4_tuning.py b/scripts/benchmark_nllb_t4_tuning.py new file mode 100644 index 0000000..b33459f --- /dev/null +++ b/scripts/benchmark_nllb_t4_tuning.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +"""Focused NLLB T4 tuning benchmark for product-name translation.""" + +from __future__ import annotations + +import argparse +import copy +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Tuple + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from config.services_config import get_translation_config +from scripts.benchmark_translation_local_models import ( + benchmark_concurrency_case, + benchmark_serial_case, + build_environment_info, + ensure_cuda_stats_reset, + load_texts, +) +from translation.service import TranslationService + + +SCENARIOS = [ + { + "name": "nllb zh->en", + "model": "nllb-200-distilled-600m", + "source_lang": "zh", + "target_lang": "en", + "column": "title_cn", + "scene": "sku_name", + }, + { + "name": "nllb en->zh", + "model": "nllb-200-distilled-600m", + "source_lang": "en", + "target_lang": "zh", + "column": "title", + "scene": "sku_name", + }, +] + +VARIANTS = [ + { + "name": "ct2_default_fixed64", + "description": "Original CT2 default", + "overrides": { + "ct2_inter_threads": 1, + "ct2_max_queued_batches": 0, + "ct2_batch_type": "examples", + "max_new_tokens": 64, + }, + }, + { + "name": "ct2_prev_t4_fixed64", + "description": "Previous T4 tuning result", + "overrides": { + "ct2_inter_threads": 2, + "ct2_max_queued_batches": 16, + "ct2_batch_type": "examples", + "max_new_tokens": 64, + }, + }, + { + "name": "ct2_best_t4_dynamic", + "description": "Recommended T4 profile after this round", + "overrides": { + "ct2_inter_threads": 4, + "ct2_max_queued_batches": 32, + "ct2_batch_type": "examples", + "max_new_tokens": 64, + "ct2_decoding_length_mode": "source", + "ct2_decoding_length_extra": 8, + "ct2_decoding_length_min": 32, + }, + }, + { + "name": "ct2_fixed48_experiment", + "description": "High-gain experiment with truncation risk", + "overrides": { + "ct2_inter_threads": 3, + "ct2_max_queued_batches": 16, + "ct2_batch_type": "examples", + "max_new_tokens": 48, + }, + }, +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Focused NLLB T4 tuning benchmark") + parser.add_argument("--csv-path", default="products_analyzed.csv", help="Benchmark dataset CSV path") + parser.add_argument( + "--output-dir", + default="perf_reports/20260318/nllb_t4_product_names_ct2", + help="Directory for JSON/Markdown reports", + ) + parser.add_argument("--batch-size", type=int, default=64, help="Batch size for the bulk scenario") + parser.add_argument("--batch-items", type=int, default=256, help="Rows used for the bulk scenario") + parser.add_argument("--concurrency", type=int, default=64, help="Concurrency for the online scenario") + parser.add_argument( + "--requests-per-case", + type=int, + default=24, + help="Requests per worker in the online scenario", + ) + parser.add_argument("--quality-samples", type=int, default=100, help="Rows used for quality spot-checks") + parser.add_argument("--warmup-batches", type=int, default=1, help="Warmup batches before measuring") + return parser.parse_args() + + +def build_service(model: str, overrides: Dict[str, Any]) -> Tuple[TranslationService, Dict[str, Any]]: + config = copy.deepcopy(get_translation_config()) + for name, cfg in config["capabilities"].items(): + cfg["enabled"] = name == model + cfg["use_cache"] = False + config["default_model"] = model + capability = config["capabilities"][model] + capability.update(overrides) + return TranslationService(config), capability + + +def build_quality_reference_overrides(overrides: Dict[str, Any]) -> Dict[str, Any]: + reference = dict(overrides) + reference.pop("ct2_decoding_length_mode", None) + reference.pop("ct2_decoding_length_extra", None) + reference.pop("ct2_decoding_length_min", None) + reference["max_new_tokens"] = max(64, int(reference.get("max_new_tokens", 64))) + return reference + + +def summarize_quality(reference_outputs: List[Any], candidate_outputs: List[Any], texts: List[str]) -> Dict[str, Any]: + same = 0 + diffs: List[Dict[str, str]] = [] + for text, ref_output, candidate_output in zip(texts, reference_outputs, candidate_outputs): + if ref_output == candidate_output: + same += 1 + continue + if len(diffs) < 3: + diffs.append( + { + "input": text, + "candidate": "" if candidate_output is None else str(candidate_output), + "reference": "" if ref_output is None else str(ref_output), + } + ) + return { + "same": same, + "total": len(texts), + "changed": len(texts) - same, + "sample_diffs": diffs, + } + + +def render_markdown(report: Dict[str, Any]) -> str: + lines = [ + "# NLLB T4 Product-Name Tuning", + "", + f"- Generated at: `{report['generated_at']}`", + f"- Python: `{report['environment']['python']}`", + f"- Torch: `{report['environment']['torch']}`", + f"- Transformers: `{report['environment']['transformers']}`", + f"- CUDA: `{report['environment']['cuda_available']}`", + ] + if report["environment"]["gpu_name"]: + lines.append(f"- GPU: `{report['environment']['gpu_name']}` ({report['environment']['gpu_total_mem_gb']} GiB)") + lines.extend( + [ + "", + "## Scope", + "", + f"- Bulk scenario: `batch={report['config']['batch_size']}, concurrency=1`", + f"- Online scenario: `batch=1, concurrency={report['config']['concurrency']}`", + f"- Online requests per worker: `{report['config']['requests_per_case']}`", + f"- Quality spot-check samples: `{report['config']['quality_samples']}`", + "", + "## Variants", + "", + ] + ) + for variant in report["variants"]: + lines.append(f"- `{variant['name']}`: {variant['description']} -> `{variant['overrides']}`") + + for scenario in report["scenarios"]: + lines.extend( + [ + "", + f"## {scenario['name']}", + "", + "| Variant | Bulk items/s | Bulk p95 ms | Online items/s | Online p95 ms | Quality same/total |", + "|---|---:|---:|---:|---:|---:|", + ] + ) + for variant in scenario["variants"]: + quality = variant["quality_vs_reference"] + lines.append( + f"| {variant['name']} | {variant['bulk']['items_per_second']} | {variant['bulk']['request_latency_p95_ms']} | " + f"{variant['online']['items_per_second']} | {variant['online']['request_latency_p95_ms']} | " + f"{quality['same']}/{quality['total']} |" + ) + for variant in scenario["variants"]: + quality = variant["quality_vs_reference"] + if not quality["sample_diffs"]: + continue + lines.extend( + [ + "", + f"### Quality Notes: {variant['name']}", + "", + ] + ) + for diff in quality["sample_diffs"]: + lines.append(f"- Input: `{diff['input']}`") + lines.append(f"- Candidate: `{diff['candidate']}`") + lines.append(f"- Reference: `{diff['reference']}`") + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def main() -> None: + args = parse_args() + csv_path = (PROJECT_ROOT / args.csv_path).resolve() if not Path(args.csv_path).is_absolute() else Path(args.csv_path) + output_dir = (PROJECT_ROOT / args.output_dir).resolve() if not Path(args.output_dir).is_absolute() else Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + report: Dict[str, Any] = { + "generated_at": datetime.now().isoformat(timespec="seconds"), + "environment": build_environment_info(), + "config": { + "csv_path": str(csv_path), + "batch_size": args.batch_size, + "batch_items": args.batch_items, + "concurrency": args.concurrency, + "requests_per_case": args.requests_per_case, + "quality_samples": args.quality_samples, + }, + "variants": VARIANTS, + "scenarios": [], + } + + for scenario in SCENARIOS: + batch_texts = load_texts(csv_path, scenario["column"], args.batch_items) + online_texts = load_texts(csv_path, scenario["column"], args.concurrency * args.requests_per_case) + quality_texts = load_texts(csv_path, scenario["column"], args.quality_samples) + + scenario_report = dict(scenario) + scenario_report["variants"] = [] + for variant in VARIANTS: + print(f"[start] {scenario['name']} | {variant['name']}", flush=True) + ensure_cuda_stats_reset() + service, capability = build_service(scenario["model"], variant["overrides"]) + backend = service.get_backend(scenario["model"]) + bulk = benchmark_serial_case( + service=service, + backend=backend, + scenario=scenario, + capability=capability, + texts=batch_texts, + batch_size=args.batch_size, + warmup_batches=args.warmup_batches, + ) + online = benchmark_concurrency_case( + service=service, + backend=backend, + scenario=scenario, + capability=capability, + texts=online_texts, + batch_size=1, + concurrency=args.concurrency, + requests_per_case=args.requests_per_case, + warmup_batches=args.warmup_batches, + ) + quality_reference_overrides = build_quality_reference_overrides(variant["overrides"]) + reference_service, _ = build_service(scenario["model"], quality_reference_overrides) + reference_outputs = reference_service.translate( + quality_texts, + source_lang=scenario["source_lang"], + target_lang=scenario["target_lang"], + model=scenario["model"], + scene=scenario["scene"], + ) + candidate_outputs = service.translate( + quality_texts, + source_lang=scenario["source_lang"], + target_lang=scenario["target_lang"], + model=scenario["model"], + scene=scenario["scene"], + ) + scenario_report["variants"].append( + { + "name": variant["name"], + "description": variant["description"], + "overrides": variant["overrides"], + "quality_reference_overrides": quality_reference_overrides, + "bulk": bulk, + "online": online, + "quality_vs_reference": summarize_quality(reference_outputs, candidate_outputs, quality_texts), + } + ) + report["scenarios"].append(scenario_report) + + timestamp = datetime.now().strftime("%H%M%S") + json_path = output_dir / f"nllb_t4_tuning_{timestamp}.json" + md_path = output_dir / f"nllb_t4_tuning_{timestamp}.md" + json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + md_path.write_text(render_markdown(report), encoding="utf-8") + print(f"JSON_REPORT={json_path}") + print(f"MARKDOWN_REPORT={md_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/benchmark_translation_local_models.py b/scripts/benchmark_translation_local_models.py index 845fc44..ded8c64 100644 --- a/scripts/benchmark_translation_local_models.py +++ b/scripts/benchmark_translation_local_models.py @@ -87,6 +87,36 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--max-new-tokens", type=int, default=0, help="Override configured max_new_tokens") parser.add_argument("--num-beams", type=int, default=0, help="Override configured num_beams") parser.add_argument("--attn-implementation", default="", help="Override attention implementation, for example sdpa") + parser.add_argument("--ct2-inter-threads", type=int, default=-1, help="Override CTranslate2 inter_threads") + parser.add_argument("--ct2-intra-threads", type=int, default=-1, help="Override CTranslate2 intra_threads") + parser.add_argument( + "--ct2-max-queued-batches", + type=int, + default=-1, + help="Override CTranslate2 max_queued_batches", + ) + parser.add_argument( + "--ct2-batch-type", + default="", + help="Override CTranslate2 batch_type, for example examples or tokens", + ) + parser.add_argument( + "--ct2-decoding-length-mode", + default="", + help="Override CTranslate2 decoding length mode, for example fixed or source", + ) + parser.add_argument( + "--ct2-decoding-length-extra", + type=int, + default=0, + help="Extra tokens added when ct2 decoding length mode is source", + ) + parser.add_argument( + "--ct2-decoding-length-min", + type=int, + default=0, + help="Minimum decoding length when ct2 decoding length mode is source", + ) parser.add_argument("--warmup-batches", type=int, default=1, help="Warmup batches before measuring") parser.add_argument("--disable-cache", action="store_true", help="Disable translation cache during benchmarks") parser.add_argument( @@ -245,6 +275,20 @@ def build_config_and_capability( capability["num_beams"] = args.num_beams if args.attn_implementation: capability["attn_implementation"] = args.attn_implementation + if args.ct2_inter_threads >= 0: + capability["ct2_inter_threads"] = args.ct2_inter_threads + if args.ct2_intra_threads >= 0: + capability["ct2_intra_threads"] = args.ct2_intra_threads + if args.ct2_max_queued_batches >= 0: + capability["ct2_max_queued_batches"] = args.ct2_max_queued_batches + if args.ct2_batch_type: + capability["ct2_batch_type"] = args.ct2_batch_type + if args.ct2_decoding_length_mode: + capability["ct2_decoding_length_mode"] = args.ct2_decoding_length_mode + if args.ct2_decoding_length_extra: + capability["ct2_decoding_length_extra"] = args.ct2_decoding_length_extra + if args.ct2_decoding_length_min: + capability["ct2_decoding_length_min"] = args.ct2_decoding_length_min if args.disable_cache: capability["use_cache"] = False config["capabilities"][args.model] = capability @@ -669,6 +713,20 @@ def run_all_scenarios(args: argparse.Namespace) -> Dict[str, Any]: cmd.extend(["--num-beams", str(args.num_beams)]) if args.attn_implementation: cmd.extend(["--attn-implementation", args.attn_implementation]) + if args.ct2_inter_threads >= 0: + cmd.extend(["--ct2-inter-threads", str(args.ct2_inter_threads)]) + if args.ct2_intra_threads >= 0: + cmd.extend(["--ct2-intra-threads", str(args.ct2_intra_threads)]) + if args.ct2_max_queued_batches >= 0: + cmd.extend(["--ct2-max-queued-batches", str(args.ct2_max_queued_batches)]) + if args.ct2_batch_type: + cmd.extend(["--ct2-batch-type", args.ct2_batch_type]) + if args.ct2_decoding_length_mode: + cmd.extend(["--ct2-decoding-length-mode", args.ct2_decoding_length_mode]) + if args.ct2_decoding_length_extra: + cmd.extend(["--ct2-decoding-length-extra", str(args.ct2_decoding_length_extra)]) + if args.ct2_decoding_length_min: + cmd.extend(["--ct2-decoding-length-min", str(args.ct2_decoding_length_min)]) if args.disable_cache: cmd.append("--disable-cache") diff --git a/scripts/benchmark_translation_local_models_focus.py b/scripts/benchmark_translation_local_models_focus.py new file mode 100644 index 0000000..00f0610 --- /dev/null +++ b/scripts/benchmark_translation_local_models_focus.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""Focused translation benchmark for two stress scenarios on local CT2 models.""" + +from __future__ import annotations + +import argparse +import copy +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from config.services_config import get_translation_config +from scripts.benchmark_translation_local_models import ( + SCENARIOS, + benchmark_concurrency_case, + benchmark_serial_case, + build_environment_info, + ensure_cuda_stats_reset, + load_texts, +) +from translation.service import TranslationService + +DEFAULT_HIGH_BATCH_SIZES = [32, 64, 128] +DEFAULT_HIGH_CONCURRENCIES = [8, 16, 32, 64] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Focused benchmark for local CT2 translation models") + parser.add_argument("--csv-path", default="products_analyzed.csv", help="Benchmark dataset CSV path") + parser.add_argument( + "--output-dir", + default="perf_reports/20260318/translation_local_models_ct2_focus", + help="Directory for JSON/Markdown focused reports", + ) + parser.add_argument( + "--high-batch-sizes", + default="32,64,128", + help="Comma-separated batch sizes for the high-batch/low-concurrency scenario", + ) + parser.add_argument( + "--high-concurrencies", + default="8,16,32,64", + help="Comma-separated concurrency levels for the high-concurrency/low-batch scenario", + ) + parser.add_argument( + "--high-batch-rows", + type=int, + default=512, + help="Rows used for the high-batch/low-concurrency scenario", + ) + parser.add_argument( + "--high-concurrency-requests", + type=int, + default=32, + help="Requests per high-concurrency/low-batch case", + ) + parser.add_argument("--warmup-batches", type=int, default=1, help="Warmup batches before measuring") + return parser.parse_args() + + +def parse_csv_ints(raw: str) -> List[int]: + values: List[int] = [] + for item in raw.split(","): + stripped = item.strip() + if not stripped: + continue + value = int(stripped) + if value <= 0: + raise ValueError(f"Expected positive integer, got {value}") + values.append(value) + if not values: + raise ValueError("Parsed empty integer list") + return values + + +def build_variant_config(model: str, overrides: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Any]]: + config = copy.deepcopy(get_translation_config()) + for name, cfg in config["capabilities"].items(): + cfg["enabled"] = name == model + cfg["use_cache"] = False + config["default_model"] = model + capability = config["capabilities"][model] + capability.update(overrides) + config["capabilities"][model] = capability + return config, capability + + +def render_markdown(report: Dict[str, Any]) -> str: + lines = [ + "# Local Translation Model Focused Benchmark", + "", + f"- Generated at: `{report['generated_at']}`", + f"- Python: `{report['environment']['python']}`", + f"- Torch: `{report['environment']['torch']}`", + f"- Transformers: `{report['environment']['transformers']}`", + f"- CUDA: `{report['environment']['cuda_available']}`", + ] + if report["environment"]["gpu_name"]: + lines.append(f"- GPU: `{report['environment']['gpu_name']}` ({report['environment']['gpu_total_mem_gb']} GiB)") + lines.extend( + [ + "", + "## Scope", + "", + "- Scenario 1: high batch size + low concurrency", + "- Scenario 2: high concurrency + low batch size", + "- Variants in this report:", + ] + ) + for variant in report["variants"]: + lines.append(f" - `{variant['name']}`: `{variant['overrides']}`") + + for scenario in report["scenarios"]: + lines.extend( + [ + "", + f"## {scenario['name']}", + "", + f"- Direction: `{scenario['source_lang']} -> {scenario['target_lang']}`", + f"- Column: `{scenario['column']}`", + ] + ) + for variant in scenario["variants"]: + lines.extend( + [ + "", + f"### Variant `{variant['name']}`", + "", + "| Scenario | Setting | Items/s | Req p95 ms | Avg req ms |", + "|---|---|---:|---:|---:|", + ] + ) + for row in variant["high_batch_low_concurrency"]: + lines.append( + f"| high-batch/low-concurrency | batch={row['batch_size']}, concurrency=1 | " + f"{row['items_per_second']} | {row['request_latency_p95_ms']} | {row['avg_request_latency_ms']} |" + ) + for row in variant["high_concurrency_low_batch"]: + lines.append( + f"| high-concurrency/low-batch | batch=1, concurrency={row['concurrency']} | " + f"{row['items_per_second']} | {row['request_latency_p95_ms']} | {row['avg_request_latency_ms']} |" + ) + return "\n".join(lines) + "\n" + + +def main() -> None: + args = parse_args() + csv_path = (PROJECT_ROOT / args.csv_path).resolve() if not Path(args.csv_path).is_absolute() else Path(args.csv_path) + output_dir = (PROJECT_ROOT / args.output_dir).resolve() if not Path(args.output_dir).is_absolute() else Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + high_batch_sizes = parse_csv_ints(args.high_batch_sizes) + high_concurrencies = parse_csv_ints(args.high_concurrencies) + + variants = [ + {"name": "ct2_default", "overrides": {}}, + { + "name": "ct2_tuned_t4", + "overrides": { + "ct2_inter_threads": 2, + "ct2_max_queued_batches": 16, + "ct2_batch_type": "examples", + }, + }, + ] + + report: Dict[str, Any] = { + "generated_at": datetime.now().isoformat(timespec="seconds"), + "environment": build_environment_info(), + "csv_path": str(csv_path), + "variants": variants, + "scenarios": [], + } + + largest_batch = max(high_batch_sizes) + high_batch_rows = max(args.high_batch_rows, largest_batch) + + for scenario in SCENARIOS: + scenario_entry = dict(scenario) + scenario_entry["variants"] = [] + batch_texts = load_texts(csv_path, scenario["column"], high_batch_rows) + conc_needed = max(high_concurrencies) * args.high_concurrency_requests + conc_texts = load_texts(csv_path, scenario["column"], conc_needed) + + for variant in variants: + print(f"[start] {scenario['name']} | {variant['name']}", flush=True) + config, capability = build_variant_config(scenario["model"], variant["overrides"]) + ensure_cuda_stats_reset() + service = TranslationService(config) + backend = service.get_backend(scenario["model"]) + + high_batch_results = [] + for batch_size in high_batch_sizes: + high_batch_results.append( + benchmark_serial_case( + service=service, + backend=backend, + scenario=scenario, + capability=capability, + texts=batch_texts[: max(batch_size, high_batch_rows)], + batch_size=batch_size, + warmup_batches=args.warmup_batches, + ) + ) + + high_concurrency_results = [] + for concurrency in high_concurrencies: + high_concurrency_results.append( + benchmark_concurrency_case( + service=service, + backend=backend, + scenario=scenario, + capability=capability, + texts=conc_texts, + batch_size=1, + concurrency=concurrency, + requests_per_case=args.high_concurrency_requests, + warmup_batches=args.warmup_batches, + ) + ) + + scenario_entry["variants"].append( + { + "name": variant["name"], + "overrides": variant["overrides"], + "high_batch_low_concurrency": high_batch_results, + "high_concurrency_low_batch": high_concurrency_results, + } + ) + print(f"[done] {scenario['name']} | {variant['name']}", flush=True) + + report["scenarios"].append(scenario_entry) + + stamp = datetime.now().strftime("%H%M%S") + json_path = output_dir / f"translation_local_models_focus_{stamp}.json" + md_path = output_dir / f"translation_local_models_focus_{stamp}.md" + json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + md_path.write_text(render_markdown(report), encoding="utf-8") + print(f"JSON report: {json_path}") + print(f"Markdown report: {md_path}") + + +if __name__ == "__main__": + main() diff --git a/translation/README.md b/translation/README.md index 47dd79d..33ef0c1 100644 --- a/translation/README.md +++ b/translation/README.md @@ -13,7 +13,11 @@ - 虚拟环境:[`scripts/setup_translator_venv.sh`](/data/saas-search/scripts/setup_translator_venv.sh) - 模型下载:[`scripts/download_translation_models.py`](/data/saas-search/scripts/download_translation_models.py) - 本地模型压测:[`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) -- 性能报告:[`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- 聚焦压测脚本:[`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) +- 基线性能报告:[`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- CT2 扩展报告:[`perf_reports/20260318/translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) +- CT2 聚焦调优报告:[`perf_reports/20260318/translation_local_models_ct2_focus/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/README.md) +- NLLB T4 商品标题专项报告:[`perf_reports/20260318/nllb_t4_product_names_ct2/README.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/README.md) ## 1. 设计目标 @@ -107,6 +111,12 @@ services: ct2_compute_type: "float16" ct2_conversion_quantization: "float16" ct2_auto_convert: true + ct2_inter_threads: 4 + ct2_max_queued_batches: 32 + ct2_batch_type: "examples" + ct2_decoding_length_mode: "source" + ct2_decoding_length_extra: 8 + ct2_decoding_length_min: 32 device: "cuda" torch_dtype: "float16" batch_size: 16 @@ -408,7 +418,7 @@ results = translator.translate( - 支持多语 - 调用时必须显式传 `source_lang` - 语言码映射定义在 [`translation/languages.py`](/data/saas-search/translation/languages.py) -- 当前 T4 推荐配置:`device=cuda`、`ct2_compute_type=float16`、`batch_size=16`、`max_new_tokens=64` +- 当前 T4 推荐配置:`device=cuda`、`ct2_compute_type=float16`、`ct2_inter_threads=4`、`ct2_max_queued_batches=32`、`ct2_batch_type=examples`、`ct2_decoding_length_mode=source(+8,min=32)`、`batch_size=16`、`max_new_tokens=64` 当前实现已经利用的优化: - 已做批量分块:`translate()` 会按 capability 的 `batch_size` 分批进入模型 @@ -509,7 +519,7 @@ models/translation/Helsinki-NLP/opus-mt-en-zh - 避免多 worker 重复加载模型 - GPU 机器上优先使用 `cuda + float16` - CPU 只建议用于功能验证或离线低频任务 -- 对 NLLB,T4 上优先采用 `batch_size=16 + max_new_tokens=64 + ct2_compute_type=float16` +- 对 NLLB,T4 上优先采用 `batch_size=16 + max_new_tokens=64 + ct2_compute_type=float16 + ct2_inter_threads=4 + ct2_max_queued_batches=32 + ct2_batch_type=examples + ct2_decoding_length_mode=source(+8,min=32)` ### 9.5 验证 @@ -541,6 +551,7 @@ curl -X POST http://127.0.0.1:6006/translate \ 性能脚本: - [`scripts/benchmark_translation_local_models.py`](/data/saas-search/scripts/benchmark_translation_local_models.py) +- [`scripts/benchmark_translation_local_models_focus.py`](/data/saas-search/scripts/benchmark_translation_local_models_focus.py) 数据集: - [`products_analyzed.csv`](/data/saas-search/products_analyzed.csv) @@ -549,6 +560,9 @@ curl -X POST http://127.0.0.1:6006/translate \ - 摘要:[`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) - 完整 Markdown:[`perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.md`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.md) - 完整 JSON:[`perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.json`](/data/saas-search/perf_reports/20260318/translation_local_models/translation_local_models_extended_221846.json) +- CT2 扩展总结:[`perf_reports/20260318/translation_local_models_ct2/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2/README.md) +- CT2 聚焦调优总结:[`perf_reports/20260318/translation_local_models_ct2_focus/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models_ct2_focus/README.md) +- NLLB T4 商品标题专项调优:[`perf_reports/20260318/nllb_t4_product_names_ct2/README.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/README.md) ### 10.1 先看哪组数据 @@ -768,33 +782,43 @@ cd /data/saas-search NLLB 性能优化经验: - 起作用的优化点 1:`float16 + cuda` - - 模型确认以 `torch.float16` 实际加载到 `cuda:0` + - 当前本地 NLLB 由 `CTranslate2` 在 `cuda:0` 以 `float16` 运行 - 优化后在 T4 上的峰值显存约 `2.8-3.0 GiB` - 起作用的优化点 2:`batch_size=16` - 相比 `batch_size=8`,吞吐提升明显 - 继续提升到 `32` 虽然还能增吞吐,但 batch p95 和 batch max 会恶化很多 -- 起作用的优化点 3:`max_new_tokens=64` - - 商品标题翻译通常不需要 `256` 的生成上限 - - 收紧生成长度后,`zh->en` 与 `en->zh` 都有明显收益 -- 起作用的优化点 4:`attn_implementation=sdpa` - - 对当前 PyTorch + T4 环境有效 - - 配合半精度和较合理 batch size 后,整体延迟进一步下降 -- 已有但不需要单独开关的点:`attention_mask` - - 当前实现会在 tokenizer 阶段自动生成并传入 `generate()` - - 它属于标准推理路径,不是一个额外的“高级优化开关” +- 起作用的优化点 3:`ct2_inter_threads=4 + ct2_max_queued_batches=32` + - 对 `batch=1` 高并发商品标题场景收益最直接 + - 相比默认 CT2 配置,`zh->en` 和 `en->zh` 的在线吞吐都能稳定提升 +- 起作用的优化点 4:动态解码上限 + - 推荐 `ct2_decoding_length_mode=source` + - 推荐 `ct2_decoding_length_extra=8` + - 推荐 `ct2_decoding_length_min=32` + - 这样可以保留 `max_new_tokens=64` 的安全上限,同时让短标题不再为长标题上限付费 +- 起作用的优化点 5:`ct2_batch_type=examples` + - 在当前数据和 T4 上,比 `tokens` 更稳 + - 更适合作为线上默认 +- 不建议直接作为默认的实验: + - `max_new_tokens=48` + - 大 batch 和在线吞吐都会继续变好 + - 但商品标题 spot-check 已看到明显截断,尤其 `en->zh` +- 收益有限或不稳定的实验: + - `ct2_batch_type=tokens` + - `ct2_max_queued_batches` 从 `16` 再继续拉高,收益很小 + - `ct2_decoding_length_mode=source(+4,min=24)` 更快,但仍有少量长标题截断风险 为什么最终没有采用其它方案: -- 当前 HF 原生方案已经能在 T4 上稳定跑通 -- 在 `10G+` 可用显存下,原生 `float16` 已足够支撑 NLLB-600M -- 因此暂时不需要为这个模型额外引入 GGUF 或 CT2 的新运行栈 -- 如果未来目标变成“继续压缩显存”或“进一步追求更低延迟”,再评估 `ct2-int8` 会更合适 +- 当前本地最优路径已经切到 `CTranslate2 + float16` +- 对这个 600M 级 encoder-decoder 模型,T4 上最有效的是把 CT2 的并行和解码策略调对 +- 因此这轮没有继续引入更重的服务化栈 +- 如果未来目标变成“继续压缩显存”或“进一步追求更低延迟”,再评估 `int8_float16` 或服务级微批处理队列会更合适 关键结论: - 当前机器上,`opus-mt-zh-en` 是三个新增本地模型里最快的 - `opus-mt-en-zh` 大约是 `opus-mt-zh-en` 吞吐的一半 -- `nllb-200-distilled-600M` 在显存充足时可以用 `cuda + float16 + batch_size=16 + max_new_tokens=64 + sdpa` 正常运行 +- `nllb-200-distilled-600M` 在 T4 上推荐 `cuda + CTranslate2 float16 + batch_size=16 + ct2_inter_threads=4 + ct2_max_queued_batches=32 + dynamic decoding` - `nllb` 最终可用,但吞吐仍明显低于两个 Marian 模型,更适合多语覆盖或独立资源环境 最终推荐部署方案: @@ -807,11 +831,15 @@ NLLB 性能优化经验: - 推荐 `max_input_length`:`256` - 推荐 `max_new_tokens`:`64` - 推荐 `num_beams`:`1` -- 推荐注意力实现:`sdpa` +- 推荐 CT2 并行:`ct2_inter_threads=4` +- 推荐 CT2 队列:`ct2_max_queued_batches=32` +- 推荐 CT2 batch 类型:`examples` +- 推荐动态解码:`ct2_decoding_length_mode=source`、`ct2_decoding_length_extra=8`、`ct2_decoding_length_min=32` - 运行方式:单 worker,避免重复加载 更详细的性能说明见: - [`perf_reports/20260318/translation_local_models/README.md`](/data/saas-search/perf_reports/20260318/translation_local_models/README.md) +- [`perf_reports/20260318/nllb_t4_product_names_ct2/README.md`](/data/saas-search/perf_reports/20260318/nllb_t4_product_names_ct2/README.md) ## 11. 开发说明 diff --git a/translation/backends/local_ctranslate2.py b/translation/backends/local_ctranslate2.py index fef3873..638a6d7 100644 --- a/translation/backends/local_ctranslate2.py +++ b/translation/backends/local_ctranslate2.py @@ -91,6 +91,9 @@ class LocalCTranslate2TranslationBackend: ct2_intra_threads: int = 0, ct2_max_queued_batches: int = 0, ct2_batch_type: str = "examples", + ct2_decoding_length_mode: str = "fixed", + ct2_decoding_length_extra: int = 0, + ct2_decoding_length_min: int = 1, ) -> None: self.model = name self.model_id = model_id @@ -114,6 +117,11 @@ class LocalCTranslate2TranslationBackend: self.ct2_batch_type = str(ct2_batch_type or "examples").strip().lower() if self.ct2_batch_type not in {"examples", "tokens"}: raise ValueError(f"Unsupported CTranslate2 batch type: {ct2_batch_type}") + self.ct2_decoding_length_mode = str(ct2_decoding_length_mode or "fixed").strip().lower() + if self.ct2_decoding_length_mode not in {"fixed", "source"}: + raise ValueError(f"Unsupported CTranslate2 decoding length mode: {ct2_decoding_length_mode}") + self.ct2_decoding_length_extra = int(ct2_decoding_length_extra) + self.ct2_decoding_length_min = max(1, int(ct2_decoding_length_min)) self._tokenizer_lock = threading.Lock() self._load_runtime() @@ -239,6 +247,15 @@ class LocalCTranslate2TranslationBackend: del count, source_lang, target_lang return None + def _resolve_max_decoding_length(self, source_tokens: Sequence[Sequence[str]]) -> int: + if self.ct2_decoding_length_mode != "source": + return self.max_new_tokens + if not source_tokens: + return self.max_new_tokens + max_source_length = max(len(tokens) for tokens in source_tokens) + dynamic_length = max(self.ct2_decoding_length_min, max_source_length + self.ct2_decoding_length_extra) + return min(self.max_new_tokens, dynamic_length) + def _postprocess_hypothesis( self, tokens: List[str], @@ -262,6 +279,7 @@ class LocalCTranslate2TranslationBackend: self._validate_languages(source_lang, target_lang) source_tokens = self._encode_source_tokens(texts, source_lang, target_lang) target_prefix = self._target_prefixes(len(source_tokens), source_lang, target_lang) + max_decoding_length = self._resolve_max_decoding_length(source_tokens) results = self.translator.translate_batch( source_tokens, target_prefix=target_prefix, @@ -269,7 +287,7 @@ class LocalCTranslate2TranslationBackend: batch_type=self.ct2_batch_type, beam_size=self.num_beams, max_input_length=self.max_input_length, - max_decoding_length=self.max_new_tokens, + max_decoding_length=max_decoding_length, ) outputs: List[Optional[str]] = [] for result in results: @@ -323,6 +341,9 @@ class MarianCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ct2_intra_threads: int = 0, ct2_max_queued_batches: int = 0, ct2_batch_type: str = "examples", + ct2_decoding_length_mode: str = "fixed", + ct2_decoding_length_extra: int = 0, + ct2_decoding_length_min: int = 1, ) -> None: self.source_langs = {str(lang).strip().lower() for lang in source_langs if str(lang).strip()} self.target_langs = {str(lang).strip().lower() for lang in target_langs if str(lang).strip()} @@ -344,6 +365,9 @@ class MarianCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ct2_intra_threads=ct2_intra_threads, ct2_max_queued_batches=ct2_max_queued_batches, ct2_batch_type=ct2_batch_type, + ct2_decoding_length_mode=ct2_decoding_length_mode, + ct2_decoding_length_extra=ct2_decoding_length_extra, + ct2_decoding_length_min=ct2_decoding_length_min, ) def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: @@ -383,6 +407,9 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ct2_intra_threads: int = 0, ct2_max_queued_batches: int = 0, ct2_batch_type: str = "examples", + ct2_decoding_length_mode: str = "fixed", + ct2_decoding_length_extra: int = 0, + ct2_decoding_length_min: int = 1, ) -> None: overrides = language_codes or {} self.language_codes = { @@ -408,6 +435,9 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ct2_intra_threads=ct2_intra_threads, ct2_max_queued_batches=ct2_max_queued_batches, ct2_batch_type=ct2_batch_type, + ct2_decoding_length_mode=ct2_decoding_length_mode, + ct2_decoding_length_extra=ct2_decoding_length_extra, + ct2_decoding_length_min=ct2_decoding_length_min, ) def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: diff --git a/translation/service.py b/translation/service.py index 0afb312..5e5c7df 100644 --- a/translation/service.py +++ b/translation/service.py @@ -131,6 +131,9 @@ class TranslationService: ct2_intra_threads=int(cfg.get("ct2_intra_threads", 0)), ct2_max_queued_batches=int(cfg.get("ct2_max_queued_batches", 0)), ct2_batch_type=str(cfg.get("ct2_batch_type", "examples")), + ct2_decoding_length_mode=str(cfg.get("ct2_decoding_length_mode", "fixed")), + ct2_decoding_length_extra=int(cfg.get("ct2_decoding_length_extra", 0)), + ct2_decoding_length_min=int(cfg.get("ct2_decoding_length_min", 1)), ) def _create_local_marian_backend(self, *, name: str, cfg: Dict[str, object]) -> TranslationBackendProtocol: @@ -158,6 +161,9 @@ class TranslationService: ct2_intra_threads=int(cfg.get("ct2_intra_threads", 0)), ct2_max_queued_batches=int(cfg.get("ct2_max_queued_batches", 0)), ct2_batch_type=str(cfg.get("ct2_batch_type", "examples")), + ct2_decoding_length_mode=str(cfg.get("ct2_decoding_length_mode", "fixed")), + ct2_decoding_length_extra=int(cfg.get("ct2_decoding_length_extra", 0)), + ct2_decoding_length_min=int(cfg.get("ct2_decoding_length_min", 1)), ) @property diff --git a/translation/settings.py b/translation/settings.py index 201e3a9..0c40885 100644 --- a/translation/settings.py +++ b/translation/settings.py @@ -149,6 +149,14 @@ def _validate_capability(name: str, capability: Mapping[str, Any]) -> None: _require_positive_int(capability.get("max_input_length"), f"{prefix}.max_input_length") _require_positive_int(capability.get("max_new_tokens"), f"{prefix}.max_new_tokens") _require_positive_int(capability.get("num_beams"), f"{prefix}.num_beams") + if "ct2_decoding_length_mode" in capability: + mode = _require_string(capability.get("ct2_decoding_length_mode"), f"{prefix}.ct2_decoding_length_mode").lower() + if mode not in {"fixed", "source"}: + raise ValueError(f"{prefix}.ct2_decoding_length_mode must be one of: fixed, source") + if "ct2_decoding_length_extra" in capability: + _require_int(capability.get("ct2_decoding_length_extra"), f"{prefix}.ct2_decoding_length_extra") + if "ct2_decoding_length_min" in capability: + _require_positive_int(capability.get("ct2_decoding_length_min"), f"{prefix}.ct2_decoding_length_min") return raise ValueError(f"Unsupported translation backend '{backend}' for capability '{name}'") -- libgit2 0.21.2