Commit 1d6727ac9b460bef4f096489b771bc54194d4760
1 parent
3eff49b7
trans
Showing
4 changed files
with
115 additions
and
152 deletions
Show diff stats
docs/翻译模块说明.md deleted
| @@ -1,145 +0,0 @@ | @@ -1,145 +0,0 @@ | ||
| 1 | -# 翻译模块 | ||
| 2 | - | ||
| 3 | -**快速上手**:见 `docs/QUICKSTART.md` 第 3.4 节。 | ||
| 4 | - | ||
| 5 | -## 环境变量 | ||
| 6 | - | ||
| 7 | -```bash | ||
| 8 | -# Qwen(默认) | ||
| 9 | -DASHSCOPE_API_KEY=sk-xxx | ||
| 10 | - | ||
| 11 | -# DeepL | ||
| 12 | -DEEPL_AUTH_KEY=xxx | ||
| 13 | -``` | ||
| 14 | - | ||
| 15 | -> **重要限速说明(Qwen 机翻)** | ||
| 16 | -> 当前默认的 Qwen 翻译后端使用 `qwen-mt-flash` 云端模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**。 | ||
| 17 | -> - 推荐通过 Redis 翻译缓存复用结果,避免对相同文本重复打云端 | ||
| 18 | -> - 高并发场景需要在调用端做限流 / 去抖,或改为离线批量翻译 | ||
| 19 | -> - 如需更高吞吐,可考虑 DeepL 或自建翻译服务 | ||
| 20 | - | ||
| 21 | -## 配置模型 | ||
| 22 | - | ||
| 23 | -翻译已改为“一个翻译服务 + 多种翻译能力”的结构: | ||
| 24 | - | ||
| 25 | -- 业务侧(`QueryParser` / indexer)统一调用 `http://127.0.0.1:6006` | ||
| 26 | -- 服务内按 `services.translation.capabilities` 加载并管理各翻译能力 | ||
| 27 | -- 已启用 capability 统一注册,后端实例按首次调用懒加载,避免多个本地模型在启动阶段一次性占满显存 | ||
| 28 | -- `config.yaml` 只保留部署相关配置;scene 规则、语言码映射、prompt 模板、模型方向约束等翻译域知识统一收口在 `translation/` 内部 | ||
| 29 | -- 每种能力独立配置 `enabled`、`model`、`base_url/api_url`、`timeout`、本地模型运行参数等部署项 | ||
| 30 | -- 每种能力显式声明 `backend` 类型,例如 `qwen_mt`、`llm`、`deepl`、`local_nllb`、`local_marian` | ||
| 31 | -- `service_url`、`default_model`、`default_scene` 只从 `config/config.yaml` 读取,不再接受环境变量静默覆盖 | ||
| 32 | -- 外部接口通过 `model + scene` 指定本次使用哪种能力、哪个场景 | ||
| 33 | - | ||
| 34 | -配置入口在 `config/config.yaml -> services.translation` | ||
| 35 | - | ||
| 36 | -## 本地模型部署 | ||
| 37 | - | ||
| 38 | -本仓库已内置 3 个本地机翻 capability: | ||
| 39 | - | ||
| 40 | -- `nllb-200-distilled-600m` | ||
| 41 | -- `opus-mt-zh-en` | ||
| 42 | -- `opus-mt-en-zh` | ||
| 43 | - | ||
| 44 | -推荐流程: | ||
| 45 | - | ||
| 46 | -1. 创建独立运行环境:`./scripts/setup_translator_venv.sh` | ||
| 47 | -2. 下载本地模型:`./.venv-translator/bin/python scripts/download_translation_models.py --all-local` | ||
| 48 | -3. 在 `config/config.yaml` 中把对应 capability 的 `enabled` 改为 `true` | ||
| 49 | -4. 启动服务:`./scripts/start_translator.sh` | ||
| 50 | - | ||
| 51 | -默认模型目录: | ||
| 52 | - | ||
| 53 | -- `models/translation/facebook/nllb-200-distilled-600M` | ||
| 54 | -- `models/translation/Helsinki-NLP/opus-mt-zh-en` | ||
| 55 | -- `models/translation/Helsinki-NLP/opus-mt-en-zh` | ||
| 56 | - | ||
| 57 | -说明: | ||
| 58 | - | ||
| 59 | -- 目前只支持 3 个标准 scene:`general`、`sku_name`、`ecommerce_search_query` | ||
| 60 | -- `nllb-200-distilled-600m` 支持多语,但依赖明确的 `source_lang` | ||
| 61 | -- 两个 OPUS 模型分别只支持 `zh -> en` 与 `en -> zh` | ||
| 62 | -- 本地模型建议单 worker 运行,避免重复加载占用显存 | ||
| 63 | - | ||
| 64 | -## HTTP 接口契约(translator service,端口 6006) | ||
| 65 | - | ||
| 66 | -服务默认监听 `http://localhost:6006`,提供: | ||
| 67 | - | ||
| 68 | -- `POST /translate`: 文本翻译(支持所有已启用 capability) | ||
| 69 | -- `GET /health`: 健康检查 | ||
| 70 | - | ||
| 71 | -### `POST /translate` | ||
| 72 | - | ||
| 73 | -**请求体**: | ||
| 74 | - | ||
| 75 | -```json | ||
| 76 | -{ | ||
| 77 | - "text": "商品名称", | ||
| 78 | - "target_lang": "en", | ||
| 79 | - "source_lang": "zh", | ||
| 80 | - "model": "qwen-mt", | ||
| 81 | - "scene": "sku_name" | ||
| 82 | -} | ||
| 83 | -``` | ||
| 84 | - | ||
| 85 | -- `text` 支持两种形式: | ||
| 86 | - - 单条:`string` | ||
| 87 | - - 批量:`string[]`(等长返回,顺序对应) | ||
| 88 | - | ||
| 89 | -**响应体**(单条): | ||
| 90 | - | ||
| 91 | -```json | ||
| 92 | -{ | ||
| 93 | - "text": "商品名称", | ||
| 94 | - "target_lang": "en", | ||
| 95 | - "source_lang": "zh", | ||
| 96 | - "translated_text": "Product name", | ||
| 97 | - "status": "success", | ||
| 98 | - "model": "qwen-mt", | ||
| 99 | - "scene": "sku_name" | ||
| 100 | -} | ||
| 101 | -``` | ||
| 102 | - | ||
| 103 | -**响应体**(批量): | ||
| 104 | - | ||
| 105 | -```json | ||
| 106 | -{ | ||
| 107 | - "text": ["商品名称1", "商品名称2"], | ||
| 108 | - "target_lang": "en", | ||
| 109 | - "source_lang": "zh", | ||
| 110 | - "translated_text": ["Product name 1", null], | ||
| 111 | - "status": "success", | ||
| 112 | - "model": "qwen-mt", | ||
| 113 | - "scene": "sku_name" | ||
| 114 | -} | ||
| 115 | -``` | ||
| 116 | - | ||
| 117 | -批量模式下,**单条失败用 `null` 占位**(即 `translated_text[i] = null`),保证长度与顺序一一对应,避免部分失败导致整批报错。 | ||
| 118 | - | ||
| 119 | -说明: | ||
| 120 | - | ||
| 121 | -- `scene` 是标准字段 | ||
| 122 | -- `prompt` 不属于外部接口;LLM prompt 由 translator service 内部根据 `scene` 生成 | ||
| 123 | -- `model` 只能选择已在 `services.translation.capabilities` 中启用的能力 | ||
| 124 | -- `/health` 会返回 `default_model`、`default_scene`、`enabled_capabilities` 与 `loaded_models` | ||
| 125 | - | ||
| 126 | ---- | ||
| 127 | - | ||
| 128 | -## 开发者接口约定(代码调用) | ||
| 129 | - | ||
| 130 | -代码侧(如 query/indexer)通过 `translation.create_translation_client()` 获取实例并调用 `translate()`; | ||
| 131 | - | ||
| 132 | -### 输入输出Shape | ||
| 133 | - | ||
| 134 | -- `translate(text=...)` 支持: | ||
| 135 | - - **单条**:`text: str` → 返回 `Optional[str]` | ||
| 136 | - - **批量**:`text: List[str]` → 返回 `List[Optional[str]]` | ||
| 137 | -- **批量语义**:返回列表必须与输入 **等长且顺序对应**;某条翻译失败时,对应位置为 `None`(HTTP JSON 中表现为 `null`)。 | ||
| 138 | - | ||
| 139 | -### 批量能力标识(supports_batch) | ||
| 140 | - | ||
| 141 | -服务客户端与服务内后端都可以暴露 `supports_batch`。若后端不支持批量,服务端会逐条拆分并保持 shape。 | ||
| 142 | - | ||
| 143 | -为便于上层(如 `api/translator_app.py`)做最优调用,client / backend 可暴露: | ||
| 144 | - | ||
| 145 | -- `supports_batch: bool`(property) |
perf_reports/20260317/translation_local_models/README.md
| @@ -13,6 +13,7 @@ Environment: | @@ -13,6 +13,7 @@ Environment: | ||
| 13 | Method: | 13 | Method: |
| 14 | - `opus-mt-zh-en` and `opus-mt-en-zh` were benchmarked on the full dataset using their configured production settings. | 14 | - `opus-mt-zh-en` and `opus-mt-en-zh` were benchmarked on the full dataset using their configured production settings. |
| 15 | - `nllb-200-distilled-600m` was benchmarked on a `500`-row subset after optimization. | 15 | - `nllb-200-distilled-600m` was benchmarked on a `500`-row subset after optimization. |
| 16 | +- `nllb-200-distilled-600m` was also benchmarked with `batch_size=1` on a `100`-row subset to approximate online query translation latency. | ||
| 16 | - This report only keeps the final optimized results and final deployment recommendation. | 17 | - This report only keeps the final optimized results and final deployment recommendation. |
| 17 | - Quality was intentionally not evaluated; this is a performance-only report. | 18 | - Quality was intentionally not evaluated; this is a performance-only report. |
| 18 | 19 | ||
| @@ -54,6 +55,34 @@ What did not become the final recommendation: | @@ -54,6 +55,34 @@ What did not become the final recommendation: | ||
| 54 | | `nllb-200-distilled-600m` | `zh -> en` | `cuda` | 500 | 7.3397 | 25.9577 | 19.26 | 51.915 | 832.64 | 1263.01 | | 55 | | `nllb-200-distilled-600m` | `zh -> en` | `cuda` | 500 | 7.3397 | 25.9577 | 19.26 | 51.915 | 832.64 | 1263.01 | |
| 55 | | `nllb-200-distilled-600m` | `en -> zh` | `cuda` | 500 | 7.4152 | 42.0405 | 11.89 | 84.081 | 1093.87 | 2107.44 | | 56 | | `nllb-200-distilled-600m` | `en -> zh` | `cuda` | 500 | 7.4152 | 42.0405 | 11.89 | 84.081 | 1093.87 | 2107.44 | |
| 56 | 57 | ||
| 58 | +## Single-Request Latency | ||
| 59 | + | ||
| 60 | +To model online search query translation, we reran NLLB with `batch_size=1`. In this mode, batch latency is request latency. | ||
| 61 | + | ||
| 62 | +| Model | Direction | Rows | Load s | Translate s | Avg req ms | Req p50 ms | Req p95 ms | Req max ms | Items/s | | ||
| 63 | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:| | ||
| 64 | +| `nllb-200-distilled-600m` | `zh -> en` | 100 | 6.8390 | 32.1909 | 321.909 | 292.54 | 624.12 | 819.67 | 3.11 | | ||
| 65 | +| `nllb-200-distilled-600m` | `en -> zh` | 100 | 6.8249 | 54.2470 | 542.470 | 481.61 | 1171.71 | 1751.85 | 1.84 | | ||
| 66 | + | ||
| 67 | +Command used: | ||
| 68 | + | ||
| 69 | +```bash | ||
| 70 | +./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | ||
| 71 | + --single \ | ||
| 72 | + --model nllb-200-distilled-600m \ | ||
| 73 | + --source-lang zh \ | ||
| 74 | + --target-lang en \ | ||
| 75 | + --column title_cn \ | ||
| 76 | + --scene sku_name \ | ||
| 77 | + --batch-size 1 \ | ||
| 78 | + --limit 100 | ||
| 79 | +``` | ||
| 80 | + | ||
| 81 | +Takeaways for online use: | ||
| 82 | +- `batch_size=1` can be treated as single-request latency for the current service path. | ||
| 83 | +- `zh -> en` is materially faster than `en -> zh` on this machine. | ||
| 84 | +- NLLB is usable for online query translation, but it is not a low-latency model by search-serving standards. | ||
| 85 | + | ||
| 57 | ## NLLB Resource Reality | 86 | ## NLLB Resource Reality |
| 58 | 87 | ||
| 59 | The common online claim that this model uses only about `1.25GB` in `float16` is best understood as a rough weight-size level, not end-to-end runtime memory. | 88 | The common online claim that this model uses only about `1.25GB` in `float16` is best understood as a rough weight-size level, not end-to-end runtime memory. |
translation/README.md
| @@ -286,6 +286,12 @@ results = translator.translate( | @@ -286,6 +286,12 @@ results = translator.translate( | ||
| 286 | ) | 286 | ) |
| 287 | ``` | 287 | ``` |
| 288 | 288 | ||
| 289 | +接口 shape 约定: | ||
| 290 | +- `translate(text="...")` 返回 `Optional[str]` | ||
| 291 | +- `translate(text=[...])` 返回 `List[Optional[str]]` | ||
| 292 | +- 批量模式始终保持“等长、同序返回”;某条失败时对应位置为 `None` | ||
| 293 | +- backend/client 可通过 `supports_batch` 暴露是否支持原生批量;服务端会在必要时自动逐条拆分并保持返回 shape 不变 | ||
| 294 | + | ||
| 289 | ## 8. 具体实现说明 | 295 | ## 8. 具体实现说明 |
| 290 | 296 | ||
| 291 | ### 8.1 Qwen-MT | 297 | ### 8.1 Qwen-MT |
| @@ -329,26 +335,51 @@ results = translator.translate( | @@ -329,26 +335,51 @@ results = translator.translate( | ||
| 329 | 335 | ||
| 330 | 模型信息: | 336 | 模型信息: |
| 331 | - Hugging Face 名称:`facebook/nllb-200-distilled-600M` | 337 | - Hugging Face 名称:`facebook/nllb-200-distilled-600M` |
| 338 | +- 简介:多语种翻译:覆盖约 200 种语言。作为NLLB-200系列的蒸馏版本,该模型通过知识蒸馏技术将原130亿参数模型压缩至600M,同时保持了80%以上的翻译质量。 | ||
| 332 | - 本地目录:`models/translation/facebook/nllb-200-distilled-600M` | 339 | - 本地目录:`models/translation/facebook/nllb-200-distilled-600M` |
| 333 | - 当前磁盘占用:约 `2.4G` | 340 | - 当前磁盘占用:约 `2.4G` |
| 334 | - 模型类型:多语种 Seq2Seq 机器翻译模型 | 341 | - 模型类型:多语种 Seq2Seq 机器翻译模型 |
| 335 | - 来源:Meta NLLB(No Language Left Behind)系列的 600M 蒸馏版 | 342 | - 来源:Meta NLLB(No Language Left Behind)系列的 600M 蒸馏版 |
| 336 | -- 目标:用一个模型覆盖大规模多语言互译,而不是只服务某一个固定语言对 | ||
| 337 | - 结构特点: | 343 | - 结构特点: |
| 338 | - Transformer encoder-decoder 架构 | 344 | - Transformer encoder-decoder 架构 |
| 339 | - 12 层 encoder + 12 层 decoder | 345 | - 12 层 encoder + 12 层 decoder |
| 340 | - `d_model=1024` | 346 | - `d_model=1024` |
| 341 | - - 多头注意力,适合多语统一建模 | ||
| 342 | - 通过 `source_lang + forced_bos_token_id` 控制翻译方向 | 347 | - 通过 `source_lang + forced_bos_token_id` 控制翻译方向 |
| 343 | - 语言标识采用 `language_script` 形式,例如 `eng_Latn`、`zho_Hans` | 348 | - 语言标识采用 `language_script` 形式,例如 `eng_Latn`、`zho_Hans` |
| 349 | + - 改良 encoder-decoder(含嵌入层缩放 `scale_embedding`、相对位置等) | ||
| 350 | + | ||
| 351 | +核心配置如下: | ||
| 352 | + | ||
| 353 | +| 配置项 | 参数值 | 备注 | | ||
| 354 | +| --- | --- | --- | | ||
| 355 | +| 隐藏层维度(`d_model`) | 1024 | | | ||
| 356 | +| 编码器 / 解码器层数 | 12 / 12 | | | ||
| 357 | +| 注意力头数 | 16 | | | ||
| 358 | +| FFN 维度 | 4096 | | | ||
| 359 | +| 词表大小 | 256,206 | 多语统一词表 | | ||
| 360 | +| 最大序列长度 | 1024 tokens | 满足长文本翻译 | | ||
| 361 | + | ||
| 362 | +`config.json` 片段(示意): | ||
| 363 | + | ||
| 364 | +```json | ||
| 365 | +{ | ||
| 366 | + "d_model": 1024, | ||
| 367 | + "encoder_layers": 12, | ||
| 368 | + "decoder_layers": 12, | ||
| 369 | + "attention_dropout": 0.1, | ||
| 370 | + "use_cache": true, | ||
| 371 | + "torch_dtype": "float32", | ||
| 372 | + "max_length": 200 | ||
| 373 | +} | ||
| 374 | +``` | ||
| 344 | 375 | ||
| 345 | 模型定位: | 376 | 模型定位: |
| 346 | - 优势是多语覆盖面广,一个模型可以支撑很多语言方向 | 377 | - 优势是多语覆盖面广,一个模型可以支撑很多语言方向 |
| 347 | - 劣势是相较于 Marian 这种双语专用模型,推理更重、延迟更高 | 378 | - 劣势是相较于 Marian 这种双语专用模型,推理更重、延迟更高 |
| 348 | -- 在我们当前业务里,它更适合“多语覆盖优先”的场景,不适合拿来和专用中英模型拼极致吞吐 | 379 | +- 更适合做**索引翻译**(离线 / 批量),不建议作为在线 query 翻译的默认方案 |
| 349 | 380 | ||
| 350 | 显存占用情况: | 381 | 显存占用情况: |
| 351 | -- 600M模型半float16权重约1.25G,推理时会叠加 CUDA context、allocator reserve、激活张量、batch、输入长度、生成长度等开销 | 382 | +- 600M 模型半精度(float16)权重约 `~1.25G`;推理还会叠加 CUDA context、allocator reserve、激活张量、batch、输入/生成长度等开销 |
| 352 | - 当前这台 `Tesla T4` 上,优化后的实际运行峰值大约在 `2.8-3.0 GiB` | 383 | - 当前这台 `Tesla T4` 上,优化后的实际运行峰值大约在 `2.8-3.0 GiB` |
| 353 | 384 | ||
| 354 | 当前实现特点: | 385 | 当前实现特点: |
| @@ -358,6 +389,25 @@ results = translator.translate( | @@ -358,6 +389,25 @@ results = translator.translate( | ||
| 358 | - 语言码映射定义在 [`translation/languages.py`](/data/saas-search/translation/languages.py) | 389 | - 语言码映射定义在 [`translation/languages.py`](/data/saas-search/translation/languages.py) |
| 359 | - 当前 T4 推荐配置:`device=cuda`、`torch_dtype=float16`、`batch_size=16`、`max_new_tokens=64`、`attn_implementation=sdpa` | 390 | - 当前 T4 推荐配置:`device=cuda`、`torch_dtype=float16`、`batch_size=16`、`max_new_tokens=64`、`attn_implementation=sdpa` |
| 360 | 391 | ||
| 392 | +当前实现已经利用的优化: | ||
| 393 | +- 已做批量分块:`translate()` 会按 capability 的 `batch_size` 分批进入模型 | ||
| 394 | +- 已做动态 padding:tokenizer 使用 `padding=True`、`truncation=True` | ||
| 395 | +- 已传入 `attention_mask`:由 tokenizer 生成并随 `generate()` 一起送入模型 | ||
| 396 | +- 已设置方向控制:NLLB 通过 `tokenizer.src_lang` 和 `forced_bos_token_id` 指定语言对 | ||
| 397 | +- 已启用推理态:`torch.inference_mode()` + `model.eval()` | ||
| 398 | +- 已启用半精度和更优注意力实现:当前配置为 `float16 + sdpa` | ||
| 399 | +- 已关闭高开销搜索:默认 `num_beams=1`,更接近线上低延迟设置 | ||
| 400 | + | ||
| 401 | +和你给出的批处理示例对照: | ||
| 402 | +- 核心思路已经覆盖,现有实现与 `tokenizer(batch) -> model.generate(...) -> batch_decode(...)` 一致 | ||
| 403 | +- 差异在于服务端额外做了语言校验、统一 chunking、输入长度约束和单条/批量 shape 保持 | ||
| 404 | +- “预计算 attention mask” 目前没有单独缓存层;现状是每个 batch 在 tokenizer 阶段实时生成 `attention_mask`,这也是 HF 常规推理路径 | ||
| 405 | + | ||
| 406 | +优化空间(按场景): | ||
| 407 | +- **线上 query**:优先补测 `batch_size=1` 的真实延迟与 tail latency,而不是继续拉大 batch。 | ||
| 408 | +- **离线批量**:可再尝试更激进的 batching / 长度分桶 / 独立批处理队列(吞吐更高,但会增加在线尾延迟风险)。 | ||
| 409 | +- **进一步降显存 / 提速**:可评估 `ctranslate2` / int8;当前仓库尚未引入该运行栈。 | ||
| 410 | + | ||
| 361 | ### 8.5 `opus-mt-zh-en` | 411 | ### 8.5 `opus-mt-zh-en` |
| 362 | 412 | ||
| 363 | 实现文件: | 413 | 实现文件: |
| @@ -483,6 +533,29 @@ cd /data/saas-search | @@ -483,6 +533,29 @@ cd /data/saas-search | ||
| 483 | --scene sku_name | 533 | --scene sku_name |
| 484 | ``` | 534 | ``` |
| 485 | 535 | ||
| 536 | +单条请求延迟复现: | ||
| 537 | + | ||
| 538 | +```bash | ||
| 539 | +./.venv-translator/bin/python scripts/benchmark_translation_local_models.py \ | ||
| 540 | + --single \ | ||
| 541 | + --model nllb-200-distilled-600m \ | ||
| 542 | + --source-lang zh \ | ||
| 543 | + --target-lang en \ | ||
| 544 | + --column title_cn \ | ||
| 545 | + --scene sku_name \ | ||
| 546 | + --batch-size 1 \ | ||
| 547 | + --limit 100 | ||
| 548 | +``` | ||
| 549 | + | ||
| 550 | +说明: | ||
| 551 | +- 对当前脚本和本地 backend 来说,“单条请求”可以直接等价理解为 `batch_size=1` | ||
| 552 | +- 此时脚本里的 `batch_latency_*`,就可以直接视为“单次请求延迟”指标 | ||
| 553 | +- 线上搜索 query 翻译更应该关注这组数据,而不是大 batch 吞吐 | ||
| 554 | + | ||
| 555 | +当前单条请求实测(`Tesla T4`,`limit=100`): | ||
| 556 | +- `nllb-200-distilled-600m zh->en`:p50 约 `292.54 ms`,p95 约 `624.12 ms`,平均约 `321.91 ms` | ||
| 557 | +- `nllb-200-distilled-600m en->zh`:p50 约 `481.61 ms`,p95 约 `1171.71 ms`,平均约 `542.47 ms` | ||
| 558 | + | ||
| 486 | 当前压测环境: | 559 | 当前压测环境: |
| 487 | - GPU:`Tesla T4 16GB` | 560 | - GPU:`Tesla T4 16GB` |
| 488 | - Python env:`.venv-translator` | 561 | - Python env:`.venv-translator` |
| @@ -511,6 +584,9 @@ NLLB 性能优化经验: | @@ -511,6 +584,9 @@ NLLB 性能优化经验: | ||
| 511 | - 起作用的优化点 4:`attn_implementation=sdpa` | 584 | - 起作用的优化点 4:`attn_implementation=sdpa` |
| 512 | - 对当前 PyTorch + T4 环境有效 | 585 | - 对当前 PyTorch + T4 环境有效 |
| 513 | - 配合半精度和较合理 batch size 后,整体延迟进一步下降 | 586 | - 配合半精度和较合理 batch size 后,整体延迟进一步下降 |
| 587 | +- 已有但不需要单独开关的点:`attention_mask` | ||
| 588 | + - 当前实现会在 tokenizer 阶段自动生成并传入 `generate()` | ||
| 589 | + - 它属于标准推理路径,不是一个额外的“高级优化开关” | ||
| 514 | 590 | ||
| 515 | 为什么最终没有采用其它方案: | 591 | 为什么最终没有采用其它方案: |
| 516 | 592 | ||
| @@ -569,7 +645,7 @@ NLLB 性能优化经验: | @@ -569,7 +645,7 @@ NLLB 性能优化经验: | ||
| 569 | 645 | ||
| 570 | ## 13. 相关文档 | 646 | ## 13. 相关文档 |
| 571 | 647 | ||
| 572 | -- [`docs/翻译模块说明.md`](/data/saas-search/docs/翻译模块说明.md) | 648 | +- [`docs/翻译模块说明.md`](/data/saas-search/docs/翻译模块说明.md)(已收口到本 README,保留为跳转页) |
| 573 | - [`docs/QUICKSTART.md`](/data/saas-search/docs/QUICKSTART.md) | 649 | - [`docs/QUICKSTART.md`](/data/saas-search/docs/QUICKSTART.md) |
| 574 | - [`docs/DEVELOPER_GUIDE.md`](/data/saas-search/docs/DEVELOPER_GUIDE.md) | 650 | - [`docs/DEVELOPER_GUIDE.md`](/data/saas-search/docs/DEVELOPER_GUIDE.md) |
| 575 | - [`docs/搜索API对接指南.md`](/data/saas-search/docs/搜索API对接指南.md) | 651 | - [`docs/搜索API对接指南.md`](/data/saas-search/docs/搜索API对接指南.md) |
translation/backends/local_seq2seq.py
| @@ -93,7 +93,7 @@ class LocalSeq2SeqTranslationBackend: | @@ -93,7 +93,7 @@ class LocalSeq2SeqTranslationBackend: | ||
| 93 | def _model_kwargs(self) -> Dict[str, object]: | 93 | def _model_kwargs(self) -> Dict[str, object]: |
| 94 | kwargs: Dict[str, object] = {} | 94 | kwargs: Dict[str, object] = {} |
| 95 | if self.torch_dtype is not None: | 95 | if self.torch_dtype is not None: |
| 96 | - kwargs["dtype"] = self.torch_dtype | 96 | + kwargs["torch_dtype"] = self.torch_dtype |
| 97 | kwargs["low_cpu_mem_usage"] = True | 97 | kwargs["low_cpu_mem_usage"] = True |
| 98 | if self.attn_implementation: | 98 | if self.attn_implementation: |
| 99 | kwargs["attn_implementation"] = self.attn_implementation | 99 | kwargs["attn_implementation"] = self.attn_implementation |
| @@ -134,7 +134,10 @@ class LocalSeq2SeqTranslationBackend: | @@ -134,7 +134,10 @@ class LocalSeq2SeqTranslationBackend: | ||
| 134 | max_length=self.max_input_length, | 134 | max_length=self.max_input_length, |
| 135 | **tokenizer_kwargs, | 135 | **tokenizer_kwargs, |
| 136 | ) | 136 | ) |
| 137 | - encoded = {key: value.to(self.device) for key, value in encoded.items()} | 137 | + encoded = { |
| 138 | + key: value.to(self.device, non_blocking=self.device.startswith("cuda")) | ||
| 139 | + for key, value in encoded.items() | ||
| 140 | + } | ||
| 138 | generate_kwargs = self._build_generate_kwargs(source_lang, target_lang) | 141 | generate_kwargs = self._build_generate_kwargs(source_lang, target_lang) |
| 139 | input_ids = encoded.get("input_ids") | 142 | input_ids = encoded.get("input_ids") |
| 140 | if input_ids is not None and "max_length" not in generate_kwargs: | 143 | if input_ids is not None and "max_length" not in generate_kwargs: |