Commit 74116f0573b1333975cc6a0eae2e8f58cd4972cf
1 parent
971a0851
jina-reranker-v3性能测试和参数优化
Showing
5 changed files
with
55 additions
and
13 deletions
Show diff stats
config/config.yaml
| @@ -395,8 +395,11 @@ services: | @@ -395,8 +395,11 @@ services: | ||
| 395 | jina_reranker_v3: | 395 | jina_reranker_v3: |
| 396 | model_name: "jinaai/jina-reranker-v3" | 396 | model_name: "jinaai/jina-reranker-v3" |
| 397 | device: null | 397 | device: null |
| 398 | - dtype: "auto" | 398 | + dtype: "float16" |
| 399 | batch_size: 64 | 399 | batch_size: 64 |
| 400 | + max_doc_length: 160 | ||
| 401 | + max_query_length: 64 | ||
| 402 | + sort_by_doc_length: true | ||
| 400 | cache_dir: "./model_cache" | 403 | cache_dir: "./model_cache" |
| 401 | trust_remote_code: true | 404 | trust_remote_code: true |
| 402 | qwen3_vllm: | 405 | qwen3_vllm: |
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
| @@ -190,6 +190,16 @@ curl "http://localhost:6008/ready" | @@ -190,6 +190,16 @@ curl "http://localhost:6008/ready" | ||
| 190 | 190 | ||
| 191 | 说明:默认后端为 `qwen3_vllm`(`Qwen/Qwen3-Reranker-0.6B`),需要可用 GPU 显存。 | 191 | 说明:默认后端为 `qwen3_vllm`(`Qwen/Qwen3-Reranker-0.6B`),需要可用 GPU 显存。 |
| 192 | 192 | ||
| 193 | +补充:若切换到 `jina_reranker_v3`,在当前 `Tesla T4` 上建议使用: | ||
| 194 | + | ||
| 195 | +- `dtype: float16` | ||
| 196 | +- `batch_size: 64` | ||
| 197 | +- `max_doc_length: 160` | ||
| 198 | +- `max_query_length: 64` | ||
| 199 | +- `sort_by_doc_length: true` | ||
| 200 | + | ||
| 201 | +原因:`jina_reranker_v3` 的 `auto` 在当前机器上会落到 `bfloat16`,性能明显差于 `float16`;而它的 listwise 架构在 T4 上对上下文长度更敏感,过大的 batch 会显著拉长延迟。 | ||
| 202 | + | ||
| 193 | 补充:`docs` 的请求大小与模型推理 `batch size` 解耦。即使一次传入 1000 条文档,服务端也会按 `services.rerank.backends.qwen3_vllm.infer_batch_size` 自动拆分。 | 203 | 补充:`docs` 的请求大小与模型推理 `batch size` 解耦。即使一次传入 1000 条文档,服务端也会按 `services.rerank.backends.qwen3_vllm.infer_batch_size` 自动拆分。 |
| 194 | 204 | ||
| 195 | #### 7.2.1 `POST /rerank` — 结果重排 | 205 | #### 7.2.1 `POST /rerank` — 结果重排 |
| @@ -439,4 +449,3 @@ curl "http://localhost:6006/health" | @@ -439,4 +449,3 @@ curl "http://localhost:6006/health" | ||
| 439 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 | 449 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 |
| 440 | 450 | ||
| 441 | --- | 451 | --- |
| 442 | - |
requirements_reranker_jina_reranker_v3.txt
| 1 | # Isolated dependencies for jina_reranker_v3 reranker backend. | 1 | # Isolated dependencies for jina_reranker_v3 reranker backend. |
| 2 | +# | ||
| 3 | +# Keep this stack aligned with the validated CUDA runtime on our hosts. | ||
| 4 | +# On this machine, torch 2.11.0 + cu130 fails CUDA init, while torch 2.10.0 + cu128 works. | ||
| 5 | +# Cap transformers <5 to stay on the same family as the known-good reranker envs. | ||
| 2 | 6 | ||
| 3 | -r requirements_reranker_base.txt | 7 | -r requirements_reranker_base.txt |
| 4 | -torch>=2.0.0 | ||
| 5 | -transformers>=4.51.0 | 8 | +torch==2.10.0 |
| 9 | +transformers>=4.51.0,<5 |
reranker/README.md
| @@ -152,12 +152,22 @@ services: | @@ -152,12 +152,22 @@ services: | ||
| 152 | jina_reranker_v3: | 152 | jina_reranker_v3: |
| 153 | model_name: "jinaai/jina-reranker-v3" | 153 | model_name: "jinaai/jina-reranker-v3" |
| 154 | device: null | 154 | device: null |
| 155 | - dtype: "auto" | 155 | + dtype: "float16" |
| 156 | batch_size: 64 | 156 | batch_size: 64 |
| 157 | + max_doc_length: 160 | ||
| 158 | + max_query_length: 64 | ||
| 159 | + sort_by_doc_length: true | ||
| 157 | cache_dir: "./model_cache" | 160 | cache_dir: "./model_cache" |
| 158 | trust_remote_code: true | 161 | trust_remote_code: true |
| 159 | ``` | 162 | ``` |
| 160 | 163 | ||
| 164 | +T4 实测建议: | ||
| 165 | + | ||
| 166 | +- `dtype` 优先使用 `float16`;在当前机器上 `auto` 会加载成 `bfloat16`,明显更慢 | ||
| 167 | +- 在线短文本商品重排建议从 `batch_size: 64` 起步;它比更大的 listwise block 更快,但会牺牲一部分“完整 listwise”排序一致性 | ||
| 168 | +- 若你更看重接近完整 listwise 的排序结果,可提高到 `batch_size: 125`,代价是延迟明显上升 | ||
| 169 | +- `max_doc_length: 160`、`max_query_length: 64` 更适合当前商品标题 / 短 query 场景 | ||
| 170 | + | ||
| 161 | ## 当前最优方案:`qwen3_vllm_score` | 171 | ## 当前最优方案:`qwen3_vllm_score` |
| 162 | 172 | ||
| 163 | 173 |
reranker/backends/jina_reranker_v3.py
| @@ -35,19 +35,26 @@ class JinaRerankerV3Backend: | @@ -35,19 +35,26 @@ class JinaRerankerV3Backend: | ||
| 35 | self._config.get("model_name") or "jinaai/jina-reranker-v3" | 35 | self._config.get("model_name") or "jinaai/jina-reranker-v3" |
| 36 | ) | 36 | ) |
| 37 | self._cache_dir = self._config.get("cache_dir") or "./model_cache" | 37 | self._cache_dir = self._config.get("cache_dir") or "./model_cache" |
| 38 | - self._dtype = str(self._config.get("dtype") or "auto") | 38 | + self._dtype = str(self._config.get("dtype") or "float16") |
| 39 | self._device = self._config.get("device") | 39 | self._device = self._config.get("device") |
| 40 | self._batch_size = max(1, int(self._config.get("batch_size", 64))) | 40 | self._batch_size = max(1, int(self._config.get("batch_size", 64))) |
| 41 | + self._max_doc_length = max(1, int(self._config.get("max_doc_length", 160))) | ||
| 42 | + self._max_query_length = max(1, int(self._config.get("max_query_length", 64))) | ||
| 43 | + self._sort_by_doc_length = bool(self._config.get("sort_by_doc_length", True)) | ||
| 41 | self._return_embeddings = bool(self._config.get("return_embeddings", False)) | 44 | self._return_embeddings = bool(self._config.get("return_embeddings", False)) |
| 42 | self._trust_remote_code = bool(self._config.get("trust_remote_code", True)) | 45 | self._trust_remote_code = bool(self._config.get("trust_remote_code", True)) |
| 43 | self._lock = threading.Lock() | 46 | self._lock = threading.Lock() |
| 44 | 47 | ||
| 45 | logger.info( | 48 | logger.info( |
| 46 | - "[Jina_Reranker_V3] Loading model %s (dtype=%s, device=%s, batch=%s)", | 49 | + "[Jina_Reranker_V3] Loading model %s (dtype=%s, device=%s, batch=%s, " |
| 50 | + "max_doc_length=%s, max_query_length=%s, sort_by_doc_length=%s)", | ||
| 47 | self._model_name, | 51 | self._model_name, |
| 48 | self._dtype, | 52 | self._dtype, |
| 49 | self._device, | 53 | self._device, |
| 50 | self._batch_size, | 54 | self._batch_size, |
| 55 | + self._max_doc_length, | ||
| 56 | + self._max_query_length, | ||
| 57 | + self._sort_by_doc_length, | ||
| 51 | ) | 58 | ) |
| 52 | 59 | ||
| 53 | load_kwargs: Dict[str, Any] = { | 60 | load_kwargs: Dict[str, Any] = { |
| @@ -116,7 +123,6 @@ class JinaRerankerV3Backend: | @@ -116,7 +123,6 @@ class JinaRerankerV3Backend: | ||
| 116 | } | 123 | } |
| 117 | 124 | ||
| 118 | unique_texts: List[str] = [] | 125 | unique_texts: List[str] = [] |
| 119 | - unique_first_indices: List[int] = [] | ||
| 120 | text_to_unique_idx: Dict[str, int] = {} | 126 | text_to_unique_idx: Dict[str, int] = {} |
| 121 | for orig_idx, text in indexed: | 127 | for orig_idx, text in indexed: |
| 122 | unique_idx = text_to_unique_idx.get(text) | 128 | unique_idx = text_to_unique_idx.get(text) |
| @@ -124,7 +130,6 @@ class JinaRerankerV3Backend: | @@ -124,7 +130,6 @@ class JinaRerankerV3Backend: | ||
| 124 | unique_idx = len(unique_texts) | 130 | unique_idx = len(unique_texts) |
| 125 | text_to_unique_idx[text] = unique_idx | 131 | text_to_unique_idx[text] = unique_idx |
| 126 | unique_texts.append(text) | 132 | unique_texts.append(text) |
| 127 | - unique_first_indices.append(orig_idx) | ||
| 128 | 133 | ||
| 129 | effective_top_n = min(top_n, len(unique_texts)) if top_n is not None else None | 134 | effective_top_n = min(top_n, len(unique_texts)) if top_n is not None else None |
| 130 | 135 | ||
| @@ -151,6 +156,9 @@ class JinaRerankerV3Backend: | @@ -151,6 +156,9 @@ class JinaRerankerV3Backend: | ||
| 151 | "device": self._device, | 156 | "device": self._device, |
| 152 | "dtype": self._dtype, | 157 | "dtype": self._dtype, |
| 153 | "batch_size": self._batch_size, | 158 | "batch_size": self._batch_size, |
| 159 | + "max_doc_length": self._max_doc_length, | ||
| 160 | + "max_query_length": self._max_query_length, | ||
| 161 | + "sort_by_doc_length": self._sort_by_doc_length, | ||
| 154 | "normalize": normalize, | 162 | "normalize": normalize, |
| 155 | "normalize_note": "jina_reranker_v3 returns model relevance scores directly", | 163 | "normalize_note": "jina_reranker_v3 returns model relevance scores directly", |
| 156 | } | 164 | } |
| @@ -172,11 +180,15 @@ class JinaRerankerV3Backend: | @@ -172,11 +180,15 @@ class JinaRerankerV3Backend: | ||
| 172 | if not docs: | 180 | if not docs: |
| 173 | return [] | 181 | return [] |
| 174 | 182 | ||
| 175 | - unique_scores: List[float] = [0.0] * len(docs) | 183 | + ordered_indices = list(range(len(docs))) |
| 184 | + if self._sort_by_doc_length and len(ordered_indices) > 1: | ||
| 185 | + ordered_indices.sort(key=lambda idx: len(docs[idx])) | ||
| 176 | 186 | ||
| 187 | + unique_scores: List[float] = [0.0] * len(docs) | ||
| 177 | with self._lock: | 188 | with self._lock: |
| 178 | - for start in range(0, len(docs), self._batch_size): | ||
| 179 | - batch_docs = docs[start : start + self._batch_size] | 189 | + for start in range(0, len(ordered_indices), self._batch_size): |
| 190 | + batch_indices = ordered_indices[start : start + self._batch_size] | ||
| 191 | + batch_docs = [docs[idx] for idx in batch_indices] | ||
| 180 | batch_top_n = None | 192 | batch_top_n = None |
| 181 | if top_n is not None and len(docs) <= self._batch_size: | 193 | if top_n is not None and len(docs) <= self._batch_size: |
| 182 | batch_top_n = min(top_n, len(batch_docs)) | 194 | batch_top_n = min(top_n, len(batch_docs)) |
| @@ -185,9 +197,13 @@ class JinaRerankerV3Backend: | @@ -185,9 +197,13 @@ class JinaRerankerV3Backend: | ||
| 185 | batch_docs, | 197 | batch_docs, |
| 186 | top_n=batch_top_n, | 198 | top_n=batch_top_n, |
| 187 | return_embeddings=self._return_embeddings, | 199 | return_embeddings=self._return_embeddings, |
| 200 | + max_doc_length=self._max_doc_length, | ||
| 201 | + max_query_length=self._max_query_length, | ||
| 188 | ) | 202 | ) |
| 189 | for item in results: | 203 | for item in results: |
| 190 | batch_index = int(item["index"]) | 204 | batch_index = int(item["index"]) |
| 191 | - unique_scores[start + batch_index] = float(item["relevance_score"]) | 205 | + unique_scores[batch_indices[batch_index]] = float( |
| 206 | + item["relevance_score"] | ||
| 207 | + ) | ||
| 192 | 208 | ||
| 193 | return unique_scores | 209 | return unique_scores |