Commit 74116f0573b1333975cc6a0eae2e8f58cd4972cf

Authored by tangwang
1 parent 971a0851

jina-reranker-v3性能测试和参数优化

config/config.yaml
@@ -395,8 +395,11 @@ services: @@ -395,8 +395,11 @@ services:
395 jina_reranker_v3: 395 jina_reranker_v3:
396 model_name: "jinaai/jina-reranker-v3" 396 model_name: "jinaai/jina-reranker-v3"
397 device: null 397 device: null
398 - dtype: "auto" 398 + dtype: "float16"
399 batch_size: 64 399 batch_size: 64
  400 + max_doc_length: 160
  401 + max_query_length: 64
  402 + sort_by_doc_length: true
400 cache_dir: "./model_cache" 403 cache_dir: "./model_cache"
401 trust_remote_code: true 404 trust_remote_code: true
402 qwen3_vllm: 405 qwen3_vllm:
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
@@ -190,6 +190,16 @@ curl "http://localhost:6008/ready" @@ -190,6 +190,16 @@ curl "http://localhost:6008/ready"
190 190
191 说明:默认后端为 `qwen3_vllm`(`Qwen/Qwen3-Reranker-0.6B`),需要可用 GPU 显存。 191 说明:默认后端为 `qwen3_vllm`(`Qwen/Qwen3-Reranker-0.6B`),需要可用 GPU 显存。
192 192
  193 +补充:若切换到 `jina_reranker_v3`,在当前 `Tesla T4` 上建议使用:
  194 +
  195 +- `dtype: float16`
  196 +- `batch_size: 64`
  197 +- `max_doc_length: 160`
  198 +- `max_query_length: 64`
  199 +- `sort_by_doc_length: true`
  200 +
  201 +原因:`jina_reranker_v3` 的 `auto` 在当前机器上会落到 `bfloat16`,性能明显差于 `float16`;而它的 listwise 架构在 T4 上对上下文长度更敏感,过大的 batch 会显著拉长延迟。
  202 +
193 补充:`docs` 的请求大小与模型推理 `batch size` 解耦。即使一次传入 1000 条文档,服务端也会按 `services.rerank.backends.qwen3_vllm.infer_batch_size` 自动拆分。 203 补充:`docs` 的请求大小与模型推理 `batch size` 解耦。即使一次传入 1000 条文档,服务端也会按 `services.rerank.backends.qwen3_vllm.infer_batch_size` 自动拆分。
194 204
195 #### 7.2.1 `POST /rerank` — 结果重排 205 #### 7.2.1 `POST /rerank` — 结果重排
@@ -439,4 +449,3 @@ curl "http://localhost:6006/health" @@ -439,4 +449,3 @@ curl "http://localhost:6006/health"
439 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 449 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。
440 450
441 --- 451 ---
442 -  
requirements_reranker_jina_reranker_v3.txt
1 # Isolated dependencies for jina_reranker_v3 reranker backend. 1 # Isolated dependencies for jina_reranker_v3 reranker backend.
  2 +#
  3 +# Keep this stack aligned with the validated CUDA runtime on our hosts.
  4 +# On this machine, torch 2.11.0 + cu130 fails CUDA init, while torch 2.10.0 + cu128 works.
  5 +# Cap transformers <5 to stay on the same family as the known-good reranker envs.
2 6
3 -r requirements_reranker_base.txt 7 -r requirements_reranker_base.txt
4 -torch>=2.0.0  
5 -transformers>=4.51.0 8 +torch==2.10.0
  9 +transformers>=4.51.0,<5
reranker/README.md
@@ -152,12 +152,22 @@ services: @@ -152,12 +152,22 @@ services:
152 jina_reranker_v3: 152 jina_reranker_v3:
153 model_name: "jinaai/jina-reranker-v3" 153 model_name: "jinaai/jina-reranker-v3"
154 device: null 154 device: null
155 - dtype: "auto" 155 + dtype: "float16"
156 batch_size: 64 156 batch_size: 64
  157 + max_doc_length: 160
  158 + max_query_length: 64
  159 + sort_by_doc_length: true
157 cache_dir: "./model_cache" 160 cache_dir: "./model_cache"
158 trust_remote_code: true 161 trust_remote_code: true
159 ``` 162 ```
160 163
  164 +T4 实测建议:
  165 +
  166 +- `dtype` 优先使用 `float16`;在当前机器上 `auto` 会加载成 `bfloat16`,明显更慢
  167 +- 在线短文本商品重排建议从 `batch_size: 64` 起步;它比更大的 listwise block 更快,但会牺牲一部分“完整 listwise”排序一致性
  168 +- 若你更看重接近完整 listwise 的排序结果,可提高到 `batch_size: 125`,代价是延迟明显上升
  169 +- `max_doc_length: 160`、`max_query_length: 64` 更适合当前商品标题 / 短 query 场景
  170 +
161 ## 当前最优方案:`qwen3_vllm_score` 171 ## 当前最优方案:`qwen3_vllm_score`
162 172
163 173
reranker/backends/jina_reranker_v3.py
@@ -35,19 +35,26 @@ class JinaRerankerV3Backend: @@ -35,19 +35,26 @@ class JinaRerankerV3Backend:
35 self._config.get("model_name") or "jinaai/jina-reranker-v3" 35 self._config.get("model_name") or "jinaai/jina-reranker-v3"
36 ) 36 )
37 self._cache_dir = self._config.get("cache_dir") or "./model_cache" 37 self._cache_dir = self._config.get("cache_dir") or "./model_cache"
38 - self._dtype = str(self._config.get("dtype") or "auto") 38 + self._dtype = str(self._config.get("dtype") or "float16")
39 self._device = self._config.get("device") 39 self._device = self._config.get("device")
40 self._batch_size = max(1, int(self._config.get("batch_size", 64))) 40 self._batch_size = max(1, int(self._config.get("batch_size", 64)))
  41 + self._max_doc_length = max(1, int(self._config.get("max_doc_length", 160)))
  42 + self._max_query_length = max(1, int(self._config.get("max_query_length", 64)))
  43 + self._sort_by_doc_length = bool(self._config.get("sort_by_doc_length", True))
41 self._return_embeddings = bool(self._config.get("return_embeddings", False)) 44 self._return_embeddings = bool(self._config.get("return_embeddings", False))
42 self._trust_remote_code = bool(self._config.get("trust_remote_code", True)) 45 self._trust_remote_code = bool(self._config.get("trust_remote_code", True))
43 self._lock = threading.Lock() 46 self._lock = threading.Lock()
44 47
45 logger.info( 48 logger.info(
46 - "[Jina_Reranker_V3] Loading model %s (dtype=%s, device=%s, batch=%s)", 49 + "[Jina_Reranker_V3] Loading model %s (dtype=%s, device=%s, batch=%s, "
  50 + "max_doc_length=%s, max_query_length=%s, sort_by_doc_length=%s)",
47 self._model_name, 51 self._model_name,
48 self._dtype, 52 self._dtype,
49 self._device, 53 self._device,
50 self._batch_size, 54 self._batch_size,
  55 + self._max_doc_length,
  56 + self._max_query_length,
  57 + self._sort_by_doc_length,
51 ) 58 )
52 59
53 load_kwargs: Dict[str, Any] = { 60 load_kwargs: Dict[str, Any] = {
@@ -116,7 +123,6 @@ class JinaRerankerV3Backend: @@ -116,7 +123,6 @@ class JinaRerankerV3Backend:
116 } 123 }
117 124
118 unique_texts: List[str] = [] 125 unique_texts: List[str] = []
119 - unique_first_indices: List[int] = []  
120 text_to_unique_idx: Dict[str, int] = {} 126 text_to_unique_idx: Dict[str, int] = {}
121 for orig_idx, text in indexed: 127 for orig_idx, text in indexed:
122 unique_idx = text_to_unique_idx.get(text) 128 unique_idx = text_to_unique_idx.get(text)
@@ -124,7 +130,6 @@ class JinaRerankerV3Backend: @@ -124,7 +130,6 @@ class JinaRerankerV3Backend:
124 unique_idx = len(unique_texts) 130 unique_idx = len(unique_texts)
125 text_to_unique_idx[text] = unique_idx 131 text_to_unique_idx[text] = unique_idx
126 unique_texts.append(text) 132 unique_texts.append(text)
127 - unique_first_indices.append(orig_idx)  
128 133
129 effective_top_n = min(top_n, len(unique_texts)) if top_n is not None else None 134 effective_top_n = min(top_n, len(unique_texts)) if top_n is not None else None
130 135
@@ -151,6 +156,9 @@ class JinaRerankerV3Backend: @@ -151,6 +156,9 @@ class JinaRerankerV3Backend:
151 "device": self._device, 156 "device": self._device,
152 "dtype": self._dtype, 157 "dtype": self._dtype,
153 "batch_size": self._batch_size, 158 "batch_size": self._batch_size,
  159 + "max_doc_length": self._max_doc_length,
  160 + "max_query_length": self._max_query_length,
  161 + "sort_by_doc_length": self._sort_by_doc_length,
154 "normalize": normalize, 162 "normalize": normalize,
155 "normalize_note": "jina_reranker_v3 returns model relevance scores directly", 163 "normalize_note": "jina_reranker_v3 returns model relevance scores directly",
156 } 164 }
@@ -172,11 +180,15 @@ class JinaRerankerV3Backend: @@ -172,11 +180,15 @@ class JinaRerankerV3Backend:
172 if not docs: 180 if not docs:
173 return [] 181 return []
174 182
175 - unique_scores: List[float] = [0.0] * len(docs) 183 + ordered_indices = list(range(len(docs)))
  184 + if self._sort_by_doc_length and len(ordered_indices) > 1:
  185 + ordered_indices.sort(key=lambda idx: len(docs[idx]))
176 186
  187 + unique_scores: List[float] = [0.0] * len(docs)
177 with self._lock: 188 with self._lock:
178 - for start in range(0, len(docs), self._batch_size):  
179 - batch_docs = docs[start : start + self._batch_size] 189 + for start in range(0, len(ordered_indices), self._batch_size):
  190 + batch_indices = ordered_indices[start : start + self._batch_size]
  191 + batch_docs = [docs[idx] for idx in batch_indices]
180 batch_top_n = None 192 batch_top_n = None
181 if top_n is not None and len(docs) <= self._batch_size: 193 if top_n is not None and len(docs) <= self._batch_size:
182 batch_top_n = min(top_n, len(batch_docs)) 194 batch_top_n = min(top_n, len(batch_docs))
@@ -185,9 +197,13 @@ class JinaRerankerV3Backend: @@ -185,9 +197,13 @@ class JinaRerankerV3Backend:
185 batch_docs, 197 batch_docs,
186 top_n=batch_top_n, 198 top_n=batch_top_n,
187 return_embeddings=self._return_embeddings, 199 return_embeddings=self._return_embeddings,
  200 + max_doc_length=self._max_doc_length,
  201 + max_query_length=self._max_query_length,
188 ) 202 )
189 for item in results: 203 for item in results:
190 batch_index = int(item["index"]) 204 batch_index = int(item["index"])
191 - unique_scores[start + batch_index] = float(item["relevance_score"]) 205 + unique_scores[batch_indices[batch_index]] = float(
  206 + item["relevance_score"]
  207 + )
192 208
193 return unique_scores 209 return unique_scores