From 971a085177e6b04b92dc5ca6fcad888121774e06 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 26 Mar 2026 22:47:40 +0800 Subject: [PATCH] 补充reranker-jina,探索listwize的优势 --- config/config.yaml | 11 +++++++++-- requirements_reranker_jina_reranker_v3.txt | 5 +++++ reranker/README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- reranker/backends/__init__.py | 5 ++++- reranker/backends/jina_reranker_v3.py | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ reranker/server.py | 2 +- scripts/lib/reranker_backend_env.sh | 2 ++ 7 files changed, 262 insertions(+), 6 deletions(-) create mode 100644 requirements_reranker_jina_reranker_v3.txt create mode 100644 reranker/backends/jina_reranker_v3.py diff --git a/config/config.yaml b/config/config.yaml index 143fcb3..7a8b268 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -238,7 +238,7 @@ services: translation: service_url: "http://127.0.0.1:6006" # default_model: "nllb-200-distilled-600m" - default_model: "deepl" + default_model: "nllb-200-distilled-600m" default_scene: "general" timeout_sec: 10.0 cache: @@ -382,7 +382,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank + backend: "qwen3_vllm_score" # bge | jina_reranker_v3 | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -392,6 +392,13 @@ services: max_length: 160 cache_dir: "./model_cache" enable_warmup: true + jina_reranker_v3: + model_name: "jinaai/jina-reranker-v3" + device: null + dtype: "auto" + batch_size: 64 + cache_dir: "./model_cache" + trust_remote_code: true qwen3_vllm: model_name: "Qwen/Qwen3-Reranker-0.6B" engine: "vllm" diff --git a/requirements_reranker_jina_reranker_v3.txt b/requirements_reranker_jina_reranker_v3.txt new file mode 100644 index 0000000..9bfce7f --- /dev/null +++ b/requirements_reranker_jina_reranker_v3.txt @@ -0,0 +1,5 @@ +# Isolated dependencies for jina_reranker_v3 reranker backend. + +-r requirements_reranker_base.txt +torch>=2.0.0 +transformers>=4.51.0 diff --git a/reranker/README.md b/reranker/README.md index 93a4f61..91ed711 100644 --- a/reranker/README.md +++ b/reranker/README.md @@ -4,7 +4,7 @@ --- -Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 +Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Jina Reranker v3、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 ## 当前结论 @@ -26,6 +26,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |------|----------|------| | `qwen3_vllm_score` | 主推荐 | 走 vLLM **`LLM.score()`** 的 **pooling / classify** 路径:对每条 (query, doc) **直接产出相关分**,不经 causal LM 的整步 **generate**。相对 **`qwen3_vllm`**(`generate(max_tokens=1)` + **yes/no** 的 logprob 推导),**省去**每对样本上**大词表 softmax / 采样约束**那一层的常规开销,语义与 cross-encoder 式 rerank 更一致;在当前栈与 T4 上延迟表现最好 | | `qwen3_vllm` | 次推荐 | 稳定、成熟、好排障,是很好的 fallback 和对照组 | +| `jina_reranker_v3` | 新增本地方案 | 按官方推荐使用 `AutoModel(..., trust_remote_code=True)` + `model.rerank(query, docs)`,更接近 Jina 原生 listwise rerank 用法 | | `qwen3_transformers` | 兼容方案 | | | `qwen3_transformers_packed` | 特定场景方案 | T可能实现还有问题,没调好 | | `qwen3_gguf` / `qwen3_gguf_06b` | 低显存 / 功能兜底 | 更适合资源受限场景,不适合作为当前主在线方案 | @@ -36,6 +37,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `reranker/server.py`:FastAPI 服务,启动时按配置加载一个后端 - `reranker/backends/`:后端实现与工厂 - `backends/__init__.py`:`get_rerank_backend(name, config)` + - `backends/jina_reranker_v3.py`:Jina 官方 `model.rerank(...)` 接法 - `backends/qwen3_vllm_score.py`:当前最优的本地 GPU reranker - `backends/qwen3_vllm.py`:次优的本地 GPU reranker - `backends/qwen3_transformers.py`:Transformers 基线实现 @@ -64,6 +66,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `qwen3_vllm` -> `.venv-reranker` - `qwen3_vllm_score` -> `.venv-reranker-score` +- `jina_reranker_v3` -> `.venv-reranker-jina` - `qwen3_transformers` -> `.venv-reranker-transformers` - `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed` - `qwen3_gguf` -> `.venv-reranker-gguf` @@ -91,6 +94,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe ./scripts/setup_reranker_venv.sh qwen3_vllm ``` +`jina_reranker_v3`: + +```bash +./scripts/setup_reranker_venv.sh jina_reranker_v3 +``` + ### 2. 基础检查 ```bash @@ -112,6 +121,43 @@ nvidia-smi PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py --gpu-memory-utilization 0.2 ``` +## `jina_reranker_v3` + +该后端参考 Jina 官方模型卡接入,使用: + +```python +from transformers import AutoModel + +model = AutoModel.from_pretrained( + "jinaai/jina-reranker-v3", + dtype="auto", + trust_remote_code=True, +) +results = model.rerank(query, documents) +``` + +服务内实现补了几件工程化工作: + +- 统一适配 `/rerank` 协议,返回与输入 docs 对齐的 `scores` +- 对空文档与重复文档做预处理,避免重复推理 +- 支持 `top_n` hint,并保留原始输入顺序输出 +- 保留 `cache_dir` / `device` / `dtype` / `batch_size` 等配置项 + +推荐配置: + +```yaml +services: + rerank: + backends: + jina_reranker_v3: + model_name: "jinaai/jina-reranker-v3" + device: null + dtype: "auto" + batch_size: 64 + cache_dir: "./model_cache" + trust_remote_code: true +``` + ## 当前最优方案:`qwen3_vllm_score` @@ -238,4 +284,4 @@ ll tests/reranker_performance/ curl1.sh curl1_simple.sh rerank_performance_compare.sh -``` \ No newline at end of file +``` diff --git a/reranker/backends/__init__.py b/reranker/backends/__init__.py index 7d23091..5ffcc25 100644 --- a/reranker/backends/__init__.py +++ b/reranker/backends/__init__.py @@ -40,6 +40,9 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc if name == "bge": from reranker.backends.bge import BGERerankerBackend return BGERerankerBackend(config) + if name == "jina_reranker_v3": + from reranker.backends.jina_reranker_v3 import JinaRerankerV3Backend + return JinaRerankerV3Backend(config) if name == "qwen3_vllm": from reranker.backends.qwen3_vllm import Qwen3VLLMRerankerBackend return Qwen3VLLMRerankerBackend(config) @@ -68,7 +71,7 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc from reranker.backends.dashscope_rerank import DashScopeRerankBackend return DashScopeRerankBackend(config) raise ValueError( - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_transformers_packed, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" + f"Unknown rerank backend: {name!r}. Supported: bge, jina_reranker_v3, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_transformers_packed, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" ) diff --git a/reranker/backends/jina_reranker_v3.py b/reranker/backends/jina_reranker_v3.py new file mode 100644 index 0000000..0551e1e --- /dev/null +++ b/reranker/backends/jina_reranker_v3.py @@ -0,0 +1,193 @@ +""" +Jina reranker v3 backend using the model card's recommended AutoModel API. + +Reference: https://huggingface.co/jinaai/jina-reranker-v3 +Requires: transformers, torch. +""" + +from __future__ import annotations + +import logging +import threading +import time +from typing import Any, Dict, List, Tuple + +import torch +from transformers import AutoModel + +logger = logging.getLogger("reranker.backends.jina_reranker_v3") + + +class JinaRerankerV3Backend: + """ + jina-reranker-v3 backend using `AutoModel(..., trust_remote_code=True)`. + + The official model card recommends calling: + model = AutoModel.from_pretrained(..., trust_remote_code=True) + model.rerank(query, documents, top_n=...) + + Config from services.rerank.backends.jina_reranker_v3. + """ + + def __init__(self, config: Dict[str, Any]) -> None: + self._config = config or {} + self._model_name = str( + self._config.get("model_name") or "jinaai/jina-reranker-v3" + ) + self._cache_dir = self._config.get("cache_dir") or "./model_cache" + self._dtype = str(self._config.get("dtype") or "auto") + self._device = self._config.get("device") + self._batch_size = max(1, int(self._config.get("batch_size", 64))) + self._return_embeddings = bool(self._config.get("return_embeddings", False)) + self._trust_remote_code = bool(self._config.get("trust_remote_code", True)) + self._lock = threading.Lock() + + logger.info( + "[Jina_Reranker_V3] Loading model %s (dtype=%s, device=%s, batch=%s)", + self._model_name, + self._dtype, + self._device, + self._batch_size, + ) + + load_kwargs: Dict[str, Any] = { + "trust_remote_code": self._trust_remote_code, + "cache_dir": self._cache_dir, + "dtype": self._dtype, + } + self._model = AutoModel.from_pretrained(self._model_name, **load_kwargs) + self._model.eval() + + if self._device is not None: + self._model = self._model.to(self._device) + elif torch.cuda.is_available(): + self._device = "cuda" + self._model = self._model.to(self._device) + else: + self._device = "cpu" + + logger.info( + "[Jina_Reranker_V3] Model ready | model=%s device=%s", + self._model_name, + self._device, + ) + + def score_with_meta( + self, + query: str, + docs: List[str], + normalize: bool = True, + ) -> Tuple[List[float], Dict[str, Any]]: + return self.score_with_meta_topn(query, docs, normalize=normalize, top_n=None) + + def score_with_meta_topn( + self, + query: str, + docs: List[str], + normalize: bool = True, + top_n: int | None = None, + ) -> Tuple[List[float], Dict[str, Any]]: + start_ts = time.time() + total_docs = len(docs) if docs else 0 + output_scores: List[float] = [0.0] * total_docs + + query = "" if query is None else str(query).strip() + indexed: List[Tuple[int, str]] = [] + for i, doc in enumerate(docs or []): + if doc is None: + continue + text = str(doc).strip() + if not text: + continue + indexed.append((i, text)) + + if not query or not indexed: + elapsed_ms = (time.time() - start_ts) * 1000.0 + return output_scores, { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": 0, + "dedup_ratio": 0.0, + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "jina_reranker_v3", + "normalize": normalize, + "normalize_note": "jina_reranker_v3 returns model relevance scores directly", + } + + unique_texts: List[str] = [] + unique_first_indices: List[int] = [] + text_to_unique_idx: Dict[str, int] = {} + for orig_idx, text in indexed: + unique_idx = text_to_unique_idx.get(text) + if unique_idx is None: + unique_idx = len(unique_texts) + text_to_unique_idx[text] = unique_idx + unique_texts.append(text) + unique_first_indices.append(orig_idx) + + effective_top_n = min(top_n, len(unique_texts)) if top_n is not None else None + + unique_scores = self._rerank_unique( + query=query, + docs=unique_texts, + top_n=effective_top_n, + ) + + for orig_idx, text in indexed: + unique_idx = text_to_unique_idx[text] + output_scores[orig_idx] = float(unique_scores[unique_idx]) + + elapsed_ms = (time.time() - start_ts) * 1000.0 + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) if indexed else 0.0 + meta = { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": len(unique_texts), + "dedup_ratio": round(dedup_ratio, 4), + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "jina_reranker_v3", + "device": self._device, + "dtype": self._dtype, + "batch_size": self._batch_size, + "normalize": normalize, + "normalize_note": "jina_reranker_v3 returns model relevance scores directly", + } + if effective_top_n is not None: + meta["top_n"] = effective_top_n + if len(unique_texts) > self._batch_size: + meta["top_n_note"] = ( + "Applied as a request hint only; full scores were computed because " + "global top_n across multiple local batches would be lossy." + ) + return output_scores, meta + + def _rerank_unique( + self, + query: str, + docs: List[str], + top_n: int | None, + ) -> List[float]: + if not docs: + return [] + + unique_scores: List[float] = [0.0] * len(docs) + + with self._lock: + for start in range(0, len(docs), self._batch_size): + batch_docs = docs[start : start + self._batch_size] + batch_top_n = None + if top_n is not None and len(docs) <= self._batch_size: + batch_top_n = min(top_n, len(batch_docs)) + results = self._model.rerank( + query, + batch_docs, + top_n=batch_top_n, + return_embeddings=self._return_embeddings, + ) + for item in results: + batch_index = int(item["index"]) + unique_scores[start + batch_index] = float(item["relevance_score"]) + + return unique_scores diff --git a/reranker/server.py b/reranker/server.py index 48ebb9f..15b8918 100644 --- a/reranker/server.py +++ b/reranker/server.py @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional Response: { "scores": [float], "meta": {...} } Backend selected via config: services.rerank.backend -(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. +(bge | jina_reranker_v3 | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. """ import logging diff --git a/scripts/lib/reranker_backend_env.sh b/scripts/lib/reranker_backend_env.sh index 343b6c8..d3b50c8 100644 --- a/scripts/lib/reranker_backend_env.sh +++ b/scripts/lib/reranker_backend_env.sh @@ -40,6 +40,7 @@ reranker_backend_venv_dir() { case "${backend}" in qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; qwen3_vllm_score) printf '%s/.venv-reranker-score\n' "${project_root}" ;; + jina_reranker_v3) printf '%s/.venv-reranker-jina\n' "${project_root}" ;; qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; @@ -57,6 +58,7 @@ reranker_backend_requirements_file() { case "${backend}" in qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm_score.txt\n' "${project_root}" ;; + jina_reranker_v3) printf '%s/requirements_reranker_jina_reranker_v3.txt\n' "${project_root}" ;; qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; -- libgit2 0.21.2