diff --git a/.env b/.env deleted file mode 100644 index 3aafe7d..0000000 --- a/.env +++ /dev/null @@ -1,41 +0,0 @@ -# Elasticsearch Configuration -ES_HOST=http://localhost:9200 -ES_USERNAME=saas -ES_PASSWORD=4hOaLaf41y2VuI8y - -# Redis Configuration (Optional) - AI 生产 10.200.16.14:6479 -REDIS_HOST=10.200.16.14 -REDIS_PORT=6479 -REDIS_PASSWORD=dxEkegEZ@C5SXWKv - -# DeepL Translation API -DEEPL_AUTH_KEY=c9293ab4-ad25-479b-919f-ab4e63b429ed - -# API Service Configuration -API_HOST=0.0.0.0 -API_PORT=6002 - -# MySQL Database Configuration (Shoplazza) - AI 生产 10.200.16.14:3316 -DB_HOST=10.200.16.14 -DB_PORT=3316 -DB_DATABASE=saas -DB_USERNAME=root -DB_PASSWORD=qY8tgodLoA&KT#yQ - -# Model Directories -TEXT_MODEL_DIR=/data/tw/models/bge-m3 # 已经改为web请求了,不使用本地模型 -IMAGE_MODEL_DIR=/data/tw/models/cn-clip # 已经改为web请求了,不使用本地模型 - -# Cache Directory -CACHE_DIR=.cache - -# Frontend API Base URL -API_BASE_URL=http://43.166.252.75:6002 - - -# 国内 -DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b -# 美国 -DASHSCOPE_API_KEY=sk-482cc3ff37a8467dab134a7a46830556 - -OPENAI_API_KEY=sk-HvmTMKtuznibZ75l7L2uF2jiaYocCthqd8Cbdkl09KTE7Ft0 diff --git a/reranker/README.md b/reranker/README.md index 9a59731..c217140 100644 --- a/reranker/README.md +++ b/reranker/README.md @@ -4,10 +4,10 @@ --- -Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM)。调用方通过 HTTP 访问,不关心具体后端。 +Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers)。调用方通过 HTTP 访问,不关心具体后端。 **特性** -- 多后端:`qwen3_vllm`(默认,Qwen3-Reranker-0.6B + vLLM)、`bge`(兼容保留) +- 多后端:`qwen3_vllm`(默认,Qwen3-Reranker-0.6B + vLLM)、`qwen3_transformers`(纯 Transformers,无需 vLLM)、`bge`(兼容保留) - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.` - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) @@ -17,18 +17,20 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `backends/__init__.py`:`get_rerank_backend(name, config)` - `backends/bge.py`:BGE 后端 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 + - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) ## 依赖 - 通用:`torch`、`modelscope`、`fastapi`、`uvicorn`(见项目 `requirements.txt` / `requirements_ml.txt`) -- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers`(可选,仅当使用 `backend: qwen3_vllm` 时安装) +- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) +- **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) ```bash ./scripts/setup_reranker_venv.sh ``` ## 配置 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `bge`),或环境变量 `RERANK_BACKEND`。 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `bge`),或环境变量 `RERANK_BACKEND`。 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: ```yaml @@ -47,6 +49,12 @@ services: qwen3_vllm: model_name: "Qwen/Qwen3-Reranker-0.6B" max_model_len: 8192 + qwen3_transformers: + model_name: "Qwen/Qwen3-Reranker-0.6B" + instruction: "Given a web search query, retrieve relevant passages that answer the query" + max_length: 8192 + batch_size: 64 + use_fp16: true tensor_parallel_size: 1 gpu_memory_utilization: 0.8 enable_prefix_caching: true @@ -111,3 +119,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info - 无请求级缓存;输入按字符串去重后推理,再按原始顺序回填分数。 - 空或 null 的 doc 跳过并计为 0。 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 +- **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存,可选 `attn_implementation: "flash_attention_2"` 加速。 diff --git a/reranker/backends/__init__.py b/reranker/backends/__init__.py index 164fba2..7edd115 100644 --- a/reranker/backends/__init__.py +++ b/reranker/backends/__init__.py @@ -43,7 +43,12 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc if name == "qwen3_vllm": from reranker.backends.qwen3_vllm import Qwen3VLLMRerankerBackend return Qwen3VLLMRerankerBackend(config) - raise ValueError(f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm") + if name == "qwen3_transformers": + from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend + return Qwen3TransformersRerankerBackend(config) + raise ValueError( + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers" + ) __all__ = ["RerankBackendProtocol", "get_rerank_backend"] diff --git a/reranker/backends/qwen3_transformers.py b/reranker/backends/qwen3_transformers.py new file mode 100644 index 0000000..beb2b18 --- /dev/null +++ b/reranker/backends/qwen3_transformers.py @@ -0,0 +1,204 @@ +""" +Qwen3-Reranker-0.6B backend using Transformers (direct usage). No vLLM required. + +Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B +Requires: transformers>=4.51.0, torch. +""" + +from __future__ import annotations + +import logging +import time +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger("reranker.backends.qwen3_transformers") + +try: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer +except ImportError as e: + raise ImportError( + "Qwen3-Transformers reranker backend requires transformers>=4.51.0 and torch. " + "Install with: pip install transformers>=4.51.0 torch" + ) from e + + +def _format_instruction(instruction: str, query: str, doc: str) -> str: + """Format (query, doc) pair per official Qwen3-Reranker spec.""" + return ": {instruction}\n: {query}\n: {doc}".format( + instruction=instruction, query=query, doc=doc + ) + + +class Qwen3TransformersRerankerBackend: + """ + Qwen3-Reranker-0.6B with Transformers (AutoModelForCausalLM) inference. + Config from services.rerank.backends.qwen3_transformers. + No vLLM dependency; lighter than qwen3_vllm, suitable for CPU or small GPU. + """ + + def __init__(self, config: Dict[str, Any]) -> None: + self._config = config or {} + model_name = str(self._config.get("model_name") or "Qwen/Qwen3-Reranker-0.6B") + self._instruction = str( + self._config.get("instruction") + or "Given a web search query, retrieve relevant passages that answer the query" + ) + max_length = int(self._config.get("max_length", 8192)) + batch_size = int(self._config.get("batch_size", 64)) + use_fp16 = bool(self._config.get("use_fp16", True)) + device = self._config.get("device") + attn_impl = self._config.get("attn_implementation") # e.g. "flash_attention_2" + + self._model_name = model_name + self._batch_size = batch_size + + logger.info( + "[Qwen3_Transformers] Loading model %s (max_length=%s, batch=%s, fp16=%s)", + model_name, + max_length, + batch_size, + use_fp16, + ) + + self._tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") + self._tokenizer.pad_token = self._tokenizer.eos_token + + # Prefix/suffix from official reference + prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n" + suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + self._prefix_tokens = self._tokenizer.encode(prefix, add_special_tokens=False) + self._suffix_tokens = self._tokenizer.encode(suffix, add_special_tokens=False) + self._max_length = max_length + self._effective_max_len = max_length - len(self._prefix_tokens) - len(self._suffix_tokens) + + self._token_true_id = self._tokenizer.convert_tokens_to_ids("yes") + self._token_false_id = self._tokenizer.convert_tokens_to_ids("no") + + kwargs = {} + if use_fp16 and torch.cuda.is_available(): + kwargs["torch_dtype"] = torch.float16 + if attn_impl: + kwargs["attn_implementation"] = attn_impl + + self._model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).eval() + if device is not None: + self._model = self._model.to(device) + elif torch.cuda.is_available(): + self._model = self._model.cuda() + + logger.info( + "[Qwen3_Transformers] Model ready | model=%s device=%s", + model_name, + next(self._model.parameters()).device, + ) + + def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]: + """Tokenize pairs and add prefix/suffix tokens. Returns batched tensors on model device.""" + inputs = self._tokenizer( + pairs, + padding=False, + truncation="longest_first", + return_attention_mask=False, + max_length=self._effective_max_len, + ) + for i, ele in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = self._prefix_tokens + ele + self._suffix_tokens + inputs = self._tokenizer.pad( + inputs, + padding=True, + return_tensors="pt", + ) + for key in inputs: + inputs[key] = inputs[key].to(self._model.device) + return inputs + + @torch.no_grad() + def _compute_scores(self, pairs: List[str]) -> List[float]: + """Run forward pass and compute yes/no probability per pair.""" + if not pairs: + return [] + inputs = self._process_inputs(pairs) + outputs = self._model(**inputs) + batch_scores = outputs.logits[:, -1, :] + true_vector = batch_scores[:, self._token_true_id] + false_vector = batch_scores[:, self._token_false_id] + batch_scores = torch.stack([false_vector, true_vector], dim=1) + batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) + scores = batch_scores[:, 1].exp().tolist() + return scores + + def score_with_meta( + self, + query: str, + docs: List[str], + normalize: bool = True, + ) -> Tuple[List[float], Dict[str, Any]]: + start_ts = time.time() + total_docs = len(docs) if docs else 0 + output_scores: List[float] = [0.0] * total_docs + + query = "" if query is None else str(query).strip() + indexed: List[Tuple[int, str]] = [] + for i, doc in enumerate(docs or []): + if doc is None: + continue + text = str(doc).strip() + if not text: + continue + indexed.append((i, text)) + + if not query or not indexed: + elapsed_ms = (time.time() - start_ts) * 1000.0 + return output_scores, { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": 0, + "dedup_ratio": 0.0, + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_transformers", + "normalize": normalize, + } + + # Deduplicate by text, keep mapping to original indices + unique_texts: List[str] = [] + position_to_unique: List[int] = [] + prev: Optional[str] = None + for _idx, text in indexed: + if text != prev: + unique_texts.append(text) + prev = text + position_to_unique.append(len(unique_texts) - 1) + + pairs = [ + _format_instruction(self._instruction, query, t) + for t in unique_texts + ] + + # Batch inference + unique_scores: List[float] = [] + for i in range(0, len(pairs), self._batch_size): + batch = pairs[i : i + self._batch_size] + batch_scores = self._compute_scores(batch) + unique_scores.extend(batch_scores) + + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): + output_scores[orig_idx] = float(unique_scores[unique_idx]) + + elapsed_ms = (time.time() - start_ts) * 1000.0 + dedup_ratio = 0.0 + if indexed: + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) + + meta = { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": len(unique_texts), + "dedup_ratio": round(dedup_ratio, 4), + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_transformers", + "normalize": normalize, + } + return output_scores, meta -- libgit2 0.21.2