Commit 749d78c8351a81bdf33f7b2a1644448fbb0958d1

Authored by tangwang
1 parent 4823f463

支持 reranker精简instruction

config/config.yaml
... ... @@ -402,6 +402,8 @@ services:
402 402 enforce_eager: false
403 403 infer_batch_size: 100
404 404 sort_by_doc_length: true
  405 + # 与 reranker/backends/qwen3_vllm.py 一致:standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct)
  406 + instruction_format: compact
405 407 # instruction: "Given a query, score the product for relevance"
406 408 # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
407 409 # instruction: "rank products by given query, category match first"
... ... @@ -433,6 +435,9 @@ services:
433 435 enforce_eager: false
434 436 infer_batch_size: 100
435 437 sort_by_doc_length: true
  438 + # 与 qwen3_vllm 同名项语义一致;默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
  439 + # instruction_format: standard
  440 + instruction_format: compact
436 441 instruction: "Rank products by query with category & style match prioritized"
437 442 qwen3_transformers:
438 443 model_name: "Qwen/Qwen3-Reranker-0.6B"
... ...
requirements_reranker_qwen3_transformers_packed.txt
1 1 # Isolated dependencies for qwen3_transformers_packed reranker backend.
  2 +#
  3 +# Keep this stack aligned with the validated CUDA runtime on our hosts.
  4 +# On this machine, torch 2.11.0 + cu130 fails CUDA init, while torch 2.10.0 + cu128 works.
  5 +# We also cap transformers <5 to stay on the same family as the working vLLM score env.
2 6  
3 7 -r requirements_reranker_qwen3_transformers.txt
  8 +torch==2.10.0
  9 +transformers>=4.51.0,<5
... ...
reranker/backends/qwen3_vllm.py
... ... @@ -45,6 +45,19 @@ def deduplicate_with_positions(texts: List[str]) -&gt; Tuple[List[str], List[int]]:
45 45 return unique_texts, position_to_unique
46 46  
47 47  
  48 +def _format_instruction__standard(instruction: str, query: str, doc: str) -> List[Dict[str, str]]:
  49 + """Build chat messages for one (query, doc) pair."""
  50 + return [
  51 + {
  52 + "role": "system",
  53 + "content": "Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".",
  54 + },
  55 + {
  56 + "role": "user",
  57 + "content": f"<Instruct>: {instruction}\n\n<Query>: {query}\n\n<Document>: {doc}",
  58 + },
  59 + ]
  60 +
48 61 def _format_instruction(instruction: str, query: str, doc: str) -> List[Dict[str, str]]:
49 62 """Build chat messages for one (query, doc) pair."""
50 63 return [
... ... @@ -54,11 +67,10 @@ def _format_instruction(instruction: str, query: str, doc: str) -&gt; List[Dict[str
54 67 },
55 68 {
56 69 "role": "user",
57   - "content": f"<Query>: {query}\n\n<Document>: {doc}",
  70 + "content": f"<Instruct>: {instruction}\n\n<Query>: {query}\n\n<Document>: {doc}",
58 71 },
59 72 ]
60 73  
61   -
62 74 class Qwen3VLLMRerankerBackend:
63 75 """
64 76 Qwen3-Reranker-0.6B with vLLM inference.
... ... @@ -78,6 +90,17 @@ class Qwen3VLLMRerankerBackend:
78 90 self._config.get("instruction")
79 91 or "Given a query, score the product for relevance"
80 92 )
  93 + _fmt = str(self._config.get("instruction_format") or "compact").strip().lower()
  94 + if _fmt not in {"standard", "compact"}:
  95 + raise ValueError(
  96 + f"instruction_format must be 'standard' or 'compact', got {_fmt!r}"
  97 + )
  98 + self._instruction_format = _fmt
  99 + self._format_messages = (
  100 + _format_instruction__standard
  101 + if self._instruction_format == "standard"
  102 + else _format_instruction
  103 + )
81 104 infer_batch_size = os.getenv("RERANK_VLLM_INFER_BATCH_SIZE") or self._config.get("infer_batch_size", 64)
82 105 sort_by_doc_length = os.getenv("RERANK_VLLM_SORT_BY_DOC_LENGTH")
83 106 if sort_by_doc_length is None:
... ... @@ -95,13 +118,15 @@ class Qwen3VLLMRerankerBackend:
95 118 )
96 119  
97 120 logger.info(
98   - "[Qwen3_VLLM] Loading model %s (max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s)",
  121 + "[Qwen3_VLLM] Loading model %s (max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s, "
  122 + "instruction_format=%s)",
99 123 model_name,
100 124 max_model_len,
101 125 tensor_parallel_size,
102 126 gpu_memory_utilization,
103 127 dtype,
104 128 enable_prefix_caching,
  129 + self._instruction_format,
105 130 )
106 131  
107 132 self._llm = LLM(
... ... @@ -145,7 +170,7 @@ class Qwen3VLLMRerankerBackend:
145 170 ) -> List[TokensPrompt]:
146 171 """Build tokenized prompts for vLLM from (query, doc) pairs. Batch apply_chat_template."""
147 172 messages_batch = [
148   - _format_instruction(self._instruction, q, d) for q, d in pairs
  173 + self._format_messages(self._instruction, q, d) for q, d in pairs
149 174 ]
150 175 tokenized = self._tokenizer.apply_chat_template(
151 176 messages_batch,
... ... @@ -242,6 +267,7 @@ class Qwen3VLLMRerankerBackend:
242 267 "infer_batch_size": self._infer_batch_size,
243 268 "inference_batches": 0,
244 269 "sort_by_doc_length": self._sort_by_doc_length,
  270 + "instruction_format": self._instruction_format,
245 271 }
246 272  
247 273 # Deduplicate globally by text, keep mapping to original indices.
... ... @@ -289,6 +315,7 @@ class Qwen3VLLMRerankerBackend:
289 315 "normalize": normalize,
290 316 "infer_batch_size": self._infer_batch_size,
291 317 "inference_batches": inference_batches,
292   - "sort_by_doc_length": self._sort_by_doc_length
  318 + "sort_by_doc_length": self._sort_by_doc_length,
  319 + "instruction_format": self._instruction_format,
293 320 }
294 321 return output_scores, meta
... ...
reranker/backends/qwen3_vllm_score.py
... ... @@ -37,6 +37,8 @@ _DEFAULT_PREFIX = (
37 37 _DEFAULT_SUFFIX = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
38 38 _DEFAULT_QUERY_TEMPLATE = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
39 39 _DEFAULT_DOCUMENT_TEMPLATE = "<Document>: {doc}{suffix}"
  40 +# compact:与 qwen3_vllm._format_instruction 一致(instruction 作 system,user 内重复 Instruct)
  41 +_IM_USER_START = "<|im_end|>\n<|im_start|>user\n"
40 42  
41 43  
42 44 def _resolve_vllm_attention_config(config: Dict[str, Any]) -> Dict[str, Any] | None:
... ... @@ -99,6 +101,12 @@ class Qwen3VLLMScoreRerankerBackend:
99 101 self._config.get("instruction")
100 102 or "Given a query, score the product for relevance"
101 103 )
  104 + _fmt = str(self._config.get("instruction_format") or "standard").strip().lower()
  105 + if _fmt not in {"standard", "compact"}:
  106 + raise ValueError(
  107 + f"instruction_format must be 'standard' or 'compact', got {_fmt!r}"
  108 + )
  109 + self._instruction_format = _fmt
102 110 self._prefix = str(self._config.get("prompt_prefix") or _DEFAULT_PREFIX)
103 111 self._suffix = str(self._config.get("prompt_suffix") or _DEFAULT_SUFFIX)
104 112 self._query_template = str(self._config.get("query_template") or _DEFAULT_QUERY_TEMPLATE)
... ... @@ -142,7 +150,8 @@ class Qwen3VLLMScoreRerankerBackend:
142 150  
143 151 logger.info(
144 152 "[Qwen3_VLLM_SCORE] Loading model %s (LLM.score API, runner=%s, convert=%s, "
145   - "hf_overrides=%s, max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s)",
  153 + "hf_overrides=%s, max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s, "
  154 + "instruction_format=%s)",
146 155 model_name,
147 156 runner,
148 157 convert,
... ... @@ -152,6 +161,7 @@ class Qwen3VLLMScoreRerankerBackend:
152 161 gpu_memory_utilization,
153 162 dtype,
154 163 enable_prefix_caching,
  164 + self._instruction_format,
155 165 )
156 166  
157 167 # vLLM 0.17+ uses runner/convert instead of LLM(..., task="score"). With the official
... ... @@ -190,6 +200,14 @@ class Qwen3VLLMScoreRerankerBackend:
190 200 logger.info("[Qwen3_VLLM_SCORE] Model ready | model=%s", model_name)
191 201  
192 202 def _format_pair(self, query: str, doc: str) -> Tuple[str, str]:
  203 + if self._instruction_format == "compact":
  204 + # Align with reranker.backends.qwen3_vllm._format_instruction query/doc split for LLM.score().
  205 + compact_prefix = f"<|im_start|>system\n{self._instruction}{_IM_USER_START}"
  206 + q_text = (
  207 + f"{compact_prefix}<Instruct>: {self._instruction}\n\n<Query>: {query}\n"
  208 + )
  209 + d_text = f"\n<Document>: {doc}{self._suffix}"
  210 + return q_text, d_text
193 211 q_text = self._query_template.format(
194 212 prefix=self._prefix,
195 213 instruction=self._instruction,
... ... @@ -255,6 +273,7 @@ class Qwen3VLLMScoreRerankerBackend:
255 273 "infer_batch_size": self._infer_batch_size,
256 274 "inference_batches": 0,
257 275 "sort_by_doc_length": self._sort_by_doc_length,
  276 + "instruction_format": self._instruction_format,
258 277 }
259 278  
260 279 indexed_texts = [text for _, text in indexed]
... ... @@ -299,5 +318,6 @@ class Qwen3VLLMScoreRerankerBackend:
299 318 "infer_batch_size": self._infer_batch_size,
300 319 "inference_batches": inference_batches,
301 320 "sort_by_doc_length": self._sort_by_doc_length,
  321 + "instruction_format": self._instruction_format,
302 322 }
303 323 return output_scores, meta
... ...