Blame view

embeddings/qwen3_model.py 2.33 KB
7bfb9946   tangwang   向量化模块
1
  """
950a640e   tangwang   embeddings
2
  Qwen3-Embedding-0.6B local text embedding implementation.
7bfb9946   tangwang   向量化模块
3
4
5
6
7
8
9
10
11
  
  Internal model implementation used by the embedding service.
  """
  
  import threading
  from typing import List, Union
  
  import numpy as np
  from sentence_transformers import SentenceTransformer
efd435cf   tangwang   tei性能调优:
12
  import torch
7bfb9946   tangwang   向量化模块
13
14
  
  
950a640e   tangwang   embeddings
15
  class Qwen3TextModel(object):
7bfb9946   tangwang   向量化模块
16
      """
950a640e   tangwang   embeddings
17
      Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference).
7bfb9946   tangwang   向量化模块
18
19
20
21
22
      """
  
      _instance = None
      _lock = threading.Lock()
  
950a640e   tangwang   embeddings
23
      def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"):
7bfb9946   tangwang   向量化模块
24
25
          with cls._lock:
              if cls._instance is None:
950a640e   tangwang   embeddings
26
27
                  cls._instance = super(Qwen3TextModel, cls).__new__(cls)
                  cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True)
efd435cf   tangwang   tei性能调优:
28
29
                  cls._instance._current_device = None
                  cls._instance._encode_lock = threading.Lock()
7bfb9946   tangwang   向量化模块
30
31
          return cls._instance
  
efd435cf   tangwang   tei性能调优:
32
33
34
35
36
37
38
39
40
41
42
      def _ensure_device(self, device: str) -> str:
          target = (device or "cpu").strip().lower()
          if target == "gpu":
              target = "cuda"
          if target == "cuda" and not torch.cuda.is_available():
              target = "cpu"
          if target != self._current_device:
              self.model = self.model.to(target)
              self._current_device = target
          return target
  
7bfb9946   tangwang   向量化模块
43
44
45
46
47
48
49
      def encode(
          self,
          sentences: Union[str, List[str]],
          normalize_embeddings: bool = True,
          device: str = "cuda",
          batch_size: int = 32,
      ) -> np.ndarray:
efd435cf   tangwang   tei性能调优:
50
51
52
53
54
55
56
57
58
59
60
61
          # SentenceTransformer + CUDA inference is not thread-safe in our usage;
          # keep one in-flight encode call while avoiding repeated .to(device) hops.
          with self._encode_lock:
              run_device = self._ensure_device(device)
              embeddings = self.model.encode(
                  sentences,
                  normalize_embeddings=normalize_embeddings,
                  device=run_device,
                  show_progress_bar=False,
                  batch_size=batch_size,
              )
              return embeddings
7bfb9946   tangwang   向量化模块
62
  
950a640e   tangwang   embeddings
63
64
65
66
67
68
69
70
71
72
73
74
75
      def encode_batch(
          self,
          texts: List[str],
          batch_size: int = 32,
          device: str = "cuda",
          normalize_embeddings: bool = True,
      ) -> np.ndarray:
          return self.encode(
              texts,
              batch_size=batch_size,
              device=device,
              normalize_embeddings=normalize_embeddings,
          )