Blame view

embeddings/qwen3_model.py 2.64 KB
7bfb9946   tangwang   向量化模块
1
  """
950a640e   tangwang   embeddings
2
  Qwen3-Embedding-0.6B local text embedding implementation.
7bfb9946   tangwang   向量化模块
3
4
5
6
7
8
9
10
11
  
  Internal model implementation used by the embedding service.
  """
  
  import threading
  from typing import List, Union
  
  import numpy as np
  from sentence_transformers import SentenceTransformer
7bfb9946   tangwang   向量化模块
12
13
  
  
950a640e   tangwang   embeddings
14
  class Qwen3TextModel(object):
7bfb9946   tangwang   向量化模块
15
      """
950a640e   tangwang   embeddings
16
      Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference).
7bfb9946   tangwang   向量化模块
17
18
19
20
21
      """
  
      _instance = None
      _lock = threading.Lock()
  
950a640e   tangwang   embeddings
22
      def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"):
7bfb9946   tangwang   向量化模块
23
24
          with cls._lock:
              if cls._instance is None:
950a640e   tangwang   embeddings
25
26
                  cls._instance = super(Qwen3TextModel, cls).__new__(cls)
                  cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True)
7bfb9946   tangwang   向量化模块
27
28
29
30
31
32
33
34
35
36
37
38
          return cls._instance
  
      def encode(
          self,
          sentences: Union[str, List[str]],
          normalize_embeddings: bool = True,
          device: str = "cuda",
          batch_size: int = 32,
      ) -> np.ndarray:
          if device == "gpu":
              device = "cuda"
  
950a640e   tangwang   embeddings
39
          # Try requested device, fallback to CPU if CUDA is unavailable/insufficient.
7bfb9946   tangwang   向量化模块
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
          try:
              if device == "cuda":
                  import torch
  
                  if torch.cuda.is_available():
                      free_memory = (
                          torch.cuda.get_device_properties(0).total_memory
                          - torch.cuda.memory_allocated()
                      )
                      if free_memory < 1024 * 1024 * 1024:  # 1GB
                          device = "cpu"
                  else:
                      device = "cpu"
  
              self.model = self.model.to(device)
              embeddings = self.model.encode(
                  sentences,
                  normalize_embeddings=normalize_embeddings,
                  device=device,
                  show_progress_bar=False,
                  batch_size=batch_size,
              )
              return embeddings
7bfb9946   tangwang   向量化模块
63
64
65
66
67
68
69
70
71
72
73
74
75
          except Exception:
              if device != "cpu":
                  self.model = self.model.to("cpu")
                  embeddings = self.model.encode(
                      sentences,
                      normalize_embeddings=normalize_embeddings,
                      device="cpu",
                      show_progress_bar=False,
                      batch_size=batch_size,
                  )
                  return embeddings
              raise
  
950a640e   tangwang   embeddings
76
77
78
79
80
81
82
83
84
85
86
87
88
      def encode_batch(
          self,
          texts: List[str],
          batch_size: int = 32,
          device: str = "cuda",
          normalize_embeddings: bool = True,
      ) -> np.ndarray:
          return self.encode(
              texts,
              batch_size=batch_size,
              device=device,
              normalize_embeddings=normalize_embeddings,
          )
7bfb9946   tangwang   向量化模块