""" Qwen3-Embedding-0.6B local text embedding implementation. Internal model implementation used by the embedding service. """ import threading from typing import List, Union import numpy as np from sentence_transformers import SentenceTransformer class Qwen3TextModel(object): """ Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference). """ _instance = None _lock = threading.Lock() def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"): with cls._lock: if cls._instance is None: cls._instance = super(Qwen3TextModel, cls).__new__(cls) cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True) return cls._instance def encode( self, sentences: Union[str, List[str]], normalize_embeddings: bool = True, device: str = "cuda", batch_size: int = 32, ) -> np.ndarray: if device == "gpu": device = "cuda" # Try requested device, fallback to CPU if CUDA is unavailable/insufficient. try: if device == "cuda": import torch if torch.cuda.is_available(): free_memory = ( torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated() ) if free_memory < 1024 * 1024 * 1024: # 1GB device = "cpu" else: device = "cpu" self.model = self.model.to(device) embeddings = self.model.encode( sentences, normalize_embeddings=normalize_embeddings, device=device, show_progress_bar=False, batch_size=batch_size, ) return embeddings except Exception: if device != "cpu": self.model = self.model.to("cpu") embeddings = self.model.encode( sentences, normalize_embeddings=normalize_embeddings, device="cpu", show_progress_bar=False, batch_size=batch_size, ) return embeddings raise def encode_batch( self, texts: List[str], batch_size: int = 32, device: str = "cuda", normalize_embeddings: bool = True, ) -> np.ndarray: return self.encode( texts, batch_size=batch_size, device=device, normalize_embeddings=normalize_embeddings, )