qwen3_model.py 2.64 KB
Edit Raw Blame History

"""
Qwen3-Embedding-0.6B local text embedding implementation.

Internal model implementation used by the embedding service.
"""

import threading
from typing import List, Union

import numpy as np
from sentence_transformers import SentenceTransformer


class Qwen3TextModel(object):
    """
    Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference).
    """

    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(Qwen3TextModel, cls).__new__(cls)
                cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True)
        return cls._instance

    def encode(
        self,
        sentences: Union[str, List[str]],
        normalize_embeddings: bool = True,
        device: str = "cuda",
        batch_size: int = 32,
    ) -> np.ndarray:
        if device == "gpu":
            device = "cuda"

        # Try requested device, fallback to CPU if CUDA is unavailable/insufficient.
        try:
            if device == "cuda":
                import torch

                if torch.cuda.is_available():
                    free_memory = (
                        torch.cuda.get_device_properties(0).total_memory
                        - torch.cuda.memory_allocated()
                    )
                    if free_memory < 1024 * 1024 * 1024:  # 1GB
                        device = "cpu"
                else:
                    device = "cpu"

            self.model = self.model.to(device)
            embeddings = self.model.encode(
                sentences,
                normalize_embeddings=normalize_embeddings,
                device=device,
                show_progress_bar=False,
                batch_size=batch_size,
            )
            return embeddings
        except Exception:
            if device != "cpu":
                self.model = self.model.to("cpu")
                embeddings = self.model.encode(
                    sentences,
                    normalize_embeddings=normalize_embeddings,
                    device="cpu",
                    show_progress_bar=False,
                    batch_size=batch_size,
                )
                return embeddings
            raise

    def encode_batch(
        self,
        texts: List[str],
        batch_size: int = 32,
        device: str = "cuda",
        normalize_embeddings: bool = True,
    ) -> np.ndarray:
        return self.encode(
            texts,
            batch_size=batch_size,
            device=device,
            normalize_embeddings=normalize_embeddings,
        )