qwen3_model.py 1.63 KB
Edit Raw Blame History

"""
Qwen3-Embedding-0.6B local text embedding implementation.

Internal model implementation used by the embedding service.
"""

import threading
from typing import List, Union

import numpy as np
from sentence_transformers import SentenceTransformer


class Qwen3TextModel(object):
    """
    Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference).
    """

    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(Qwen3TextModel, cls).__new__(cls)
                cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True)
        return cls._instance

    def encode(
        self,
        sentences: Union[str, List[str]],
        normalize_embeddings: bool = True,
        device: str = "cuda",
        batch_size: int = 32,
    ) -> np.ndarray:
        if device == "gpu":
            device = "cuda"
        self.model = self.model.to(device)
        embeddings = self.model.encode(
            sentences,
            normalize_embeddings=normalize_embeddings,
            device=device,
            show_progress_bar=False,
            batch_size=batch_size,
        )
        return embeddings

    def encode_batch(
        self,
        texts: List[str],
        batch_size: int = 32,
        device: str = "cuda",
        normalize_embeddings: bool = True,
    ) -> np.ndarray:
        return self.encode(
            texts,
            batch_size=batch_size,
            device=device,
            normalize_embeddings=normalize_embeddings,
        )