"""
Qwen3-Embedding-0.6B local text embedding implementation.

Internal model implementation used by the embedding service.
"""

import threading
from typing import List, Union

import numpy as np
from sentence_transformers import SentenceTransformer
import torch


class Qwen3TextModel(object):
    """
    Thread-safe singleton text encoder using Qwen3-Embedding-0.6B (local inference).
    """

    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_id: str = "Qwen/Qwen3-Embedding-0.6B"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(Qwen3TextModel, cls).__new__(cls)
                cls._instance.model = SentenceTransformer(model_id, trust_remote_code=True)
                cls._instance._current_device = None
                cls._instance._encode_lock = threading.Lock()
        return cls._instance

    def _ensure_device(self, device: str) -> str:
        target = (device or "cpu").strip().lower()
        if target == "gpu":
            target = "cuda"
        if target == "cuda" and not torch.cuda.is_available():
            target = "cpu"
        if target != self._current_device:
            self.model = self.model.to(target)
            self._current_device = target
        return target

    def encode(
        self,
        sentences: Union[str, List[str]],
        normalize_embeddings: bool = True,
        device: str = "cuda",
        batch_size: int = 32,
    ) -> np.ndarray:

        # SentenceTransformer + CUDA inference is not thread-safe in our usage;
        # keep one in-flight encode call while avoiding repeated .to(device) hops.
        with self._encode_lock:
            run_device = self._ensure_device(device)
            embeddings = self.model.encode(
                sentences,
                normalize_embeddings=normalize_embeddings,
                device=run_device,
                show_progress_bar=False,
                batch_size=batch_size,
            )
            return embeddings