suggestion/builder.py

"""
Suggestion index builder (Phase 2).
Capabilities:
- Full rebuild to versioned index
- Atomic alias publish
- Incremental update from query logs with watermark
"""
import json
import logging
import math
import re
import unicodedata
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterator, List, Optional, Tuple
from sqlalchemy import text
from config.loader import get_app_config
from config.tenant_config_loader import get_tenant_config_loader
from suggestion.mapping import build_suggestion_mapping
from utils.es_client import ESClient
logger = logging.getLogger(__name__)
def _index_prefix() -> str:
    return get_app_config().runtime.index_namespace or ""
def get_suggestion_alias_name(tenant_id: str) -> str:
    """Read alias for suggestion index (single source of truth)."""
    return f"{_index_prefix()}search_suggestions_tenant_{tenant_id}_current"
def get_suggestion_versioned_index_name(tenant_id: str, build_at: Optional[datetime] = None) -> str:
    """Versioned suggestion index name."""
    ts = (build_at or datetime.now(timezone.utc)).strftime("%Y%m%d%H%M%S")
    return f"{_index_prefix()}search_suggestions_tenant_{tenant_id}_v{ts}"
def get_suggestion_versioned_index_pattern(tenant_id: str) -> str:
    return f"{_index_prefix()}search_suggestions_tenant_{tenant_id}_v*"
def get_suggestion_meta_index_name() -> str:
    return f"{_index_prefix()}search_suggestions_meta"
@dataclass
class SuggestionCandidate:
    text: str
    text_norm: str
    lang: str
    sources: set = field(default_factory=set)
    title_spu_ids: set = field(default_factory=set)
    qanchor_spu_ids: set = field(default_factory=set)
    query_count_7d: int = 0
    query_count_30d: int = 0
    lang_confidence: float = 1.0
    lang_source: str = "default"
    lang_conflict: bool = False
    def add_product(self, source: str, spu_id: str) -> None:
        self.sources.add(source)
        if source == "title":
            self.title_spu_ids.add(spu_id)
        elif source == "qanchor":
            self.qanchor_spu_ids.add(spu_id)
    def add_query_log(self, is_7d: bool) -> None:
        self.sources.add("query_log")
        self.query_count_30d += 1
        if is_7d:
            self.query_count_7d += 1
@dataclass
class QueryDelta:
    tenant_id: str
    lang: str
    text: str
    text_norm: str
    delta_7d: int = 0
    delta_30d: int = 0
    lang_confidence: float = 1.0
    lang_source: str = "default"
    lang_conflict: bool = False
class SuggestionIndexBuilder:
    """Build and update suggestion index."""
    def __init__(self, es_client: ESClient, db_engine: Any):
        self.es_client = es_client
        self.db_engine = db_engine
    @staticmethod
    def _to_utc(dt: Any) -> Optional[datetime]:
        if dt is None:
            return None
        if isinstance(dt, datetime):
            if dt.tzinfo is None:
                return dt.replace(tzinfo=timezone.utc)
            return dt.astimezone(timezone.utc)
        return None
    @staticmethod
    def _normalize_text(value: str) -> str:
        text_value = unicodedata.normalize("NFKC", (value or "")).strip().lower()
        text_value = re.sub(r"\s+", " ", text_value)
        return text_value
    @staticmethod
    def _prepare_title_for_suggest(title: str, max_len: int = 120) -> str:
        """
        Keep title-derived suggestions concise:
        - keep raw title when short enough
        - for long titles, keep the leading phrase before common separators
        - fallback to hard truncate
        """
        raw = str(title or "").strip()
        if not raw:
            return ""
        if len(raw) <= max_len:
            return raw
        head = re.split(r"[，,;；|/\\\\(（\\[【]", raw, maxsplit=1)[0].strip()
        if 1 < len(head) <= max_len:
            return head
        truncated = raw[:max_len].rstrip(" ，,;；|/\\\\-—–()（）[]【】")
        return truncated or raw[:max_len]
    @staticmethod
    def _split_qanchors(value: Any) -> List[str]:
        if value is None:
            return []
        if isinstance(value, list):
            return [str(x).strip() for x in value if str(x).strip()]
        raw = str(value).strip()
        if not raw:
            return []
        parts = re.split(r"[,;|/\n\t]+", raw)
        out = [p.strip() for p in parts if p and p.strip()]
        if not out:
            return [raw]
        return out
    @staticmethod
    def _looks_noise(text_value: str) -> bool:
        if not text_value:
            return True
        if len(text_value) > 120:
            return True
        if re.fullmatch(r"[\W_]+", text_value):
            return True
        return False
    @staticmethod
    def _normalize_lang(lang: Optional[str]) -> Optional[str]:
        if not lang:
            return None
        token = str(lang).strip().lower().replace("-", "_")
        if not token:
            return None
        if token in {"zh_tw", "pt_br"}:
            return token
        return token.split("_")[0]
    @staticmethod
    def _parse_request_params_language(raw: Any) -> Optional[str]:
        if raw is None:
            return None
        if isinstance(raw, dict):
            return raw.get("language")
        text_raw = str(raw).strip()
        if not text_raw:
            return None
        try:
            obj = json.loads(text_raw)
            if isinstance(obj, dict):
                return obj.get("language")
        except Exception:
            return None
        return None
    @staticmethod
    def _detect_script_language(query: str) -> Tuple[Optional[str], float, str]:
        if re.search(r"[\u4e00-\u9fff]", query):
            return "zh", 0.98, "script"
        if re.search(r"[\u0600-\u06FF]", query):
            return "ar", 0.98, "script"
        if re.search(r"[\u0400-\u04FF]", query):
            return "ru", 0.95, "script"
        if re.search(r"[\u0370-\u03FF]", query):
            return "el", 0.95, "script"
        if re.search(r"[a-zA-Z]", query):
            return "en", 0.55, "model"
        return None, 0.0, "default"
    def _resolve_query_language(
        self,
        query: str,
        log_language: Optional[str],
        request_params: Any,
        index_languages: List[str],
        primary_language: str,
    ) -> Tuple[str, float, str, bool]:
        """Resolve lang with priority: log field > request_params > script/model."""
        langs_set = set(index_languages or [])
        primary = self._normalize_lang(primary_language) or "en"
        if primary not in langs_set and langs_set:
            primary = index_languages[0]
        log_lang = self._normalize_lang(log_language)
        req_lang = self._normalize_lang(self._parse_request_params_language(request_params))
        conflict = bool(log_lang and req_lang and log_lang != req_lang)
        if log_lang and (not langs_set or log_lang in langs_set):
            return log_lang, 1.0, "log_field", conflict
        if req_lang and (not langs_set or req_lang in langs_set):
            return req_lang, 1.0, "request_params", conflict
        detected_lang, conf, source = self._detect_script_language(query)
        if detected_lang and (not langs_set or detected_lang in langs_set):
            return detected_lang, conf, source, conflict
        return primary, 0.3, "default", conflict
    @staticmethod
    def _compute_rank_score(query_count_30d: int, query_count_7d: int, qanchor_doc_count: int, title_doc_count: int) -> float:
        return (
            1.8 * math.log1p(max(query_count_30d, 0))
            + 1.2 * math.log1p(max(query_count_7d, 0))
            + 1.0 * math.log1p(max(qanchor_doc_count, 0))
            + 0.6 * math.log1p(max(title_doc_count, 0))
        )
    @classmethod
    def _compute_rank_score_from_candidate(cls, c: SuggestionCandidate) -> float:
        return cls._compute_rank_score(
            query_count_30d=c.query_count_30d,
            query_count_7d=c.query_count_7d,
            qanchor_doc_count=len(c.qanchor_spu_ids),
            title_doc_count=len(c.title_spu_ids),
        )
    def _iter_products(self, tenant_id: str, batch_size: int = 500) -> Iterator[Dict[str, Any]]:
        """Stream product docs from tenant index using search_after."""
        from indexer.mapping_generator import get_tenant_index_name
        index_name = get_tenant_index_name(tenant_id)
        search_after: Optional[List[Any]] = None
        while True:
            body: Dict[str, Any] = {
                "size": batch_size,
                "_source": ["id", "spu_id", "title", "qanchors"],
                "sort": [
                    {"spu_id": {"order": "asc", "missing": "_last"}},
                ],
                "query": {"match_all": {}},
            }
            if search_after is not None:
                body["search_after"] = search_after
            resp = self.es_client.client.search(index=index_name, body=body)
            hits = resp.get("hits", {}).get("hits", []) or []
            if not hits:
                break
            for hit in hits:
                yield hit
            search_after = hits[-1].get("sort")
            if len(hits) < batch_size:
                break
    def _iter_query_log_rows(
        self,
        tenant_id: str,
        since: datetime,
        until: datetime,
        fetch_size: int = 2000,
    ) -> Iterator[Any]:
        """Stream search logs from MySQL with bounded time range."""
        query_sql = text(
            """
            SELECT query, language, request_params, create_time
            FROM shoplazza_search_log
            WHERE tenant_id = :tenant_id
              AND deleted = 0
              AND query IS NOT NULL
              AND query <> ''
              AND create_time >= :since_time
              AND create_time < :until_time
            ORDER BY create_time ASC
            """
        )
        with self.db_engine.connect().execution_options(stream_results=True) as conn:
            result = conn.execute(
                query_sql,
                {
                    "tenant_id": int(tenant_id),
                    "since_time": since,
                    "until_time": until,
                },
            )
            while True:
                rows = result.fetchmany(fetch_size)
                if not rows:
                    break
                for row in rows:
                    yield row
    def _ensure_meta_index(self) -> str:
        meta_index = get_suggestion_meta_index_name()
        if self.es_client.index_exists(meta_index):
            return meta_index
        body = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "refresh_interval": "1s",
            },
            "mappings": {
                "properties": {
                    "tenant_id": {"type": "keyword"},
                    "active_alias": {"type": "keyword"},
                    "active_index": {"type": "keyword"},
                    "last_full_build_at": {"type": "date"},
                    "last_incremental_build_at": {"type": "date"},
                    "last_incremental_watermark": {"type": "date"},
                    "updated_at": {"type": "date"},
                }
            },
        }
        if not self.es_client.create_index(meta_index, body):
            raise RuntimeError(f"Failed to create suggestion meta index: {meta_index}")
        return meta_index
    def _get_meta(self, tenant_id: str) -> Dict[str, Any]:
        meta_index = self._ensure_meta_index()
        try:
            resp = self.es_client.client.get(index=meta_index, id=str(tenant_id))
            return resp.get("_source", {}) or {}
        except Exception:
            return {}
    def _upsert_meta(self, tenant_id: str, patch: Dict[str, Any]) -> None:
        meta_index = self._ensure_meta_index()
        current = self._get_meta(tenant_id)
        now_iso = datetime.now(timezone.utc).isoformat()
        merged = {
            "tenant_id": str(tenant_id),
            **current,
            **patch,
            "updated_at": now_iso,
        }
        self.es_client.client.index(index=meta_index, id=str(tenant_id), document=merged, refresh="wait_for")
    def _cleanup_old_versions(self, tenant_id: str, keep_versions: int, protected_indices: Optional[List[str]] = None) -> List[str]:
        if keep_versions < 1:
            keep_versions = 1
        protected = set(protected_indices or [])
        pattern = get_suggestion_versioned_index_pattern(tenant_id)
        all_indices = self.es_client.list_indices(pattern)
        if len(all_indices) <= keep_versions:
            return []
        # Names are timestamp-ordered by suffix; keep newest N.
        kept = set(sorted(all_indices)[-keep_versions:])
        dropped: List[str] = []
        for idx in sorted(all_indices):
            if idx in kept or idx in protected:
                continue
            if self.es_client.delete_index(idx):
                dropped.append(idx)
        return dropped
    def _publish_alias(self, tenant_id: str, index_name: str, keep_versions: int = 2) -> Dict[str, Any]:
        alias_name = get_suggestion_alias_name(tenant_id)
        current_indices = self.es_client.get_alias_indices(alias_name)
        actions: List[Dict[str, Any]] = []
        for idx in current_indices:
            actions.append({"remove": {"index": idx, "alias": alias_name}})
        actions.append({"add": {"index": index_name, "alias": alias_name}})
        if not self.es_client.update_aliases(actions):
            raise RuntimeError(f"Failed to publish alias {alias_name} -> {index_name}")
        dropped = self._cleanup_old_versions(
            tenant_id=tenant_id,
            keep_versions=keep_versions,
            protected_indices=[index_name],
        )
        self._upsert_meta(
            tenant_id,
            {
                "active_alias": alias_name,
                "active_index": index_name,
            },
        )
        return {
            "alias": alias_name,
            "previous_indices": current_indices,
            "current_index": index_name,
            "dropped_old_indices": dropped,
        }
    def _resolve_incremental_target_index(self, tenant_id: str) -> Optional[str]:
        """Resolve active suggestion index for incremental updates (alias only)."""
        alias_name = get_suggestion_alias_name(tenant_id)
        aliased = self.es_client.get_alias_indices(alias_name)
        if aliased:
            # alias should map to one index in this design
            return sorted(aliased)[-1]
        return None
    def _build_full_candidates(
        self,
        tenant_id: str,
        index_languages: List[str],
        primary_language: str,
        days: int,
        batch_size: int,
        min_query_len: int,
    ) -> Dict[Tuple[str, str], SuggestionCandidate]:
        key_to_candidate: Dict[Tuple[str, str], SuggestionCandidate] = {}
        # Step 1: product title/qanchors
        for hit in self._iter_products(tenant_id, batch_size=batch_size):
            src = hit.get("_source", {}) or {}
            product_id = str(src.get("spu_id") or src.get("id") or hit.get("_id") or "")
            if not product_id:
                continue
            title_obj = src.get("title") or {}
            qanchor_obj = src.get("qanchors") or {}
            for lang in index_languages:
                title = ""
                if isinstance(title_obj, dict):
                    title = self._prepare_title_for_suggest(title_obj.get(lang) or "")
                if title:
                    text_norm = self._normalize_text(title)
                    if not self._looks_noise(text_norm):
                        key = (lang, text_norm)
                        c = key_to_candidate.get(key)
                        if c is None:
                            c = SuggestionCandidate(text=title, text_norm=text_norm, lang=lang)
                            key_to_candidate[key] = c
                        c.add_product("title", spu_id=product_id)
                q_raw = None
                if isinstance(qanchor_obj, dict):
                    q_raw = qanchor_obj.get(lang)
                for q_text in self._split_qanchors(q_raw):
                    text_norm = self._normalize_text(q_text)
                    if self._looks_noise(text_norm):
                        continue
                    key = (lang, text_norm)
                    c = key_to_candidate.get(key)
                    if c is None:
                        c = SuggestionCandidate(text=q_text, text_norm=text_norm, lang=lang)
                        key_to_candidate[key] = c
                    c.add_product("qanchor", spu_id=product_id)
        # Step 2: query logs
        now = datetime.now(timezone.utc)
        since = now - timedelta(days=days)
        since_7d = now - timedelta(days=7)
        for row in self._iter_query_log_rows(tenant_id=tenant_id, since=since, until=now):
            q = str(row.query or "").strip()
            if len(q) < min_query_len:
                continue
            lang, conf, source, conflict = self._resolve_query_language(
                query=q,
                log_language=getattr(row, "language", None),
                request_params=getattr(row, "request_params", None),
                index_languages=index_languages,
                primary_language=primary_language,
            )
            text_norm = self._normalize_text(q)
            if self._looks_noise(text_norm):
                continue
            key = (lang, text_norm)
            c = key_to_candidate.get(key)
            if c is None:
                c = SuggestionCandidate(text=q, text_norm=text_norm, lang=lang)
                key_to_candidate[key] = c
            c.lang_confidence = max(c.lang_confidence, conf)
            c.lang_source = source if c.lang_source == "default" else c.lang_source
            c.lang_conflict = c.lang_conflict or conflict
            created_at = self._to_utc(getattr(row, "create_time", None))
            is_7d = bool(created_at and created_at >= since_7d)
            c.add_query_log(is_7d=is_7d)
        return key_to_candidate
    def _candidate_to_doc(self, tenant_id: str, c: SuggestionCandidate, now_iso: str) -> Dict[str, Any]:
        rank_score = self._compute_rank_score_from_candidate(c)
        completion_obj = {c.lang: {"input": [c.text], "weight": int(max(rank_score, 1.0) * 100)}}
        sat_obj = {c.lang: c.text}
        return {
            "_id": f"{tenant_id}|{c.lang}|{c.text_norm}",
            "tenant_id": str(tenant_id),
            "lang": c.lang,
            "text": c.text,
            "text_norm": c.text_norm,
            "sources": sorted(c.sources),
            "title_doc_count": len(c.title_spu_ids),
            "qanchor_doc_count": len(c.qanchor_spu_ids),
            "query_count_7d": c.query_count_7d,
            "query_count_30d": c.query_count_30d,
            "rank_score": float(rank_score),
            "lang_confidence": float(c.lang_confidence),
            "lang_source": c.lang_source,
            "lang_conflict": bool(c.lang_conflict),
            "status": 1,
            "updated_at": now_iso,
            "completion": completion_obj,
            "sat": sat_obj,
        }
    def rebuild_tenant_index(
        self,
        tenant_id: str,
        days: int = 365,
        batch_size: int = 500,
        min_query_len: int = 1,
        publish_alias: bool = True,
        keep_versions: int = 2,
    ) -> Dict[str, Any]:
        """
        Full rebuild.
        Phase2 default behavior:
        - write to versioned index
        - atomically publish alias
        """
        tenant_loader = get_tenant_config_loader()
        tenant_cfg = tenant_loader.get_tenant_config(tenant_id)
        index_languages: List[str] = tenant_cfg.get("index_languages") or ["en", "zh"]
        primary_language: str = tenant_cfg.get("primary_language") or "en"
        # Always write to a fresh versioned index; legacy concrete index is no longer supported.
        index_name = get_suggestion_versioned_index_name(tenant_id)
        if self.es_client.index_exists(index_name):
            raise RuntimeError(f"Target suggestion index already exists: {index_name}")
        mapping = build_suggestion_mapping(index_languages=index_languages)
        if not self.es_client.create_index(index_name, mapping):
            raise RuntimeError(f"Failed to create suggestion index: {index_name}")
        key_to_candidate = self._build_full_candidates(
            tenant_id=tenant_id,
            index_languages=index_languages,
            primary_language=primary_language,
            days=days,
            batch_size=batch_size,
            min_query_len=min_query_len,
        )
        now_iso = datetime.now(timezone.utc).isoformat()
        docs = [self._candidate_to_doc(tenant_id, c, now_iso) for c in key_to_candidate.values()]
        if docs:
            bulk_result = self.es_client.bulk_index(index_name=index_name, docs=docs)
            self.es_client.refresh(index_name)
        else:
            bulk_result = {"success": 0, "failed": 0, "errors": []}
        alias_publish: Optional[Dict[str, Any]] = None
        if publish_alias:
            alias_publish = self._publish_alias(
                tenant_id=tenant_id,
                index_name=index_name,
                keep_versions=keep_versions,
            )
        now_utc = datetime.now(timezone.utc).isoformat()
        meta_patch: Dict[str, Any] = {
            "last_full_build_at": now_utc,
            "last_incremental_watermark": now_utc,
        }
        if publish_alias:
            meta_patch["active_index"] = index_name
            meta_patch["active_alias"] = get_suggestion_alias_name(tenant_id)
        self._upsert_meta(tenant_id, meta_patch)
        return {
            "mode": "full",
            "tenant_id": str(tenant_id),
            "index_name": index_name,
            "alias_published": bool(alias_publish),
            "alias_publish": alias_publish,
            "total_candidates": len(key_to_candidate),
            "indexed_docs": len(docs),
            "bulk_result": bulk_result,
        }
    def _build_incremental_deltas(
        self,
        tenant_id: str,
        index_languages: List[str],
        primary_language: str,
        since: datetime,
        until: datetime,
        min_query_len: int,
    ) -> Dict[Tuple[str, str], QueryDelta]:
        now = datetime.now(timezone.utc)
        since_7d = now - timedelta(days=7)
        deltas: Dict[Tuple[str, str], QueryDelta] = {}
        for row in self._iter_query_log_rows(tenant_id=tenant_id, since=since, until=until):
            q = str(row.query or "").strip()
            if len(q) < min_query_len:
                continue
            lang, conf, source, conflict = self._resolve_query_language(
                query=q,
                log_language=getattr(row, "language", None),
                request_params=getattr(row, "request_params", None),
                index_languages=index_languages,
                primary_language=primary_language,
            )
            text_norm = self._normalize_text(q)
            if self._looks_noise(text_norm):
                continue
            key = (lang, text_norm)
            item = deltas.get(key)
            if item is None:
                item = QueryDelta(
                    tenant_id=str(tenant_id),
                    lang=lang,
                    text=q,
                    text_norm=text_norm,
                    lang_confidence=conf,
                    lang_source=source,
                    lang_conflict=conflict,
                )
                deltas[key] = item
            created_at = self._to_utc(getattr(row, "create_time", None))
            item.delta_30d += 1
            if created_at and created_at >= since_7d:
                item.delta_7d += 1
            if conf > item.lang_confidence:
                item.lang_confidence = conf
                item.lang_source = source
            item.lang_conflict = item.lang_conflict or conflict
        return deltas
    def _delta_to_upsert_doc(self, delta: QueryDelta, now_iso: str) -> Dict[str, Any]:
        rank_score = self._compute_rank_score(
            query_count_30d=delta.delta_30d,
            query_count_7d=delta.delta_7d,
            qanchor_doc_count=0,
            title_doc_count=0,
        )
        return {
            "tenant_id": delta.tenant_id,
            "lang": delta.lang,
            "text": delta.text,
            "text_norm": delta.text_norm,
            "sources": ["query_log"],
            "title_doc_count": 0,
            "qanchor_doc_count": 0,
            "query_count_7d": delta.delta_7d,
            "query_count_30d": delta.delta_30d,
            "rank_score": float(rank_score),
            "lang_confidence": float(delta.lang_confidence),
            "lang_source": delta.lang_source,
            "lang_conflict": bool(delta.lang_conflict),
            "status": 1,
            "updated_at": now_iso,
            "completion": {
                delta.lang: {
                    "input": [delta.text],
                    "weight": int(max(rank_score, 1.0) * 100),
                }
            },
            "sat": {delta.lang: delta.text},
        }
    @staticmethod
    def _build_incremental_update_script() -> str:
        return """
            if (ctx._source == null || ctx._source.isEmpty()) {
                ctx._source = params.upsert;
                return;
            }
            if (ctx._source.query_count_30d == null) { ctx._source.query_count_30d = 0; }
            if (ctx._source.query_count_7d == null) { ctx._source.query_count_7d = 0; }
            if (ctx._source.qanchor_doc_count == null) { ctx._source.qanchor_doc_count = 0; }
            if (ctx._source.title_doc_count == null) { ctx._source.title_doc_count = 0; }
            ctx._source.query_count_30d += params.delta_30d;
            ctx._source.query_count_7d += params.delta_7d;
            if (ctx._source.sources == null) { ctx._source.sources = new ArrayList(); }
            if (!ctx._source.sources.contains('query_log')) { ctx._source.sources.add('query_log'); }
            if (ctx._source.lang_conflict == null) { ctx._source.lang_conflict = false; }
            ctx._source.lang_conflict = ctx._source.lang_conflict || params.lang_conflict;
            if (ctx._source.lang_confidence == null || params.lang_confidence > ctx._source.lang_confidence) {
                ctx._source.lang_confidence = params.lang_confidence;
                ctx._source.lang_source = params.lang_source;
            }
            int q30 = ctx._source.query_count_30d;
            int q7 = ctx._source.query_count_7d;
            int qa = ctx._source.qanchor_doc_count;
            int td = ctx._source.title_doc_count;
            double score = 1.8 * Math.log(1 + q30)
                         + 1.2 * Math.log(1 + q7)
                         + 1.0 * Math.log(1 + qa)
                         + 0.6 * Math.log(1 + td);
            ctx._source.rank_score = score;
            ctx._source.status = 1;
            ctx._source.updated_at = params.now_iso;
            ctx._source.text = params.text;
            ctx._source.lang = params.lang;
            ctx._source.text_norm = params.text_norm;
            if (ctx._source.completion == null) { ctx._source.completion = new HashMap(); }
            Map c = new HashMap();
            c.put('input', params.completion_input);
            c.put('weight', params.completion_weight);
            ctx._source.completion.put(params.lang, c);
            if (ctx._source.sat == null) { ctx._source.sat = new HashMap(); }
            ctx._source.sat.put(params.lang, params.text);
        """
    def _build_incremental_actions(self, target_index: str, deltas: Dict[Tuple[str, str], QueryDelta]) -> List[Dict[str, Any]]:
        now_iso = datetime.now(timezone.utc).isoformat()
        script_source = self._build_incremental_update_script()
        actions: List[Dict[str, Any]] = []
        for delta in deltas.values():
            upsert_doc = self._delta_to_upsert_doc(delta=delta, now_iso=now_iso)
            upsert_rank = float(upsert_doc.get("rank_score") or 0.0)
            action = {
                "_op_type": "update",
                "_index": target_index,
                "_id": f"{delta.tenant_id}|{delta.lang}|{delta.text_norm}",
                "scripted_upsert": True,
                "script": {
                    "lang": "painless",
                    "source": script_source,
                    "params": {
                        "delta_30d": int(delta.delta_30d),
                        "delta_7d": int(delta.delta_7d),
                        "lang_confidence": float(delta.lang_confidence),
                        "lang_source": delta.lang_source,
                        "lang_conflict": bool(delta.lang_conflict),
                        "now_iso": now_iso,
                        "lang": delta.lang,
                        "text": delta.text,
                        "text_norm": delta.text_norm,
                        "completion_input": [delta.text],
                        "completion_weight": int(max(upsert_rank, 1.0) * 100),
                        "upsert": upsert_doc,
                    },
                },
                "upsert": upsert_doc,
            }
            actions.append(action)
        return actions
    def incremental_update_tenant_index(
        self,
        tenant_id: str,
        min_query_len: int = 1,
        fallback_days: int = 7,
        overlap_minutes: int = 30,
        bootstrap_if_missing: bool = True,
        bootstrap_days: int = 30,
        batch_size: int = 500,
    ) -> Dict[str, Any]:
        tenant_loader = get_tenant_config_loader()
        tenant_cfg = tenant_loader.get_tenant_config(tenant_id)
        index_languages: List[str] = tenant_cfg.get("index_languages") or ["en", "zh"]
        primary_language: str = tenant_cfg.get("primary_language") or "en"
        target_index = self._resolve_incremental_target_index(tenant_id)
        if not target_index:
            if not bootstrap_if_missing:
                raise RuntimeError(
                    f"No active suggestion index for tenant={tenant_id}. "
                    "Run full rebuild first or enable bootstrap_if_missing."
                )
            full_result = self.rebuild_tenant_index(
                tenant_id=tenant_id,
                days=bootstrap_days,
                batch_size=batch_size,
                min_query_len=min_query_len,
                publish_alias=True
            )
            return {
                "mode": "incremental",
                "tenant_id": str(tenant_id),
                "bootstrapped": True,
                "bootstrap_result": full_result,
            }
        meta = self._get_meta(tenant_id)
        watermark_raw = meta.get("last_incremental_watermark") or meta.get("last_full_build_at")
        now = datetime.now(timezone.utc)
        default_since = now - timedelta(days=fallback_days)
        since = None
        if isinstance(watermark_raw, str) and watermark_raw.strip():
            try:
                since = self._to_utc(datetime.fromisoformat(watermark_raw.replace("Z", "+00:00")))
            except Exception:
                since = None
        if since is None:
            since = default_since
        since = since - timedelta(minutes=max(overlap_minutes, 0))
        if since < default_since:
            since = default_since
        deltas = self._build_incremental_deltas(
            tenant_id=tenant_id,
            index_languages=index_languages,
            primary_language=primary_language,
            since=since,
            until=now,
            min_query_len=min_query_len,
        )
        actions = self._build_incremental_actions(target_index=target_index, deltas=deltas)
        bulk_result = self.es_client.bulk_actions(actions)
        self.es_client.refresh(target_index)
        now_iso = now.isoformat()
        self._upsert_meta(
            tenant_id,
            {
                "last_incremental_build_at": now_iso,
                "last_incremental_watermark": now_iso,
                "active_index": target_index,
                "active_alias": get_suggestion_alias_name(tenant_id),
            },
        )
        return {
            "mode": "incremental",
            "tenant_id": str(tenant_id),
            "target_index": target_index,
            "query_window": {
                "since": since.isoformat(),
                "until": now_iso,
                "overlap_minutes": int(overlap_minutes),
            },
            "updated_terms": len(deltas),
            "bulk_result": bulk_result,
        }