query/style_intent.py

"""
Style intent detection for query understanding.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from .tokenization import TokenizedText, normalize_query_text, tokenize_text
@dataclass(frozen=True)
class StyleIntentDefinition:
    intent_type: str
    term_groups: Tuple[Tuple[str, ...], ...]
    dimension_aliases: Tuple[str, ...]
    synonym_to_canonical: Dict[str, str]
    max_term_ngram: int = 3
    @classmethod
    def from_rows(
        cls,
        intent_type: str,
        rows: Sequence[Sequence[str]],
        dimension_aliases: Sequence[str],
    ) -> "StyleIntentDefinition":
        term_groups: List[Tuple[str, ...]] = []
        synonym_to_canonical: Dict[str, str] = {}
        max_ngram = 1
        for row in rows:
            normalized_terms: List[str] = []
            for raw_term in row:
                term = normalize_query_text(raw_term)
                if not term or term in normalized_terms:
                    continue
                normalized_terms.append(term)
            if not normalized_terms:
                continue
            canonical = normalized_terms[0]
            term_groups.append(tuple(normalized_terms))
            for term in normalized_terms:
                synonym_to_canonical[term] = canonical
                max_ngram = max(max_ngram, len(term.split()))
        aliases = tuple(
            dict.fromkeys(
                term
                for term in (
                    normalize_query_text(alias)
                    for alias in dimension_aliases
                )
                if term
            )
        )
        return cls(
            intent_type=intent_type,
            term_groups=tuple(term_groups),
            dimension_aliases=aliases,
            synonym_to_canonical=synonym_to_canonical,
            max_term_ngram=max_ngram,
        )
    def match_candidates(self, candidates: Iterable[str]) -> Set[str]:
        matched: Set[str] = set()
        for candidate in candidates:
            canonical = self.synonym_to_canonical.get(normalize_query_text(candidate))
            if canonical:
                matched.add(canonical)
        return matched
    def match_text(
        self,
        text: str,
        *,
        tokenizer: Optional[Callable[[str], Any]] = None,
    ) -> Set[str]:
        bundle = tokenize_text(text, tokenizer=tokenizer, max_ngram=self.max_term_ngram)
        return self.match_candidates(bundle.candidates)
@dataclass(frozen=True)
class DetectedStyleIntent:
    intent_type: str
    canonical_value: str
    matched_term: str
    matched_query_text: str
    dimension_aliases: Tuple[str, ...]
    def to_dict(self) -> Dict[str, Any]:
        return {
            "intent_type": self.intent_type,
            "canonical_value": self.canonical_value,
            "matched_term": self.matched_term,
            "matched_query_text": self.matched_query_text,
            "dimension_aliases": list(self.dimension_aliases),
        }
@dataclass(frozen=True)
class StyleIntentProfile:
    query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple)
    intents: Tuple[DetectedStyleIntent, ...] = field(default_factory=tuple)
    @property
    def is_active(self) -> bool:
        return bool(self.intents)
    def get_intents(self, intent_type: Optional[str] = None) -> List[DetectedStyleIntent]:
        if intent_type is None:
            return list(self.intents)
        normalized = normalize_query_text(intent_type)
        return [intent for intent in self.intents if intent.intent_type == normalized]
    def get_canonical_values(self, intent_type: str) -> Set[str]:
        return {intent.canonical_value for intent in self.get_intents(intent_type)}
    def to_dict(self) -> Dict[str, Any]:
        return {
            "active": self.is_active,
            "intents": [intent.to_dict() for intent in self.intents],
            "query_variants": [
                {
                    "text": variant.text,
                    "normalized_text": variant.normalized_text,
                    "fine_tokens": list(variant.fine_tokens),
                    "coarse_tokens": list(variant.coarse_tokens),
                    "candidates": list(variant.candidates),
                }
                for variant in self.query_variants
            ],
        }
class StyleIntentRegistry:
    """Holds style intent vocabularies and matching helpers."""
    def __init__(
        self,
        definitions: Dict[str, StyleIntentDefinition],
        *,
        enabled: bool = True,
    ) -> None:
        self.definitions = definitions
        self.enabled = bool(enabled)
    @classmethod
    def from_query_config(cls, query_config: Any) -> "StyleIntentRegistry":
        style_terms = getattr(query_config, "style_intent_terms", {}) or {}
        dimension_aliases = getattr(query_config, "style_intent_dimension_aliases", {}) or {}
        definitions: Dict[str, StyleIntentDefinition] = {}
        for intent_type, rows in style_terms.items():
            definition = StyleIntentDefinition.from_rows(
                intent_type=normalize_query_text(intent_type),
                rows=rows or [],
                dimension_aliases=dimension_aliases.get(intent_type, []),
            )
            if definition.synonym_to_canonical:
                definitions[definition.intent_type] = definition
        return cls(
            definitions,
            enabled=bool(getattr(query_config, "style_intent_enabled", True)),
        )
    def get_definition(self, intent_type: str) -> Optional[StyleIntentDefinition]:
        return self.definitions.get(normalize_query_text(intent_type))
    def get_dimension_aliases(self, intent_type: str) -> Tuple[str, ...]:
        definition = self.get_definition(intent_type)
        return definition.dimension_aliases if definition else tuple()
class StyleIntentDetector:
    """Detects style intents from parsed query variants."""
    def __init__(
        self,
        registry: StyleIntentRegistry,
        *,
        tokenizer: Optional[Callable[[str], Any]] = None,
    ) -> None:
        self.registry = registry
        self.tokenizer = tokenizer
    def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]:
        seen = set()
        variants: List[TokenizedText] = []
        texts = [
            getattr(parsed_query, "original_query", None),
            getattr(parsed_query, "query_normalized", None),
            getattr(parsed_query, "rewritten_query", None),
        ]
        translations = getattr(parsed_query, "translations", {}) or {}
        if isinstance(translations, dict):
            texts.extend(translations.values())
        for raw_text in texts:
            text = str(raw_text or "").strip()
            if not text:
                continue
            normalized = normalize_query_text(text)
            if not normalized or normalized in seen:
                continue
            seen.add(normalized)
            variants.append(
                tokenize_text(
                    text,
                    tokenizer=self.tokenizer,
                    max_ngram=max(
                        (definition.max_term_ngram for definition in self.registry.definitions.values()),
                        default=3,
                    ),
                )
            )
        return tuple(variants)
    def detect(self, parsed_query: Any) -> StyleIntentProfile:
        if not self.registry.enabled or not self.registry.definitions:
            return StyleIntentProfile()
        query_variants = self._build_query_variants(parsed_query)
        detected: List[DetectedStyleIntent] = []
        seen_pairs = set()
        for variant in query_variants:
            for intent_type, definition in self.registry.definitions.items():
                matched_canonicals = definition.match_candidates(variant.candidates)
                if not matched_canonicals:
                    continue
                for candidate in variant.candidates:
                    normalized_candidate = normalize_query_text(candidate)
                    canonical = definition.synonym_to_canonical.get(normalized_candidate)
                    if not canonical or canonical not in matched_canonicals:
                        continue
                    pair = (intent_type, canonical)
                    if pair in seen_pairs:
                        continue
                    seen_pairs.add(pair)
                    detected.append(
                        DetectedStyleIntent(
                            intent_type=intent_type,
                            canonical_value=canonical,
                            matched_term=normalized_candidate,
                            matched_query_text=variant.text,
                            dimension_aliases=definition.dimension_aliases,
                        )
                    )
                    break
        return StyleIntentProfile(
            query_variants=query_variants,
            intents=tuple(detected),
        )