""" Style intent detection for query understanding. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple from .tokenization import TokenizedText, normalize_query_text, tokenize_text @dataclass(frozen=True) class StyleIntentDefinition: intent_type: str term_groups: Tuple[Tuple[str, ...], ...] dimension_aliases: Tuple[str, ...] synonym_to_canonical: Dict[str, str] max_term_ngram: int = 3 @classmethod def from_rows( cls, intent_type: str, rows: Sequence[Sequence[str]], dimension_aliases: Sequence[str], ) -> "StyleIntentDefinition": term_groups: List[Tuple[str, ...]] = [] synonym_to_canonical: Dict[str, str] = {} max_ngram = 1 for row in rows: normalized_terms: List[str] = [] for raw_term in row: term = normalize_query_text(raw_term) if not term or term in normalized_terms: continue normalized_terms.append(term) if not normalized_terms: continue canonical = normalized_terms[0] term_groups.append(tuple(normalized_terms)) for term in normalized_terms: synonym_to_canonical[term] = canonical max_ngram = max(max_ngram, len(term.split())) aliases = tuple( dict.fromkeys( term for term in ( normalize_query_text(alias) for alias in dimension_aliases ) if term ) ) return cls( intent_type=intent_type, term_groups=tuple(term_groups), dimension_aliases=aliases, synonym_to_canonical=synonym_to_canonical, max_term_ngram=max_ngram, ) def match_candidates(self, candidates: Iterable[str]) -> Set[str]: matched: Set[str] = set() for candidate in candidates: canonical = self.synonym_to_canonical.get(normalize_query_text(candidate)) if canonical: matched.add(canonical) return matched def match_text( self, text: str, *, tokenizer: Optional[Callable[[str], Any]] = None, ) -> Set[str]: bundle = tokenize_text(text, tokenizer=tokenizer, max_ngram=self.max_term_ngram) return self.match_candidates(bundle.candidates) @dataclass(frozen=True) class DetectedStyleIntent: intent_type: str canonical_value: str matched_term: str matched_query_text: str dimension_aliases: Tuple[str, ...] def to_dict(self) -> Dict[str, Any]: return { "intent_type": self.intent_type, "canonical_value": self.canonical_value, "matched_term": self.matched_term, "matched_query_text": self.matched_query_text, "dimension_aliases": list(self.dimension_aliases), } @dataclass(frozen=True) class StyleIntentProfile: query_variants: Tuple[TokenizedText, ...] = field(default_factory=tuple) intents: Tuple[DetectedStyleIntent, ...] = field(default_factory=tuple) @property def is_active(self) -> bool: return bool(self.intents) def get_intents(self, intent_type: Optional[str] = None) -> List[DetectedStyleIntent]: if intent_type is None: return list(self.intents) normalized = normalize_query_text(intent_type) return [intent for intent in self.intents if intent.intent_type == normalized] def get_canonical_values(self, intent_type: str) -> Set[str]: return {intent.canonical_value for intent in self.get_intents(intent_type)} def to_dict(self) -> Dict[str, Any]: return { "active": self.is_active, "intents": [intent.to_dict() for intent in self.intents], "query_variants": [ { "text": variant.text, "normalized_text": variant.normalized_text, "fine_tokens": list(variant.fine_tokens), "coarse_tokens": list(variant.coarse_tokens), "candidates": list(variant.candidates), } for variant in self.query_variants ], } class StyleIntentRegistry: """Holds style intent vocabularies and matching helpers.""" def __init__( self, definitions: Dict[str, StyleIntentDefinition], *, enabled: bool = True, ) -> None: self.definitions = definitions self.enabled = bool(enabled) @classmethod def from_query_config(cls, query_config: Any) -> "StyleIntentRegistry": style_terms = getattr(query_config, "style_intent_terms", {}) or {} dimension_aliases = getattr(query_config, "style_intent_dimension_aliases", {}) or {} definitions: Dict[str, StyleIntentDefinition] = {} for intent_type, rows in style_terms.items(): definition = StyleIntentDefinition.from_rows( intent_type=normalize_query_text(intent_type), rows=rows or [], dimension_aliases=dimension_aliases.get(intent_type, []), ) if definition.synonym_to_canonical: definitions[definition.intent_type] = definition return cls( definitions, enabled=bool(getattr(query_config, "style_intent_enabled", True)), ) def get_definition(self, intent_type: str) -> Optional[StyleIntentDefinition]: return self.definitions.get(normalize_query_text(intent_type)) def get_dimension_aliases(self, intent_type: str) -> Tuple[str, ...]: definition = self.get_definition(intent_type) return definition.dimension_aliases if definition else tuple() class StyleIntentDetector: """Detects style intents from parsed query variants.""" def __init__( self, registry: StyleIntentRegistry, *, tokenizer: Optional[Callable[[str], Any]] = None, ) -> None: self.registry = registry self.tokenizer = tokenizer def _build_query_variants(self, parsed_query: Any) -> Tuple[TokenizedText, ...]: seen = set() variants: List[TokenizedText] = [] texts = [ getattr(parsed_query, "original_query", None), getattr(parsed_query, "query_normalized", None), getattr(parsed_query, "rewritten_query", None), ] translations = getattr(parsed_query, "translations", {}) or {} if isinstance(translations, dict): texts.extend(translations.values()) for raw_text in texts: text = str(raw_text or "").strip() if not text: continue normalized = normalize_query_text(text) if not normalized or normalized in seen: continue seen.add(normalized) variants.append( tokenize_text( text, tokenizer=self.tokenizer, max_ngram=max( (definition.max_term_ngram for definition in self.registry.definitions.values()), default=3, ), ) ) return tuple(variants) def detect(self, parsed_query: Any) -> StyleIntentProfile: if not self.registry.enabled or not self.registry.definitions: return StyleIntentProfile() query_variants = self._build_query_variants(parsed_query) detected: List[DetectedStyleIntent] = [] seen_pairs = set() for variant in query_variants: for intent_type, definition in self.registry.definitions.items(): matched_canonicals = definition.match_candidates(variant.candidates) if not matched_canonicals: continue for candidate in variant.candidates: normalized_candidate = normalize_query_text(candidate) canonical = definition.synonym_to_canonical.get(normalized_candidate) if not canonical or canonical not in matched_canonicals: continue pair = (intent_type, canonical) if pair in seen_pairs: continue seen_pairs.add(pair) detected.append( DetectedStyleIntent( intent_type=intent_type, canonical_value=canonical, matched_term=normalized_candidate, matched_query_text=variant.text, dimension_aliases=definition.dimension_aliases, ) ) break return StyleIntentProfile( query_variants=query_variants, intents=tuple(detected), )