""" Boolean expression parser for search queries. Supports: AND, OR, RANK, ANDNOT operators with parentheses. Precedence (high to low): (), ANDNOT, AND, OR, RANK """ import re from typing import List, Tuple, Optional from dataclasses import dataclass @dataclass class QueryNode: """Represents a node in the parsed query tree.""" operator: str # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM' terms: List['QueryNode'] = None # Child nodes for operators value: str = None # Value for leaf nodes (TERM) def __repr__(self): if self.operator == 'TERM': return f"TERM({self.value})" else: return f"{self.operator}({', '.join(str(t) for t in self.terms)})" class BooleanParser: """ Parser for boolean search expressions. Operator precedence (high to low): 1. () - Parentheses 2. ANDNOT - AND NOT (exclusion) 3. AND - All terms must match 4. OR - Any term must match 5. RANK - Scoring boost (like OR but affects ranking) """ OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'} PRECEDENCE = { 'ANDNOT': 3, 'AND': 2, 'OR': 1, 'RANK': 0 } def __init__(self): """Initialize boolean parser.""" pass def parse(self, expression: str) -> QueryNode: """ Parse boolean expression into query tree. Args: expression: Boolean expression string Example: "laptop AND (gaming OR professional) ANDNOT cheap" Returns: Root QueryNode of parsed tree """ if not expression or not expression.strip(): return QueryNode(operator='TERM', value='') # Tokenize tokens = self._tokenize(expression) if not tokens: return QueryNode(operator='TERM', value='') # Parse with precedence return self._parse_expression(tokens) def _tokenize(self, expression: str) -> List[str]: """ Tokenize expression into terms and operators. Args: expression: Expression string Returns: List of tokens """ # Pattern to match: operators, parentheses, or terms (with domain prefix support) pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+' tokens = [] for match in re.finditer(pattern, expression): token = match.group().strip() if token: tokens.append(token) return tokens def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]: """ Parse expression with operator precedence. Args: tokens: List of tokens start: Starting index Returns: Tuple of (QueryNode, next_index) """ # Start with lowest precedence (RANK) return self._parse_rank(tokens, start) def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: """Parse RANK operator (lowest precedence).""" left, pos = self._parse_or(tokens, start) while pos < len(tokens) and tokens[pos] == 'RANK': pos += 1 # Skip 'RANK' right, pos = self._parse_or(tokens, pos) left = QueryNode(operator='RANK', terms=[left, right]) return left, pos def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: """Parse OR operator.""" left, pos = self._parse_and(tokens, start) while pos < len(tokens) and tokens[pos] == 'OR': pos += 1 # Skip 'OR' right, pos = self._parse_and(tokens, pos) left = QueryNode(operator='OR', terms=[left, right]) return left, pos def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: """Parse AND operator.""" left, pos = self._parse_andnot(tokens, start) while pos < len(tokens) and tokens[pos] == 'AND': pos += 1 # Skip 'AND' right, pos = self._parse_andnot(tokens, pos) left = QueryNode(operator='AND', terms=[left, right]) return left, pos def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: """Parse ANDNOT operator (highest precedence).""" left, pos = self._parse_primary(tokens, start) while pos < len(tokens) and tokens[pos] == 'ANDNOT': pos += 1 # Skip 'ANDNOT' right, pos = self._parse_primary(tokens, pos) left = QueryNode(operator='ANDNOT', terms=[left, right]) return left, pos def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]: """Parse primary expression (terms or parentheses).""" if start >= len(tokens): return QueryNode(operator='TERM', value=''), start token = tokens[start] # Handle parentheses if token == '(': # Find matching closing parenthesis depth = 1 pos = start + 1 while pos < len(tokens) and depth > 0: if tokens[pos] == '(': depth += 1 elif tokens[pos] == ')': depth -= 1 pos += 1 # Parse contents of parentheses inner_tokens = tokens[start + 1:pos - 1] if inner_tokens: node, _ = self._parse_expression(inner_tokens, 0) return node, pos else: return QueryNode(operator='TERM', value=''), pos # Handle term if token not in self.OPERATORS and token not in ['(', ')']: return QueryNode(operator='TERM', value=token), start + 1 # Unexpected token return QueryNode(operator='TERM', value=''), start + 1 def is_simple_query(self, expression: str) -> bool: """ Check if query is simple (no boolean operators). Args: expression: Query expression Returns: True if simple query (no operators) """ tokens = self._tokenize(expression) for token in tokens: if token in self.OPERATORS: return False return True