boolean_parser.py 6.1 KB
"""
Boolean expression parser for search queries.

Supports: AND, OR, RANK, ANDNOT operators with parentheses.
Precedence (high to low): (), ANDNOT, AND, OR, RANK
"""

import re
from typing import List, Tuple, Optional
from dataclasses import dataclass


@dataclass
class QueryNode:
    """Represents a node in the parsed query tree."""
    operator: str  # 'AND', 'OR', 'RANK', 'ANDNOT', 'TERM'
    terms: List['QueryNode'] = None  # Child nodes for operators
    value: str = None  # Value for leaf nodes (TERM)

    def __repr__(self):
        if self.operator == 'TERM':
            return f"TERM({self.value})"
        else:
            return f"{self.operator}({', '.join(str(t) for t in self.terms)})"


class BooleanParser:
    """
    Parser for boolean search expressions.

    Operator precedence (high to low):
    1. () - Parentheses
    2. ANDNOT - AND NOT (exclusion)
    3. AND - All terms must match
    4. OR - Any term must match
    5. RANK - Scoring boost (like OR but affects ranking)
    """

    OPERATORS = {'AND', 'OR', 'RANK', 'ANDNOT'}
    PRECEDENCE = {
        'ANDNOT': 3,
        'AND': 2,
        'OR': 1,
        'RANK': 0
    }

    def __init__(self):
        """Initialize boolean parser."""
        pass

    def parse(self, expression: str) -> QueryNode:
        """
        Parse boolean expression into query tree.

        Args:
            expression: Boolean expression string
                       Example: "laptop AND (gaming OR professional) ANDNOT cheap"

        Returns:
            Root QueryNode of parsed tree
        """
        if not expression or not expression.strip():
            return QueryNode(operator='TERM', value='')

        # Tokenize
        tokens = self._tokenize(expression)

        if not tokens:
            return QueryNode(operator='TERM', value='')

        # Parse with precedence
        return self._parse_expression(tokens)

    def _tokenize(self, expression: str) -> List[str]:
        """
        Tokenize expression into terms and operators.

        Args:
            expression: Expression string

        Returns:
            List of tokens
        """
        # Pattern to match: operators, parentheses, or terms (with domain prefix support)
        pattern = r'\b(AND|OR|RANK|ANDNOT)\b|[()]|(?:\w+:)?[^\s()]+'

        tokens = []
        for match in re.finditer(pattern, expression):
            token = match.group().strip()
            if token:
                tokens.append(token)

        return tokens

    def _parse_expression(self, tokens: List[str], start: int = 0) -> Tuple[QueryNode, int]:
        """
        Parse expression with operator precedence.

        Args:
            tokens: List of tokens
            start: Starting index

        Returns:
            Tuple of (QueryNode, next_index)
        """
        # Start with lowest precedence (RANK)
        return self._parse_rank(tokens, start)

    def _parse_rank(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
        """Parse RANK operator (lowest precedence)."""
        left, pos = self._parse_or(tokens, start)

        while pos < len(tokens) and tokens[pos] == 'RANK':
            pos += 1  # Skip 'RANK'
            right, pos = self._parse_or(tokens, pos)
            left = QueryNode(operator='RANK', terms=[left, right])

        return left, pos

    def _parse_or(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
        """Parse OR operator."""
        left, pos = self._parse_and(tokens, start)

        while pos < len(tokens) and tokens[pos] == 'OR':
            pos += 1  # Skip 'OR'
            right, pos = self._parse_and(tokens, pos)
            left = QueryNode(operator='OR', terms=[left, right])

        return left, pos

    def _parse_and(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
        """Parse AND operator."""
        left, pos = self._parse_andnot(tokens, start)

        while pos < len(tokens) and tokens[pos] == 'AND':
            pos += 1  # Skip 'AND'
            right, pos = self._parse_andnot(tokens, pos)
            left = QueryNode(operator='AND', terms=[left, right])

        return left, pos

    def _parse_andnot(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
        """Parse ANDNOT operator (highest precedence)."""
        left, pos = self._parse_primary(tokens, start)

        while pos < len(tokens) and tokens[pos] == 'ANDNOT':
            pos += 1  # Skip 'ANDNOT'
            right, pos = self._parse_primary(tokens, pos)
            left = QueryNode(operator='ANDNOT', terms=[left, right])

        return left, pos

    def _parse_primary(self, tokens: List[str], start: int) -> Tuple[QueryNode, int]:
        """Parse primary expression (terms or parentheses)."""
        if start >= len(tokens):
            return QueryNode(operator='TERM', value=''), start

        token = tokens[start]

        # Handle parentheses
        if token == '(':
            # Find matching closing parenthesis
            depth = 1
            pos = start + 1
            while pos < len(tokens) and depth > 0:
                if tokens[pos] == '(':
                    depth += 1
                elif tokens[pos] == ')':
                    depth -= 1
                pos += 1

            # Parse contents of parentheses
            inner_tokens = tokens[start + 1:pos - 1]
            if inner_tokens:
                node, _ = self._parse_expression(inner_tokens, 0)
                return node, pos
            else:
                return QueryNode(operator='TERM', value=''), pos

        # Handle term
        if token not in self.OPERATORS and token not in ['(', ')']:
            return QueryNode(operator='TERM', value=token), start + 1

        # Unexpected token
        return QueryNode(operator='TERM', value=''), start + 1

    def is_simple_query(self, expression: str) -> bool:
        """
        Check if query is simple (no boolean operators).

        Args:
            expression: Query expression

        Returns:
            True if simple query (no operators)
        """
        tokens = self._tokenize(expression)
        for token in tokens:
            if token in self.OPERATORS:
                return False
        return True