language_detector.py 3.78 KB
Edit Raw Blame History

"""
Language detection utility.

Detects the language of a query string.
"""

from typing import Optional
import re


class LanguageDetector:
    """Simple rule-based language detector for common e-commerce languages."""

    # Unicode ranges for different scripts
    CJK_RANGES = [
        (0x4E00, 0x9FFF),   # CJK Unified Ideographs
        (0x3400, 0x4DBF),   # CJK Extension A
        (0x20000, 0x2A6DF), # CJK Extension B
        (0x3040, 0x309F),   # Hiragana
        (0x30A0, 0x30FF),   # Katakana
    ]

    CYRILLIC_RANGE = (0x0400, 0x04FF)
    ARABIC_RANGE = (0x0600, 0x06FF)
    LATIN_RANGE = (0x0041, 0x007A)

    def __init__(self):
        """Initialize language detector."""
        self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
        self.russian_pattern = re.compile(r'[\u0400-\u04ff]+')
        self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+')
        self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+')

    def detect(self, text: str) -> str:
        """
        Detect language of text.

        Args:
            text: Input text

        Returns:
            Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown'
        """
        if not text or not text.strip():
            return 'unknown'

        text = text.strip()

        # Count characters in each script
        char_counts = {
            'chinese': 0,
            'russian': 0,
            'arabic': 0,
            'japanese': 0,
            'latin': 0
        }

        for char in text:
            code_point = ord(char)

            # Check CJK (Chinese/Japanese)
            is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES)
            if is_cjk:
                char_counts['chinese'] += 1

            # Check Hiragana/Katakana (Japanese)
            if 0x3040 <= code_point <= 0x30FF:
                char_counts['japanese'] += 1

            # Check Cyrillic (Russian)
            if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]:
                char_counts['russian'] += 1

            # Check Arabic
            if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]:
                char_counts['arabic'] += 1

            # Check Latin
            if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A):
                char_counts['latin'] += 1

        # Determine dominant script
        total_chars = sum(char_counts.values())
        if total_chars == 0:
            return 'unknown'

        # Calculate percentages
        percentages = {
            script: count / total_chars
            for script, count in char_counts.items()
        }

        # Japanese has both Hiragana/Katakana and CJK
        if percentages['japanese'] > 0.1:
            return 'ja'

        # Russian (Cyrillic)
        if percentages['russian'] > 0.5:
            return 'ru'

        # Arabic
        if percentages['arabic'] > 0.5:
            return 'ar'

        # Chinese (CJK without Japanese kana)
        if percentages['chinese'] > 0.3:
            return 'zh'

        # English/Latin
        if percentages['latin'] > 0.5:
            return 'en'

        return 'unknown'

    def is_chinese(self, text: str) -> bool:
        """Check if text is primarily Chinese."""
        return self.detect(text) == 'zh'

    def is_english(self, text: str) -> bool:
        """Check if text is primarily English."""
        return self.detect(text) == 'en'

    def is_russian(self, text: str) -> bool:
        """Check if text is primarily Russian."""
        return self.detect(text) == 'ru'

    def is_arabic(self, text: str) -> bool:
        """Check if text is primarily Arabic."""
        return self.detect(text) == 'ar'

    def is_japanese(self, text: str) -> bool:
        """Check if text is primarily Japanese."""
        return self.detect(text) == 'ja'