""" Field type definitions for the search engine configuration system. This module defines all supported field types, analyzers, and their corresponding Elasticsearch mapping configurations. """ from enum import Enum from typing import Dict, Any, Optional from dataclasses import dataclass class FieldType(Enum): """Supported field types in the search engine.""" TEXT = "text" KEYWORD = "keyword" TEXT_EMBEDDING = "text_embedding" IMAGE_EMBEDDING = "image_embedding" INT = "int" LONG = "long" FLOAT = "float" DOUBLE = "double" DATE = "date" BOOLEAN = "boolean" JSON = "json" class AnalyzerType(Enum): """Supported analyzer types for text fields.""" # E-commerce general analysis - Chinese CHINESE_ECOMMERCE = "index_ansj" CHINESE_ECOMMERCE_QUERY = "query_ansj" # Standard language analyzers ENGLISH = "english" ARABIC = "arabic" SPANISH = "spanish" RUSSIAN = "russian" JAPANESE = "japanese" # Standard analyzers STANDARD = "standard" KEYWORD = "keyword" class SimilarityType(Enum): """Supported similarity algorithms for text fields.""" BM25 = "BM25" BM25_CUSTOM = "BM25_custom" # Modified BM25 with b=0.0, k1=0.0 @dataclass class FieldConfig: """Configuration for a single field.""" name: str field_type: FieldType analyzer: Optional[AnalyzerType] = None search_analyzer: Optional[AnalyzerType] = None required: bool = False multi_language: bool = False # If true, field has language variants languages: Optional[list] = None # ['zh', 'en', 'ru'] boost: float = 1.0 store: bool = False index: bool = True # For embedding fields embedding_dims: int = 1024 embedding_similarity: str = "dot_product" # dot_product, cosine, l2_norm # For nested fields (like image embeddings) nested: bool = False nested_properties: Optional[Dict[str, Any]] = None def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: """ Generate Elasticsearch mapping configuration for a field. Args: field_config: Field configuration object Returns: Dictionary containing ES mapping for the field """ mapping = {} if field_config.field_type == FieldType.TEXT: mapping = { "type": "text", "store": field_config.store, "index": field_config.index } if field_config.analyzer: if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE: mapping["analyzer"] = "index_ansj" mapping["search_analyzer"] = "query_ansj" else: mapping["analyzer"] = field_config.analyzer.value if field_config.search_analyzer: mapping["search_analyzer"] = field_config.search_analyzer.value elif field_config.field_type == FieldType.KEYWORD: mapping = { "type": "keyword", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.TEXT_EMBEDDING: mapping = { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity } elif field_config.field_type == FieldType.IMAGE_EMBEDDING: if field_config.nested: mapping = { "type": "nested", "properties": { "vector": { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity }, "url": { "type": "keyword" } } } else: # Simple vector field mapping = { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity } elif field_config.field_type in [FieldType.INT, FieldType.LONG]: mapping = { "type": "long", "store": field_config.store, "index": field_config.index } elif field_config.field_type in [FieldType.FLOAT, FieldType.DOUBLE]: mapping = { "type": "float", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.DATE: mapping = { "type": "date", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.BOOLEAN: mapping = { "type": "boolean", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.JSON: if field_config.nested and field_config.nested_properties: # Nested type with properties (e.g., variants) mapping = { "type": "nested", "properties": {} } # Generate mappings for nested properties for prop_name, prop_config in field_config.nested_properties.items(): prop_type = prop_config.get("type", "keyword") prop_mapping = {"type": prop_type} # Add analyzer for text fields if prop_type == "text" and "analyzer" in prop_config: prop_mapping["analyzer"] = prop_config["analyzer"] # Add other properties if "index" in prop_config: prop_mapping["index"] = prop_config["index"] if "store" in prop_config: prop_mapping["store"] = prop_config["store"] mapping["properties"][prop_name] = prop_mapping else: # Simple object type mapping = { "type": "object", "enabled": True } return mapping def get_default_analyzers() -> Dict[str, Any]: """ Get default analyzer definitions for the index. Returns: Dictionary of analyzer configurations """ return { "analysis": { "analyzer": { "index_ansj": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] }, "query_ansj": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] } } } } def get_default_similarity() -> Dict[str, Any]: """ Get default similarity configuration (modified BM25). Returns: Dictionary of similarity configurations """ return { "similarity": { "default": { "type": "BM25", "b": 0.0, "k1": 0.0 } } } # Mapping of field type strings to FieldType enum FIELD_TYPE_MAP = { "text": FieldType.TEXT, "TEXT": FieldType.TEXT, "keyword": FieldType.KEYWORD, "KEYWORD": FieldType.KEYWORD, "LITERAL": FieldType.KEYWORD, "text_embedding": FieldType.TEXT_EMBEDDING, "TEXT_EMBEDDING": FieldType.TEXT_EMBEDDING, "EMBEDDING": FieldType.TEXT_EMBEDDING, "image_embedding": FieldType.IMAGE_EMBEDDING, "IMAGE_EMBEDDING": FieldType.IMAGE_EMBEDDING, "int": FieldType.INT, "INT": FieldType.INT, "long": FieldType.LONG, "LONG": FieldType.LONG, "float": FieldType.FLOAT, "FLOAT": FieldType.FLOAT, "double": FieldType.DOUBLE, "DOUBLE": FieldType.DOUBLE, "date": FieldType.DATE, "DATE": FieldType.DATE, "boolean": FieldType.BOOLEAN, "BOOLEAN": FieldType.BOOLEAN, "json": FieldType.JSON, "JSON": FieldType.JSON, } # Mapping of analyzer strings to AnalyzerType enum ANALYZER_MAP = { "chinese": AnalyzerType.CHINESE_ECOMMERCE, "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE, "index_ansj": AnalyzerType.CHINESE_ECOMMERCE, "english": AnalyzerType.ENGLISH, "arabic": AnalyzerType.ARABIC, "spanish": AnalyzerType.SPANISH, "russian": AnalyzerType.RUSSIAN, "japanese": AnalyzerType.JAPANESE, "standard": AnalyzerType.STANDARD, "keyword": AnalyzerType.KEYWORD, }