""" Field type definitions for the search engine configuration system. This module defines all supported field types, analyzers, and their corresponding Elasticsearch mapping configurations. """ from enum import Enum from typing import Dict, Any, Optional from dataclasses import dataclass class FieldType(Enum): """Supported field types in the search engine.""" TEXT = "text" KEYWORD = "keyword" TEXT_EMBEDDING = "text_embedding" IMAGE_EMBEDDING = "image_embedding" INT = "int" LONG = "long" FLOAT = "float" DOUBLE = "double" DATE = "date" BOOLEAN = "boolean" JSON = "json" class AnalyzerType(Enum): """Supported analyzer types for text fields.""" # E-commerce general analysis - Chinese CHINESE_ECOMMERCE = "index_ansj" CHINESE_ECOMMERCE_QUERY = "query_ansj" # Standard language analyzers ENGLISH = "english" ARABIC = "arabic" SPANISH = "spanish" RUSSIAN = "russian" JAPANESE = "japanese" # Standard analyzers STANDARD = "standard" KEYWORD = "keyword" class SimilarityType(Enum): """Supported similarity algorithms for text fields.""" BM25 = "BM25" BM25_CUSTOM = "BM25_custom" # Modified BM25 with b=0.0, k1=0.0 @dataclass class FieldConfig: """Configuration for a single field.""" name: str field_type: FieldType analyzer: Optional[AnalyzerType] = None search_analyzer: Optional[AnalyzerType] = None required: bool = False multi_language: bool = False # If true, field has language variants languages: Optional[list] = None # ['zh', 'en', 'ru'] boost: float = 1.0 store: bool = False index: bool = True return_in_source: bool = True # Whether to include this field in search result _source # For embedding fields embedding_dims: int = 1024 embedding_similarity: str = "dot_product" # dot_product, cosine, l2_norm # For nested fields (like image embeddings) nested: bool = False nested_properties: Optional[Dict[str, Any]] = None # Hybrid Keyword Text (HKText) support keyword_subfield: bool = False keyword_ignore_above: int = 256 keyword_normalizer: Optional[str] = None # For keyword subfield normalizer (e.g., "lowercase") def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: """ Generate Elasticsearch mapping configuration for a field. Args: field_config: Field configuration object Returns: Dictionary containing ES mapping for the field """ mapping = {} if field_config.field_type == FieldType.TEXT: mapping = { "type": "text", "store": field_config.store, "index": field_config.index } if field_config.analyzer: if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE: mapping["analyzer"] = "index_ansj" mapping["search_analyzer"] = "query_ansj" elif field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: # If search_analyzer is explicitly set to CHINESE_ECOMMERCE_QUERY mapping["analyzer"] = "index_ansj" mapping["search_analyzer"] = "query_ansj" else: mapping["analyzer"] = field_config.analyzer.value if field_config.search_analyzer: if field_config.search_analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: mapping["search_analyzer"] = "query_ansj" else: mapping["search_analyzer"] = field_config.search_analyzer.value if field_config.keyword_subfield: mapping.setdefault("fields", {}) keyword_field = { "type": "keyword", "ignore_above": field_config.keyword_ignore_above } if field_config.keyword_normalizer: keyword_field["normalizer"] = field_config.keyword_normalizer mapping["fields"]["keyword"] = keyword_field elif field_config.field_type == FieldType.KEYWORD: mapping = { "type": "keyword", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.TEXT_EMBEDDING: mapping = { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity } elif field_config.field_type == FieldType.IMAGE_EMBEDDING: if field_config.nested: mapping = { "type": "nested", "properties": { "vector": { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity }, "url": { "type": "text" } } } else: # Simple vector field mapping = { "type": "dense_vector", "dims": field_config.embedding_dims, "index": True, "similarity": field_config.embedding_similarity } elif field_config.field_type in [FieldType.INT, FieldType.LONG]: mapping = { "type": "long", "store": field_config.store, "index": field_config.index } elif field_config.field_type in [FieldType.FLOAT, FieldType.DOUBLE]: mapping = { "type": "float", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.DATE: mapping = { "type": "date", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.BOOLEAN: mapping = { "type": "boolean", "store": field_config.store, "index": field_config.index } elif field_config.field_type == FieldType.JSON: if field_config.nested and field_config.nested_properties: # Nested type with properties (e.g., variants) mapping = { "type": "nested", "properties": {} } # Generate mappings for nested properties for prop_name, prop_config in field_config.nested_properties.items(): prop_type = prop_config.get("type", "keyword") prop_mapping = {"type": prop_type} # Add analyzer for text fields if prop_type == "text" and "analyzer" in prop_config: analyzer_str = prop_config["analyzer"] # Convert chinese_ecommerce to index_ansj/query_ansj if analyzer_str == "chinese_ecommerce": prop_mapping["analyzer"] = "index_ansj" prop_mapping["search_analyzer"] = "query_ansj" else: prop_mapping["analyzer"] = analyzer_str # Add other properties if "index" in prop_config: prop_mapping["index"] = prop_config["index"] if "store" in prop_config: prop_mapping["store"] = prop_config["store"] mapping["properties"][prop_name] = prop_mapping else: # Simple object type mapping = { "type": "object", "enabled": True } return mapping def get_default_analyzers() -> Dict[str, Any]: """ Get default analyzer definitions for the index. Returns: Dictionary of analyzer configurations """ return { "analysis": { "analyzer": { "index_ansj": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] }, "query_ansj": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] }, "hanlp_index": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] }, "hanlp_standard": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] } }, "normalizer": { "lowercase": { "type": "custom", "filter": ["lowercase"] } } } } def get_default_similarity() -> Dict[str, Any]: """ Get default similarity configuration (modified BM25). Returns: Dictionary of similarity configurations """ return { "similarity": { "default": { "type": "BM25", "b": 0.0, "k1": 0.0 } } } # Mapping of field type strings to FieldType enum FIELD_TYPE_MAP = { "text": FieldType.TEXT, "TEXT": FieldType.TEXT, "HKText": FieldType.TEXT, "hktext": FieldType.TEXT, "HKTEXT": FieldType.TEXT, "keyword": FieldType.KEYWORD, "KEYWORD": FieldType.KEYWORD, "LITERAL": FieldType.KEYWORD, "text_embedding": FieldType.TEXT_EMBEDDING, "TEXT_EMBEDDING": FieldType.TEXT_EMBEDDING, "EMBEDDING": FieldType.TEXT_EMBEDDING, "image_embedding": FieldType.IMAGE_EMBEDDING, "IMAGE_EMBEDDING": FieldType.IMAGE_EMBEDDING, "int": FieldType.INT, "INT": FieldType.INT, "long": FieldType.LONG, "LONG": FieldType.LONG, "float": FieldType.FLOAT, "FLOAT": FieldType.FLOAT, "double": FieldType.DOUBLE, "DOUBLE": FieldType.DOUBLE, "date": FieldType.DATE, "DATE": FieldType.DATE, "boolean": FieldType.BOOLEAN, "BOOLEAN": FieldType.BOOLEAN, "json": FieldType.JSON, "JSON": FieldType.JSON, } # Mapping of analyzer strings to AnalyzerType enum ANALYZER_MAP = { "chinese": AnalyzerType.CHINESE_ECOMMERCE, "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE, "index_ansj": AnalyzerType.CHINESE_ECOMMERCE, "hanlp_index": AnalyzerType.CHINESE_ECOMMERCE, # Alias for index_ansj "hanlp_standard": AnalyzerType.CHINESE_ECOMMERCE_QUERY, # Alias for query_ansj "query_ansj": AnalyzerType.CHINESE_ECOMMERCE_QUERY, "english": AnalyzerType.ENGLISH, "arabic": AnalyzerType.ARABIC, "spanish": AnalyzerType.SPANISH, "russian": AnalyzerType.RUSSIAN, "japanese": AnalyzerType.JAPANESE, "standard": AnalyzerType.STANDARD, "keyword": AnalyzerType.KEYWORD, }