""" 索引工具函数。 提取公共逻辑,避免代码重复。 """ import logging from typing import Dict, Any, Optional from sqlalchemy import Engine, text from config import ConfigLoader from config.tenant_config_loader import get_tenant_config_loader from indexer.document_transformer import SPUDocumentTransformer logger = logging.getLogger(__name__) def load_category_mapping(db_engine: Engine) -> Dict[str, str]: """ 加载分类ID到名称的映射(全局,所有租户共享)。 Args: db_engine: SQLAlchemy database engine Returns: Dictionary mapping category_id to category_name """ query = text(""" SELECT DISTINCT category_id, category FROM shoplazza_product_spu WHERE deleted = 0 AND category_id IS NOT NULL """) mapping = {} try: with db_engine.connect() as conn: result = conn.execute(query) for row in result: category_id = str(int(row.category_id)) category_name = row.category if not category_name or not category_name.strip(): logger.warning(f"Category ID {category_id} has empty name, skipping") continue mapping[category_id] = category_name except Exception as e: logger.error(f"Failed to load category mapping: {e}", exc_info=True) raise RuntimeError("Failed to load category mapping from MySQL") from e return mapping def create_document_transformer( category_id_to_name: Dict[str, str], tenant_id: str, searchable_option_dimensions: Optional[list] = None, translator: Optional[Any] = None, encoder: Optional[Any] = None, enable_title_embedding: bool = True, image_encoder: Optional[Any] = None, enable_image_embedding: bool = False, config: Optional[Any] = None, ) -> SPUDocumentTransformer: """ 创建文档转换器(统一初始化逻辑)。 Args: category_id_to_name: 分类ID到名称的映射 tenant_id: 租户ID searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) translator: 翻译器实例(如果为None则根据配置初始化) encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) enable_title_embedding: 是否启用标题向量化(默认True) image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) enable_image_embedding: 是否启用 image_embedding 填充(默认False) Returns: SPUDocumentTransformer实例 """ # 加载租户配置 tenant_config_loader = get_tenant_config_loader() tenant_config = tenant_config_loader.get_tenant_config(tenant_id) # 加载搜索配置(如果需要) if ( searchable_option_dimensions is None or translator is None or (encoder is None and enable_title_embedding) or config is None ): if config is None: config_loader = ConfigLoader() config = config_loader.load_config() if searchable_option_dimensions is None: searchable_option_dimensions = config.spu_config.searchable_option_dimensions index_langs = tenant_config.get("index_languages") or [] need_translator = len(index_langs) > 1 if translator is None and need_translator: from providers import create_translation_provider translator = create_translation_provider(config.query_config) # 初始化encoder(如果启用标题向量化且未提供encoder) if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: from embeddings.text_encoder import TextEmbeddingEncoder encoder = TextEmbeddingEncoder() logger.info("TextEmbeddingEncoder initialized for title embedding") return SPUDocumentTransformer( category_id_to_name=category_id_to_name, searchable_option_dimensions=searchable_option_dimensions, tenant_config=tenant_config, translator=translator, encoder=encoder, enable_title_embedding=enable_title_embedding, image_encoder=image_encoder, enable_image_embedding=enable_image_embedding, )