""" 索引工具函数。 提取公共逻辑,避免代码重复。 """ import logging from typing import Dict, Any, Optional from sqlalchemy import Engine, text from config import ConfigLoader from config.tenant_config_loader import get_tenant_config_loader from indexer.document_transformer import SPUDocumentTransformer logger = logging.getLogger(__name__) def load_category_mapping(db_engine: Engine) -> Dict[str, str]: """ 加载分类ID到名称的映射(全局,所有租户共享)。 Args: db_engine: SQLAlchemy database engine Returns: Dictionary mapping category_id to category_name """ query = text(""" SELECT DISTINCT category_id, category FROM shoplazza_product_spu WHERE deleted = 0 AND category_id IS NOT NULL """) mapping = {} try: with db_engine.connect() as conn: result = conn.execute(query) for row in result: category_id = str(int(row.category_id)) category_name = row.category if not category_name or not category_name.strip(): logger.warning(f"Category ID {category_id} has empty name, skipping") continue mapping[category_id] = category_name except Exception as e: logger.error(f"Failed to load category mapping: {e}", exc_info=True) return mapping def create_document_transformer( category_id_to_name: Dict[str, str], tenant_id: str, searchable_option_dimensions: Optional[list] = None, translator: Optional[Any] = None, translation_prompts: Optional[Dict[str, str]] = None, encoder: Optional[Any] = None, enable_title_embedding: bool = True ) -> SPUDocumentTransformer: """ 创建文档转换器(统一初始化逻辑)。 Args: category_id_to_name: 分类ID到名称的映射 tenant_id: 租户ID searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) translator: 翻译器实例(如果为None则根据配置初始化) translation_prompts: 翻译提示词配置(如果为None则从配置加载) encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) enable_title_embedding: 是否启用标题向量化(默认True) Returns: SPUDocumentTransformer实例 """ # 加载租户配置 tenant_config_loader = get_tenant_config_loader() tenant_config = tenant_config_loader.get_tenant_config(tenant_id) # 加载搜索配置(如果需要) if searchable_option_dimensions is None or translator is None or translation_prompts is None or (encoder is None and enable_title_embedding): try: config_loader = ConfigLoader() config = config_loader.load_config() if searchable_option_dimensions is None: searchable_option_dimensions = config.spu_config.searchable_option_dimensions # 根据租户配置决定是否需要翻译:只要开启任一方向的翻译,就初始化翻译器 translate_to_en = bool(tenant_config.get("translate_to_en")) translate_to_zh = bool(tenant_config.get("translate_to_zh")) if translator is None and (translate_to_en or translate_to_zh): from query.translator import Translator translator = Translator( api_key=config.query_config.translation_api_key, use_cache=True, glossary_id=config.query_config.translation_glossary_id, translation_context=config.query_config.translation_context ) if translation_prompts is None: translation_prompts = config.query_config.translation_prompts # 初始化encoder(如果启用标题向量化且未提供encoder) if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: try: from embeddings.text_encoder import BgeEncoder encoder = BgeEncoder() logger.info("BgeEncoder initialized for title embedding") except Exception as e: logger.warning(f"Failed to initialize BgeEncoder: {e}, title embedding will be disabled") enable_title_embedding = False except Exception as e: logger.warning(f"Failed to load config, using defaults: {e}") if searchable_option_dimensions is None: searchable_option_dimensions = ['option1', 'option2', 'option3'] if translation_prompts is None: translation_prompts = {} return SPUDocumentTransformer( category_id_to_name=category_id_to_name, searchable_option_dimensions=searchable_option_dimensions, tenant_config=tenant_config, translator=translator, translation_prompts=translation_prompts, encoder=encoder, enable_title_embedding=enable_title_embedding )