indexing_utils.py 4.32 KB
"""
索引工具函数。

提取公共逻辑,避免代码重复。
"""

import logging
from typing import Dict, Any, Optional
from sqlalchemy import Engine, text
from config import ConfigLoader
from config.tenant_config_loader import get_tenant_config_loader
from indexer.document_transformer import SPUDocumentTransformer

logger = logging.getLogger(__name__)


def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
    """
    加载分类ID到名称的映射(全局,所有租户共享)。
    
    Args:
        db_engine: SQLAlchemy database engine
        
    Returns:
        Dictionary mapping category_id to category_name
    """
    query = text("""
        SELECT DISTINCT
            category_id,
            category
        FROM shoplazza_product_spu
        WHERE deleted = 0 AND category_id IS NOT NULL
    """)
    
    mapping = {}
    try:
        with db_engine.connect() as conn:
            result = conn.execute(query)
            for row in result:
                category_id = str(int(row.category_id))
                category_name = row.category

                if not category_name or not category_name.strip():
                    logger.warning(f"Category ID {category_id} has empty name, skipping")
                    continue

                mapping[category_id] = category_name
    except Exception as e:
        logger.error(f"Failed to load category mapping: {e}", exc_info=True)
        raise RuntimeError("Failed to load category mapping from MySQL") from e
    
    return mapping


def create_document_transformer(
    category_id_to_name: Dict[str, str],
    tenant_id: str,
    searchable_option_dimensions: Optional[list] = None,
    translator: Optional[Any] = None,
    encoder: Optional[Any] = None,
    enable_title_embedding: bool = True,
    image_encoder: Optional[Any] = None,
    enable_image_embedding: bool = False,
    config: Optional[Any] = None,
) -> SPUDocumentTransformer:
    """
    创建文档转换器(统一初始化逻辑)。

    Args:
        category_id_to_name: 分类ID到名称的映射
        tenant_id: 租户ID
        searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
        translator: 翻译器实例(如果为None则根据配置初始化)
        encoder: 文本编码器实例(如果为Noneenable_title_embeddingTrue则根据配置初始化)
        enable_title_embedding: 是否启用标题向量化(默认True
        image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls))
        enable_image_embedding: 是否启用 image_embedding 填充(默认False

    Returns:
        SPUDocumentTransformer实例
    """
    # 加载租户配置
    tenant_config_loader = get_tenant_config_loader()
    tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
    
    # 加载搜索配置(如果需要)
    if (
        searchable_option_dimensions is None
        or translator is None
        or (encoder is None and enable_title_embedding)
        or config is None
    ):
        if config is None:
            config_loader = ConfigLoader()
            config = config_loader.load_config()

        if searchable_option_dimensions is None:
            searchable_option_dimensions = config.spu_config.searchable_option_dimensions

        index_langs = tenant_config.get("index_languages") or []
        need_translator = len(index_langs) > 1
        if translator is None and need_translator:
            from providers import create_translation_provider

            translator = create_translation_provider(config.query_config)

        # 初始化encoder(如果启用标题向量化且未提供encoder)
        if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
            from embeddings.text_encoder import TextEmbeddingEncoder

            encoder = TextEmbeddingEncoder()
            logger.info("TextEmbeddingEncoder initialized for title embedding")
    
    return SPUDocumentTransformer(
        category_id_to_name=category_id_to_name,
        searchable_option_dimensions=searchable_option_dimensions,
        tenant_config=tenant_config,
        translator=translator,
        encoder=encoder,
        enable_title_embedding=enable_title_embedding,
        image_encoder=image_encoder,
        enable_image_embedding=enable_image_embedding,
    )