Blame view

indexer/indexing_utils.py 4.32 KB
3c1f8031   tangwang   api/routes/indexe...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
  """
  索引工具函数。
  
  提取公共逻辑,避免代码重复。
  """
  
  import logging
  from typing import Dict, Any, Optional
  from sqlalchemy import Engine, text
  from config import ConfigLoader
  from config.tenant_config_loader import get_tenant_config_loader
  from indexer.document_transformer import SPUDocumentTransformer
  
  logger = logging.getLogger(__name__)
  
  
  def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
      """
      加载分类ID到名称的映射(全局,所有租户共享)。
      
      Args:
          db_engine: SQLAlchemy database engine
          
      Returns:
          Dictionary mapping category_id to category_name
      """
      query = text("""
          SELECT DISTINCT
              category_id,
              category
          FROM shoplazza_product_spu
          WHERE deleted = 0 AND category_id IS NOT NULL
      """)
      
      mapping = {}
      try:
          with db_engine.connect() as conn:
              result = conn.execute(query)
              for row in result:
                  category_id = str(int(row.category_id))
                  category_name = row.category
07cf5a93   tangwang   START_EMBEDDING=...
42
  
3c1f8031   tangwang   api/routes/indexe...
43
44
45
                  if not category_name or not category_name.strip():
                      logger.warning(f"Category ID {category_id} has empty name, skipping")
                      continue
07cf5a93   tangwang   START_EMBEDDING=...
46
  
3c1f8031   tangwang   api/routes/indexe...
47
48
49
                  mapping[category_id] = category_name
      except Exception as e:
          logger.error(f"Failed to load category mapping: {e}", exc_info=True)
07cf5a93   tangwang   START_EMBEDDING=...
50
          raise RuntimeError("Failed to load category mapping from MySQL") from e
3c1f8031   tangwang   api/routes/indexe...
51
52
53
54
55
56
57
58
59
      
      return mapping
  
  
  def create_document_transformer(
      category_id_to_name: Dict[str, str],
      tenant_id: str,
      searchable_option_dimensions: Optional[list] = None,
      translator: Optional[Any] = None,
453992a8   tangwang   需求:
60
      encoder: Optional[Any] = None,
5c2b70a2   tangwang   search_products.json
61
      enable_title_embedding: bool = True,
e7a2c0b7   tangwang   img encode
62
63
      image_encoder: Optional[Any] = None,
      enable_image_embedding: bool = False,
5c2b70a2   tangwang   search_products.json
64
      config: Optional[Any] = None,
3c1f8031   tangwang   api/routes/indexe...
65
66
67
  ) -> SPUDocumentTransformer:
      """
      创建文档转换器(统一初始化逻辑)。
e7a2c0b7   tangwang   img encode
68
  
3c1f8031   tangwang   api/routes/indexe...
69
70
71
72
73
      Args:
          category_id_to_name: 分类ID到名称的映射
          tenant_id: 租户ID
          searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
          translator: 翻译器实例(如果为None则根据配置初始化)
453992a8   tangwang   需求:
74
75
          encoder: 文本编码器实例(如果为Noneenable_title_embeddingTrue则根据配置初始化)
          enable_title_embedding: 是否启用标题向量化(默认True
e7a2c0b7   tangwang   img encode
76
77
78
          image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)
          enable_image_embedding: 是否启用 image_embedding 填充(默认False
  
3c1f8031   tangwang   api/routes/indexe...
79
80
81
82
83
84
85
86
      Returns:
          SPUDocumentTransformer实例
      """
      # 加载租户配置
      tenant_config_loader = get_tenant_config_loader()
      tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
      
      # 加载搜索配置(如果需要)
5c2b70a2   tangwang   search_products.json
87
88
89
      if (
          searchable_option_dimensions is None
          or translator is None
5c2b70a2   tangwang   search_products.json
90
91
92
          or (encoder is None and enable_title_embedding)
          or config is None
      ):
ed948666   tangwang   tidy
93
94
95
96
97
98
99
100
101
102
103
104
105
106
          if config is None:
              config_loader = ConfigLoader()
              config = config_loader.load_config()
  
          if searchable_option_dimensions is None:
              searchable_option_dimensions = config.spu_config.searchable_option_dimensions
  
          index_langs = tenant_config.get("index_languages") or []
          need_translator = len(index_langs) > 1
          if translator is None and need_translator:
              from providers import create_translation_provider
  
              translator = create_translation_provider(config.query_config)
  
ed948666   tangwang   tidy
107
108
109
110
111
112
          # 初始化encoder(如果启用标题向量化且未提供encoder)
          if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
              from embeddings.text_encoder import TextEmbeddingEncoder
  
              encoder = TextEmbeddingEncoder()
              logger.info("TextEmbeddingEncoder initialized for title embedding")
3c1f8031   tangwang   api/routes/indexe...
113
114
115
116
117
118
      
      return SPUDocumentTransformer(
          category_id_to_name=category_id_to_name,
          searchable_option_dimensions=searchable_option_dimensions,
          tenant_config=tenant_config,
          translator=translator,
453992a8   tangwang   需求:
119
          encoder=encoder,
e7a2c0b7   tangwang   img encode
120
121
122
          enable_title_embedding=enable_title_embedding,
          image_encoder=image_encoder,
          enable_image_embedding=enable_image_embedding,
3c1f8031   tangwang   api/routes/indexe...
123
      )