Blame view

indexer/indexing_utils.py 5.18 KB
3c1f8031   tangwang   api/routes/indexe...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
  """
  索引工具函数。
  
  提取公共逻辑,避免代码重复。
  """
  
  import logging
  from typing import Dict, Any, Optional
  from sqlalchemy import Engine, text
  from config import ConfigLoader
  from config.tenant_config_loader import get_tenant_config_loader
  from indexer.document_transformer import SPUDocumentTransformer
  
  logger = logging.getLogger(__name__)
  
  
  def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
      """
      加载分类ID到名称的映射(全局,所有租户共享)。
      
      Args:
          db_engine: SQLAlchemy database engine
          
      Returns:
          Dictionary mapping category_id to category_name
      """
      query = text("""
          SELECT DISTINCT
              category_id,
              category
          FROM shoplazza_product_spu
          WHERE deleted = 0 AND category_id IS NOT NULL
      """)
      
      mapping = {}
      try:
          with db_engine.connect() as conn:
              result = conn.execute(query)
              for row in result:
                  category_id = str(int(row.category_id))
                  category_name = row.category
                  
                  if not category_name or not category_name.strip():
                      logger.warning(f"Category ID {category_id} has empty name, skipping")
                      continue
                  
                  mapping[category_id] = category_name
      except Exception as e:
          logger.error(f"Failed to load category mapping: {e}", exc_info=True)
      
      return mapping
  
  
  def create_document_transformer(
      category_id_to_name: Dict[str, str],
      tenant_id: str,
      searchable_option_dimensions: Optional[list] = None,
      translator: Optional[Any] = None,
453992a8   tangwang   需求:
59
60
61
      translation_prompts: Optional[Dict[str, str]] = None,
      encoder: Optional[Any] = None,
      enable_title_embedding: bool = True
3c1f8031   tangwang   api/routes/indexe...
62
63
64
65
66
67
68
69
70
71
  ) -> SPUDocumentTransformer:
      """
      创建文档转换器(统一初始化逻辑)。
      
      Args:
          category_id_to_name: 分类ID到名称的映射
          tenant_id: 租户ID
          searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
          translator: 翻译器实例(如果为None则根据配置初始化)
          translation_prompts: 翻译提示词配置(如果为None则从配置加载)
453992a8   tangwang   需求:
72
73
          encoder: 文本编码器实例(如果为Noneenable_title_embeddingTrue则根据配置初始化)
          enable_title_embedding: 是否启用标题向量化(默认True
3c1f8031   tangwang   api/routes/indexe...
74
75
76
77
78
79
80
81
82
          
      Returns:
          SPUDocumentTransformer实例
      """
      # 加载租户配置
      tenant_config_loader = get_tenant_config_loader()
      tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
      
      # 加载搜索配置(如果需要)
453992a8   tangwang   需求:
83
      if searchable_option_dimensions is None or translator is None or translation_prompts is None or (encoder is None and enable_title_embedding):
3c1f8031   tangwang   api/routes/indexe...
84
85
86
87
88
89
90
          try:
              config_loader = ConfigLoader()
              config = config_loader.load_config()
              
              if searchable_option_dimensions is None:
                  searchable_option_dimensions = config.spu_config.searchable_option_dimensions
              
345d960b   tangwang   1. 删除全局 enable_tr...
91
92
93
94
95
              # 根据租户配置决定是否需要翻译:只要开启任一方向的翻译,就初始化翻译器
              translate_to_en = bool(tenant_config.get("translate_to_en"))
              translate_to_zh = bool(tenant_config.get("translate_to_zh"))
  
              if translator is None and (translate_to_en or translate_to_zh):
3c1f8031   tangwang   api/routes/indexe...
96
97
98
99
100
101
102
103
104
105
                  from query.translator import Translator
                  translator = Translator(
                      api_key=config.query_config.translation_api_key,
                      use_cache=True,
                      glossary_id=config.query_config.translation_glossary_id,
                      translation_context=config.query_config.translation_context
                  )
              
              if translation_prompts is None:
                  translation_prompts = config.query_config.translation_prompts
453992a8   tangwang   需求:
106
107
108
109
110
111
112
113
114
115
              
              # 初始化encoder(如果启用标题向量化且未提供encoder)
              if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
                  try:
                      from embeddings.text_encoder import BgeEncoder
                      encoder = BgeEncoder()
                      logger.info("BgeEncoder initialized for title embedding")
                  except Exception as e:
                      logger.warning(f"Failed to initialize BgeEncoder: {e}, title embedding will be disabled")
                      enable_title_embedding = False
3c1f8031   tangwang   api/routes/indexe...
116
117
118
119
120
121
122
123
124
125
126
127
          except Exception as e:
              logger.warning(f"Failed to load config, using defaults: {e}")
              if searchable_option_dimensions is None:
                  searchable_option_dimensions = ['option1', 'option2', 'option3']
              if translation_prompts is None:
                  translation_prompts = {}
      
      return SPUDocumentTransformer(
          category_id_to_name=category_id_to_name,
          searchable_option_dimensions=searchable_option_dimensions,
          tenant_config=tenant_config,
          translator=translator,
453992a8   tangwang   需求:
128
129
130
          translation_prompts=translation_prompts,
          encoder=encoder,
          enable_title_embedding=enable_title_embedding
3c1f8031   tangwang   api/routes/indexe...
131
      )