Commit 453992a87dd623affa8475b9d7fdbc7abaadf431

Authored by tangwang
1 parent b735cced

需求:

索引的两项功能:
1. 多语言。 店铺配置的语言如果不等于zh,那么要调用翻译 获得中文翻译结果,同时 如果不等于en,要翻译en的结果。
要缓存到redis。 先查询缓存,没命中缓存再调用翻译,然后存入redis缓存起来。
这些逻辑应该是 @query/translator.py 内部的,不需要调用的地方关心。但是现在是  DictCache,直接改掉,改为redis的缓存

2. 填充 标题的向量化字段。如果该店铺的标题向量化打开,那么应该请求向量化模型根据英文的title得到embedding。使用 BgeEncoder.

以上两个模块的缓存,过期时间都是 最近多长时间内没有访问过。

feat:
1. 更新 REDIS_CONFIG 配置
在 config/env_config.py 中添加了用户提供的配置项(snapshot_db, translation_cache_expire_days, translation_cache_prefix 等)
2. 修改 query/translator.py
将 DictCache 改为 Redis 缓存
实现了 translate_for_indexing 方法,自动处理多语言翻译:
如果店铺语言不等于 zh,自动翻译成 zh
如果店铺语言不等于 en,自动翻译成 en
翻译逻辑封装在 translator.py 内部,调用方无需关心
3. 修改 embeddings/text_encoder.py
在 BgeEncoder 中添加了 Redis 缓存
实现了滑动过期策略(每次访问时重置过期时间)
缓存逻辑参考了提供的 CacheManager 对象
4. 修改 indexer/document_transformer.py
添加了 encoder 和 enable_title_embedding 参数
实现了 _fill_title_embedding 方法,使用英文标题(title_en)生成 embedding
更新了 _fill_text_fields 方法,使用新的 translate_for_indexing 方法
5. 更新 indexer/indexing_utils.py
更新了 create_document_transformer 函数,支持新的 encoder 和 enable_title_embedding 参数
如果启用标题向量化且未提供 encoder,会自动初始化 BgeEncoder
config/env_config.py
@@ -26,7 +26,14 @@ ES_CONFIG = { @@ -26,7 +26,14 @@ ES_CONFIG = {
26 REDIS_CONFIG = { 26 REDIS_CONFIG = {
27 'host': os.getenv('REDIS_HOST', 'localhost'), 27 'host': os.getenv('REDIS_HOST', 'localhost'),
28 'port': int(os.getenv('REDIS_PORT', 6479)), 28 'port': int(os.getenv('REDIS_PORT', 6479)),
29 - 'password': os.getenv('REDIS_PASSWORD'), 29 + 'snapshot_db': int(os.getenv('REDIS_SNAPSHOT_DB', 0)),
  30 + 'password': os.getenv('REDIS_PASSWORD', 'BMfv5aI31kgHWtlx'),
  31 + 'socket_timeout': int(os.getenv('REDIS_SOCKET_TIMEOUT', 1)),
  32 + 'socket_connect_timeout': int(os.getenv('REDIS_SOCKET_CONNECT_TIMEOUT', 1)),
  33 + 'retry_on_timeout': os.getenv('REDIS_RETRY_ON_TIMEOUT', 'False').lower() == 'true',
  34 + 'cache_expire_days': int(os.getenv('REDIS_CACHE_EXPIRE_DAYS', 180)), # 6 months
  35 + 'translation_cache_expire_days': int(os.getenv('REDIS_TRANSLATION_CACHE_EXPIRE_DAYS', 360)),
  36 + 'translation_cache_prefix': os.getenv('REDIS_TRANSLATION_CACHE_PREFIX', 'trans'),
30 } 37 }
31 38
32 # DeepL API Key 39 # DeepL API Key
embeddings/text_encoder.py
@@ -9,11 +9,20 @@ import requests @@ -9,11 +9,20 @@ import requests
9 import time 9 import time
10 import threading 10 import threading
11 import numpy as np 11 import numpy as np
  12 +import pickle
  13 +import redis
  14 +from datetime import timedelta
  15 +from typing import List, Union, Dict, Any, Optional
12 import logging 16 import logging
13 -from typing import List, Union, Dict, Any  
14 17
15 logger = logging.getLogger(__name__) 18 logger = logging.getLogger(__name__)
16 19
  20 +# Try to import REDIS_CONFIG, but allow import to fail
  21 +try:
  22 + from config.env_config import REDIS_CONFIG
  23 +except ImportError:
  24 + REDIS_CONFIG = {}
  25 +
17 26
18 class BgeEncoder: 27 class BgeEncoder:
19 """ 28 """
@@ -31,6 +40,26 @@ class BgeEncoder: @@ -31,6 +40,26 @@ class BgeEncoder:
31 logger.info(f"Creating BgeEncoder instance with service URL: {service_url}") 40 logger.info(f"Creating BgeEncoder instance with service URL: {service_url}")
32 cls._instance.service_url = service_url 41 cls._instance.service_url = service_url
33 cls._instance.endpoint = f"{service_url}/embedding/generate_embeddings" 42 cls._instance.endpoint = f"{service_url}/embedding/generate_embeddings"
  43 +
  44 + # Initialize Redis cache
  45 + try:
  46 + cls._instance.redis_client = redis.Redis(
  47 + host=REDIS_CONFIG.get('host', 'localhost'),
  48 + port=REDIS_CONFIG.get('port', 6479),
  49 + password=REDIS_CONFIG.get('password'),
  50 + decode_responses=False, # Keep binary data as is
  51 + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
  52 + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
  53 + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
  54 + health_check_interval=10 # 避免复用坏连接
  55 + )
  56 + # Test connection
  57 + cls._instance.redis_client.ping()
  58 + cls._instance.expire_time = timedelta(days=REDIS_CONFIG.get('cache_expire_days', 180))
  59 + logger.info("Redis cache initialized for embeddings")
  60 + except Exception as e:
  61 + logger.warning(f"Failed to initialize Redis cache for embeddings: {e}, continuing without cache")
  62 + cls._instance.redis_client = None
34 return cls._instance 63 return cls._instance
35 64
36 def _call_service(self, request_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 65 def _call_service(self, request_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -63,7 +92,7 @@ class BgeEncoder: @@ -63,7 +92,7 @@ class BgeEncoder:
63 batch_size: int = 32 92 batch_size: int = 32
64 ) -> np.ndarray: 93 ) -> np.ndarray:
65 """ 94 """
66 - Encode text into embeddings via network service. 95 + Encode text into embeddings via network service with Redis caching.
67 96
68 Args: 97 Args:
69 sentences: Single string or list of strings to encode 98 sentences: Single string or list of strings to encode
@@ -78,11 +107,24 @@ class BgeEncoder: @@ -78,11 +107,24 @@ class BgeEncoder:
78 if isinstance(sentences, str): 107 if isinstance(sentences, str):
79 sentences = [sentences] 108 sentences = [sentences]
80 109
81 - # Prepare request data  
82 - request_data = [] 110 + # Check cache first
  111 + cached_embeddings = []
  112 + uncached_indices = []
  113 + uncached_texts = []
  114 +
83 for i, text in enumerate(sentences): 115 for i, text in enumerate(sentences):
  116 + cached = self._get_cached_embedding(text, 'en') # Use 'en' as default language for title embedding
  117 + if cached is not None:
  118 + cached_embeddings.append((i, cached))
  119 + else:
  120 + uncached_indices.append(i)
  121 + uncached_texts.append(text)
  122 +
  123 + # Prepare request data for uncached texts
  124 + request_data = []
  125 + for i, text in enumerate(uncached_texts):
84 request_item = { 126 request_item = {
85 - "id": str(i), 127 + "id": str(uncached_indices[i]),
86 "name_zh": text 128 "name_zh": text
87 } 129 }
88 130
@@ -93,43 +135,58 @@ class BgeEncoder: @@ -93,43 +135,58 @@ class BgeEncoder:
93 135
94 request_data.append(request_item) 136 request_data.append(request_item)
95 137
96 - try:  
97 - # Call service  
98 - response_data = self._call_service(request_data)  
99 -  
100 - # Process response  
101 - embeddings = []  
102 - for i, text in enumerate(sentences):  
103 - # Find corresponding response by ID  
104 - response_item = None  
105 - for item in response_data:  
106 - if str(item.get("id")) == str(i):  
107 - response_item = item  
108 - break  
109 -  
110 - if response_item:  
111 - # Try Chinese embedding first, then English, then Russian  
112 - embedding = None  
113 - for lang in ["embedding_zh", "embedding_en", "embedding_ru"]:  
114 - if lang in response_item and response_item[lang] is not None:  
115 - embedding = response_item[lang] 138 + # Process response
  139 + embeddings = [None] * len(sentences)
  140 +
  141 + # Fill in cached embeddings
  142 + for idx, cached_emb in cached_embeddings:
  143 + embeddings[idx] = cached_emb
  144 +
  145 + # If there are uncached texts, call service
  146 + if uncached_texts:
  147 + try:
  148 + # Call service
  149 + response_data = self._call_service(request_data)
  150 +
  151 + # Process response
  152 + for i, text in enumerate(uncached_texts):
  153 + original_idx = uncached_indices[i]
  154 + # Find corresponding response by ID
  155 + response_item = None
  156 + for item in response_data:
  157 + if str(item.get("id")) == str(original_idx):
  158 + response_item = item
116 break 159 break
117 160
118 - if embedding is not None:  
119 - embeddings.append(embedding) 161 + if response_item:
  162 + # Try Chinese embedding first, then English, then Russian
  163 + embedding = None
  164 + for lang in ["embedding_zh", "embedding_en", "embedding_ru"]:
  165 + if lang in response_item and response_item[lang] is not None:
  166 + embedding = response_item[lang]
  167 + break
  168 +
  169 + if embedding is not None:
  170 + embedding_array = np.array(embedding, dtype=np.float32)
  171 + embeddings[original_idx] = embedding_array
  172 + # Cache the embedding
  173 + self._set_cached_embedding(text, 'en', embedding_array)
  174 + else:
  175 + logger.warning(f"No embedding found for text {original_idx}: {text[:50]}...")
  176 + embeddings[original_idx] = np.zeros(1024, dtype=np.float32)
120 else: 177 else:
121 - logger.warning(f"No embedding found for text {i}: {text[:50]}...")  
122 - embeddings.append([0.0] * 1024)  
123 - else:  
124 - logger.warning(f"No response found for text {i}")  
125 - embeddings.append([0.0] * 1024)  
126 -  
127 - return np.array(embeddings, dtype=np.float32)  
128 -  
129 - except Exception as e:  
130 - logger.error(f"Failed to encode texts: {e}", exc_info=True)  
131 - # Return zero embeddings as fallback  
132 - return np.zeros((len(sentences), 1024), dtype=np.float32) 178 + logger.warning(f"No response found for text {original_idx}")
  179 + embeddings[original_idx] = np.zeros(1024, dtype=np.float32)
  180 +
  181 + except Exception as e:
  182 + logger.error(f"Failed to encode texts: {e}", exc_info=True)
  183 + # Fill missing embeddings with zeros
  184 + for idx in uncached_indices:
  185 + if embeddings[idx] is None:
  186 + embeddings[idx] = np.zeros(1024, dtype=np.float32)
  187 +
  188 + # Convert to numpy array
  189 + return np.array(embeddings, dtype=np.float32)
133 190
134 def encode_batch( 191 def encode_batch(
135 self, 192 self,
@@ -149,3 +206,48 @@ class BgeEncoder: @@ -149,3 +206,48 @@ class BgeEncoder:
149 numpy array of embeddings 206 numpy array of embeddings
150 """ 207 """
151 return self.encode(texts, batch_size=batch_size, device=device) 208 return self.encode(texts, batch_size=batch_size, device=device)
  209 +
  210 + def _get_cache_key(self, query: str, language: str) -> str:
  211 + """Generate a cache key for the query"""
  212 + return f"embedding:{language}:{query}"
  213 +
  214 + def _get_cached_embedding(self, query: str, language: str) -> Optional[np.ndarray]:
  215 + """Get embedding from cache if exists (with sliding expiration)"""
  216 + if not self.redis_client:
  217 + return None
  218 +
  219 + try:
  220 + cache_key = self._get_cache_key(query, language)
  221 + cached_data = self.redis_client.get(cache_key)
  222 + if cached_data:
  223 + logger.debug(f"Cache hit for embedding: {query}")
  224 + # Update expiration time on access (sliding expiration)
  225 + self.redis_client.expire(cache_key, self.expire_time)
  226 + return pickle.loads(cached_data)
  227 + return None
  228 + except Exception as e:
  229 + logger.error(f"Error retrieving embedding from cache: {e}")
  230 + return None
  231 +
  232 + def _set_cached_embedding(self, query: str, language: str, embedding: np.ndarray) -> bool:
  233 + """Store embedding in cache"""
  234 + if not self.redis_client:
  235 + return False
  236 +
  237 + try:
  238 + cache_key = self._get_cache_key(query, language)
  239 + serialized_data = pickle.dumps(embedding)
  240 + self.redis_client.setex(
  241 + cache_key,
  242 + self.expire_time,
  243 + serialized_data
  244 + )
  245 + logger.debug(f"Successfully cached embedding for query: {query}")
  246 + return True
  247 + except (redis.exceptions.BusyLoadingError, redis.exceptions.ConnectionError,
  248 + redis.exceptions.TimeoutError, redis.exceptions.RedisError) as e:
  249 + logger.warning(f"Redis error storing embedding in cache: {e}")
  250 + return False
  251 + except Exception as e:
  252 + logger.error(f"Error storing embedding in cache: {e}")
  253 + return False
indexer/document_transformer.py
@@ -29,7 +29,9 @@ class SPUDocumentTransformer: @@ -29,7 +29,9 @@ class SPUDocumentTransformer:
29 searchable_option_dimensions: List[str], 29 searchable_option_dimensions: List[str],
30 tenant_config: Optional[Dict[str, Any]] = None, 30 tenant_config: Optional[Dict[str, Any]] = None,
31 translator: Optional[Any] = None, 31 translator: Optional[Any] = None,
32 - translation_prompts: Optional[Dict[str, str]] = None 32 + translation_prompts: Optional[Dict[str, str]] = None,
  33 + encoder: Optional[Any] = None,
  34 + enable_title_embedding: bool = True
33 ): 35 ):
34 """ 36 """
35 初始化文档转换器。 37 初始化文档转换器。
@@ -40,12 +42,16 @@ class SPUDocumentTransformer: @@ -40,12 +42,16 @@ class SPUDocumentTransformer:
40 tenant_config: 租户配置(包含主语言和翻译配置) 42 tenant_config: 租户配置(包含主语言和翻译配置)
41 translator: 翻译器实例(可选,如果提供则启用翻译功能) 43 translator: 翻译器实例(可选,如果提供则启用翻译功能)
42 translation_prompts: 翻译提示词配置(可选) 44 translation_prompts: 翻译提示词配置(可选)
  45 + encoder: 文本编码器实例(可选,用于生成title_embedding)
  46 + enable_title_embedding: 是否启用标题向量化(默认True)
43 """ 47 """
44 self.category_id_to_name = category_id_to_name 48 self.category_id_to_name = category_id_to_name
45 self.searchable_option_dimensions = searchable_option_dimensions 49 self.searchable_option_dimensions = searchable_option_dimensions
46 self.tenant_config = tenant_config or {} 50 self.tenant_config = tenant_config or {}
47 self.translator = translator 51 self.translator = translator
48 self.translation_prompts = translation_prompts or {} 52 self.translation_prompts = translation_prompts or {}
  53 + self.encoder = encoder
  54 + self.enable_title_embedding = enable_title_embedding
49 55
50 def transform_spu_to_doc( 56 def transform_spu_to_doc(
51 self, 57 self,
@@ -81,11 +87,13 @@ class SPUDocumentTransformer: @@ -81,11 +87,13 @@ class SPUDocumentTransformer:
81 87
82 # 获取租户配置 88 # 获取租户配置
83 primary_lang = self.tenant_config.get('primary_language', 'zh') 89 primary_lang = self.tenant_config.get('primary_language', 'zh')
84 - translate_to_en = self.tenant_config.get('translate_to_en', True)  
85 - translate_to_zh = self.tenant_config.get('translate_to_zh', False)  
86 90
87 - # 文本字段处理(根据主语言和翻译配置)  
88 - self._fill_text_fields(doc, spu_row, primary_lang, translate_to_en, translate_to_zh) 91 + # 文本字段处理(使用translator的内部逻辑自动处理多语言翻译)
  92 + self._fill_text_fields(doc, spu_row, primary_lang)
  93 +
  94 + # 标题向量化处理(如果启用)
  95 + if self.enable_title_embedding and self.encoder:
  96 + self._fill_title_embedding(doc)
89 97
90 # Tags 98 # Tags
91 if pd.notna(spu_row.get('tags')): 99 if pd.notna(spu_row.get('tags')):
@@ -160,114 +168,119 @@ class SPUDocumentTransformer: @@ -160,114 +168,119 @@ class SPUDocumentTransformer:
160 self, 168 self,
161 doc: Dict[str, Any], 169 doc: Dict[str, Any],
162 spu_row: pd.Series, 170 spu_row: pd.Series,
163 - primary_lang: str,  
164 - translate_to_en: bool,  
165 - translate_to_zh: bool 171 + primary_lang: str
166 ): 172 ):
167 - """填充文本字段(根据主语言和翻译配置)。"""  
168 - # 主语言字段  
169 - primary_suffix = '_zh' if primary_lang == 'zh' else '_en'  
170 - secondary_suffix = '_en' if primary_lang == 'zh' else '_zh'  
171 - 173 + """
  174 + 填充文本字段(根据主语言自动处理多语言翻译)。
  175 +
  176 + 翻译逻辑在translator内部处理:
  177 + - 如果店铺语言不等于zh,自动翻译成zh
  178 + - 如果店铺语言不等于en,自动翻译成en
  179 + """
172 # Title 180 # Title
173 if pd.notna(spu_row.get('title')): 181 if pd.notna(spu_row.get('title')):
174 title_text = str(spu_row['title']) 182 title_text = str(spu_row['title'])
175 - doc[f'title{primary_suffix}'] = title_text  
176 - # 如果需要翻译,调用翻译服务(同步模式)  
177 - if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):  
178 - if self.translator:  
179 - target_lang = 'en' if primary_lang == 'zh' else 'zh'  
180 - # 根据目标语言选择对应的提示词  
181 - if target_lang == 'zh':  
182 - prompt = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh')  
183 - else:  
184 - prompt = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en')  
185 - translated = self.translator.translate(  
186 - title_text,  
187 - target_lang=target_lang,  
188 - source_lang=primary_lang,  
189 - prompt=prompt  
190 - )  
191 - doc[f'title{secondary_suffix}'] = translated if translated else None  
192 - else:  
193 - doc[f'title{secondary_suffix}'] = None # 无翻译器,设为None 183 +
  184 + # 使用translator的translate_for_indexing方法,自动处理多语言翻译
  185 + if self.translator:
  186 + # 根据目标语言选择对应的提示词
  187 + prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh')
  188 + prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en')
  189 +
  190 + # 调用translate_for_indexing,自动处理翻译逻辑
  191 + translations = self.translator.translate_for_indexing(
  192 + title_text,
  193 + shop_language=primary_lang,
  194 + source_lang=primary_lang,
  195 + prompt=prompt_zh if primary_lang == 'zh' else prompt_en
  196 + )
  197 +
  198 + # 填充翻译结果
  199 + doc['title_zh'] = translations.get('zh') or (title_text if primary_lang == 'zh' else None)
  200 + doc['title_en'] = translations.get('en') or (title_text if primary_lang == 'en' else None)
194 else: 201 else:
195 - doc[f'title{secondary_suffix}'] = None 202 + # 无翻译器,只填充主语言字段
  203 + if primary_lang == 'zh':
  204 + doc['title_zh'] = title_text
  205 + doc['title_en'] = None
  206 + else:
  207 + doc['title_zh'] = None
  208 + doc['title_en'] = title_text
196 else: 209 else:
197 - doc[f'title{primary_suffix}'] = None  
198 - doc[f'title{secondary_suffix}'] = None 210 + doc['title_zh'] = None
  211 + doc['title_en'] = None
199 212
200 # Brief 213 # Brief
201 if pd.notna(spu_row.get('brief')): 214 if pd.notna(spu_row.get('brief')):
202 brief_text = str(spu_row['brief']) 215 brief_text = str(spu_row['brief'])
203 - doc[f'brief{primary_suffix}'] = brief_text  
204 - if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):  
205 - if self.translator:  
206 - target_lang = 'en' if primary_lang == 'zh' else 'zh'  
207 - # 根据目标语言选择对应的提示词  
208 - prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
209 - translated = self.translator.translate(  
210 - brief_text,  
211 - target_lang=target_lang,  
212 - source_lang=primary_lang,  
213 - prompt=prompt  
214 - )  
215 - doc[f'brief{secondary_suffix}'] = translated if translated else None  
216 - else:  
217 - doc[f'brief{secondary_suffix}'] = None 216 + if self.translator:
  217 + prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  218 + translations = self.translator.translate_for_indexing(
  219 + brief_text,
  220 + shop_language=primary_lang,
  221 + source_lang=primary_lang,
  222 + prompt=prompt
  223 + )
  224 + doc['brief_zh'] = translations.get('zh') or (brief_text if primary_lang == 'zh' else None)
  225 + doc['brief_en'] = translations.get('en') or (brief_text if primary_lang == 'en' else None)
218 else: 226 else:
219 - doc[f'brief{secondary_suffix}'] = None 227 + if primary_lang == 'zh':
  228 + doc['brief_zh'] = brief_text
  229 + doc['brief_en'] = None
  230 + else:
  231 + doc['brief_zh'] = None
  232 + doc['brief_en'] = brief_text
220 else: 233 else:
221 - doc[f'brief{primary_suffix}'] = None  
222 - doc[f'brief{secondary_suffix}'] = None 234 + doc['brief_zh'] = None
  235 + doc['brief_en'] = None
223 236
224 # Description 237 # Description
225 if pd.notna(spu_row.get('description')): 238 if pd.notna(spu_row.get('description')):
226 desc_text = str(spu_row['description']) 239 desc_text = str(spu_row['description'])
227 - doc[f'description{primary_suffix}'] = desc_text  
228 - if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):  
229 - if self.translator:  
230 - target_lang = 'en' if primary_lang == 'zh' else 'zh'  
231 - # 根据目标语言选择对应的提示词  
232 - prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
233 - translated = self.translator.translate(  
234 - desc_text,  
235 - target_lang=target_lang,  
236 - source_lang=primary_lang,  
237 - prompt=prompt  
238 - )  
239 - doc[f'description{secondary_suffix}'] = translated if translated else None  
240 - else:  
241 - doc[f'description{secondary_suffix}'] = None 240 + if self.translator:
  241 + prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  242 + translations = self.translator.translate_for_indexing(
  243 + desc_text,
  244 + shop_language=primary_lang,
  245 + source_lang=primary_lang,
  246 + prompt=prompt
  247 + )
  248 + doc['description_zh'] = translations.get('zh') or (desc_text if primary_lang == 'zh' else None)
  249 + doc['description_en'] = translations.get('en') or (desc_text if primary_lang == 'en' else None)
242 else: 250 else:
243 - doc[f'description{secondary_suffix}'] = None 251 + if primary_lang == 'zh':
  252 + doc['description_zh'] = desc_text
  253 + doc['description_en'] = None
  254 + else:
  255 + doc['description_zh'] = None
  256 + doc['description_en'] = desc_text
244 else: 257 else:
245 - doc[f'description{primary_suffix}'] = None  
246 - doc[f'description{secondary_suffix}'] = None 258 + doc['description_zh'] = None
  259 + doc['description_en'] = None
247 260
248 # Vendor 261 # Vendor
249 if pd.notna(spu_row.get('vendor')): 262 if pd.notna(spu_row.get('vendor')):
250 vendor_text = str(spu_row['vendor']) 263 vendor_text = str(spu_row['vendor'])
251 - doc[f'vendor{primary_suffix}'] = vendor_text  
252 - if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):  
253 - if self.translator:  
254 - target_lang = 'en' if primary_lang == 'zh' else 'zh'  
255 - # 根据目标语言选择对应的提示词  
256 - prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
257 - translated = self.translator.translate(  
258 - vendor_text,  
259 - target_lang=target_lang,  
260 - source_lang=primary_lang,  
261 - prompt=prompt  
262 - )  
263 - doc[f'vendor{secondary_suffix}'] = translated if translated else None  
264 - else:  
265 - doc[f'vendor{secondary_suffix}'] = None 264 + if self.translator:
  265 + prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  266 + translations = self.translator.translate_for_indexing(
  267 + vendor_text,
  268 + shop_language=primary_lang,
  269 + source_lang=primary_lang,
  270 + prompt=prompt
  271 + )
  272 + doc['vendor_zh'] = translations.get('zh') or (vendor_text if primary_lang == 'zh' else None)
  273 + doc['vendor_en'] = translations.get('en') or (vendor_text if primary_lang == 'en' else None)
266 else: 274 else:
267 - doc[f'vendor{secondary_suffix}'] = None 275 + if primary_lang == 'zh':
  276 + doc['vendor_zh'] = vendor_text
  277 + doc['vendor_en'] = None
  278 + else:
  279 + doc['vendor_zh'] = None
  280 + doc['vendor_en'] = vendor_text
268 else: 281 else:
269 - doc[f'vendor{primary_suffix}'] = None  
270 - doc[f'vendor{secondary_suffix}'] = None 282 + doc['vendor_zh'] = None
  283 + doc['vendor_en'] = None
271 284
272 def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): 285 def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series):
273 """填充类目相关字段。""" 286 """填充类目相关字段。"""
@@ -542,4 +555,36 @@ class SPUDocumentTransformer: @@ -542,4 +555,36 @@ class SPUDocumentTransformer:
542 sku_data['image_src'] = str(sku_row['image_src']) 555 sku_data['image_src'] = str(sku_row['image_src'])
543 556
544 return sku_data 557 return sku_data
  558 +
  559 + def _fill_title_embedding(self, doc: Dict[str, Any]) -> None:
  560 + """
  561 + 填充标题向量化字段。
  562 +
  563 + 使用英文标题(title_en)生成embedding。如果title_en不存在,则使用title_zh。
  564 +
  565 + Args:
  566 + doc: ES文档字典
  567 + """
  568 + # 优先使用英文标题,如果没有则使用中文标题
  569 + title_text = doc.get('title_en') or doc.get('title_zh')
  570 +
  571 + if not title_text or not title_text.strip():
  572 + logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}")
  573 + return
  574 +
  575 + try:
  576 + # 使用BgeEncoder生成embedding
  577 + # encode方法返回numpy数组,形状为(n, 1024)
  578 + embeddings = self.encoder.encode(title_text)
  579 +
  580 + if embeddings is not None and len(embeddings) > 0:
  581 + # 取第一个embedding(因为只传了一个文本)
  582 + embedding = embeddings[0]
  583 + # 转换为列表格式(ES需要)
  584 + doc['title_embedding'] = embedding.tolist()
  585 + logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...")
  586 + else:
  587 + logger.warning(f"Failed to generate embedding for title: {title_text[:50]}...")
  588 + except Exception as e:
  589 + logger.error(f"Error generating title_embedding for SPU {doc.get('spu_id')}: {e}", exc_info=True)
545 590
indexer/indexing_utils.py
@@ -56,7 +56,9 @@ def create_document_transformer( @@ -56,7 +56,9 @@ def create_document_transformer(
56 tenant_id: str, 56 tenant_id: str,
57 searchable_option_dimensions: Optional[list] = None, 57 searchable_option_dimensions: Optional[list] = None,
58 translator: Optional[Any] = None, 58 translator: Optional[Any] = None,
59 - translation_prompts: Optional[Dict[str, str]] = None 59 + translation_prompts: Optional[Dict[str, str]] = None,
  60 + encoder: Optional[Any] = None,
  61 + enable_title_embedding: bool = True
60 ) -> SPUDocumentTransformer: 62 ) -> SPUDocumentTransformer:
61 """ 63 """
62 创建文档转换器(统一初始化逻辑)。 64 创建文档转换器(统一初始化逻辑)。
@@ -67,6 +69,8 @@ def create_document_transformer( @@ -67,6 +69,8 @@ def create_document_transformer(
67 searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) 69 searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
68 translator: 翻译器实例(如果为None则根据配置初始化) 70 translator: 翻译器实例(如果为None则根据配置初始化)
69 translation_prompts: 翻译提示词配置(如果为None则从配置加载) 71 translation_prompts: 翻译提示词配置(如果为None则从配置加载)
  72 + encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
  73 + enable_title_embedding: 是否启用标题向量化(默认True)
70 74
71 Returns: 75 Returns:
72 SPUDocumentTransformer实例 76 SPUDocumentTransformer实例
@@ -76,7 +80,7 @@ def create_document_transformer( @@ -76,7 +80,7 @@ def create_document_transformer(
76 tenant_config = tenant_config_loader.get_tenant_config(tenant_id) 80 tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
77 81
78 # 加载搜索配置(如果需要) 82 # 加载搜索配置(如果需要)
79 - if searchable_option_dimensions is None or translator is None or translation_prompts is None: 83 + if searchable_option_dimensions is None or translator is None or translation_prompts is None or (encoder is None and enable_title_embedding):
80 try: 84 try:
81 config_loader = ConfigLoader() 85 config_loader = ConfigLoader()
82 config = config_loader.load_config() 86 config = config_loader.load_config()
@@ -95,6 +99,16 @@ def create_document_transformer( @@ -95,6 +99,16 @@ def create_document_transformer(
95 99
96 if translation_prompts is None: 100 if translation_prompts is None:
97 translation_prompts = config.query_config.translation_prompts 101 translation_prompts = config.query_config.translation_prompts
  102 +
  103 + # 初始化encoder(如果启用标题向量化且未提供encoder)
  104 + if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
  105 + try:
  106 + from embeddings.text_encoder import BgeEncoder
  107 + encoder = BgeEncoder()
  108 + logger.info("BgeEncoder initialized for title embedding")
  109 + except Exception as e:
  110 + logger.warning(f"Failed to initialize BgeEncoder: {e}, title embedding will be disabled")
  111 + enable_title_embedding = False
98 except Exception as e: 112 except Exception as e:
99 logger.warning(f"Failed to load config, using defaults: {e}") 113 logger.warning(f"Failed to load config, using defaults: {e}")
100 if searchable_option_dimensions is None: 114 if searchable_option_dimensions is None:
@@ -107,6 +121,8 @@ def create_document_transformer( @@ -107,6 +121,8 @@ def create_document_transformer(
107 searchable_option_dimensions=searchable_option_dimensions, 121 searchable_option_dimensions=searchable_option_dimensions,
108 tenant_config=tenant_config, 122 tenant_config=tenant_config,
109 translator=translator, 123 translator=translator,
110 - translation_prompts=translation_prompts 124 + translation_prompts=translation_prompts,
  125 + encoder=encoder,
  126 + enable_title_embedding=enable_title_embedding
111 ) 127 )
112 128
query/translator.py
@@ -13,18 +13,20 @@ https://developers.deepl.com/api-reference/translate/request-translation @@ -13,18 +13,20 @@ https://developers.deepl.com/api-reference/translate/request-translation
13 13
14 import requests 14 import requests
15 import re 15 import re
  16 +import redis
16 from concurrent.futures import ThreadPoolExecutor 17 from concurrent.futures import ThreadPoolExecutor
  18 +from datetime import timedelta
17 from typing import Dict, List, Optional 19 from typing import Dict, List, Optional
18 -from utils.cache import DictCache  
19 import logging 20 import logging
20 21
21 logger = logging.getLogger(__name__) 22 logger = logging.getLogger(__name__)
22 23
23 -# Try to import DEEPL_AUTH_KEY, but allow import to fail 24 +# Try to import DEEPL_AUTH_KEY and REDIS_CONFIG, but allow import to fail
24 try: 25 try:
25 - from config.env_config import DEEPL_AUTH_KEY 26 + from config.env_config import DEEPL_AUTH_KEY, REDIS_CONFIG
26 except ImportError: 27 except ImportError:
27 DEEPL_AUTH_KEY = None 28 DEEPL_AUTH_KEY = None
  29 + REDIS_CONFIG = {}
28 30
29 31
30 class Translator: 32 class Translator:
@@ -74,9 +76,30 @@ class Translator: @@ -74,9 +76,30 @@ class Translator:
74 self.glossary_id = glossary_id 76 self.glossary_id = glossary_id
75 self.translation_context = translation_context or "e-commerce product search" 77 self.translation_context = translation_context or "e-commerce product search"
76 78
  79 + # Initialize Redis cache if enabled
77 if use_cache: 80 if use_cache:
78 - self.cache = DictCache(".cache/translations.json") 81 + try:
  82 + self.redis_client = redis.Redis(
  83 + host=REDIS_CONFIG.get('host', 'localhost'),
  84 + port=REDIS_CONFIG.get('port', 6479),
  85 + password=REDIS_CONFIG.get('password'),
  86 + decode_responses=True, # Return str instead of bytes
  87 + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
  88 + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
  89 + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
  90 + health_check_interval=10, # 避免复用坏连接
  91 + )
  92 + # Test connection
  93 + self.redis_client.ping()
  94 + self.expire_time = timedelta(days=REDIS_CONFIG.get('translation_cache_expire_days', 360))
  95 + self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
  96 + logger.info("Redis cache initialized for translations")
  97 + except Exception as e:
  98 + logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
  99 + self.redis_client = None
  100 + self.cache = None
79 else: 101 else:
  102 + self.redis_client = None
80 self.cache = None 103 self.cache = None
81 104
82 # Thread pool for async translation 105 # Thread pool for async translation
@@ -131,8 +154,8 @@ class Translator: @@ -131,8 +154,8 @@ class Translator:
131 cache_key = ':'.join(cache_key_parts) 154 cache_key = ':'.join(cache_key_parts)
132 155
133 # Check cache (include context and prompt in cache key for accuracy) 156 # Check cache (include context and prompt in cache key for accuracy)
134 - if self.use_cache:  
135 - cached = self.cache.get(cache_key, category="translations") 157 + if self.use_cache and self.redis_client:
  158 + cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
136 if cached: 159 if cached:
137 return cached 160 return cached
138 161
@@ -155,8 +178,8 @@ class Translator: @@ -155,8 +178,8 @@ class Translator:
155 result = text 178 result = text
156 179
157 # Cache result 180 # Cache result
158 - if result and self.use_cache:  
159 - self.cache.set(cache_key, result, category="translations") 181 + if result and self.use_cache and self.redis_client:
  182 + self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
160 183
161 return result 184 return result
162 185
@@ -395,16 +418,57 @@ class Translator: @@ -395,16 +418,57 @@ class Translator:
395 prompt: Optional[str] = None 418 prompt: Optional[str] = None
396 ) -> Optional[str]: 419 ) -> Optional[str]:
397 """Get translation from cache if available.""" 420 """Get translation from cache if available."""
398 - if not self.cache: 421 + if not self.redis_client:
  422 + return None
  423 + return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
  424 +
  425 + def _get_cached_translation_redis(
  426 + self,
  427 + text: str,
  428 + target_lang: str,
  429 + source_lang: Optional[str] = None,
  430 + context: Optional[str] = None,
  431 + prompt: Optional[str] = None
  432 + ) -> Optional[str]:
  433 + """Get translation from Redis cache with sliding expiration."""
  434 + if not self.redis_client:
399 return None 435 return None
400 436
401 - translation_context = context or self.translation_context  
402 - cache_key_parts = [source_lang or 'auto', target_lang, translation_context]  
403 - if prompt:  
404 - cache_key_parts.append(prompt)  
405 - cache_key_parts.append(text)  
406 - cache_key = ':'.join(cache_key_parts)  
407 - return self.cache.get(cache_key, category="translations") 437 + try:
  438 + # Build cache key: prefix:target_lang:text
  439 + # For simplicity, we use target_lang and text as key
  440 + # Context and prompt are not included in key to maximize cache hits
  441 + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
  442 + value = self.redis_client.get(cache_key)
  443 + if value:
  444 + # Sliding expiration: reset expiration time on access
  445 + self.redis_client.expire(cache_key, self.expire_time)
  446 + logger.debug(f"[Translator] Cache hit for translation: {text} -> {target_lang}")
  447 + return value
  448 + return None
  449 + except Exception as e:
  450 + logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}")
  451 + return None
  452 +
  453 + def _set_cached_translation_redis(
  454 + self,
  455 + text: str,
  456 + target_lang: str,
  457 + translation: str,
  458 + source_lang: Optional[str] = None,
  459 + context: Optional[str] = None,
  460 + prompt: Optional[str] = None
  461 + ) -> None:
  462 + """Store translation in Redis cache."""
  463 + if not self.redis_client:
  464 + return
  465 +
  466 + try:
  467 + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
  468 + self.redis_client.setex(cache_key, self.expire_time, translation)
  469 + logger.debug(f"[Translator] Cached translation: {text} -> {target_lang}: {translation}")
  470 + except Exception as e:
  471 + logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}")
408 472
409 def _translate_async( 473 def _translate_async(
410 self, 474 self,
@@ -507,6 +571,83 @@ class Translator: @@ -507,6 +571,83 @@ class Translator:
507 # The user can configure a glossary for better results 571 # The user can configure a glossary for better results
508 return translated_text 572 return translated_text
509 573
  574 + def translate_for_indexing(
  575 + self,
  576 + text: str,
  577 + shop_language: str,
  578 + source_lang: Optional[str] = None,
  579 + context: Optional[str] = None,
  580 + prompt: Optional[str] = None
  581 + ) -> Dict[str, Optional[str]]:
  582 + """
  583 + Translate text for indexing based on shop language configuration.
  584 +
  585 + This method automatically handles multi-language translation:
  586 + - If shop language is not 'zh', translate to Chinese (zh)
  587 + - If shop language is not 'en', translate to English (en)
  588 +
  589 + All translation logic is internal - callers don't need to worry about
  590 + which languages to translate to.
  591 +
  592 + Args:
  593 + text: Text to translate
  594 + shop_language: Shop's configured language (e.g., 'zh', 'en', 'ru')
  595 + source_lang: Source language code (optional, auto-detect if None)
  596 + context: Additional context for translation (optional)
  597 + prompt: Translation prompt/instruction (optional)
  598 +
  599 + Returns:
  600 + Dictionary with 'zh' and 'en' keys containing translated text (or None if not needed)
  601 + Example: {'zh': '中文翻译', 'en': 'English translation'}
  602 + """
  603 + if not text or not text.strip():
  604 + return {'zh': None, 'en': None}
  605 +
  606 + # Skip translation for symbol-only queries
  607 + if re.match(r'^[\d\s_-]+$', text):
  608 + logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
  609 + return {'zh': None, 'en': None}
  610 +
  611 + results = {'zh': None, 'en': None}
  612 + shop_lang_lower = shop_language.lower() if shop_language else ""
  613 +
  614 + # Determine which languages need translation
  615 + targets = []
  616 + if "zh" not in shop_lang_lower:
  617 + targets.append("zh")
  618 + if "en" not in shop_lang_lower:
  619 + targets.append("en")
  620 +
  621 + # If shop language is already zh and en, no translation needed
  622 + if not targets:
  623 + # Use original text for both languages
  624 + if "zh" in shop_lang_lower:
  625 + results['zh'] = text
  626 + if "en" in shop_lang_lower:
  627 + results['en'] = text
  628 + return results
  629 +
  630 + # Translate to each target language
  631 + for target_lang in targets:
  632 + # Check cache first
  633 + cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
  634 + if cached:
  635 + results[target_lang] = cached
  636 + logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
  637 + continue
  638 +
  639 + # Translate synchronously for indexing (we need the result immediately)
  640 + translated = self.translate(
  641 + text,
  642 + target_lang=target_lang,
  643 + source_lang=source_lang or shop_language,
  644 + context=context,
  645 + prompt=prompt
  646 + )
  647 + results[target_lang] = translated
  648 +
  649 + return results
  650 +
510 def get_translation_needs( 651 def get_translation_needs(
511 self, 652 self,
512 detected_lang: str, 653 detected_lang: str,
@@ -524,7 +665,7 @@ class Translator: @@ -524,7 +665,7 @@ class Translator:
524 """ 665 """
525 # If detected language is in supported list, translate to others 666 # If detected language is in supported list, translate to others
526 if detected_lang in supported_langs: 667 if detected_lang in supported_langs:
527 - return [lang for lang in supported_langs if lang != detected_lang] 668 + return [lang for lang in supported_langs if detected_lang != lang]
528 669
529 # Otherwise, translate to all supported languages 670 # Otherwise, translate to all supported languages
530 return supported_langs 671 return supported_langs