Compare View

switch
from
...
to
 
Commits (2)
  • 主要功能:
    1. 增量数据获取服务
       - 新增 IncrementalIndexerService 提供单个SPU数据获取
       - 新增 /indexer/spu/{spu_id} API接口
       - 服务启动时预加载分类映射等公共数据
       - 提取 SPUDocumentTransformer 统一全量和增量转换逻辑
       - 支持根据租户配置进行语言处理和翻译
    
    3. 租户配置系统
       - 租户配置合并到统一配置文件 config/config.yaml
       - 支持每个租户独立配置主语言和翻译选项
       - 租户162配置为翻译关闭(用于测试)
    
    4. 翻译功能集成
       - 翻译提示词作为DeepL API的context参数传递
       - 支持中英文提示词配置
       - 索引场景:同步翻译,使用缓存
       - 查询场景:异步翻译,立即返回
    
    测试:
    - 新增 indexer/test_indexing.py 和 query/test_translation.py
    - 验证租户162翻译关闭功能
    - 验证全量和增量索引功能
    tangwang
     
  • - 新增批量索引接口: POST /indexer/bulk - 全量索引功能
      - SPU接口改进: POST /indexer/spus - 支持批量获取SPU文档(最多100个)
    
    新增 全量索引服务
    indexer/bulk_indexing_service.py
    
    docs/搜索API对接指南.md
      - 新增索引接口文档: 详细的批量索引和SPU索引接口说明
      - 请求示例: 提供完整的curl命令示例
    tangwang
     
@@ -24,12 +24,15 @@ from slowapi.util import get_remote_address @@ -24,12 +24,15 @@ from slowapi.util import get_remote_address
24 from slowapi.errors import RateLimitExceeded 24 from slowapi.errors import RateLimitExceeded
25 25
26 # Configure logging with better formatting 26 # Configure logging with better formatting
  27 +import pathlib
  28 +log_dir = pathlib.Path('logs')
  29 +log_dir.mkdir(exist_ok=True)
27 logging.basicConfig( 30 logging.basicConfig(
28 level=logging.INFO, 31 level=logging.INFO,
29 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 32 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
30 handlers=[ 33 handlers=[
31 logging.StreamHandler(), 34 logging.StreamHandler(),
32 - logging.FileHandler('/tmp/search_engine_api.log', mode='a') 35 + logging.FileHandler(log_dir / 'api.log', mode='a', encoding='utf-8')
33 ] 36 ]
34 ) 37 )
35 logger = logging.getLogger(__name__) 38 logger = logging.getLogger(__name__)
@@ -40,17 +43,21 @@ limiter = Limiter(key_func=get_remote_address) @@ -40,17 +43,21 @@ limiter = Limiter(key_func=get_remote_address)
40 # Add parent directory to path 43 # Add parent directory to path
41 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 44 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
42 45
43 -from config.env_config import ES_CONFIG 46 +from config.env_config import ES_CONFIG, DB_CONFIG
44 from config import ConfigLoader 47 from config import ConfigLoader
45 from utils import ESClient 48 from utils import ESClient
  49 +from utils.db_connector import create_db_connection
46 from search import Searcher 50 from search import Searcher
47 from query import QueryParser 51 from query import QueryParser
  52 +from indexer.incremental_service import IncrementalIndexerService
48 53
49 # Global instances 54 # Global instances
50 _es_client: Optional[ESClient] = None 55 _es_client: Optional[ESClient] = None
51 _searcher: Optional[Searcher] = None 56 _searcher: Optional[Searcher] = None
52 _query_parser: Optional[QueryParser] = None 57 _query_parser: Optional[QueryParser] = None
53 _config = None 58 _config = None
  59 +_incremental_service: Optional[IncrementalIndexerService] = None
  60 +_bulk_indexing_service = None
54 61
55 62
56 def init_service(es_host: str = "http://localhost:9200"): 63 def init_service(es_host: str = "http://localhost:9200"):
@@ -60,7 +67,7 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -60,7 +67,7 @@ def init_service(es_host: str = "http://localhost:9200"):
60 Args: 67 Args:
61 es_host: Elasticsearch host URL 68 es_host: Elasticsearch host URL
62 """ 69 """
63 - global _es_client, _searcher, _query_parser, _config 70 + global _es_client, _searcher, _query_parser, _config, _incremental_service, _bulk_indexing_service
64 71
65 start_time = time.time() 72 start_time = time.time()
66 logger.info("Initializing search service (multi-tenant)") 73 logger.info("Initializing search service (multi-tenant)")
@@ -93,6 +100,44 @@ def init_service(es_host: str = "http://localhost:9200"): @@ -93,6 +100,44 @@ def init_service(es_host: str = "http://localhost:9200"):
93 logger.info("Initializing searcher...") 100 logger.info("Initializing searcher...")
94 _searcher = Searcher(_es_client, _config, _query_parser) 101 _searcher = Searcher(_es_client, _config, _query_parser)
95 102
  103 + # Initialize indexing services (if DB config is available)
  104 + try:
  105 + from utils.db_connector import create_db_connection
  106 + from indexer.incremental_service import IncrementalIndexerService
  107 + from indexer.bulk_indexing_service import BulkIndexingService
  108 +
  109 + db_host = os.getenv('DB_HOST')
  110 + db_port = int(os.getenv('DB_PORT', 3306))
  111 + db_database = os.getenv('DB_DATABASE')
  112 + db_username = os.getenv('DB_USERNAME')
  113 + db_password = os.getenv('DB_PASSWORD')
  114 +
  115 + if all([db_host, db_database, db_username, db_password]):
  116 + logger.info("Initializing database connection for indexing services...")
  117 + db_engine = create_db_connection(
  118 + host=db_host,
  119 + port=db_port,
  120 + database=db_database,
  121 + username=db_username,
  122 + password=db_password
  123 + )
  124 +
  125 + # Initialize incremental service
  126 + _incremental_service = IncrementalIndexerService(db_engine)
  127 + logger.info("Incremental indexer service initialized")
  128 +
  129 + # Initialize bulk indexing service
  130 + _bulk_indexing_service = BulkIndexingService(db_engine, _es_client)
  131 + logger.info("Bulk indexing service initialized")
  132 + else:
  133 + logger.warning("Database config incomplete, indexing services will not be available")
  134 + logger.warning("Required: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD")
  135 + except Exception as e:
  136 + logger.warning(f"Failed to initialize indexing services: {e}")
  137 + logger.warning("Indexing endpoints will not be available")
  138 + _incremental_service = None
  139 + _bulk_indexing_service = None
  140 +
96 elapsed = time.time() - start_time 141 elapsed = time.time() - start_time
97 logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}") 142 logger.info(f"Search service ready! (took {elapsed:.2f}s) | Index: {_config.es_index_name}")
98 143
@@ -127,6 +172,16 @@ def get_config(): @@ -127,6 +172,16 @@ def get_config():
127 return _config 172 return _config
128 173
129 174
  175 +def get_incremental_service() -> Optional[IncrementalIndexerService]:
  176 + """Get incremental indexer service instance."""
  177 + return _incremental_service
  178 +
  179 +
  180 +def get_bulk_indexing_service():
  181 + """Get bulk indexing service instance."""
  182 + return _bulk_indexing_service
  183 +
  184 +
130 # Create FastAPI app with enhanced configuration 185 # Create FastAPI app with enhanced configuration
131 app = FastAPI( 186 app = FastAPI(
132 title="E-Commerce Search API", 187 title="E-Commerce Search API",
@@ -172,15 +227,14 @@ app.add_middleware( @@ -172,15 +227,14 @@ app.add_middleware(
172 async def startup_event(): 227 async def startup_event():
173 """Initialize service on startup.""" 228 """Initialize service on startup."""
174 es_host = os.getenv("ES_HOST", "http://localhost:9200") 229 es_host = os.getenv("ES_HOST", "http://localhost:9200")
175 -  
176 logger.info("Starting E-Commerce Search API (Multi-Tenant)") 230 logger.info("Starting E-Commerce Search API (Multi-Tenant)")
177 logger.info(f"Elasticsearch Host: {es_host}") 231 logger.info(f"Elasticsearch Host: {es_host}")
178 - 232 +
179 try: 233 try:
180 init_service(es_host=es_host) 234 init_service(es_host=es_host)
181 logger.info("Service initialized successfully") 235 logger.info("Service initialized successfully")
182 except Exception as e: 236 except Exception as e:
183 - logger.error(f"Failed to initialize service: {e}") 237 + logger.error(f"Failed to initialize service: {e}", exc_info=True)
184 logger.warning("Service will start but may not function correctly") 238 logger.warning("Service will start but may not function correctly")
185 239
186 240
@@ -267,10 +321,11 @@ async def health_check(request: Request): @@ -267,10 +321,11 @@ async def health_check(request: Request):
267 321
268 322
269 # Include routers 323 # Include routers
270 -from .routes import search, admin 324 +from .routes import search, admin, indexer
271 325
272 app.include_router(search.router) 326 app.include_router(search.router)
273 app.include_router(admin.router) 327 app.include_router(admin.router)
  328 +app.include_router(indexer.router)
274 329
275 # Mount static files and serve frontend 330 # Mount static files and serve frontend
276 frontend_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "frontend") 331 frontend_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "frontend")
api/routes/indexer.py 0 → 100644
@@ -0,0 +1,110 @@ @@ -0,0 +1,110 @@
  1 +"""
  2 +索引API路由。
  3 +
  4 +提供全量和增量索引接口,供外部Java程序调用。
  5 +"""
  6 +
  7 +from fastapi import APIRouter, HTTPException
  8 +from typing import List
  9 +from pydantic import BaseModel
  10 +import logging
  11 +
  12 +logger = logging.getLogger(__name__)
  13 +
  14 +router = APIRouter(prefix="/indexer", tags=["indexer"])
  15 +
  16 +
  17 +class BulkIndexRequest(BaseModel):
  18 + tenant_id: str
  19 + recreate_index: bool = False
  20 + batch_size: int = 500
  21 +
  22 +
  23 +class BatchSpuRequest(BaseModel):
  24 + tenant_id: str
  25 + spu_ids: List[str]
  26 +
  27 +
  28 +@router.post("/bulk")
  29 +async def bulk_index(request: BulkIndexRequest):
  30 + """全量索引接口"""
  31 + try:
  32 + from ..app import get_bulk_indexing_service
  33 + service = get_bulk_indexing_service()
  34 + if service is None:
  35 + raise HTTPException(status_code=503, detail="Bulk indexing service is not initialized")
  36 + return service.bulk_index(
  37 + tenant_id=request.tenant_id,
  38 + recreate_index=request.recreate_index,
  39 + batch_size=request.batch_size
  40 + )
  41 + except HTTPException:
  42 + raise
  43 + except Exception as e:
  44 + logger.error(f"Error in bulk indexing for tenant_id={request.tenant_id}: {e}", exc_info=True)
  45 + raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
  46 +
  47 +
  48 +@router.post("/spus")
  49 +async def get_spu_documents(request: BatchSpuRequest):
  50 + """获取SPU文档接口(支持单个或批量)"""
  51 + try:
  52 + from ..app import get_incremental_service
  53 + if not request.spu_ids:
  54 + raise HTTPException(status_code=400, detail="spu_ids cannot be empty")
  55 + if len(request.spu_ids) > 100:
  56 + raise HTTPException(status_code=400, detail="Maximum 100 SPU IDs allowed per request")
  57 + service = get_incremental_service()
  58 + if service is None:
  59 + raise HTTPException(status_code=503, detail="Incremental indexer service is not initialized")
  60 + success_list, failed_list = [], []
  61 + for spu_id in request.spu_ids:
  62 + try:
  63 + doc = service.get_spu_document(tenant_id=request.tenant_id, spu_id=spu_id)
  64 + (success_list if doc else failed_list).append({
  65 + "spu_id": spu_id,
  66 + "document": doc
  67 + } if doc else {
  68 + "spu_id": spu_id,
  69 + "error": "SPU not found or deleted"
  70 + })
  71 + except Exception as e:
  72 + failed_list.append({"spu_id": spu_id, "error": str(e)})
  73 + return {
  74 + "success": success_list,
  75 + "failed": failed_list,
  76 + "total": len(request.spu_ids),
  77 + "success_count": len(success_list),
  78 + "failed_count": len(failed_list)
  79 + }
  80 + except HTTPException:
  81 + raise
  82 + except Exception as e:
  83 + logger.error(f"Error getting SPU documents for tenant_id={request.tenant_id}: {e}", exc_info=True)
  84 + raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
  85 +
  86 +
  87 +@router.get("/health")
  88 +async def indexer_health_check():
  89 + """检查索引服务健康状态"""
  90 + try:
  91 + from ..app import get_incremental_service
  92 + from sqlalchemy import text
  93 + service = get_incremental_service()
  94 + if service is None:
  95 + return {"status": "unavailable", "database": "unknown", "preloaded_data": {"category_mappings": 0}}
  96 + try:
  97 + with service.db_engine.connect() as conn:
  98 + conn.execute(text("SELECT 1"))
  99 + db_status = "connected"
  100 + except Exception as e:
  101 + db_status = f"disconnected: {str(e)}"
  102 + return {
  103 + "status": "available",
  104 + "database": db_status,
  105 + "preloaded_data": {"category_mappings": len(service.category_id_to_name)}
  106 + }
  107 + except Exception as e:
  108 + logger.error(f"Error checking indexer health: {e}", exc_info=True)
  109 + return {"status": "error", "message": str(e)}
  110 +
config/config.yaml
@@ -104,6 +104,18 @@ query_config: @@ -104,6 +104,18 @@ query_config:
104 translation_service: "deepl" 104 translation_service: "deepl"
105 translation_api_key: null # 通过环境变量设置 105 translation_api_key: null # 通过环境变量设置
106 106
  107 + # 翻译提示词配置(用于提高翻译质量,作为DeepL API的context参数)
  108 + translation_prompts:
  109 + # 商品标题翻译提示词
  110 + product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。"
  111 + product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."
  112 + # query翻译提示词
  113 + query_zh: "电商领域"
  114 + query_en: "e-commerce domain"
  115 + # 默认翻译用词
  116 + default_zh: "电商领域"
  117 + default_en: "e-commerce domain"
  118 +
107 # 返回字段配置(_source includes) 119 # 返回字段配置(_source includes)
108 # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 120 # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
109 source_fields: null 121 source_fields: null
@@ -133,3 +145,30 @@ spu_config: @@ -133,3 +145,30 @@ spu_config:
133 # 配置哪些option维度参与检索(进索引、以及在线搜索) 145 # 配置哪些option维度参与检索(进索引、以及在线搜索)
134 # 格式为list,选择option1/option2/option3中的一个或多个 146 # 格式为list,选择option1/option2/option3中的一个或多个
135 searchable_option_dimensions: ['option1', 'option2', 'option3'] 147 searchable_option_dimensions: ['option1', 'option2', 'option3']
  148 +
  149 +# 租户配置(Tenant Configuration)
  150 +# 每个租户可以配置主语言和翻译选项
  151 +tenant_config:
  152 + # 默认配置(未配置的租户使用此配置)
  153 + default:
  154 + primary_language: "zh"
  155 + translate_to_en: true
  156 + translate_to_zh: false
  157 + # 租户特定配置
  158 + tenants:
  159 + "1":
  160 + primary_language: "zh"
  161 + translate_to_en: true
  162 + translate_to_zh: false
  163 + "2":
  164 + primary_language: "en"
  165 + translate_to_en: false
  166 + translate_to_zh: true
  167 + "3":
  168 + primary_language: "zh"
  169 + translate_to_en: true
  170 + translate_to_zh: false
  171 + "162":
  172 + primary_language: "zh"
  173 + translate_to_en: false
  174 + translate_to_zh: false
config/config_loader.py
@@ -45,6 +45,7 @@ class QueryConfig: @@ -45,6 +45,7 @@ class QueryConfig:
45 translation_api_key: Optional[str] = None 45 translation_api_key: Optional[str] = None
46 translation_glossary_id: Optional[str] = None 46 translation_glossary_id: Optional[str] = None
47 translation_context: str = "e-commerce product search" 47 translation_context: str = "e-commerce product search"
  48 + translation_prompts: Dict[str, str] = field(default_factory=dict) # Translation prompts for different use cases
48 49
49 # Embedding field names 50 # Embedding field names
50 text_embedding_field: Optional[str] = "title_embedding" 51 text_embedding_field: Optional[str] = "title_embedding"
@@ -118,6 +119,11 @@ class SearchConfig: @@ -118,6 +119,11 @@ class SearchConfig:
118 119
119 # ES index settings 120 # ES index settings
120 es_index_name: str 121 es_index_name: str
  122 +
  123 + # Tenant configuration
  124 + tenant_config: Dict[str, Any] = field(default_factory=dict)
  125 +
  126 + # ES settings
121 es_settings: Dict[str, Any] = field(default_factory=dict) 127 es_settings: Dict[str, Any] = field(default_factory=dict)
122 128
123 129
@@ -232,6 +238,7 @@ class ConfigLoader: @@ -232,6 +238,7 @@ class ConfigLoader:
232 translation_service=query_config_data.get("translation_service") or "deepl", 238 translation_service=query_config_data.get("translation_service") or "deepl",
233 translation_glossary_id=query_config_data.get("translation_glossary_id"), 239 translation_glossary_id=query_config_data.get("translation_glossary_id"),
234 translation_context=query_config_data.get("translation_context") or "e-commerce product search", 240 translation_context=query_config_data.get("translation_context") or "e-commerce product search",
  241 + translation_prompts=query_config_data.get("translation_prompts", {}),
235 text_embedding_field=query_config_data.get("text_embedding_field"), 242 text_embedding_field=query_config_data.get("text_embedding_field"),
236 image_embedding_field=query_config_data.get("image_embedding_field"), 243 image_embedding_field=query_config_data.get("image_embedding_field"),
237 embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), 244 embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
@@ -271,6 +278,9 @@ class ConfigLoader: @@ -271,6 +278,9 @@ class ConfigLoader:
271 searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3']) 278 searchable_option_dimensions=spu_data.get("searchable_option_dimensions", ['option1', 'option2', 'option3'])
272 ) 279 )
273 280
  281 + # Parse tenant config
  282 + tenant_config_data = config_data.get("tenant_config", {})
  283 +
274 return SearchConfig( 284 return SearchConfig(
275 field_boosts=field_boosts, 285 field_boosts=field_boosts,
276 indexes=indexes, 286 indexes=indexes,
@@ -279,6 +289,7 @@ class ConfigLoader: @@ -279,6 +289,7 @@ class ConfigLoader:
279 function_score=function_score, 289 function_score=function_score,
280 rerank=rerank, 290 rerank=rerank,
281 spu_config=spu_config, 291 spu_config=spu_config,
  292 + tenant_config=tenant_config_data,
282 es_index_name=config_data.get("es_index_name", "search_products"), 293 es_index_name=config_data.get("es_index_name", "search_products"),
283 es_settings=config_data.get("es_settings", {}) 294 es_settings=config_data.get("es_settings", {})
284 ) 295 )
config/tenant_config_loader.py 0 → 100644
@@ -0,0 +1,90 @@ @@ -0,0 +1,90 @@
  1 +"""
  2 +租户配置加载器。
  3 +
  4 +从统一配置文件(config.yaml)加载租户配置,包括主语言和翻译配置。
  5 +"""
  6 +
  7 +import logging
  8 +from typing import Dict, Any, Optional
  9 +
  10 +logger = logging.getLogger(__name__)
  11 +
  12 +
  13 +class TenantConfigLoader:
  14 + """租户配置加载器。"""
  15 +
  16 + def __init__(self):
  17 + """初始化租户配置加载器。"""
  18 + self._config: Optional[Dict[str, Any]] = None
  19 +
  20 + def load_config(self) -> Dict[str, Any]:
  21 + """
  22 + 加载租户配置(从统一配置文件)。
  23 +
  24 + Returns:
  25 + 租户配置字典,格式:{"tenants": {...}, "default": {...}}
  26 + """
  27 + if self._config is not None:
  28 + return self._config
  29 +
  30 + try:
  31 + from config import ConfigLoader
  32 + config_loader = ConfigLoader()
  33 + search_config = config_loader.load_config()
  34 + self._config = search_config.tenant_config
  35 + logger.info("Loaded tenant config from unified config.yaml")
  36 + return self._config
  37 + except Exception as e:
  38 + logger.error(f"Failed to load tenant config: {e}", exc_info=True)
  39 + # 返回默认配置
  40 + self._config = {
  41 + "default": {
  42 + "primary_language": "zh",
  43 + "translate_to_en": True,
  44 + "translate_to_zh": False
  45 + },
  46 + "tenants": {}
  47 + }
  48 + return self._config
  49 +
  50 + def get_tenant_config(self, tenant_id: str) -> Dict[str, Any]:
  51 + """
  52 + 获取指定租户的配置。
  53 +
  54 + Args:
  55 + tenant_id: 租户ID
  56 +
  57 + Returns:
  58 + 租户配置字典,如果租户不存在则返回默认配置
  59 + """
  60 + config = self.load_config()
  61 + tenant_id_str = str(tenant_id)
  62 +
  63 + tenants = config.get("tenants", {})
  64 + if tenant_id_str in tenants:
  65 + return tenants[tenant_id_str]
  66 + else:
  67 + logger.debug(f"Tenant {tenant_id} not found in config, using default")
  68 + return config.get("default", {
  69 + "primary_language": "zh",
  70 + "translate_to_en": True,
  71 + "translate_to_zh": False
  72 + })
  73 +
  74 + def reload(self):
  75 + """重新加载配置(用于配置更新)。"""
  76 + self._config = None
  77 + return self.load_config()
  78 +
  79 +
  80 +# 全局实例
  81 +_tenant_config_loader: Optional[TenantConfigLoader] = None
  82 +
  83 +
  84 +def get_tenant_config_loader() -> TenantConfigLoader:
  85 + """获取全局租户配置加载器实例。"""
  86 + global _tenant_config_loader
  87 + if _tenant_config_loader is None:
  88 + _tenant_config_loader = TenantConfigLoader()
  89 + return _tenant_config_loader
  90 +
docs/搜索API对接指南.md
@@ -76,12 +76,191 @@ curl -X POST "http://120.76.41.98:6002/search/" \ @@ -76,12 +76,191 @@ curl -X POST "http://120.76.41.98:6002/search/" \
76 76
77 ## 接口概览 77 ## 接口概览
78 78
79 -| 接口 | HTTP Method | Endpoint |  
80 -|------|------|------|  
81 -| 搜索 | POST | `/search/` |  
82 -| 搜索建议(框架,暂未实现) | GET | `/search/suggestions` |  
83 -| 获取文档 | GET | `/search/{doc_id}` |  
84 -| 健康检查 | GET | `/admin/health` | 79 +| 接口 | HTTP Method | Endpoint | 说明 |
  80 +|------|------|------|------|
  81 +| 搜索 | POST | `/search/` | 执行搜索查询 |
  82 +| 全量索引 | POST | `/indexer/bulk` | 全量索引接口 |
  83 +| SPU索引 | POST | `/indexer/spus` | 获取SPU文档(支持单个或批量) |
  84 +| 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 |
  85 +| 搜索建议(框架,暂未实现) | GET | `/search/suggestions` | 搜索建议 |
  86 +| 获取文档 | GET | `/search/{doc_id}` | 获取单个文档 |
  87 +| 健康检查 | GET | `/admin/health` | 服务健康检查 |
  88 +
  89 +---
  90 +
  91 +## 索引接口
  92 +
  93 +### 全量索引接口
  94 +
  95 +- **端点**: `POST /indexer/bulk`
  96 +- **描述**: 将指定租户的所有SPU数据导入到ES索引
  97 +
  98 +#### 请求参数
  99 +
  100 +```json
  101 +{
  102 + "tenant_id": "162",
  103 + "recreate_index": false,
  104 + "batch_size": 500
  105 +}
  106 +```
  107 +
  108 +| 参数 | 类型 | 必填 | 默认值 | 说明 |
  109 +|------|------|------|--------|------|
  110 +| `tenant_id` | string | Y | - | 租户ID |
  111 +| `recreate_index` | boolean | N | false | 是否重建索引(删除旧索引后创建新索引) |
  112 +| `batch_size` | integer | N | 500 | 批量导入大小 |
  113 +
  114 +#### 响应格式
  115 +
  116 +**成功响应(200 OK)**:
  117 +```json
  118 +{
  119 + "success": true,
  120 + "total": 1000,
  121 + "indexed": 1000,
  122 + "failed": 0,
  123 + "elapsed_time": 12.34,
  124 + "index_name": "search_products",
  125 + "tenant_id": "162"
  126 +}
  127 +```
  128 +
  129 +**错误响应**:
  130 +- `400 Bad Request`: 参数错误
  131 +- `503 Service Unavailable`: 服务未初始化
  132 +
  133 +#### 请求示例
  134 +
  135 +**首次索引(重建索引)**:
  136 +```bash
  137 +curl -X POST "http://localhost:6002/indexer/bulk" \
  138 + -H "Content-Type: application/json" \
  139 + -d '{
  140 + "tenant_id": "162",
  141 + "recreate_index": true,
  142 + "batch_size": 500
  143 + }'
  144 +```
  145 +
  146 +**查看日志**:
  147 +```bash
  148 +# 查看API日志(包含索引操作日志)
  149 +tail -f logs/api.log
  150 +
  151 +# 或者查看所有日志文件
  152 +tail -f logs/*.log
  153 +```
  154 +
  155 +**增量更新(不重建索引)**:
  156 +```bash
  157 +curl -X POST "http://localhost:6002/indexer/bulk" \
  158 + -H "Content-Type: application/json" \
  159 + -d '{
  160 + "tenant_id": "162",
  161 + "recreate_index": false,
  162 + "batch_size": 500
  163 + }'
  164 +```
  165 +
  166 +---
  167 +
  168 +### SPU索引接口
  169 +
  170 +- **端点**: `POST /indexer/spus`
  171 +- **描述**: 获取SPU的ES文档数据(支持单个或批量)
  172 +
  173 +#### 请求参数
  174 +
  175 +```json
  176 +{
  177 + "tenant_id": "162",
  178 + "spu_ids": ["123", "456", "789"]
  179 +}
  180 +```
  181 +
  182 +| 参数 | 类型 | 必填 | 说明 |
  183 +|------|------|------|------|
  184 +| `tenant_id` | string | Y | 租户ID |
  185 +| `spu_ids` | array[string] | Y | SPU ID列表(1-100个) |
  186 +
  187 +#### 响应格式
  188 +
  189 +```json
  190 +{
  191 + "success": [
  192 + {
  193 + "spu_id": "123",
  194 + "document": {
  195 + "tenant_id": "162",
  196 + "spu_id": "123",
  197 + "title_zh": "商品标题",
  198 + ...
  199 + }
  200 + },
  201 + {
  202 + "spu_id": "456",
  203 + "document": {...}
  204 + }
  205 + ],
  206 + "failed": [
  207 + {
  208 + "spu_id": "789",
  209 + "error": "SPU not found or deleted"
  210 + }
  211 + ],
  212 + "total": 3,
  213 + "success_count": 2,
  214 + "failed_count": 1
  215 +}
  216 +```
  217 +
  218 +#### 请求示例
  219 +
  220 +**单个SPU**:
  221 +```bash
  222 +curl -X POST "http://localhost:6002/indexer/spus" \
  223 + -H "Content-Type: application/json" \
  224 + -d '{
  225 + "tenant_id": "162",
  226 + "spu_ids": ["123"]
  227 + }'
  228 +```
  229 +
  230 +**批量SPU**:
  231 +```bash
  232 +curl -X POST "http://localhost:6002/indexer/spus" \
  233 + -H "Content-Type: application/json" \
  234 + -d '{
  235 + "tenant_id": "162",
  236 + "spu_ids": ["123", "456", "789"]
  237 + }'
  238 +```
  239 +
  240 +---
  241 +
  242 +### 索引健康检查接口
  243 +
  244 +- **端点**: `GET /indexer/health`
  245 +- **描述**: 检查索引服务的健康状态
  246 +
  247 +#### 响应格式
  248 +
  249 +```json
  250 +{
  251 + "status": "available",
  252 + "database": "connected",
  253 + "preloaded_data": {
  254 + "category_mappings": 150
  255 + }
  256 +}
  257 +```
  258 +
  259 +#### 请求示例
  260 +
  261 +```bash
  262 +curl -X GET "http://localhost:6002/indexer/health"
  263 +```
85 264
86 --- 265 ---
87 266
@@ -1163,8 +1342,10 @@ curl "http://localhost:6002/search/12345" @@ -1163,8 +1342,10 @@ curl "http://localhost:6002/search/12345"
1163 1342
1164 | 分析器 | 语言 | 描述 | 1343 | 分析器 | 语言 | 描述 |
1165 |--------|------|------| 1344 |--------|------|------|
1166 -| `hanlp_index` | 中文 | 中文索引分析器(用于中文字段) |  
1167 -| `hanlp_standard` | 中文 | 中文查询分析器(用于中文字段) | 1345 +| `index_ansj` | 中文 | 中文索引分析器(用于中文字段) |
  1346 +| `query_ansj` | 中文 | 中文查询分析器(用于中文字段) |
  1347 +| `hanlp_index`(暂不支持) | 中文 | 中文索引分析器(用于中文字段) |
  1348 +| `hanlp_standard`(暂不支持) | 中文 | 中文查询分析器(用于中文字段) |
1168 | `english` | 英文 | 标准英文分析器(用于英文字段) | 1349 | `english` | 英文 | 标准英文分析器(用于英文字段) |
1169 | `lowercase` | - | 小写标准化器(用于keyword子字段) | 1350 | `lowercase` | - | 小写标准化器(用于keyword子字段) |
1170 1351
@@ -1180,4 +1361,3 @@ curl "http://localhost:6002/search/12345" @@ -1180,4 +1361,3 @@ curl "http://localhost:6002/search/12345"
1180 | `date` | `date` | 日期时间 | 1361 | `date` | `date` | 日期时间 |
1181 | `nested` | `nested` | 嵌套对象(specifications, skus, image_embedding) | 1362 | `nested` | `nested` | 嵌套对象(specifications, skus, image_embedding) |
1182 | `dense_vector` | `dense_vector` | 向量字段(title_embedding,仅用于搜索) | 1363 | `dense_vector` | `dense_vector` | 向量字段(title_embedding,仅用于搜索) |
1183 -  
docs/相关性检索优化说明.md
@@ -54,7 +54,7 @@ @@ -54,7 +54,7 @@
54 "fields": ["title_en^3.0", ...], 54 "fields": ["title_en^3.0", ...],
55 "minimum_should_match": "75%", 55 "minimum_should_match": "75%",
56 "operator": "AND", 56 "operator": "AND",
57 - "query": "water sports (e.g. animals playing with water)", 57 + "query": "water sports",
58 "tie_breaker": 0.9 58 "tie_breaker": 0.9
59 } 59 }
60 }, 60 },
docs/索引数据接口文档.md 0 → 100644
@@ -0,0 +1,714 @@ @@ -0,0 +1,714 @@
  1 +# 索引数据接口文档
  2 +
  3 +本文档说明如何获取需要灌入ES索引的数据,包括全量导入脚本和增量数据获取接口。
  4 +
  5 +## 目录
  6 +
  7 +1. [租户配置说明](#租户配置说明)
  8 +2. [全量数据导入脚本](#全量数据导入脚本)
  9 +3. [增量数据获取接口](#增量数据获取接口)
  10 +4. [数据格式说明](#数据格式说明)
  11 +5. [使用示例](#使用示例)
  12 +
  13 +---
  14 +
  15 +## 租户配置说明
  16 +
  17 +### 配置文件位置
  18 +
  19 +租户配置存储在统一配置文件 `config/config.yaml` 中,与索引配置放在同一文件。
  20 +
  21 +### 配置结构
  22 +
  23 +在 `config/config.yaml` 中的 `tenant_config` 部分:
  24 +
  25 +```yaml
  26 +tenant_config:
  27 + # 默认配置(未配置的租户使用此配置)
  28 + default:
  29 + primary_language: "zh"
  30 + translate_to_en: true
  31 + translate_to_zh: false
  32 + # 租户特定配置
  33 + tenants:
  34 + "1":
  35 + primary_language: "zh"
  36 + translate_to_en: true
  37 + translate_to_zh: false
  38 + "162":
  39 + primary_language: "zh"
  40 + translate_to_en: false
  41 + translate_to_zh: false
  42 +```
  43 +
  44 +### 配置字段说明
  45 +
  46 +| 字段 | 类型 | 说明 | 可选值 |
  47 +|------|------|------|--------|
  48 +| `primary_language` | string | 主语言(SKU表中title等文本字段的语言) | `"zh"`(中文)或 `"en"`(英文) |
  49 +| `translate_to_en` | boolean | 是否需要翻译英文 | `true` 或 `false` |
  50 +| `translate_to_zh` | boolean | 是否需要翻译中文 | `true` 或 `false` |
  51 +
  52 +### 配置规则
  53 +
  54 +1. **主语言**:指定SKU表中 `title`、`brief`、`description`、`vendor` 等字段的语言。
  55 + - 如果主语言是 `zh`,这些字段的值会填充到 `title_zh`、`brief_zh` 等字段
  56 + - 如果主语言是 `en`,这些字段的值会填充到 `title_en`、`brief_en` 等字段
  57 +
  58 +2. **翻译配置**:
  59 + - `translate_to_en: true`:如果主语言是中文,则会将中文内容翻译为英文,填充到 `title_en` 等字段
  60 + - `translate_to_zh: true`:如果主语言是英文,则会将英文内容翻译为中文,填充到 `title_zh` 等字段
  61 + - **注意**:如果主语言本身就是目标语言,则不会触发翻译(例如主语言是英文,`translate_to_en: true` 不会触发翻译)
  62 +
  63 +3. **默认配置**:如果租户ID不在 `tenants` 中,则使用 `default` 配置。
  64 +
  65 +### 配置示例
  66 +
  67 +**示例1:中文主语言,需要翻译英文**
  68 +```json
  69 +{
  70 + "primary_language": "zh",
  71 + "translate_to_en": true,
  72 + "translate_to_zh": false
  73 +}
  74 +```
  75 +- SKU表的 `title` 字段(中文)→ `title_zh`
  76 +- 翻译服务将中文翻译为英文 → `title_en`
  77 +
  78 +**示例2:英文主语言,需要翻译中文**
  79 +```json
  80 +{
  81 + "primary_language": "en",
  82 + "translate_to_en": false,
  83 + "translate_to_zh": true
  84 +}
  85 +```
  86 +- SKU表的 `title` 字段(英文)→ `title_en`
  87 +- 翻译服务将英文翻译为中文 → `title_zh`
  88 +
  89 +**示例3:仅使用主语言,不翻译**
  90 +```json
  91 +{
  92 + "primary_language": "zh",
  93 + "translate_to_en": false,
  94 + "translate_to_zh": false
  95 +}
  96 +```
  97 +- SKU表的 `title` 字段(中文)→ `title_zh`
  98 +- `title_en` 保持为 `null`
  99 +
  100 +### 配置更新
  101 +
  102 +修改 `config/config.yaml` 中的 `tenant_config` 部分后,需要重启服务才能生效。增量服务会在每次请求时重新加载租户配置(支持热更新)。
  103 +
  104 +---
  105 +
  106 +## 全量数据导入脚本
  107 +
  108 +### 功能说明
  109 +
  110 +`scripts/recreate_and_import.py` 是一个全量数据导入脚本,用于:
  111 +- 重建ES索引(删除旧索引,使用新的mapping创建新索引)
  112 +- 从MySQL数据库批量读取指定租户的所有SPU数据
  113 +- 将数据转换为ES文档格式
  114 +- 批量导入到Elasticsearch
  115 +
  116 +### 使用方法
  117 +
  118 +#### 基本用法
  119 +
  120 +```bash
  121 +python scripts/recreate_and_import.py \
  122 + --tenant-id 1 \
  123 + --db-host 120.79.247.228 \
  124 + --db-port 3306 \
  125 + --db-database saas \
  126 + --db-username saas \
  127 + --db-password your_password \
  128 + --es-host http://localhost:9200 \
  129 + --batch-size 500
  130 +```
  131 +
  132 +#### 参数说明
  133 +
  134 +| 参数 | 说明 | 是否必需 | 默认值 |
  135 +|------|------|----------|--------|
  136 +| `--tenant-id` | 租户ID | **是** | - |
  137 +| `--db-host` | MySQL主机地址 | 否(可用环境变量) | 环境变量 `DB_HOST` |
  138 +| `--db-port` | MySQL端口 | 否(可用环境变量) | 环境变量 `DB_PORT` 或 3306 |
  139 +| `--db-database` | MySQL数据库名 | 否(可用环境变量) | 环境变量 `DB_DATABASE` |
  140 +| `--db-username` | MySQL用户名 | 否(可用环境变量) | 环境变量 `DB_USERNAME` |
  141 +| `--db-password` | MySQL密码 | 否(可用环境变量) | 环境变量 `DB_PASSWORD` |
  142 +| `--es-host` | Elasticsearch地址 | 否(可用环境变量) | 环境变量 `ES_HOST` 或 `http://localhost:9200` |
  143 +| `--batch-size` | 批量导入大小 | 否 | 500 |
  144 +| `--skip-delete` | 跳过删除旧索引步骤 | 否 | False |
  145 +
  146 +#### 环境变量配置
  147 +
  148 +可以通过环境变量设置数据库和ES连接信息,避免在命令行中暴露敏感信息:
  149 +
  150 +```bash
  151 +export DB_HOST=120.79.247.228
  152 +export DB_PORT=3306
  153 +export DB_DATABASE=saas
  154 +export DB_USERNAME=saas
  155 +export DB_PASSWORD=your_password
  156 +export ES_HOST=http://localhost:9200
  157 +
  158 +python scripts/recreate_and_import.py --tenant-id 1
  159 +```
  160 +
  161 +#### 执行流程
  162 +
  163 +脚本执行分为以下步骤:
  164 +
  165 +1. **加载mapping配置**:从 `mappings/search_products.json` 加载ES索引mapping
  166 +2. **连接Elasticsearch**:验证ES连接可用性
  167 +3. **删除旧索引**(可选):如果索引已存在,删除旧索引(可通过 `--skip-delete` 跳过)
  168 +4. **创建新索引**:使用新的mapping创建索引
  169 +5. **连接MySQL**:建立数据库连接
  170 +6. **数据转换和导入**:
  171 + - 从MySQL读取SPU、SKU、Option数据
  172 + - 转换为ES文档格式
  173 + - 批量导入到ES
  174 +
  175 +#### 输出示例
  176 +
  177 +```
  178 +============================================================
  179 +重建ES索引并导入数据
  180 +============================================================
  181 +
  182 +[1/4] 加载mapping配置...
  183 +✓ 成功加载mapping配置
  184 +索引名称: search_products
  185 +
  186 +[2/4] 连接Elasticsearch...
  187 +ES地址: http://localhost:9200
  188 +✓ Elasticsearch连接成功
  189 +
  190 +[3/4] 删除旧索引...
  191 +发现已存在的索引: search_products
  192 +✓ 成功删除索引: search_products
  193 +
  194 +[4/4] 创建新索引...
  195 +创建索引: search_products
  196 +✓ 成功创建索引: search_products
  197 +
  198 +[5/5] 连接MySQL...
  199 +MySQL: 120.79.247.228:3306/saas
  200 +✓ MySQL连接成功
  201 +
  202 +[6/6] 导入数据...
  203 +Tenant ID: 1
  204 +批量大小: 500
  205 +正在转换数据...
  206 +✓ 转换完成: 1000 个文档
  207 +正在导入数据到ES (批量大小: 500)...
  208 +✓ 导入完成
  209 +
  210 +============================================================
  211 +导入完成!
  212 +============================================================
  213 +成功: 1000
  214 +失败: 0
  215 +耗时: 12.34秒
  216 +```
  217 +
  218 +#### 注意事项
  219 +
  220 +1. **数据量**:全量导入适合数据量较小或首次导入的场景。对于大数据量,建议使用增量接口。
  221 +2. **索引重建**:默认会删除旧索引,请确保有数据备份。
  222 +3. **性能**:批量大小(`--batch-size`)影响导入性能,建议根据ES集群性能调整(默认500)。
  223 +4. **租户隔离**:每次只能导入一个租户的数据,需要为每个租户分别执行。
  224 +
  225 +---
  226 +
  227 +## 增量数据获取接口
  228 +
  229 +### 功能说明
  230 +
  231 +增量数据获取接口提供单个SPU的ES文档数据,用于增量更新ES索引。适用于:
  232 +- MySQL数据变更后,实时同步到ES
  233 +- 外部Java程序监听MySQL变更事件,调用接口获取数据后推送到ES
  234 +- 避免全量重建索引,提高更新效率
  235 +
  236 +### 接口地址
  237 +
  238 +```
  239 +GET /indexer/spu/{spu_id}?tenant_id={tenant_id}
  240 +```
  241 +
  242 +### 请求参数
  243 +
  244 +| 参数 | 位置 | 类型 | 说明 | 是否必需 |
  245 +|------|------|------|------|----------|
  246 +| `spu_id` | 路径参数 | string | SPU ID | **是** |
  247 +| `tenant_id` | 查询参数 | string | 租户ID | **是** |
  248 +
  249 +### 请求示例
  250 +
  251 +```bash
  252 +# cURL
  253 +curl -X GET "http://localhost:6002/indexer/spu/123?tenant_id=1"
  254 +
  255 +# Java (OkHttp)
  256 +OkHttpClient client = new OkHttpClient();
  257 +Request request = new Request.Builder()
  258 + .url("http://localhost:6002/indexer/spu/123?tenant_id=1")
  259 + .get()
  260 + .build();
  261 +Response response = client.newCall(request).execute();
  262 +String json = response.body().string();
  263 +```
  264 +
  265 +### 响应格式
  266 +
  267 +#### 成功响应(200 OK)
  268 +
  269 +返回完整的ES文档JSON对象,包含所有索引字段:
  270 +
  271 +```json
  272 +{
  273 + "tenant_id": "1",
  274 + "spu_id": "123",
  275 + "title_zh": "商品标题",
  276 + "title_en": null,
  277 + "brief_zh": "商品简介",
  278 + "brief_en": null,
  279 + "description_zh": "商品详细描述",
  280 + "description_en": null,
  281 + "vendor_zh": "供应商名称",
  282 + "vendor_en": null,
  283 + "tags": ["标签1", "标签2"],
  284 + "category_path_zh": "类目1/类目2/类目3",
  285 + "category_path_en": null,
  286 + "category_name_zh": "类目名称",
  287 + "category_name_en": null,
  288 + "category_id": "100",
  289 + "category_name": "类目名称",
  290 + "category_level": 3,
  291 + "category1_name": "类目1",
  292 + "category2_name": "类目2",
  293 + "category3_name": "类目3",
  294 + "option1_name": "颜色",
  295 + "option2_name": "尺寸",
  296 + "option3_name": null,
  297 + "option1_values": ["红色", "蓝色", "绿色"],
  298 + "option2_values": ["S", "M", "L"],
  299 + "option3_values": [],
  300 + "min_price": 99.99,
  301 + "max_price": 199.99,
  302 + "compare_at_price": 299.99,
  303 + "sku_prices": [99.99, 149.99, 199.99],
  304 + "sku_weights": [100, 150, 200],
  305 + "sku_weight_units": ["g"],
  306 + "total_inventory": 500,
  307 + "sales": 1000,
  308 + "image_url": "https://example.com/image.jpg",
  309 + "create_time": "2024-01-01T00:00:00",
  310 + "update_time": "2024-01-02T00:00:00",
  311 + "skus": [
  312 + {
  313 + "sku_id": "456",
  314 + "price": 99.99,
  315 + "compare_at_price": 149.99,
  316 + "sku_code": "SKU001",
  317 + "stock": 100,
  318 + "weight": 100.0,
  319 + "weight_unit": "g",
  320 + "option1_value": "红色",
  321 + "option2_value": "S",
  322 + "option3_value": null,
  323 + "image_src": "https://example.com/sku1.jpg"
  324 + }
  325 + ],
  326 + "specifications": [
  327 + {
  328 + "sku_id": "456",
  329 + "name": "颜色",
  330 + "value": "红色"
  331 + },
  332 + {
  333 + "sku_id": "456",
  334 + "name": "尺寸",
  335 + "value": "S"
  336 + }
  337 + ]
  338 +}
  339 +```
  340 +
  341 +#### 错误响应
  342 +
  343 +**404 Not Found** - SPU不存在或已删除:
  344 +```json
  345 +{
  346 + "detail": "SPU 123 not found for tenant_id=1 or has been deleted"
  347 +}
  348 +```
  349 +
  350 +**400 Bad Request** - 缺少必需参数:
  351 +```json
  352 +{
  353 + "detail": "tenant_id is required"
  354 +}
  355 +```
  356 +
  357 +**500 Internal Server Error** - 服务器内部错误:
  358 +```json
  359 +{
  360 + "detail": "Internal server error: ..."
  361 +}
  362 +```
  363 +
  364 +**503 Service Unavailable** - 服务未初始化:
  365 +```json
  366 +{
  367 + "detail": "Incremental indexer service is not initialized. Please check database connection."
  368 +}
  369 +```
  370 +
  371 +### 健康检查接口
  372 +
  373 +检查增量索引服务的健康状态:
  374 +
  375 +```
  376 +GET /indexer/health
  377 +```
  378 +
  379 +#### 响应示例
  380 +
  381 +```json
  382 +{
  383 + "status": "available",
  384 + "database": "connected",
  385 + "preloaded_data": {
  386 + "category_mappings": 150,
  387 + "searchable_option_dimensions": ["option1", "option2", "option3"]
  388 + }
  389 +}
  390 +```
  391 +
  392 +### 性能优化
  393 +
  394 +服务在启动时预加载以下公共数据,以提高查询性能:
  395 +
  396 +1. **分类映射**:所有租户共享的分类ID到名称映射
  397 +2. **配置信息**:搜索配置(如 `searchable_option_dimensions`)
  398 +
  399 +这些数据在服务启动时一次性加载,后续查询无需重复查询数据库,大幅提升响应速度。
  400 +
  401 +### 使用场景
  402 +
  403 +#### 场景1:MySQL变更监听
  404 +
  405 +外部Java程序使用Canal或Debezium监听MySQL binlog,当检测到商品数据变更时:
  406 +
  407 +```java
  408 +// 伪代码示例
  409 +@EventListener
  410 +public void onProductChange(ProductChangeEvent event) {
  411 + String tenantId = event.getTenantId();
  412 + String spuId = event.getSpuId();
  413 +
  414 + // 调用增量接口获取ES文档数据
  415 + String url = String.format("http://localhost:6002/indexer/spu/%s?tenant_id=%s", spuId, tenantId);
  416 + Map<String, Object> esDoc = httpClient.get(url);
  417 +
  418 + // 推送到ES
  419 + elasticsearchClient.index("search_products", esDoc);
  420 +}
  421 +```
  422 +
  423 +#### 场景2:定时同步
  424 +
  425 +定时任务扫描变更的商品,批量更新:
  426 +
  427 +```java
  428 +// 伪代码示例
  429 +List<String> changedSpuIds = getChangedSpuIds();
  430 +for (String spuId : changedSpuIds) {
  431 + String url = String.format("http://localhost:6002/indexer/spu/%s?tenant_id=%s", spuId, tenantId);
  432 + Map<String, Object> esDoc = httpClient.get(url);
  433 + elasticsearchClient.index("search_products", esDoc);
  434 +}
  435 +```
  436 +
  437 +### 注意事项
  438 +
  439 +1. **服务初始化**:确保API服务已启动,且数据库连接配置正确(`DB_HOST`, `DB_DATABASE`, `DB_USERNAME`, `DB_PASSWORD`)。
  440 +2. **数据一致性**:接口返回的是调用时刻的数据快照,如果MySQL数据在调用后立即变更,可能需要重新调用。
  441 +3. **错误处理**:建议实现重试机制,对于404错误(SPU已删除),应调用ES删除接口。
  442 +4. **性能**:接口已优化,单次查询通常在100ms以内。如需批量获取,建议并发调用。
  443 +
  444 +---
  445 +
  446 +## 数据格式说明
  447 +
  448 +### ES文档结构
  449 +
  450 +返回的ES文档结构完全符合 `mappings/search_products.json` 定义的索引结构。主要字段说明:
  451 +
  452 +| 字段类别 | 字段名 | 类型 | 说明 |
  453 +|---------|--------|------|------|
  454 +| 基础标识 | `tenant_id` | keyword | 租户ID |
  455 +| 基础标识 | `spu_id` | keyword | SPU ID |
  456 +| 文本字段 | `title_zh`, `title_en` | text | 标题(中英文) |
  457 +| 文本字段 | `brief_zh`, `brief_en` | text | 简介(中英文) |
  458 +| 文本字段 | `description_zh`, `description_en` | text | 描述(中英文) |
  459 +| 文本字段 | `vendor_zh`, `vendor_en` | text | 供应商(中英文) |
  460 +| 类目字段 | `category_path_zh`, `category_path_en` | text | 类目路径(中英文) |
  461 +| 类目字段 | `category1_name`, `category2_name`, `category3_name` | keyword | 分层类目名称 |
  462 +| 价格字段 | `min_price`, `max_price` | float | 价格范围 |
  463 +| 库存字段 | `total_inventory` | long | 总库存 |
  464 +| 销量字段 | `sales` | long | 销量 |
  465 +| 嵌套字段 | `skus` | nested | SKU列表 |
  466 +| 嵌套字段 | `specifications` | nested | 规格列表 |
  467 +
  468 +详细字段说明请参考:[索引字段说明v2.md](./索引字段说明v2.md)
  469 +
  470 +### SKU嵌套结构
  471 +
  472 +```json
  473 +{
  474 + "skus": [
  475 + {
  476 + "sku_id": "456",
  477 + "price": 99.99,
  478 + "compare_at_price": 149.99,
  479 + "sku_code": "SKU001",
  480 + "stock": 100,
  481 + "weight": 100.0,
  482 + "weight_unit": "g",
  483 + "option1_value": "红色",
  484 + "option2_value": "S",
  485 + "option3_value": null,
  486 + "image_src": "https://example.com/sku1.jpg"
  487 + }
  488 + ]
  489 +}
  490 +```
  491 +
  492 +### Specifications嵌套结构
  493 +
  494 +```json
  495 +{
  496 + "specifications": [
  497 + {
  498 + "sku_id": "456",
  499 + "name": "颜色",
  500 + "value": "红色"
  501 + },
  502 + {
  503 + "sku_id": "456",
  504 + "name": "尺寸",
  505 + "value": "S"
  506 + }
  507 + ]
  508 +}
  509 +```
  510 +
  511 +---
  512 +
  513 +## 使用示例
  514 +
  515 +### 示例1:全量导入
  516 +
  517 +```bash
  518 +# 设置环境变量
  519 +export DB_HOST=120.79.247.228
  520 +export DB_PORT=3306
  521 +export DB_DATABASE=saas
  522 +export DB_USERNAME=saas
  523 +export DB_PASSWORD=your_password
  524 +export ES_HOST=http://localhost:9200
  525 +
  526 +# 执行全量导入
  527 +python scripts/recreate_and_import.py --tenant-id 1 --batch-size 500
  528 +```
  529 +
  530 +### 示例2:增量更新(Java)
  531 +
  532 +```java
  533 +import okhttp3.OkHttpClient;
  534 +import okhttp3.Request;
  535 +import okhttp3.Response;
  536 +import com.fasterxml.jackson.databind.ObjectMapper;
  537 +import org.elasticsearch.client.RestHighLevelClient;
  538 +
  539 +public class IncrementalIndexer {
  540 + private static final String API_BASE_URL = "http://localhost:6002";
  541 + private static final OkHttpClient httpClient = new OkHttpClient();
  542 + private static final ObjectMapper objectMapper = new ObjectMapper();
  543 + private static final RestHighLevelClient esClient = createESClient();
  544 +
  545 + /**
  546 + * 获取SPU的ES文档数据并推送到ES
  547 + */
  548 + public void indexSpu(String tenantId, String spuId) throws Exception {
  549 + // 1. 调用增量接口获取数据
  550 + String url = String.format("%s/indexer/spu/%s?tenant_id=%s",
  551 + API_BASE_URL, spuId, tenantId);
  552 +
  553 + Request request = new Request.Builder()
  554 + .url(url)
  555 + .get()
  556 + .build();
  557 +
  558 + try (Response response = httpClient.newCall(request).execute()) {
  559 + if (response.code() == 404) {
  560 + // SPU已删除,从ES中删除
  561 + deleteFromES(tenantId, spuId);
  562 + return;
  563 + }
  564 +
  565 + if (!response.isSuccessful()) {
  566 + throw new RuntimeException("Failed to get SPU data: " + response.code());
  567 + }
  568 +
  569 + // 2. 解析JSON响应
  570 + String json = response.body().string();
  571 + Map<String, Object> esDoc = objectMapper.readValue(json, Map.class);
  572 +
  573 + // 3. 推送到ES
  574 + IndexRequest indexRequest = new IndexRequest("search_products")
  575 + .id(spuId)
  576 + .source(esDoc);
  577 +
  578 + esClient.index(indexRequest, RequestOptions.DEFAULT);
  579 + }
  580 + }
  581 +
  582 + /**
  583 + * 从ES中删除SPU
  584 + */
  585 + private void deleteFromES(String tenantId, String spuId) throws Exception {
  586 + DeleteRequest deleteRequest = new DeleteRequest("search_products", spuId);
  587 + esClient.delete(deleteRequest, RequestOptions.DEFAULT);
  588 + }
  589 +}
  590 +```
  591 +
  592 +### 示例3:批量增量更新
  593 +
  594 +```java
  595 +/**
  596 + * 批量更新多个SPU
  597 + */
  598 +public void batchIndexSpus(String tenantId, List<String> spuIds) {
  599 + ExecutorService executor = Executors.newFixedThreadPool(10);
  600 + List<Future<?>> futures = new ArrayList<>();
  601 +
  602 + for (String spuId : spuIds) {
  603 + Future<?> future = executor.submit(() -> {
  604 + try {
  605 + indexSpu(tenantId, spuId);
  606 + } catch (Exception e) {
  607 + log.error("Failed to index SPU: " + spuId, e);
  608 + }
  609 + });
  610 + futures.add(future);
  611 + }
  612 +
  613 + // 等待所有任务完成
  614 + for (Future<?> future : futures) {
  615 + try {
  616 + future.get();
  617 + } catch (Exception e) {
  618 + log.error("Task failed", e);
  619 + }
  620 + }
  621 +
  622 + executor.shutdown();
  623 +}
  624 +```
  625 +
  626 +### 示例4:监听MySQL变更(Canal)
  627 +
  628 +```java
  629 +@CanalEventListener
  630 +public class ProductChangeListener {
  631 +
  632 + @Autowired
  633 + private IncrementalIndexer indexer;
  634 +
  635 + @ListenPoint(
  636 + destination = "example",
  637 + schema = "saas",
  638 + table = {"shoplazza_product_spu", "shoplazza_product_sku"},
  639 + eventType = {CanalEntry.EventType.INSERT, CanalEntry.EventType.UPDATE, CanalEntry.EventType.DELETE}
  640 + )
  641 + public void onEvent(CanalEntry.Entry entry) {
  642 + String tableName = entry.getHeader().getTableName();
  643 + String tenantId = extractTenantId(entry);
  644 + String spuId = extractSpuId(entry, tableName);
  645 +
  646 + if (tableName.equals("shoplazza_product_spu")) {
  647 + if (entry.getEntryType() == CanalEntry.EntryType.DELETE) {
  648 + // SPU删除,从ES删除
  649 + indexer.deleteFromES(tenantId, spuId);
  650 + } else {
  651 + // SPU新增或更新,重新索引
  652 + indexer.indexSpu(tenantId, spuId);
  653 + }
  654 + } else if (tableName.equals("shoplazza_product_sku")) {
  655 + // SKU变更,需要更新对应的SPU
  656 + indexer.indexSpu(tenantId, spuId);
  657 + }
  658 + }
  659 +}
  660 +```
  661 +
  662 +---
  663 +
  664 +## 常见问题
  665 +
  666 +### Q1: 全量导入和增量接口的区别?
  667 +
  668 +- **全量导入**:适合首次导入或数据重建,一次性导入所有数据,但耗时较长。
  669 +- **增量接口**:适合实时同步,按需获取单个SPU数据,响应快速。
  670 +
  671 +### Q2: 增量接口返回的数据是否包含向量字段?
  672 +
  673 +不包含。向量字段(`title_embedding`, `image_embedding`)需要单独生成,不在本接口返回范围内。如需向量字段,需要:
  674 +1. 调用本接口获取基础数据
  675 +2. 使用文本/图片编码服务生成向量
  676 +3. 将向量字段添加到文档后推送到ES
  677 +
  678 +### Q3: 如何处理SPU删除?
  679 +
  680 +当接口返回404时,表示SPU不存在或已删除。此时应从ES中删除对应文档:
  681 +
  682 +```java
  683 +if (response.code() == 404) {
  684 + DeleteRequest deleteRequest = new DeleteRequest("search_products", spuId);
  685 + esClient.delete(deleteRequest, RequestOptions.DEFAULT);
  686 +}
  687 +```
  688 +
  689 +### Q4: 服务启动失败,提示数据库连接错误?
  690 +
  691 +检查环境变量或配置文件中的数据库连接信息:
  692 +- `DB_HOST`
  693 +- `DB_PORT`
  694 +- `DB_DATABASE`
  695 +- `DB_USERNAME`
  696 +- `DB_PASSWORD`
  697 +
  698 +确保这些变量已正确设置,且数据库可访问。
  699 +
  700 +### Q5: 接口响应慢怎么办?
  701 +
  702 +1. 检查数据库连接池配置
  703 +2. 确认预加载数据是否成功(调用 `/indexer/health` 检查)
  704 +3. 检查数据库查询性能(SPU、SKU、Option表是否有索引)
  705 +4. 考虑使用连接池和缓存优化
  706 +
  707 +---
  708 +
  709 +## 相关文档
  710 +
  711 +- [索引字段说明v2.md](./索引字段说明v2.md) - ES索引字段详细说明
  712 +- [索引字段说明v2-参考表结构.md](./索引字段说明v2-参考表结构.md) - MySQL表结构参考
  713 +- [mappings/search_products.json](../mappings/search_products.json) - ES索引mapping定义
  714 +
docs/翻译功能测试说明.md 0 → 100644
@@ -0,0 +1,197 @@ @@ -0,0 +1,197 @@
  1 +# 翻译功能测试说明
  2 +
  3 +## 功能概述
  4 +
  5 +本次更新实现了以下功能:
  6 +
  7 +1. **翻译提示词配置**:支持中英文提示词,用于提高翻译质量
  8 +2. **DeepL Context参数**:提示词作为DeepL API的`context`参数传递(不参与翻译,仅提供上下文)
  9 +3. **同步/异步翻译**:
  10 + - 索引场景:同步翻译,等待结果返回
  11 + - 查询场景:异步翻译,立即返回缓存结果
  12 +4. **缓存机制**:翻译结果自动缓存,避免重复翻译
  13 +
  14 +## 配置说明
  15 +
  16 +### 配置文件位置
  17 +
  18 +`config/config.yaml`
  19 +
  20 +### 翻译提示词配置
  21 +
  22 +```yaml
  23 +translation_prompts:
  24 + # 商品标题翻译提示词
  25 + product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。"
  26 + product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."
  27 + # query翻译提示词
  28 + query_zh: "电商领域"
  29 + query_en: "e-commerce domain"
  30 + # 默认翻译用词
  31 + default_zh: "电商领域"
  32 + default_en: "e-commerce domain"
  33 +```
  34 +
  35 +### 提示词使用规则
  36 +
  37 +1. **商品标题翻译**:
  38 + - 中文→英文:使用 `product_title_en`
  39 + - 英文→中文:使用 `product_title_zh`
  40 +
  41 +2. **其他字段翻译**(brief, description, vendor):
  42 + - 根据目标语言选择 `default_zh` 或 `default_en`
  43 +
  44 +3. **查询翻译**:
  45 + - 根据目标语言选择 `query_zh` 或 `query_en`
  46 +
  47 +## 测试方法
  48 +
  49 +### 1. 测试配置加载
  50 +
  51 +```python
  52 +from config import ConfigLoader
  53 +
  54 +config_loader = ConfigLoader()
  55 +config = config_loader.load_config()
  56 +
  57 +# 检查翻译提示词配置
  58 +print(config.query_config.translation_prompts)
  59 +```
  60 +
  61 +### 2. 测试同步翻译(索引场景)
  62 +
  63 +```python
  64 +from query.translator import Translator
  65 +from config import ConfigLoader
  66 +
  67 +config = ConfigLoader().load_config()
  68 +translator = Translator(
  69 + api_key=config.query_config.translation_api_key,
  70 + use_cache=True
  71 +)
  72 +
  73 +# 测试商品标题翻译
  74 +text = "蓝牙耳机"
  75 +prompt = config.query_config.translation_prompts.get('product_title_en')
  76 +result = translator.translate(
  77 + text,
  78 + target_lang='en',
  79 + source_lang='zh',
  80 + prompt=prompt
  81 +)
  82 +print(f"翻译结果: {result}")
  83 +```
  84 +
  85 +### 3. 测试异步翻译(查询场景)
  86 +
  87 +```python
  88 +# 异步模式(立即返回,后台翻译)
  89 +results = translator.translate_multi(
  90 + "手机",
  91 + target_langs=['en'],
  92 + source_lang='zh',
  93 + async_mode=True,
  94 + prompt=config.query_config.translation_prompts.get('query_zh')
  95 +)
  96 +print(f"异步结果: {results}") # 可能包含None(后台翻译中)
  97 +
  98 +# 同步模式(等待完成)
  99 +results_sync = translator.translate_multi(
  100 + "手机",
  101 + target_langs=['en'],
  102 + source_lang='zh',
  103 + async_mode=False,
  104 + prompt=config.query_config.translation_prompts.get('query_zh')
  105 +)
  106 +print(f"同步结果: {results_sync}")
  107 +```
  108 +
  109 +### 4. 测试文档转换器集成
  110 +
  111 +```python
  112 +from indexer.document_transformer import SPUDocumentTransformer
  113 +import pandas as pd
  114 +
  115 +# 创建模拟数据
  116 +spu_row = pd.Series({
  117 + 'id': 123,
  118 + 'tenant_id': '1',
  119 + 'title': '蓝牙耳机',
  120 + 'brief': '高品质无线蓝牙耳机',
  121 + 'description': '这是一款高品质的无线蓝牙耳机。',
  122 + 'vendor': '品牌A',
  123 + # ... 其他字段
  124 +})
  125 +
  126 +# 初始化转换器(带翻译器)
  127 +transformer = SPUDocumentTransformer(
  128 + category_id_to_name={},
  129 + searchable_option_dimensions=['option1', 'option2', 'option3'],
  130 + tenant_config={'primary_language': 'zh', 'translate_to_en': True},
  131 + translator=translator,
  132 + translation_prompts=config.query_config.translation_prompts
  133 +)
  134 +
  135 +# 转换文档
  136 +doc = transformer.transform_spu_to_doc(
  137 + tenant_id='1',
  138 + spu_row=spu_row,
  139 + skus=pd.DataFrame(),
  140 + options=pd.DataFrame()
  141 +)
  142 +
  143 +print(f"title_zh: {doc.get('title_zh')}")
  144 +print(f"title_en: {doc.get('title_en')}") # 应该包含翻译结果
  145 +```
  146 +
  147 +### 5. 测试缓存功能
  148 +
  149 +```python
  150 +# 第一次翻译(调用API)
  151 +result1 = translator.translate("测试文本", "en", "zh", prompt="电商领域")
  152 +
  153 +# 第二次翻译(使用缓存)
  154 +result2 = translator.translate("测试文本", "en", "zh", prompt="电商领域")
  155 +
  156 +assert result1 == result2 # 应该相同
  157 +```
  158 +
  159 +## DeepL API Context参数说明
  160 +
  161 +根据 [DeepL API文档](https://developers.deepl.com/api-reference/translate/request-translation):
  162 +
  163 +- `context` 参数:Additional context that can influence a translation but is not translated itself
  164 +- Context中的字符不计入计费
  165 +- Context用于提供翻译上下文,帮助提高翻译质量
  166 +
  167 +我们的实现:
  168 +- 将提示词作为 `context` 参数传递给DeepL API
  169 +- Context不参与翻译,仅提供上下文信息
  170 +- 不同场景使用不同的提示词(商品标题、查询、默认)
  171 +
  172 +## 运行完整测试
  173 +
  174 +```bash
  175 +# 激活环境
  176 +source /home/tw/miniconda3/etc/profile.d/conda.sh
  177 +conda activate searchengine
  178 +
  179 +# 运行测试脚本
  180 +python scripts/test_translation.py
  181 +```
  182 +
  183 +## 验证要点
  184 +
  185 +1. **配置加载**:确认所有提示词配置正确加载
  186 +2. **同步翻译**:索引时翻译结果正确填充到文档
  187 +3. **异步翻译**:查询时缓存命中立即返回,未命中后台翻译
  188 +4. **提示词使用**:不同场景使用正确的提示词
  189 +5. **缓存机制**:相同文本和提示词的翻译结果被缓存
  190 +
  191 +## 注意事项
  192 +
  193 +1. 需要配置 `DEEPL_AUTH_KEY` 环境变量或 `translation_api_key`
  194 +2. 如果没有API key,翻译器会返回原文(mock模式)
  195 +3. 缓存文件存储在 `.cache/translations.json`
  196 +4. Context参数中的字符不计入DeepL计费
  197 +
indexer/bulk_indexing_service.py 0 → 100644
@@ -0,0 +1,108 @@ @@ -0,0 +1,108 @@
  1 +"""
  2 +全量索引服务。
  3 +
  4 +提供全量索引功能,将指定租户的所有SPU数据导入到ES。
  5 +"""
  6 +
  7 +import logging
  8 +from typing import Dict, Any
  9 +from sqlalchemy import Engine
  10 +from utils.es_client import ESClient
  11 +from indexer.spu_transformer import SPUTransformer
  12 +from indexer.bulk_indexer import BulkIndexer
  13 +from indexer.mapping_generator import load_mapping, delete_index_if_exists, DEFAULT_INDEX_NAME
  14 +
  15 +logger = logging.getLogger(__name__)
  16 +
  17 +
  18 +class BulkIndexingService:
  19 + """全量索引服务,提供批量导入功能。"""
  20 +
  21 + def __init__(self, db_engine: Engine, es_client: ESClient):
  22 + """
  23 + 初始化全量索引服务。
  24 +
  25 + Args:
  26 + db_engine: SQLAlchemy database engine
  27 + es_client: Elasticsearch client
  28 + """
  29 + self.db_engine = db_engine
  30 + self.es_client = es_client
  31 + self.index_name = DEFAULT_INDEX_NAME
  32 +
  33 + def bulk_index(self, tenant_id: str, recreate_index: bool = False, batch_size: int = 500) -> Dict[str, Any]:
  34 + """执行全量索引"""
  35 + import time
  36 + start_time = time.time()
  37 +
  38 + try:
  39 + # 1. 加载mapping
  40 + logger.info(f"[BulkIndexing] Loading mapping for tenant_id={tenant_id}")
  41 + mapping = load_mapping()
  42 +
  43 + # 2. 处理索引(删除并重建或创建)
  44 + if recreate_index:
  45 + logger.info(f"[BulkIndexing] Recreating index: {self.index_name}")
  46 + if self.es_client.index_exists(self.index_name):
  47 + if delete_index_if_exists(self.es_client, self.index_name):
  48 + logger.info(f"[BulkIndexing] Deleted existing index: {self.index_name}")
  49 + else:
  50 + raise Exception(f"Failed to delete index: {self.index_name}")
  51 +
  52 + if not self.es_client.index_exists(self.index_name):
  53 + logger.info(f"[BulkIndexing] Creating index: {self.index_name}")
  54 + if not self.es_client.create_index(self.index_name, mapping):
  55 + raise Exception(f"Failed to create index: {self.index_name}")
  56 + logger.info(f"[BulkIndexing] Created index: {self.index_name}")
  57 + else:
  58 + logger.info(f"[BulkIndexing] Index already exists: {self.index_name}")
  59 +
  60 + # 3. 转换数据
  61 + logger.info(f"[BulkIndexing] Transforming data for tenant_id={tenant_id}")
  62 + transformer = SPUTransformer(self.db_engine, tenant_id)
  63 + documents = transformer.transform_batch()
  64 +
  65 + if not documents:
  66 + logger.warning(f"[BulkIndexing] No documents to index for tenant_id={tenant_id}")
  67 + return {
  68 + "success": True,
  69 + "total": 0,
  70 + "indexed": 0,
  71 + "failed": 0,
  72 + "elapsed_time": time.time() - start_time,
  73 + "message": "No documents to index"
  74 + }
  75 +
  76 + logger.info(f"[BulkIndexing] Transformed {len(documents)} documents")
  77 +
  78 + # 4. 批量导入
  79 + logger.info(f"[BulkIndexing] Indexing {len(documents)} documents (batch_size={batch_size})")
  80 + indexer = BulkIndexer(self.es_client, self.index_name, batch_size=batch_size)
  81 + results = indexer.index_documents(
  82 + documents,
  83 + id_field="spu_id",
  84 + show_progress=False # API调用时不打印进度
  85 + )
  86 +
  87 + elapsed_time = time.time() - start_time
  88 +
  89 + logger.info(
  90 + f"[BulkIndexing] Completed for tenant_id={tenant_id}: "
  91 + f"indexed={results['success']}, failed={results['failed']}, "
  92 + f"elapsed={elapsed_time:.2f}s"
  93 + )
  94 +
  95 + return {
  96 + "success": results['failed'] == 0,
  97 + "total": len(documents),
  98 + "indexed": results['success'],
  99 + "failed": results['failed'],
  100 + "elapsed_time": elapsed_time,
  101 + "index_name": self.index_name,
  102 + "tenant_id": tenant_id
  103 + }
  104 +
  105 + except Exception as e:
  106 + logger.error(f"[BulkIndexing] Failed for tenant_id={tenant_id}: {e}", exc_info=True)
  107 + raise
  108 +
indexer/document_transformer.py 0 → 100644
@@ -0,0 +1,545 @@ @@ -0,0 +1,545 @@
  1 +"""
  2 +SPU文档转换器 - 公共转换逻辑。
  3 +
  4 +提取全量和增量索引共用的文档转换逻辑,避免代码冗余。
  5 +"""
  6 +
  7 +import pandas as pd
  8 +import logging
  9 +from typing import Dict, Any, Optional, List
  10 +from config import ConfigLoader
  11 +
  12 +logger = logging.getLogger(__name__)
  13 +
  14 +# Try to import translator (optional dependency)
  15 +try:
  16 + from query.translator import Translator
  17 + TRANSLATOR_AVAILABLE = True
  18 +except ImportError:
  19 + TRANSLATOR_AVAILABLE = False
  20 + Translator = None
  21 +
  22 +
  23 +class SPUDocumentTransformer:
  24 + """SPU文档转换器,将SPU、SKU、Option数据转换为ES文档格式。"""
  25 +
  26 + def __init__(
  27 + self,
  28 + category_id_to_name: Dict[str, str],
  29 + searchable_option_dimensions: List[str],
  30 + tenant_config: Optional[Dict[str, Any]] = None,
  31 + translator: Optional[Any] = None,
  32 + translation_prompts: Optional[Dict[str, str]] = None
  33 + ):
  34 + """
  35 + 初始化文档转换器。
  36 +
  37 + Args:
  38 + category_id_to_name: 分类ID到名称的映射
  39 + searchable_option_dimensions: 可搜索的option维度列表
  40 + tenant_config: 租户配置(包含主语言和翻译配置)
  41 + translator: 翻译器实例(可选,如果提供则启用翻译功能)
  42 + translation_prompts: 翻译提示词配置(可选)
  43 + """
  44 + self.category_id_to_name = category_id_to_name
  45 + self.searchable_option_dimensions = searchable_option_dimensions
  46 + self.tenant_config = tenant_config or {}
  47 + self.translator = translator
  48 + self.translation_prompts = translation_prompts or {}
  49 +
  50 + def transform_spu_to_doc(
  51 + self,
  52 + tenant_id: str,
  53 + spu_row: pd.Series,
  54 + skus: pd.DataFrame,
  55 + options: pd.DataFrame
  56 + ) -> Optional[Dict[str, Any]]:
  57 + """
  58 + 将单个SPU行和其SKUs转换为ES文档。
  59 +
  60 + Args:
  61 + tenant_id: 租户ID
  62 + spu_row: SPU行数据
  63 + skus: SKU数据DataFrame
  64 + options: Option数据DataFrame
  65 +
  66 + Returns:
  67 + ES文档字典
  68 + """
  69 + doc = {}
  70 +
  71 + # Tenant ID (required)
  72 + doc['tenant_id'] = str(tenant_id)
  73 +
  74 + # SPU ID
  75 + spu_id = spu_row['id']
  76 + doc['spu_id'] = str(spu_id)
  77 +
  78 + # Validate required fields
  79 + if pd.isna(spu_row.get('title')) or not str(spu_row['title']).strip():
  80 + logger.error(f"SPU {spu_id} has no title, this may cause search issues")
  81 +
  82 + # 获取租户配置
  83 + primary_lang = self.tenant_config.get('primary_language', 'zh')
  84 + translate_to_en = self.tenant_config.get('translate_to_en', True)
  85 + translate_to_zh = self.tenant_config.get('translate_to_zh', False)
  86 +
  87 + # 文本字段处理(根据主语言和翻译配置)
  88 + self._fill_text_fields(doc, spu_row, primary_lang, translate_to_en, translate_to_zh)
  89 +
  90 + # Tags
  91 + if pd.notna(spu_row.get('tags')):
  92 + tags_str = str(spu_row['tags'])
  93 + doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
  94 +
  95 + # Category相关字段
  96 + self._fill_category_fields(doc, spu_row)
  97 +
  98 + # Option名称(从option表获取)
  99 + self._fill_option_names(doc, options)
  100 +
  101 + # Image URL
  102 + self._fill_image_url(doc, spu_row)
  103 +
  104 + # Sales (fake_sales)
  105 + if pd.notna(spu_row.get('fake_sales')):
  106 + try:
  107 + doc['sales'] = int(spu_row['fake_sales'])
  108 + except (ValueError, TypeError):
  109 + doc['sales'] = 0
  110 + else:
  111 + doc['sales'] = 0
  112 +
  113 + # Process SKUs and build specifications
  114 + skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications = \
  115 + self._process_skus(skus, options)
  116 +
  117 + doc['skus'] = skus_list
  118 + doc['specifications'] = specifications
  119 +
  120 + # 提取option值(根据配置的searchable_option_dimensions)
  121 + self._fill_option_values(doc, skus)
  122 +
  123 + # Calculate price ranges
  124 + if prices:
  125 + doc['min_price'] = float(min(prices))
  126 + doc['max_price'] = float(max(prices))
  127 + else:
  128 + doc['min_price'] = 0.0
  129 + doc['max_price'] = 0.0
  130 +
  131 + if compare_prices:
  132 + doc['compare_at_price'] = float(max(compare_prices))
  133 + else:
  134 + doc['compare_at_price'] = None
  135 +
  136 + # SKU扁平化字段
  137 + doc['sku_prices'] = sku_prices
  138 + doc['sku_weights'] = sku_weights
  139 + doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重
  140 + doc['total_inventory'] = total_inventory
  141 +
  142 + # Time fields - convert datetime to ISO format string for ES DATE type
  143 + if pd.notna(spu_row.get('create_time')):
  144 + create_time = spu_row['create_time']
  145 + if hasattr(create_time, 'isoformat'):
  146 + doc['create_time'] = create_time.isoformat()
  147 + else:
  148 + doc['create_time'] = str(create_time)
  149 +
  150 + if pd.notna(spu_row.get('update_time')):
  151 + update_time = spu_row['update_time']
  152 + if hasattr(update_time, 'isoformat'):
  153 + doc['update_time'] = update_time.isoformat()
  154 + else:
  155 + doc['update_time'] = str(update_time)
  156 +
  157 + return doc
  158 +
  159 + def _fill_text_fields(
  160 + self,
  161 + doc: Dict[str, Any],
  162 + spu_row: pd.Series,
  163 + primary_lang: str,
  164 + translate_to_en: bool,
  165 + translate_to_zh: bool
  166 + ):
  167 + """填充文本字段(根据主语言和翻译配置)。"""
  168 + # 主语言字段
  169 + primary_suffix = '_zh' if primary_lang == 'zh' else '_en'
  170 + secondary_suffix = '_en' if primary_lang == 'zh' else '_zh'
  171 +
  172 + # Title
  173 + if pd.notna(spu_row.get('title')):
  174 + title_text = str(spu_row['title'])
  175 + doc[f'title{primary_suffix}'] = title_text
  176 + # 如果需要翻译,调用翻译服务(同步模式)
  177 + if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):
  178 + if self.translator:
  179 + target_lang = 'en' if primary_lang == 'zh' else 'zh'
  180 + # 根据目标语言选择对应的提示词
  181 + if target_lang == 'zh':
  182 + prompt = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh')
  183 + else:
  184 + prompt = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en')
  185 + translated = self.translator.translate(
  186 + title_text,
  187 + target_lang=target_lang,
  188 + source_lang=primary_lang,
  189 + prompt=prompt
  190 + )
  191 + doc[f'title{secondary_suffix}'] = translated if translated else None
  192 + else:
  193 + doc[f'title{secondary_suffix}'] = None # 无翻译器,设为None
  194 + else:
  195 + doc[f'title{secondary_suffix}'] = None
  196 + else:
  197 + doc[f'title{primary_suffix}'] = None
  198 + doc[f'title{secondary_suffix}'] = None
  199 +
  200 + # Brief
  201 + if pd.notna(spu_row.get('brief')):
  202 + brief_text = str(spu_row['brief'])
  203 + doc[f'brief{primary_suffix}'] = brief_text
  204 + if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):
  205 + if self.translator:
  206 + target_lang = 'en' if primary_lang == 'zh' else 'zh'
  207 + # 根据目标语言选择对应的提示词
  208 + prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  209 + translated = self.translator.translate(
  210 + brief_text,
  211 + target_lang=target_lang,
  212 + source_lang=primary_lang,
  213 + prompt=prompt
  214 + )
  215 + doc[f'brief{secondary_suffix}'] = translated if translated else None
  216 + else:
  217 + doc[f'brief{secondary_suffix}'] = None
  218 + else:
  219 + doc[f'brief{secondary_suffix}'] = None
  220 + else:
  221 + doc[f'brief{primary_suffix}'] = None
  222 + doc[f'brief{secondary_suffix}'] = None
  223 +
  224 + # Description
  225 + if pd.notna(spu_row.get('description')):
  226 + desc_text = str(spu_row['description'])
  227 + doc[f'description{primary_suffix}'] = desc_text
  228 + if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):
  229 + if self.translator:
  230 + target_lang = 'en' if primary_lang == 'zh' else 'zh'
  231 + # 根据目标语言选择对应的提示词
  232 + prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  233 + translated = self.translator.translate(
  234 + desc_text,
  235 + target_lang=target_lang,
  236 + source_lang=primary_lang,
  237 + prompt=prompt
  238 + )
  239 + doc[f'description{secondary_suffix}'] = translated if translated else None
  240 + else:
  241 + doc[f'description{secondary_suffix}'] = None
  242 + else:
  243 + doc[f'description{secondary_suffix}'] = None
  244 + else:
  245 + doc[f'description{primary_suffix}'] = None
  246 + doc[f'description{secondary_suffix}'] = None
  247 +
  248 + # Vendor
  249 + if pd.notna(spu_row.get('vendor')):
  250 + vendor_text = str(spu_row['vendor'])
  251 + doc[f'vendor{primary_suffix}'] = vendor_text
  252 + if (primary_lang == 'zh' and translate_to_en) or (primary_lang == 'en' and translate_to_zh):
  253 + if self.translator:
  254 + target_lang = 'en' if primary_lang == 'zh' else 'zh'
  255 + # 根据目标语言选择对应的提示词
  256 + prompt = self.translation_prompts.get(f'default_{target_lang}') or self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
  257 + translated = self.translator.translate(
  258 + vendor_text,
  259 + target_lang=target_lang,
  260 + source_lang=primary_lang,
  261 + prompt=prompt
  262 + )
  263 + doc[f'vendor{secondary_suffix}'] = translated if translated else None
  264 + else:
  265 + doc[f'vendor{secondary_suffix}'] = None
  266 + else:
  267 + doc[f'vendor{secondary_suffix}'] = None
  268 + else:
  269 + doc[f'vendor{primary_suffix}'] = None
  270 + doc[f'vendor{secondary_suffix}'] = None
  271 +
  272 + def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series):
  273 + """填充类目相关字段。"""
  274 + if pd.notna(spu_row.get('category_path')):
  275 + category_path = str(spu_row['category_path'])
  276 +
  277 + # 解析category_path - 这是逗号分隔的类目ID列表
  278 + category_ids = [cid.strip() for cid in category_path.split(',') if cid.strip()]
  279 +
  280 + # 将ID映射为名称
  281 + category_names = []
  282 + for cid in category_ids:
  283 + if cid in self.category_id_to_name:
  284 + category_names.append(self.category_id_to_name[cid])
  285 + else:
  286 + logger.error(f"Category ID {cid} not found in mapping for SPU {spu_row['id']} (title: {spu_row.get('title', 'N/A')}), category_path={category_path}")
  287 + category_names.append(cid) # 使用ID作为备选
  288 +
  289 + # 构建类目路径字符串(用于搜索)
  290 + if category_names:
  291 + category_path_str = '/'.join(category_names)
  292 + doc['category_path_zh'] = category_path_str
  293 + doc['category_path_en'] = None # 暂时设为空
  294 +
  295 + # 填充分层类目名称
  296 + if len(category_names) > 0:
  297 + doc['category1_name'] = category_names[0]
  298 + if len(category_names) > 1:
  299 + doc['category2_name'] = category_names[1]
  300 + if len(category_names) > 2:
  301 + doc['category3_name'] = category_names[2]
  302 + elif pd.notna(spu_row.get('category')):
  303 + # 如果category_path为空,使用category字段作为category1_name的备选
  304 + category = str(spu_row['category'])
  305 + doc['category_name_zh'] = category
  306 + doc['category_name_en'] = None
  307 + doc['category_name'] = category
  308 +
  309 + # 尝试从category字段解析多级分类
  310 + if '/' in category:
  311 + path_parts = category.split('/')
  312 + if len(path_parts) > 0:
  313 + doc['category1_name'] = path_parts[0].strip()
  314 + if len(path_parts) > 1:
  315 + doc['category2_name'] = path_parts[1].strip()
  316 + if len(path_parts) > 2:
  317 + doc['category3_name'] = path_parts[2].strip()
  318 + else:
  319 + # 如果category不包含"/",直接作为category1_name
  320 + doc['category1_name'] = category.strip()
  321 +
  322 + if pd.notna(spu_row.get('category')):
  323 + # 确保category相关字段都被设置(如果前面没有设置)
  324 + category_name = str(spu_row['category'])
  325 + if 'category_name_zh' not in doc:
  326 + doc['category_name_zh'] = category_name
  327 + if 'category_name_en' not in doc:
  328 + doc['category_name_en'] = None
  329 + if 'category_name' not in doc:
  330 + doc['category_name'] = category_name
  331 +
  332 + if pd.notna(spu_row.get('category_id')):
  333 + doc['category_id'] = str(int(spu_row['category_id']))
  334 +
  335 + if pd.notna(spu_row.get('category_level')):
  336 + doc['category_level'] = int(spu_row['category_level'])
  337 +
  338 + def _fill_option_names(self, doc: Dict[str, Any], options: pd.DataFrame):
  339 + """填充Option名称字段。"""
  340 + if not options.empty:
  341 + # 按position排序获取option名称
  342 + sorted_options = options.sort_values('position')
  343 + if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')):
  344 + doc['option1_name'] = str(sorted_options.iloc[0]['name'])
  345 + if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')):
  346 + doc['option2_name'] = str(sorted_options.iloc[1]['name'])
  347 + if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')):
  348 + doc['option3_name'] = str(sorted_options.iloc[2]['name'])
  349 +
  350 + def _fill_image_url(self, doc: Dict[str, Any], spu_row: pd.Series):
  351 + """填充图片URL字段。"""
  352 + if pd.notna(spu_row.get('image_src')):
  353 + image_src = str(spu_row['image_src'])
  354 + if not image_src.startswith('http'):
  355 + image_src = f"//{image_src}" if image_src.startswith('//') else image_src
  356 + doc['image_url'] = image_src
  357 +
  358 + def _process_skus(
  359 + self,
  360 + skus: pd.DataFrame,
  361 + options: pd.DataFrame
  362 + ) -> tuple:
  363 + """处理SKU数据,返回处理结果。"""
  364 + skus_list = []
  365 + prices = []
  366 + compare_prices = []
  367 + sku_prices = []
  368 + sku_weights = []
  369 + sku_weight_units = []
  370 + total_inventory = 0
  371 + specifications = []
  372 +
  373 + # 构建option名称映射(position -> name)
  374 + option_name_map = {}
  375 + if not options.empty:
  376 + for _, opt_row in options.iterrows():
  377 + position = opt_row.get('position')
  378 + name = opt_row.get('name')
  379 + if pd.notna(position) and pd.notna(name):
  380 + option_name_map[int(position)] = str(name)
  381 +
  382 + for _, sku_row in skus.iterrows():
  383 + sku_data = self._transform_sku_row(sku_row, option_name_map)
  384 + if sku_data:
  385 + skus_list.append(sku_data)
  386 +
  387 + # 收集价格信息
  388 + if 'price' in sku_data and sku_data['price'] is not None:
  389 + try:
  390 + price_val = float(sku_data['price'])
  391 + prices.append(price_val)
  392 + sku_prices.append(price_val)
  393 + except (ValueError, TypeError):
  394 + pass
  395 +
  396 + if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None:
  397 + try:
  398 + compare_prices.append(float(sku_data['compare_at_price']))
  399 + except (ValueError, TypeError):
  400 + pass
  401 +
  402 + # 收集重量信息
  403 + if 'weight' in sku_data and sku_data['weight'] is not None:
  404 + try:
  405 + sku_weights.append(int(float(sku_data['weight'])))
  406 + except (ValueError, TypeError):
  407 + pass
  408 +
  409 + if 'weight_unit' in sku_data and sku_data['weight_unit']:
  410 + sku_weight_units.append(str(sku_data['weight_unit']))
  411 +
  412 + # 收集库存信息
  413 + if 'stock' in sku_data and sku_data['stock'] is not None:
  414 + try:
  415 + total_inventory += int(sku_data['stock'])
  416 + except (ValueError, TypeError):
  417 + pass
  418 +
  419 + # 构建specifications(从SKU的option值和option表的name)
  420 + sku_id = str(sku_row['id'])
  421 + if pd.notna(sku_row.get('option1')) and 1 in option_name_map:
  422 + specifications.append({
  423 + 'sku_id': sku_id,
  424 + 'name': option_name_map[1],
  425 + 'value': str(sku_row['option1'])
  426 + })
  427 + if pd.notna(sku_row.get('option2')) and 2 in option_name_map:
  428 + specifications.append({
  429 + 'sku_id': sku_id,
  430 + 'name': option_name_map[2],
  431 + 'value': str(sku_row['option2'])
  432 + })
  433 + if pd.notna(sku_row.get('option3')) and 3 in option_name_map:
  434 + specifications.append({
  435 + 'sku_id': sku_id,
  436 + 'name': option_name_map[3],
  437 + 'value': str(sku_row['option3'])
  438 + })
  439 +
  440 + return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications
  441 +
  442 + def _fill_option_values(self, doc: Dict[str, Any], skus: pd.DataFrame):
  443 + """填充option值字段。"""
  444 + option1_values = []
  445 + option2_values = []
  446 + option3_values = []
  447 +
  448 + for _, sku_row in skus.iterrows():
  449 + if pd.notna(sku_row.get('option1')):
  450 + option1_values.append(str(sku_row['option1']))
  451 + if pd.notna(sku_row.get('option2')):
  452 + option2_values.append(str(sku_row['option2']))
  453 + if pd.notna(sku_row.get('option3')):
  454 + option3_values.append(str(sku_row['option3']))
  455 +
  456 + # 去重并根据配置决定是否写入索引
  457 + if 'option1' in self.searchable_option_dimensions:
  458 + doc['option1_values'] = list(set(option1_values)) if option1_values else []
  459 + else:
  460 + doc['option1_values'] = []
  461 +
  462 + if 'option2' in self.searchable_option_dimensions:
  463 + doc['option2_values'] = list(set(option2_values)) if option2_values else []
  464 + else:
  465 + doc['option2_values'] = []
  466 +
  467 + if 'option3' in self.searchable_option_dimensions:
  468 + doc['option3_values'] = list(set(option3_values)) if option3_values else []
  469 + else:
  470 + doc['option3_values'] = []
  471 +
  472 + def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:
  473 + """
  474 + 将SKU行转换为SKU对象。
  475 +
  476 + Args:
  477 + sku_row: SKU行数据
  478 + option_name_map: position到option名称的映射
  479 +
  480 + Returns:
  481 + SKU字典
  482 + """
  483 + sku_data = {}
  484 +
  485 + # SKU ID
  486 + sku_data['sku_id'] = str(sku_row['id'])
  487 +
  488 + # Price
  489 + if pd.notna(sku_row.get('price')):
  490 + try:
  491 + sku_data['price'] = float(sku_row['price'])
  492 + except (ValueError, TypeError):
  493 + sku_data['price'] = None
  494 + else:
  495 + sku_data['price'] = None
  496 +
  497 + # Compare at price
  498 + if pd.notna(sku_row.get('compare_at_price')):
  499 + try:
  500 + sku_data['compare_at_price'] = float(sku_row['compare_at_price'])
  501 + except (ValueError, TypeError):
  502 + sku_data['compare_at_price'] = None
  503 + else:
  504 + sku_data['compare_at_price'] = None
  505 +
  506 + # SKU Code
  507 + if pd.notna(sku_row.get('sku')):
  508 + sku_data['sku_code'] = str(sku_row['sku'])
  509 +
  510 + # Stock
  511 + if pd.notna(sku_row.get('inventory_quantity')):
  512 + try:
  513 + sku_data['stock'] = int(sku_row['inventory_quantity'])
  514 + except (ValueError, TypeError):
  515 + sku_data['stock'] = 0
  516 + else:
  517 + sku_data['stock'] = 0
  518 +
  519 + # Weight
  520 + if pd.notna(sku_row.get('weight')):
  521 + try:
  522 + sku_data['weight'] = float(sku_row['weight'])
  523 + except (ValueError, TypeError):
  524 + sku_data['weight'] = None
  525 + else:
  526 + sku_data['weight'] = None
  527 +
  528 + # Weight unit
  529 + if pd.notna(sku_row.get('weight_unit')):
  530 + sku_data['weight_unit'] = str(sku_row['weight_unit'])
  531 +
  532 + # Option values
  533 + if pd.notna(sku_row.get('option1')):
  534 + sku_data['option1_value'] = str(sku_row['option1'])
  535 + if pd.notna(sku_row.get('option2')):
  536 + sku_data['option2_value'] = str(sku_row['option2'])
  537 + if pd.notna(sku_row.get('option3')):
  538 + sku_data['option3_value'] = str(sku_row['option3'])
  539 +
  540 + # Image src
  541 + if pd.notna(sku_row.get('image_src')):
  542 + sku_data['image_src'] = str(sku_row['image_src'])
  543 +
  544 + return sku_data
  545 +
indexer/incremental_service.py 0 → 100644
@@ -0,0 +1,125 @@ @@ -0,0 +1,125 @@
  1 +"""增量数据获取服务"""
  2 +
  3 +import pandas as pd
  4 +import logging
  5 +from typing import Dict, Any, Optional
  6 +from sqlalchemy import text
  7 +from indexer.indexing_utils import load_category_mapping, create_document_transformer
  8 +
  9 +# Configure logger
  10 +logger = logging.getLogger(__name__)
  11 +
  12 +
  13 +class IncrementalIndexerService:
  14 + """增量索引服务,提供SPU数据获取功能。"""
  15 +
  16 + def __init__(self, db_engine: Any):
  17 + """初始化增量索引服务"""
  18 + self.db_engine = db_engine
  19 +
  20 + # 预加载分类映射(全局,所有租户共享)
  21 + self.category_id_to_name = load_category_mapping(db_engine)
  22 + logger.info(f"Preloaded {len(self.category_id_to_name)} category mappings")
  23 +
  24 + def get_spu_document(self, tenant_id: str, spu_id: str) -> Optional[Dict[str, Any]]:
  25 + """获取SPU的ES文档数据"""
  26 + try:
  27 + # 加载SPU数据
  28 + spu_row = self._load_single_spu(tenant_id, spu_id)
  29 + if spu_row is None:
  30 + logger.warning(f"SPU {spu_id} not found for tenant_id={tenant_id}")
  31 + return None
  32 +
  33 + # 加载SKU数据
  34 + skus_df = self._load_skus_for_spu(tenant_id, spu_id)
  35 +
  36 + # 加载Option数据
  37 + options_df = self._load_options_for_spu(tenant_id, spu_id)
  38 +
  39 + # 创建文档转换器
  40 + transformer = create_document_transformer(
  41 + category_id_to_name=self.category_id_to_name,
  42 + tenant_id=tenant_id
  43 + )
  44 +
  45 + # 转换为ES文档
  46 + doc = transformer.transform_spu_to_doc(
  47 + tenant_id=tenant_id,
  48 + spu_row=spu_row,
  49 + skus=skus_df,
  50 + options=options_df
  51 + )
  52 +
  53 + if doc is None:
  54 + logger.warning(f"Failed to transform SPU {spu_id} for tenant_id={tenant_id}")
  55 + return None
  56 +
  57 + return doc
  58 +
  59 + except Exception as e:
  60 + logger.error(f"Error getting SPU document for tenant_id={tenant_id}, spu_id={spu_id}: {e}", exc_info=True)
  61 + raise
  62 +
  63 + def _load_single_spu(self, tenant_id: str, spu_id: str) -> Optional[pd.Series]:
  64 + """加载单个SPU数据"""
  65 + query = text("""
  66 + SELECT
  67 + id, shop_id, shoplazza_id, title, brief, description,
  68 + spu, vendor, vendor_url,
  69 + image_src, image_width, image_height, image_path, image_alt,
  70 + tags, note, category, category_id, category_google_id,
  71 + category_level, category_path,
  72 + fake_sales, display_fake_sales,
  73 + tenant_id, creator, create_time, updater, update_time, deleted
  74 + FROM shoplazza_product_spu
  75 + WHERE tenant_id = :tenant_id AND id = :spu_id AND deleted = 0
  76 + LIMIT 1
  77 + """)
  78 +
  79 + with self.db_engine.connect() as conn:
  80 + df = pd.read_sql(query, conn, params={"tenant_id": tenant_id, "spu_id": spu_id})
  81 +
  82 + if df.empty:
  83 + return None
  84 +
  85 + return df.iloc[0]
  86 +
  87 + def _load_skus_for_spu(self, tenant_id: str, spu_id: str) -> pd.DataFrame:
  88 + """加载指定SPU的所有SKU数据"""
  89 + query = text("""
  90 + SELECT
  91 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  92 + shoplazza_image_id, title, sku, barcode, position,
  93 + price, compare_at_price, cost_price,
  94 + option1, option2, option3,
  95 + inventory_quantity, weight, weight_unit, image_src,
  96 + wholesale_price, note, extend,
  97 + shoplazza_created_at, shoplazza_updated_at, tenant_id,
  98 + creator, create_time, updater, update_time, deleted
  99 + FROM shoplazza_product_sku
  100 + WHERE tenant_id = :tenant_id AND spu_id = :spu_id AND deleted = 0
  101 + """)
  102 +
  103 + with self.db_engine.connect() as conn:
  104 + df = pd.read_sql(query, conn, params={"tenant_id": tenant_id, "spu_id": spu_id})
  105 +
  106 + return df
  107 +
  108 + def _load_options_for_spu(self, tenant_id: str, spu_id: str) -> pd.DataFrame:
  109 + """加载指定SPU的所有Option数据"""
  110 + query = text("""
  111 + SELECT
  112 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  113 + position, name, `values`, tenant_id,
  114 + creator, create_time, updater, update_time, deleted
  115 + FROM shoplazza_product_option
  116 + WHERE tenant_id = :tenant_id AND spu_id = :spu_id AND deleted = 0
  117 + ORDER BY position
  118 + """)
  119 +
  120 + with self.db_engine.connect() as conn:
  121 + df = pd.read_sql(query, conn, params={"tenant_id": tenant_id, "spu_id": spu_id})
  122 +
  123 + return df
  124 +
  125 +
indexer/indexing_utils.py 0 → 100644
@@ -0,0 +1,112 @@ @@ -0,0 +1,112 @@
  1 +"""
  2 +索引工具函数。
  3 +
  4 +提取公共逻辑,避免代码重复。
  5 +"""
  6 +
  7 +import logging
  8 +from typing import Dict, Any, Optional
  9 +from sqlalchemy import Engine, text
  10 +from config import ConfigLoader
  11 +from config.tenant_config_loader import get_tenant_config_loader
  12 +from indexer.document_transformer import SPUDocumentTransformer
  13 +
  14 +logger = logging.getLogger(__name__)
  15 +
  16 +
  17 +def load_category_mapping(db_engine: Engine) -> Dict[str, str]:
  18 + """
  19 + 加载分类ID到名称的映射(全局,所有租户共享)。
  20 +
  21 + Args:
  22 + db_engine: SQLAlchemy database engine
  23 +
  24 + Returns:
  25 + Dictionary mapping category_id to category_name
  26 + """
  27 + query = text("""
  28 + SELECT DISTINCT
  29 + category_id,
  30 + category
  31 + FROM shoplazza_product_spu
  32 + WHERE deleted = 0 AND category_id IS NOT NULL
  33 + """)
  34 +
  35 + mapping = {}
  36 + try:
  37 + with db_engine.connect() as conn:
  38 + result = conn.execute(query)
  39 + for row in result:
  40 + category_id = str(int(row.category_id))
  41 + category_name = row.category
  42 +
  43 + if not category_name or not category_name.strip():
  44 + logger.warning(f"Category ID {category_id} has empty name, skipping")
  45 + continue
  46 +
  47 + mapping[category_id] = category_name
  48 + except Exception as e:
  49 + logger.error(f"Failed to load category mapping: {e}", exc_info=True)
  50 +
  51 + return mapping
  52 +
  53 +
  54 +def create_document_transformer(
  55 + category_id_to_name: Dict[str, str],
  56 + tenant_id: str,
  57 + searchable_option_dimensions: Optional[list] = None,
  58 + translator: Optional[Any] = None,
  59 + translation_prompts: Optional[Dict[str, str]] = None
  60 +) -> SPUDocumentTransformer:
  61 + """
  62 + 创建文档转换器(统一初始化逻辑)。
  63 +
  64 + Args:
  65 + category_id_to_name: 分类ID到名称的映射
  66 + tenant_id: 租户ID
  67 + searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
  68 + translator: 翻译器实例(如果为None则根据配置初始化)
  69 + translation_prompts: 翻译提示词配置(如果为None则从配置加载)
  70 +
  71 + Returns:
  72 + SPUDocumentTransformer实例
  73 + """
  74 + # 加载租户配置
  75 + tenant_config_loader = get_tenant_config_loader()
  76 + tenant_config = tenant_config_loader.get_tenant_config(tenant_id)
  77 +
  78 + # 加载搜索配置(如果需要)
  79 + if searchable_option_dimensions is None or translator is None or translation_prompts is None:
  80 + try:
  81 + config_loader = ConfigLoader()
  82 + config = config_loader.load_config()
  83 +
  84 + if searchable_option_dimensions is None:
  85 + searchable_option_dimensions = config.spu_config.searchable_option_dimensions
  86 +
  87 + if translator is None and config.query_config.enable_translation:
  88 + from query.translator import Translator
  89 + translator = Translator(
  90 + api_key=config.query_config.translation_api_key,
  91 + use_cache=True,
  92 + glossary_id=config.query_config.translation_glossary_id,
  93 + translation_context=config.query_config.translation_context
  94 + )
  95 +
  96 + if translation_prompts is None:
  97 + translation_prompts = config.query_config.translation_prompts
  98 + except Exception as e:
  99 + logger.warning(f"Failed to load config, using defaults: {e}")
  100 + if searchable_option_dimensions is None:
  101 + searchable_option_dimensions = ['option1', 'option2', 'option3']
  102 + if translation_prompts is None:
  103 + translation_prompts = {}
  104 +
  105 + return SPUDocumentTransformer(
  106 + category_id_to_name=category_id_to_name,
  107 + searchable_option_dimensions=searchable_option_dimensions,
  108 + tenant_config=tenant_config,
  109 + translator=translator,
  110 + translation_prompts=translation_prompts
  111 + )
  112 +
indexer/spu_transformer.py
@@ -5,12 +5,10 @@ Transforms SPU and SKU data from MySQL into SPU-level ES documents with nested s @@ -5,12 +5,10 @@ Transforms SPU and SKU data from MySQL into SPU-level ES documents with nested s
5 """ 5 """
6 6
7 import pandas as pd 7 import pandas as pd
8 -import numpy as np  
9 import logging 8 import logging
10 from typing import Dict, Any, List, Optional 9 from typing import Dict, Any, List, Optional
11 -from sqlalchemy import create_engine, text  
12 -from utils.db_connector import create_db_connection  
13 -from config import ConfigLoader 10 +from sqlalchemy import text
  11 +from indexer.indexing_utils import load_category_mapping, create_document_transformer
14 12
15 # Configure logger 13 # Configure logger
16 logger = logging.getLogger(__name__) 14 logger = logging.getLogger(__name__)
@@ -19,70 +17,19 @@ logger = logging.getLogger(__name__) @@ -19,70 +17,19 @@ logger = logging.getLogger(__name__)
19 class SPUTransformer: 17 class SPUTransformer:
20 """Transform SPU and SKU data into SPU-level ES documents.""" 18 """Transform SPU and SKU data into SPU-level ES documents."""
21 19
22 - def __init__(  
23 - self,  
24 - db_engine: Any,  
25 - tenant_id: str  
26 - ):  
27 - """  
28 - Initialize SPU transformer.  
29 -  
30 - Args:  
31 - db_engine: SQLAlchemy database engine  
32 - tenant_id: Tenant ID for filtering data  
33 - """ 20 + def __init__(self, db_engine: Any, tenant_id: str):
34 self.db_engine = db_engine 21 self.db_engine = db_engine
35 self.tenant_id = tenant_id 22 self.tenant_id = tenant_id
36 23
37 - # Load configuration to get searchable_option_dimensions  
38 - try:  
39 - config_loader = ConfigLoader()  
40 - config = config_loader.load_config()  
41 - self.searchable_option_dimensions = config.spu_config.searchable_option_dimensions  
42 - except Exception as e:  
43 - print(f"Warning: Failed to load config, using default searchable_option_dimensions: {e}")  
44 - self.searchable_option_dimensions = ['option1', 'option2', 'option3']  
45 -  
46 # Load category ID to name mapping 24 # Load category ID to name mapping
47 - self.category_id_to_name = self._load_category_mapping()  
48 -  
49 - def _load_category_mapping(self) -> Dict[str, str]:  
50 - """  
51 - Load category ID to name mapping from database. 25 + self.category_id_to_name = load_category_mapping(db_engine)
  26 + logger.info(f"Loaded {len(self.category_id_to_name)} category ID to name mappings")
52 27
53 - Returns:  
54 - Dictionary mapping category_id to category_name  
55 - """  
56 - query = text("""  
57 - SELECT DISTINCT  
58 - category_id,  
59 - category  
60 - FROM shoplazza_product_spu  
61 - WHERE deleted = 0 AND category_id IS NOT NULL  
62 - """)  
63 -  
64 - mapping = {}  
65 - with self.db_engine.connect() as conn:  
66 - result = conn.execute(query)  
67 - for row in result:  
68 - category_id = str(int(row.category_id))  
69 - category_name = row.category  
70 -  
71 - if not category_name or not category_name.strip():  
72 - logger.warning(f"Category ID {category_id} has empty name, skipping")  
73 - continue  
74 -  
75 - mapping[category_id] = category_name  
76 -  
77 - logger.info(f"Loaded {len(mapping)} category ID to name mappings")  
78 -  
79 - # Log all category mappings for debugging  
80 - if mapping:  
81 - logger.debug("Category ID mappings:")  
82 - for cid, name in sorted(mapping.items()):  
83 - logger.debug(f" {cid} -> {name}")  
84 -  
85 - return mapping 28 + # Initialize document transformer
  29 + self.document_transformer = create_document_transformer(
  30 + category_id_to_name=self.category_id_to_name,
  31 + tenant_id=tenant_id
  32 + )
86 33
87 def load_spu_data(self) -> pd.DataFrame: 34 def load_spu_data(self) -> pd.DataFrame:
88 """ 35 """
@@ -291,7 +238,12 @@ class SPUTransformer: @@ -291,7 +238,12 @@ class SPUTransformer:
291 logger.warning(f"SPU {spu_id} (title: {spu_row.get('title', 'N/A')}) has no SKUs") 238 logger.warning(f"SPU {spu_id} (title: {spu_row.get('title', 'N/A')}) has no SKUs")
292 239
293 # Transform to ES document 240 # Transform to ES document
294 - doc = self._transform_spu_to_doc(spu_row, skus, options) 241 + doc = self.document_transformer.transform_spu_to_doc(
  242 + tenant_id=self.tenant_id,
  243 + spu_row=spu_row,
  244 + skus=skus,
  245 + options=options
  246 + )
295 if doc: 247 if doc:
296 documents.append(doc) 248 documents.append(doc)
297 else: 249 else:
@@ -309,378 +261,4 @@ class SPUTransformer: @@ -309,378 +261,4 @@ class SPUTransformer:
309 261
310 return documents 262 return documents
311 263
312 - def _transform_spu_to_doc(  
313 - self,  
314 - spu_row: pd.Series,  
315 - skus: pd.DataFrame,  
316 - options: pd.DataFrame  
317 - ) -> Optional[Dict[str, Any]]:  
318 - """  
319 - Transform a single SPU row and its SKUs into an ES document.  
320 -  
321 - Args:  
322 - spu_row: SPU row from database  
323 - skus: DataFrame with SKUs for this SPU  
324 - options: DataFrame with options for this SPU  
325 -  
326 - Returns:  
327 - ES document or None if transformation fails  
328 - """  
329 - doc = {}  
330 -  
331 - # Tenant ID (required)  
332 - doc['tenant_id'] = str(self.tenant_id)  
333 -  
334 - # SPU ID  
335 - spu_id = spu_row['id']  
336 - doc['spu_id'] = str(spu_id)  
337 -  
338 - # Validate required fields  
339 - if pd.isna(spu_row.get('title')) or not str(spu_row['title']).strip():  
340 - logger.error(f"SPU {spu_id} has no title, this may cause search issues")  
341 -  
342 - # 文本相关性相关字段(中英文双语,暂时只填充中文)  
343 - if pd.notna(spu_row.get('title')):  
344 - doc['title_zh'] = str(spu_row['title'])  
345 - doc['title_en'] = None # 暂时设为空  
346 -  
347 - if pd.notna(spu_row.get('brief')):  
348 - doc['brief_zh'] = str(spu_row['brief'])  
349 - doc['brief_en'] = None  
350 -  
351 - if pd.notna(spu_row.get('description')):  
352 - doc['description_zh'] = str(spu_row['description'])  
353 - doc['description_en'] = None  
354 -  
355 - if pd.notna(spu_row.get('vendor')):  
356 - doc['vendor_zh'] = str(spu_row['vendor'])  
357 - doc['vendor_en'] = None  
358 -  
359 - # Tags  
360 - if pd.notna(spu_row.get('tags')):  
361 - # Tags是逗号分隔的字符串,需要转换为数组  
362 - tags_str = str(spu_row['tags'])  
363 - doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()]  
364 -  
365 - # Category相关字段  
366 - if pd.notna(spu_row.get('category_path')):  
367 - category_path = str(spu_row['category_path'])  
368 -  
369 - # 解析category_path - 这是逗号分隔的类目ID列表  
370 - category_ids = [cid.strip() for cid in category_path.split(',') if cid.strip()]  
371 -  
372 - # 将ID映射为名称  
373 - category_names = []  
374 - missing_category_ids = []  
375 - for cid in category_ids:  
376 - if cid in self.category_id_to_name:  
377 - category_names.append(self.category_id_to_name[cid])  
378 - else:  
379 - # 如果找不到映射,记录错误并使用ID作为备选  
380 - logger.error(f"Category ID {cid} not found in mapping for SPU {spu_row['id']} (title: {spu_row.get('title', 'N/A')}), category_path={category_path}")  
381 - missing_category_ids.append(cid)  
382 - category_names.append(cid) # 使用ID作为备选  
383 -  
384 - # 构建类目路径字符串(用于搜索)  
385 - if category_names:  
386 - category_path_str = '/'.join(category_names)  
387 - doc['category_path_zh'] = category_path_str  
388 - doc['category_path_en'] = None # 暂时设为空  
389 -  
390 - # 填充分层类目名称  
391 - if len(category_names) > 0:  
392 - doc['category1_name'] = category_names[0]  
393 - if len(category_names) > 1:  
394 - doc['category2_name'] = category_names[1]  
395 - if len(category_names) > 2:  
396 - doc['category3_name'] = category_names[2]  
397 - elif pd.notna(spu_row.get('category')):  
398 - # 如果category_path为空,使用category字段作为category1_name的备选  
399 - category = str(spu_row['category'])  
400 - doc['category_name_zh'] = category  
401 - doc['category_name_en'] = None  
402 - doc['category_name'] = category  
403 -  
404 - # 尝试从category字段解析多级分类  
405 - if '/' in category:  
406 - path_parts = category.split('/')  
407 - if len(path_parts) > 0:  
408 - doc['category1_name'] = path_parts[0].strip()  
409 - if len(path_parts) > 1:  
410 - doc['category2_name'] = path_parts[1].strip()  
411 - if len(path_parts) > 2:  
412 - doc['category3_name'] = path_parts[2].strip()  
413 - else:  
414 - # 如果category不包含"/",直接作为category1_name  
415 - doc['category1_name'] = category.strip()  
416 -  
417 - if pd.notna(spu_row.get('category')):  
418 - # 确保category相关字段都被设置(如果前面没有设置)  
419 - category_name = str(spu_row['category'])  
420 - if 'category_name_zh' not in doc:  
421 - doc['category_name_zh'] = category_name  
422 - if 'category_name_en' not in doc:  
423 - doc['category_name_en'] = None  
424 - if 'category_name' not in doc:  
425 - doc['category_name'] = category_name  
426 -  
427 - if pd.notna(spu_row.get('category_id')):  
428 - doc['category_id'] = str(int(spu_row['category_id']))  
429 -  
430 - if pd.notna(spu_row.get('category_level')):  
431 - doc['category_level'] = int(spu_row['category_level'])  
432 -  
433 - # Option名称(从option表获取)  
434 - if not options.empty:  
435 - # 按position排序获取option名称  
436 - sorted_options = options.sort_values('position')  
437 - if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')):  
438 - doc['option1_name'] = str(sorted_options.iloc[0]['name'])  
439 - if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')):  
440 - doc['option2_name'] = str(sorted_options.iloc[1]['name'])  
441 - if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')):  
442 - doc['option3_name'] = str(sorted_options.iloc[2]['name'])  
443 -  
444 - # Image URL  
445 - if pd.notna(spu_row.get('image_src')):  
446 - image_src = str(spu_row['image_src'])  
447 - if not image_src.startswith('http'):  
448 - image_src = f"//{image_src}" if image_src.startswith('//') else image_src  
449 - doc['image_url'] = image_src  
450 -  
451 - # Sales (fake_sales)  
452 - if pd.notna(spu_row.get('fake_sales')):  
453 - try:  
454 - doc['sales'] = int(spu_row['fake_sales'])  
455 - except (ValueError, TypeError):  
456 - doc['sales'] = 0  
457 - else:  
458 - doc['sales'] = 0  
459 -  
460 - # Process SKUs and build specifications  
461 - skus_list = []  
462 - prices = []  
463 - compare_prices = []  
464 - sku_prices = []  
465 - sku_weights = []  
466 - sku_weight_units = []  
467 - total_inventory = 0  
468 - specifications = []  
469 -  
470 - # 构建option名称映射(position -> name)  
471 - option_name_map = {}  
472 - if not options.empty:  
473 - for _, opt_row in options.iterrows():  
474 - position = opt_row.get('position')  
475 - name = opt_row.get('name')  
476 - if pd.notna(position) and pd.notna(name):  
477 - option_name_map[int(position)] = str(name)  
478 -  
479 - for _, sku_row in skus.iterrows():  
480 - sku_data = self._transform_sku_row(sku_row, option_name_map)  
481 - if sku_data:  
482 - skus_list.append(sku_data)  
483 -  
484 - # 收集价格信息  
485 - if 'price' in sku_data and sku_data['price'] is not None:  
486 - try:  
487 - price_val = float(sku_data['price'])  
488 - prices.append(price_val)  
489 - sku_prices.append(price_val)  
490 - except (ValueError, TypeError):  
491 - pass  
492 -  
493 - if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None:  
494 - try:  
495 - compare_prices.append(float(sku_data['compare_at_price']))  
496 - except (ValueError, TypeError):  
497 - pass  
498 -  
499 - # 收集重量信息  
500 - if 'weight' in sku_data and sku_data['weight'] is not None:  
501 - try:  
502 - sku_weights.append(int(float(sku_data['weight'])))  
503 - except (ValueError, TypeError):  
504 - pass  
505 -  
506 - if 'weight_unit' in sku_data and sku_data['weight_unit']:  
507 - sku_weight_units.append(str(sku_data['weight_unit']))  
508 -  
509 - # 收集库存信息  
510 - if 'stock' in sku_data and sku_data['stock'] is not None:  
511 - try:  
512 - total_inventory += int(sku_data['stock'])  
513 - except (ValueError, TypeError):  
514 - pass  
515 -  
516 - # 构建specifications(从SKU的option值和option表的name)  
517 - sku_id = str(sku_row['id'])  
518 - if pd.notna(sku_row.get('option1')) and 1 in option_name_map:  
519 - specifications.append({  
520 - 'sku_id': sku_id,  
521 - 'name': option_name_map[1],  
522 - 'value': str(sku_row['option1'])  
523 - })  
524 - if pd.notna(sku_row.get('option2')) and 2 in option_name_map:  
525 - specifications.append({  
526 - 'sku_id': sku_id,  
527 - 'name': option_name_map[2],  
528 - 'value': str(sku_row['option2'])  
529 - })  
530 - if pd.notna(sku_row.get('option3')) and 3 in option_name_map:  
531 - specifications.append({  
532 - 'sku_id': sku_id,  
533 - 'name': option_name_map[3],  
534 - 'value': str(sku_row['option3'])  
535 - })  
536 -  
537 - doc['skus'] = skus_list  
538 - doc['specifications'] = specifications  
539 -  
540 - # 提取option值(根据配置的searchable_option_dimensions)  
541 - # 从子SKU的option1_value, option2_value, option3_value中提取去重后的值  
542 - option1_values = []  
543 - option2_values = []  
544 - option3_values = []  
545 -  
546 - for _, sku_row in skus.iterrows():  
547 - if pd.notna(sku_row.get('option1')):  
548 - option1_values.append(str(sku_row['option1']))  
549 - if pd.notna(sku_row.get('option2')):  
550 - option2_values.append(str(sku_row['option2']))  
551 - if pd.notna(sku_row.get('option3')):  
552 - option3_values.append(str(sku_row['option3']))  
553 -  
554 - # 去重并根据配置决定是否写入索引  
555 - if 'option1' in self.searchable_option_dimensions:  
556 - doc['option1_values'] = list(set(option1_values)) if option1_values else []  
557 - else:  
558 - doc['option1_values'] = []  
559 -  
560 - if 'option2' in self.searchable_option_dimensions:  
561 - doc['option2_values'] = list(set(option2_values)) if option2_values else []  
562 - else:  
563 - doc['option2_values'] = []  
564 -  
565 - if 'option3' in self.searchable_option_dimensions:  
566 - doc['option3_values'] = list(set(option3_values)) if option3_values else []  
567 - else:  
568 - doc['option3_values'] = []  
569 -  
570 - # Calculate price ranges  
571 - if prices:  
572 - doc['min_price'] = float(min(prices))  
573 - doc['max_price'] = float(max(prices))  
574 - else:  
575 - doc['min_price'] = 0.0  
576 - doc['max_price'] = 0.0  
577 -  
578 - if compare_prices:  
579 - doc['compare_at_price'] = float(max(compare_prices))  
580 - else:  
581 - doc['compare_at_price'] = None  
582 -  
583 - # SKU扁平化字段  
584 - doc['sku_prices'] = sku_prices  
585 - doc['sku_weights'] = sku_weights  
586 - doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重  
587 - doc['total_inventory'] = total_inventory  
588 -  
589 - # Image URL  
590 - if pd.notna(spu_row.get('image_src')):  
591 - image_src = str(spu_row['image_src'])  
592 - if not image_src.startswith('http'):  
593 - image_src = f"//{image_src}" if image_src.startswith('//') else image_src  
594 - doc['image_url'] = image_src  
595 -  
596 - # Time fields - convert datetime to ISO format string for ES DATE type  
597 - if pd.notna(spu_row.get('create_time')):  
598 - create_time = spu_row['create_time']  
599 - if hasattr(create_time, 'isoformat'):  
600 - doc['create_time'] = create_time.isoformat()  
601 - else:  
602 - doc['create_time'] = str(create_time)  
603 -  
604 - if pd.notna(spu_row.get('update_time')):  
605 - update_time = spu_row['update_time']  
606 - if hasattr(update_time, 'isoformat'):  
607 - doc['update_time'] = update_time.isoformat()  
608 - else:  
609 - doc['update_time'] = str(update_time)  
610 -  
611 - return doc  
612 -  
613 - def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:  
614 - """  
615 - Transform a SKU row into a SKU object.  
616 -  
617 - Args:  
618 - sku_row: SKU row from database  
619 - option_name_map: Mapping from position to option name  
620 -  
621 - Returns:  
622 - SKU dictionary or None  
623 - """  
624 - sku_data = {}  
625 -  
626 - # SKU ID  
627 - sku_data['sku_id'] = str(sku_row['id'])  
628 -  
629 - # Price  
630 - if pd.notna(sku_row.get('price')):  
631 - try:  
632 - sku_data['price'] = float(sku_row['price'])  
633 - except (ValueError, TypeError):  
634 - sku_data['price'] = None  
635 - else:  
636 - sku_data['price'] = None  
637 -  
638 - # Compare at price  
639 - if pd.notna(sku_row.get('compare_at_price')):  
640 - try:  
641 - sku_data['compare_at_price'] = float(sku_row['compare_at_price'])  
642 - except (ValueError, TypeError):  
643 - sku_data['compare_at_price'] = None  
644 - else:  
645 - sku_data['compare_at_price'] = None  
646 -  
647 - # SKU Code  
648 - if pd.notna(sku_row.get('sku')):  
649 - sku_data['sku_code'] = str(sku_row['sku'])  
650 -  
651 - # Stock  
652 - if pd.notna(sku_row.get('inventory_quantity')):  
653 - try:  
654 - sku_data['stock'] = int(sku_row['inventory_quantity'])  
655 - except (ValueError, TypeError):  
656 - sku_data['stock'] = 0  
657 - else:  
658 - sku_data['stock'] = 0  
659 -  
660 - # Weight  
661 - if pd.notna(sku_row.get('weight')):  
662 - try:  
663 - sku_data['weight'] = float(sku_row['weight'])  
664 - except (ValueError, TypeError):  
665 - sku_data['weight'] = None  
666 - else:  
667 - sku_data['weight'] = None  
668 -  
669 - # Weight unit  
670 - if pd.notna(sku_row.get('weight_unit')):  
671 - sku_data['weight_unit'] = str(sku_row['weight_unit'])  
672 -  
673 - # Option values  
674 - if pd.notna(sku_row.get('option1')):  
675 - sku_data['option1_value'] = str(sku_row['option1'])  
676 - if pd.notna(sku_row.get('option2')):  
677 - sku_data['option2_value'] = str(sku_row['option2'])  
678 - if pd.notna(sku_row.get('option3')):  
679 - sku_data['option3_value'] = str(sku_row['option3'])  
680 -  
681 - # Image src  
682 - if pd.notna(sku_row.get('image_src')):  
683 - sku_data['image_src'] = str(sku_row['image_src'])  
684 -  
685 - return sku_data  
686 264
indexer/test_indexing.py 0 → 100755
@@ -0,0 +1,362 @@ @@ -0,0 +1,362 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +索引功能测试脚本。
  4 +
  5 +测试内容:
  6 +1. 全量索引(SPUTransformer)
  7 +2. 增量索引(IncrementalIndexerService)
  8 +3. 租户配置加载
  9 +4. 翻译功能集成(根据租户配置)
  10 +5. 文档转换器功能
  11 +"""
  12 +
  13 +import sys
  14 +import os
  15 +from pathlib import Path
  16 +
  17 +# Add parent directory to path
  18 +sys.path.insert(0, str(Path(__file__).parent.parent))
  19 +
  20 +from config import ConfigLoader
  21 +from config.tenant_config_loader import get_tenant_config_loader
  22 +from utils.db_connector import create_db_connection
  23 +from indexer.spu_transformer import SPUTransformer
  24 +from indexer.incremental_service import IncrementalIndexerService
  25 +import logging
  26 +
  27 +# Configure logging
  28 +logging.basicConfig(
  29 + level=logging.INFO,
  30 + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  31 +)
  32 +logger = logging.getLogger(__name__)
  33 +
  34 +
  35 +def test_tenant_config():
  36 + """测试租户配置加载"""
  37 + print("\n" + "="*60)
  38 + print("测试1: 租户配置加载")
  39 + print("="*60)
  40 +
  41 + try:
  42 + tenant_config_loader = get_tenant_config_loader()
  43 +
  44 + # 测试默认配置
  45 + default_config = tenant_config_loader.get_tenant_config("999")
  46 + print(f"默认配置: {default_config}")
  47 +
  48 + # 测试租户162(翻译关闭)
  49 + tenant_162_config = tenant_config_loader.get_tenant_config("162")
  50 + print(f"租户162配置: {tenant_162_config}")
  51 + assert tenant_162_config['translate_to_en'] == False, "租户162翻译应该关闭"
  52 + assert tenant_162_config['translate_to_zh'] == False, "租户162翻译应该关闭"
  53 + print("✓ 租户162配置正确(翻译关闭)")
  54 +
  55 + # 测试其他租户
  56 + tenant_1_config = tenant_config_loader.get_tenant_config("1")
  57 + print(f"租户1配置: {tenant_1_config}")
  58 + assert tenant_1_config['translate_to_en'] == True, "租户1应该启用英文翻译"
  59 + print("✓ 租户1配置正确(翻译开启)")
  60 +
  61 + return True
  62 + except Exception as e:
  63 + print(f"✗ 租户配置测试失败: {e}")
  64 + import traceback
  65 + traceback.print_exc()
  66 + return False
  67 +
  68 +
  69 +def test_full_indexing(tenant_id: str = "162"):
  70 + """测试全量索引"""
  71 + print("\n" + "="*60)
  72 + print(f"测试2: 全量索引(租户{tenant_id})")
  73 + print("="*60)
  74 +
  75 + # 获取数据库配置
  76 + db_host = os.environ.get('DB_HOST')
  77 + db_port = int(os.environ.get('DB_PORT', 3306))
  78 + db_database = os.environ.get('DB_DATABASE')
  79 + db_username = os.environ.get('DB_USERNAME')
  80 + db_password = os.environ.get('DB_PASSWORD')
  81 +
  82 + if not all([db_host, db_database, db_username, db_password]):
  83 + print("✗ 跳过:数据库配置不完整")
  84 + print(" 需要环境变量: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD")
  85 + return False
  86 +
  87 + try:
  88 + # 连接数据库
  89 + db_engine = create_db_connection(
  90 + host=db_host,
  91 + port=db_port,
  92 + database=db_database,
  93 + username=db_username,
  94 + password=db_password
  95 + )
  96 + print(f"✓ 数据库连接成功: {db_host}:{db_port}/{db_database}")
  97 +
  98 + # 创建转换器
  99 + transformer = SPUTransformer(db_engine, tenant_id)
  100 + print(f"✓ SPUTransformer初始化成功")
  101 +
  102 + # 转换数据(只转换前3个SPU用于测试)
  103 + print(f"\n开始转换数据(租户{tenant_id})...")
  104 + documents = transformer.transform_batch()
  105 +
  106 + if not documents:
  107 + print(f"⚠ 没有数据需要转换")
  108 + return True
  109 +
  110 + print(f"✓ 转换完成: {len(documents)} 个文档")
  111 +
  112 + # 检查前3个文档
  113 + for i, doc in enumerate(documents[:3]):
  114 + print(f"\n文档 {i+1}:")
  115 + print(f" SPU ID: {doc.get('spu_id')}")
  116 + print(f" Tenant ID: {doc.get('tenant_id')}")
  117 + print(f" 标题 (中文): {doc.get('title_zh', 'N/A')}")
  118 + print(f" 标题 (英文): {doc.get('title_en', 'N/A')}")
  119 +
  120 + # 检查租户162的翻译状态
  121 + if tenant_id == "162":
  122 + # 租户162翻译应该关闭,title_en应该为None
  123 + if doc.get('title_en') is None:
  124 + print(f" ✓ 翻译已关闭(title_en为None)")
  125 + else:
  126 + print(f" ⚠ 警告:翻译应该关闭,但title_en有值: {doc.get('title_en')}")
  127 +
  128 + return True
  129 +
  130 + except Exception as e:
  131 + print(f"✗ 全量索引测试失败: {e}")
  132 + import traceback
  133 + traceback.print_exc()
  134 + return False
  135 +
  136 +
  137 +def test_incremental_indexing(tenant_id: str = "162"):
  138 + """测试增量索引"""
  139 + print("\n" + "="*60)
  140 + print(f"测试3: 增量索引(租户{tenant_id})")
  141 + print("="*60)
  142 +
  143 + # 获取数据库配置
  144 + db_host = os.environ.get('DB_HOST')
  145 + db_port = int(os.environ.get('DB_PORT', 3306))
  146 + db_database = os.environ.get('DB_DATABASE')
  147 + db_username = os.environ.get('DB_USERNAME')
  148 + db_password = os.environ.get('DB_PASSWORD')
  149 +
  150 + if not all([db_host, db_database, db_username, db_password]):
  151 + print("✗ 跳过:数据库配置不完整")
  152 + return False
  153 +
  154 + try:
  155 + # 连接数据库
  156 + db_engine = create_db_connection(
  157 + host=db_host,
  158 + port=db_port,
  159 + database=db_database,
  160 + username=db_username,
  161 + password=db_password
  162 + )
  163 +
  164 + # 创建增量服务
  165 + service = IncrementalIndexerService(db_engine)
  166 + print(f"✓ IncrementalIndexerService初始化成功")
  167 +
  168 + # 先查询一个SPU ID
  169 + from sqlalchemy import text
  170 + with db_engine.connect() as conn:
  171 + query = text("""
  172 + SELECT id FROM shoplazza_product_spu
  173 + WHERE tenant_id = :tenant_id AND deleted = 0
  174 + LIMIT 1
  175 + """)
  176 + result = conn.execute(query, {"tenant_id": tenant_id})
  177 + row = result.fetchone()
  178 + if not row:
  179 + print(f"⚠ 租户{tenant_id}没有数据,跳过增量测试")
  180 + return True
  181 + spu_id = str(row[0])
  182 +
  183 + print(f"\n测试SPU ID: {spu_id}")
  184 +
  185 + # 获取SPU文档
  186 + doc = service.get_spu_document(tenant_id=tenant_id, spu_id=spu_id)
  187 +
  188 + if doc is None:
  189 + print(f"✗ SPU {spu_id} 文档获取失败")
  190 + return False
  191 +
  192 + print(f"✓ SPU文档获取成功")
  193 + print(f" SPU ID: {doc.get('spu_id')}")
  194 + print(f" Tenant ID: {doc.get('tenant_id')}")
  195 + print(f" 标题 (中文): {doc.get('title_zh', 'N/A')}")
  196 + print(f" 标题 (英文): {doc.get('title_en', 'N/A')}")
  197 + print(f" SKU数量: {len(doc.get('skus', []))}")
  198 + print(f" 规格数量: {len(doc.get('specifications', []))}")
  199 +
  200 + # 检查租户162的翻译状态
  201 + if tenant_id == "162":
  202 + if doc.get('title_en') is None:
  203 + print(f" ✓ 翻译已关闭(title_en为None)")
  204 + else:
  205 + print(f" ⚠ 警告:翻译应该关闭,但title_en有值: {doc.get('title_en')}")
  206 +
  207 + return True
  208 +
  209 + except Exception as e:
  210 + print(f"✗ 增量索引测试失败: {e}")
  211 + import traceback
  212 + traceback.print_exc()
  213 + return False
  214 +
  215 +
  216 +def test_document_transformer():
  217 + """测试文档转换器"""
  218 + print("\n" + "="*60)
  219 + print("测试4: 文档转换器")
  220 + print("="*60)
  221 +
  222 + try:
  223 + import pandas as pd
  224 + from indexer.document_transformer import SPUDocumentTransformer
  225 + from config import ConfigLoader
  226 +
  227 + config = ConfigLoader().load_config()
  228 +
  229 + # 创建模拟数据
  230 + spu_row = pd.Series({
  231 + 'id': 123,
  232 + 'tenant_id': '162',
  233 + 'title': '测试商品',
  234 + 'brief': '测试简介',
  235 + 'description': '测试描述',
  236 + 'vendor': '测试品牌',
  237 + 'category': '测试类目',
  238 + 'category_id': 100,
  239 + 'category_level': 1,
  240 + 'fake_sales': 1000,
  241 + 'image_src': 'https://example.com/image.jpg',
  242 + 'tags': '测试,标签',
  243 + 'create_time': pd.Timestamp.now(),
  244 + 'update_time': pd.Timestamp.now()
  245 + })
  246 +
  247 + skus_df = pd.DataFrame([{
  248 + 'id': 456,
  249 + 'price': 99.99,
  250 + 'compare_at_price': 149.99,
  251 + 'sku': 'SKU001',
  252 + 'inventory_quantity': 100,
  253 + 'option1': '黑色',
  254 + 'option2': None,
  255 + 'option3': None
  256 + }])
  257 +
  258 + options_df = pd.DataFrame([{
  259 + 'id': 1,
  260 + 'position': 1,
  261 + 'name': '颜色'
  262 + }])
  263 +
  264 + # 获取租户配置
  265 + tenant_config_loader = get_tenant_config_loader()
  266 + tenant_config = tenant_config_loader.get_tenant_config('162')
  267 +
  268 + # 初始化翻译器(如果启用)
  269 + translator = None
  270 + if config.query_config.enable_translation:
  271 + from query.translator import Translator
  272 + translator = Translator(
  273 + api_key=config.query_config.translation_api_key,
  274 + use_cache=True
  275 + )
  276 +
  277 + # 创建转换器
  278 + transformer = SPUDocumentTransformer(
  279 + category_id_to_name={},
  280 + searchable_option_dimensions=['option1', 'option2', 'option3'],
  281 + tenant_config=tenant_config,
  282 + translator=translator,
  283 + translation_prompts=config.query_config.translation_prompts
  284 + )
  285 +
  286 + # 转换文档
  287 + doc = transformer.transform_spu_to_doc(
  288 + tenant_id='162',
  289 + spu_row=spu_row,
  290 + skus=skus_df,
  291 + options=options_df
  292 + )
  293 +
  294 + if doc:
  295 + print(f"✓ 文档转换成功")
  296 + print(f" title_zh: {doc.get('title_zh')}")
  297 + print(f" title_en: {doc.get('title_en')}")
  298 + print(f" SKU数量: {len(doc.get('skus', []))}")
  299 +
  300 + # 验证租户162翻译关闭
  301 + if doc.get('title_en') is None:
  302 + print(f" ✓ 翻译已关闭(符合租户162配置)")
  303 + else:
  304 + print(f" ⚠ 警告:翻译应该关闭")
  305 +
  306 + return True
  307 + else:
  308 + print(f"✗ 文档转换失败")
  309 + return False
  310 +
  311 + except Exception as e:
  312 + print(f"✗ 文档转换器测试失败: {e}")
  313 + import traceback
  314 + traceback.print_exc()
  315 + return False
  316 +
  317 +
  318 +def main():
  319 + """主测试函数"""
  320 + print("="*60)
  321 + print("索引功能完整测试")
  322 + print("="*60)
  323 +
  324 + results = []
  325 +
  326 + # 测试1: 租户配置
  327 + results.append(("租户配置加载", test_tenant_config()))
  328 +
  329 + # 测试2: 全量索引(租户162)
  330 + results.append(("全量索引(租户162)", test_full_indexing("162")))
  331 +
  332 + # 测试3: 增量索引(租户162)
  333 + results.append(("增量索引(租户162)", test_incremental_indexing("162")))
  334 +
  335 + # 测试4: 文档转换器
  336 + results.append(("文档转换器", test_document_transformer()))
  337 +
  338 + # 总结
  339 + print("\n" + "="*60)
  340 + print("测试总结")
  341 + print("="*60)
  342 +
  343 + passed = sum(1 for _, result in results if result)
  344 + total = len(results)
  345 +
  346 + for name, result in results:
  347 + status = "✓ 通过" if result else "✗ 失败"
  348 + print(f"{status}: {name}")
  349 +
  350 + print(f"\n总计: {passed}/{total} 通过")
  351 +
  352 + if passed == total:
  353 + print("✓ 所有测试通过")
  354 + return 0
  355 + else:
  356 + print("✗ 部分测试失败")
  357 + return 1
  358 +
  359 +
  360 +if __name__ == '__main__':
  361 + sys.exit(main())
  362 +
query/query_parser.py
@@ -229,14 +229,19 @@ class QueryParser: @@ -229,14 +229,19 @@ class QueryParser:
229 229
230 if target_langs: 230 if target_langs:
231 # Use e-commerce context for better disambiguation 231 # Use e-commerce context for better disambiguation
232 - translation_context = 'e-commerce product search' 232 + translation_context = self.config.query_config.translation_context
  233 + # For query translation, we use a general prompt (not language-specific)
  234 + # Since translate_multi uses same prompt for all languages, we use default
  235 + query_prompt = self.config.query_config.translation_prompts.get('query_zh') or \
  236 + self.config.query_config.translation_prompts.get('default_zh')
233 # Use async mode: returns cached translations immediately, missing ones translated in background 237 # Use async mode: returns cached translations immediately, missing ones translated in background
234 translations = self.translator.translate_multi( 238 translations = self.translator.translate_multi(
235 query_text, 239 query_text,
236 target_langs, 240 target_langs,
237 source_lang=detected_lang, 241 source_lang=detected_lang,
238 context=translation_context, 242 context=translation_context,
239 - async_mode=True 243 + async_mode=True,
  244 + prompt=query_prompt
240 ) 245 )
241 # Filter out None values (missing translations that are being processed async) 246 # Filter out None values (missing translations that are being processed async)
242 translations = {k: v for k, v in translations.items() if v is not None} 247 translations = {k: v for k, v in translations.items() if v is not None}
query/test_translation.py 0 → 100755
@@ -0,0 +1,294 @@ @@ -0,0 +1,294 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +翻译功能测试脚本。
  4 +
  5 +测试内容:
  6 +1. 翻译提示词配置加载
  7 +2. 同步翻译(索引场景)
  8 +3. 异步翻译(查询场景)
  9 +4. 不同提示词的使用
  10 +5. 缓存功能
  11 +6. DeepL Context参数使用
  12 +"""
  13 +
  14 +import sys
  15 +import os
  16 +from pathlib import Path
  17 +
  18 +# Add parent directory to path
  19 +sys.path.insert(0, str(Path(__file__).parent.parent))
  20 +
  21 +from config import ConfigLoader
  22 +from query.translator import Translator
  23 +import logging
  24 +
  25 +# Configure logging
  26 +logging.basicConfig(
  27 + level=logging.INFO,
  28 + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  29 +)
  30 +logger = logging.getLogger(__name__)
  31 +
  32 +
  33 +def test_config_loading():
  34 + """测试配置加载"""
  35 + print("\n" + "="*60)
  36 + print("测试1: 配置加载")
  37 + print("="*60)
  38 +
  39 + try:
  40 + config_loader = ConfigLoader()
  41 + config = config_loader.load_config()
  42 +
  43 + print(f"✓ 配置加载成功")
  44 + print(f" 翻译服务: {config.query_config.translation_service}")
  45 + print(f" 翻译提示词配置:")
  46 + for key, value in config.query_config.translation_prompts.items():
  47 + print(f" {key}: {value[:60]}..." if len(value) > 60 else f" {key}: {value}")
  48 +
  49 + return config
  50 + except Exception as e:
  51 + print(f"✗ 配置加载失败: {e}")
  52 + import traceback
  53 + traceback.print_exc()
  54 + return None
  55 +
  56 +
  57 +def test_translator_sync(config):
  58 + """测试同步翻译(索引场景)"""
  59 + print("\n" + "="*60)
  60 + print("测试2: 同步翻译(索引场景)")
  61 + print("="*60)
  62 +
  63 + if not config:
  64 + print("✗ 跳过:配置未加载")
  65 + return None
  66 +
  67 + try:
  68 + translator = Translator(
  69 + api_key=config.query_config.translation_api_key,
  70 + use_cache=True,
  71 + glossary_id=config.query_config.translation_glossary_id,
  72 + translation_context=config.query_config.translation_context
  73 + )
  74 +
  75 + # 测试商品标题翻译(使用product_title提示词)
  76 + test_texts = [
  77 + ("蓝牙耳机", "zh", "en", "product_title"),
  78 + ("Wireless Headphones", "en", "zh", "product_title"),
  79 + ]
  80 +
  81 + for text, source_lang, target_lang, prompt_type in test_texts:
  82 + if prompt_type == "product_title":
  83 + if target_lang == "zh":
  84 + prompt = config.query_config.translation_prompts.get('product_title_zh')
  85 + else:
  86 + prompt = config.query_config.translation_prompts.get('product_title_en')
  87 + else:
  88 + if target_lang == "zh":
  89 + prompt = config.query_config.translation_prompts.get('default_zh')
  90 + else:
  91 + prompt = config.query_config.translation_prompts.get('default_en')
  92 +
  93 + print(f"\n翻译测试:")
  94 + print(f" 原文 ({source_lang}): {text}")
  95 + print(f" 目标语言: {target_lang}")
  96 + print(f" 提示词: {prompt[:50] if prompt else 'None'}...")
  97 +
  98 + result = translator.translate(
  99 + text,
  100 + target_lang=target_lang,
  101 + source_lang=source_lang,
  102 + prompt=prompt
  103 + )
  104 +
  105 + if result:
  106 + print(f" 结果: {result}")
  107 + print(f" ✓ 翻译成功")
  108 + else:
  109 + print(f" ⚠ 翻译返回None(可能是mock模式或无API key)")
  110 +
  111 + return translator
  112 +
  113 + except Exception as e:
  114 + print(f"✗ 同步翻译测试失败: {e}")
  115 + import traceback
  116 + traceback.print_exc()
  117 + return None
  118 +
  119 +
  120 +def test_translator_async(config, translator):
  121 + """测试异步翻译(查询场景)"""
  122 + print("\n" + "="*60)
  123 + print("测试3: 异步翻译(查询场景)")
  124 + print("="*60)
  125 +
  126 + if not config or not translator:
  127 + print("✗ 跳过:配置或翻译器未初始化")
  128 + return
  129 +
  130 + try:
  131 + query_text = "手机"
  132 + target_langs = ['en']
  133 + source_lang = 'zh'
  134 +
  135 + query_prompt = config.query_config.translation_prompts.get('query_zh')
  136 +
  137 + print(f"查询文本: {query_text}")
  138 + print(f"目标语言: {target_langs}")
  139 + print(f"提示词: {query_prompt}")
  140 +
  141 + # 异步模式(立即返回,后台翻译)
  142 + results = translator.translate_multi(
  143 + query_text,
  144 + target_langs,
  145 + source_lang=source_lang,
  146 + context=config.query_config.translation_context,
  147 + async_mode=True,
  148 + prompt=query_prompt
  149 + )
  150 +
  151 + print(f"\n异步翻译结果:")
  152 + for lang, translation in results.items():
  153 + if translation:
  154 + print(f" {lang}: {translation} (缓存命中)")
  155 + else:
  156 + print(f" {lang}: None (后台翻译中...)")
  157 +
  158 + # 同步模式(等待完成)
  159 + print(f"\n同步翻译(等待完成):")
  160 + results_sync = translator.translate_multi(
  161 + query_text,
  162 + target_langs,
  163 + source_lang=source_lang,
  164 + context=config.query_config.translation_context,
  165 + async_mode=False,
  166 + prompt=query_prompt
  167 + )
  168 +
  169 + for lang, translation in results_sync.items():
  170 + print(f" {lang}: {translation}")
  171 +
  172 + except Exception as e:
  173 + print(f"✗ 异步翻译测试失败: {e}")
  174 + import traceback
  175 + traceback.print_exc()
  176 +
  177 +
  178 +def test_cache():
  179 + """测试缓存功能"""
  180 + print("\n" + "="*60)
  181 + print("测试4: 缓存功能")
  182 + print("="*60)
  183 +
  184 + try:
  185 + config_loader = ConfigLoader()
  186 + config = config_loader.load_config()
  187 +
  188 + translator = Translator(
  189 + api_key=config.query_config.translation_api_key,
  190 + use_cache=True
  191 + )
  192 +
  193 + test_text = "测试文本"
  194 + target_lang = "en"
  195 + source_lang = "zh"
  196 + prompt = config.query_config.translation_prompts.get('default_zh')
  197 +
  198 + print(f"第一次翻译(应该调用API或返回mock):")
  199 + result1 = translator.translate(test_text, target_lang, source_lang, prompt=prompt)
  200 + print(f" 结果: {result1}")
  201 +
  202 + print(f"\n第二次翻译(应该使用缓存):")
  203 + result2 = translator.translate(test_text, target_lang, source_lang, prompt=prompt)
  204 + print(f" 结果: {result2}")
  205 +
  206 + if result1 == result2:
  207 + print(f" ✓ 缓存功能正常")
  208 + else:
  209 + print(f" ⚠ 缓存可能有问题")
  210 +
  211 + except Exception as e:
  212 + print(f"✗ 缓存测试失败: {e}")
  213 + import traceback
  214 + traceback.print_exc()
  215 +
  216 +
  217 +def test_context_parameter():
  218 + """测试DeepL Context参数使用"""
  219 + print("\n" + "="*60)
  220 + print("测试5: DeepL Context参数")
  221 + print("="*60)
  222 +
  223 + try:
  224 + config_loader = ConfigLoader()
  225 + config = config_loader.load_config()
  226 +
  227 + translator = Translator(
  228 + api_key=config.query_config.translation_api_key,
  229 + use_cache=False # 禁用缓存以便测试
  230 + )
  231 +
  232 + # 测试带context和不带context的翻译
  233 + text = "手机"
  234 + prompt = config.query_config.translation_prompts.get('query_zh')
  235 +
  236 + print(f"测试文本: {text}")
  237 + print(f"提示词(作为context): {prompt}")
  238 +
  239 + # 带context的翻译
  240 + result_with_context = translator.translate(
  241 + text,
  242 + target_lang='en',
  243 + source_lang='zh',
  244 + prompt=prompt
  245 + )
  246 + print(f"\n带context翻译结果: {result_with_context}")
  247 +
  248 + # 不带context的翻译
  249 + result_without_context = translator.translate(
  250 + text,
  251 + target_lang='en',
  252 + source_lang='zh',
  253 + prompt=None
  254 + )
  255 + print(f"不带context翻译结果: {result_without_context}")
  256 +
  257 + print(f"\n✓ Context参数测试完成")
  258 + print(f" 注意:根据DeepL API,context参数影响翻译但不参与翻译本身")
  259 +
  260 + except Exception as e:
  261 + print(f"✗ Context参数测试失败: {e}")
  262 + import traceback
  263 + traceback.print_exc()
  264 +
  265 +
  266 +def main():
  267 + """主测试函数"""
  268 + print("="*60)
  269 + print("翻译功能测试")
  270 + print("="*60)
  271 +
  272 + # 测试1: 配置加载
  273 + config = test_config_loading()
  274 +
  275 + # 测试2: 同步翻译
  276 + translator = test_translator_sync(config)
  277 +
  278 + # 测试3: 异步翻译
  279 + test_translator_async(config, translator)
  280 +
  281 + # 测试4: 缓存功能
  282 + test_cache()
  283 +
  284 + # 测试5: Context参数
  285 + test_context_parameter()
  286 +
  287 + print("\n" + "="*60)
  288 + print("测试完成")
  289 + print("="*60)
  290 +
  291 +
  292 +if __name__ == '__main__':
  293 + main()
  294 +
query/translator.py
@@ -2,10 +2,16 @@ @@ -2,10 +2,16 @@
2 Translation service for multi-language query support. 2 Translation service for multi-language query support.
3 3
4 Supports DeepL API for high-quality translations. 4 Supports DeepL API for high-quality translations.
  5 +
  6 +
  7 +#### 官方文档:
  8 +https://developers.deepl.com/api-reference/translate/request-translation
  9 +#####
  10 +
  11 +
5 """ 12 """
6 13
7 import requests 14 import requests
8 -import threading  
9 from concurrent.futures import ThreadPoolExecutor 15 from concurrent.futures import ThreadPoolExecutor
10 from typing import Dict, List, Optional 16 from typing import Dict, List, Optional
11 from utils.cache import DictCache 17 from utils.cache import DictCache
@@ -74,25 +80,24 @@ class Translator: @@ -74,25 +80,24 @@ class Translator:
74 80
75 # Thread pool for async translation 81 # Thread pool for async translation
76 self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") 82 self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")
77 -  
78 - # Thread pool for async translation  
79 - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")  
80 83
81 def translate( 84 def translate(
82 self, 85 self,
83 text: str, 86 text: str,
84 target_lang: str, 87 target_lang: str,
85 source_lang: Optional[str] = None, 88 source_lang: Optional[str] = None,
86 - context: Optional[str] = None 89 + context: Optional[str] = None,
  90 + prompt: Optional[str] = None
87 ) -> Optional[str]: 91 ) -> Optional[str]:
88 """ 92 """
89 - Translate text to target language. 93 + Translate text to target language (synchronous mode).
90 94
91 Args: 95 Args:
92 text: Text to translate 96 text: Text to translate
93 target_lang: Target language code ('zh', 'en', 'ru', etc.) 97 target_lang: Target language code ('zh', 'en', 'ru', etc.)
94 source_lang: Source language code (optional, auto-detect if None) 98 source_lang: Source language code (optional, auto-detect if None)
95 context: Additional context for translation (overrides default context) 99 context: Additional context for translation (overrides default context)
  100 + prompt: Translation prompt/instruction (optional, for better translation quality)
96 101
97 Returns: 102 Returns:
98 Translated text or None if translation fails 103 Translated text or None if translation fails
@@ -107,35 +112,40 @@ class Translator: @@ -107,35 +112,40 @@ class Translator:
107 112
108 # Use provided context or default context 113 # Use provided context or default context
109 translation_context = context or self.translation_context 114 translation_context = context or self.translation_context
110 -  
111 - # Check cache (include context in cache key for accuracy) 115 +
  116 + # Build cache key (include prompt in cache key if provided)
  117 + cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
  118 + if prompt:
  119 + cache_key_parts.append(prompt)
  120 + cache_key_parts.append(text)
  121 + cache_key = ':'.join(cache_key_parts)
  122 +
  123 + # Check cache (include context and prompt in cache key for accuracy)
112 if self.use_cache: 124 if self.use_cache:
113 - cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}"  
114 cached = self.cache.get(cache_key, category="translations") 125 cached = self.cache.get(cache_key, category="translations")
115 if cached: 126 if cached:
116 return cached 127 return cached
117 128
118 # If no API key, return mock translation (for testing) 129 # If no API key, return mock translation (for testing)
119 if not self.api_key: 130 if not self.api_key:
120 - print(f"[Translator] No API key, returning original text (mock mode)") 131 + logger.debug(f"[Translator] No API key, returning original text (mock mode)")
121 return text 132 return text
122 133
123 # Translate using DeepL with fallback 134 # Translate using DeepL with fallback
124 - result = self._translate_deepl(text, target_lang, source_lang, translation_context) 135 + result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
125 136
126 # If translation failed, try fallback to free API 137 # If translation failed, try fallback to free API
127 if result is None and "api.deepl.com" in self.DEEPL_API_URL: 138 if result is None and "api.deepl.com" in self.DEEPL_API_URL:
128 - print(f"[Translator] Pro API failed, trying free API...")  
129 - result = self._translate_deepl_free(text, target_lang, source_lang, translation_context) 139 + logger.debug(f"[Translator] Pro API failed, trying free API...")
  140 + result = self._translate_deepl_free(text, target_lang, source_lang, translation_context, prompt)
130 141
131 # If still failed, return original text with warning 142 # If still failed, return original text with warning
132 if result is None: 143 if result is None:
133 - print(f"[Translator] Translation failed, returning original text") 144 + logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text")
134 result = text 145 result = text
135 146
136 # Cache result 147 # Cache result
137 if result and self.use_cache: 148 if result and self.use_cache:
138 - cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}"  
139 self.cache.set(cache_key, result, category="translations") 149 self.cache.set(cache_key, result, category="translations")
140 150
141 return result 151 return result
@@ -145,7 +155,8 @@ class Translator: @@ -145,7 +155,8 @@ class Translator:
145 text: str, 155 text: str,
146 target_lang: str, 156 target_lang: str,
147 source_lang: Optional[str], 157 source_lang: Optional[str],
148 - context: Optional[str] = None 158 + context: Optional[str] = None,
  159 + prompt: Optional[str] = None
149 ) -> Optional[str]: 160 ) -> Optional[str]:
150 """ 161 """
151 Translate using DeepL API with context and glossary support. 162 Translate using DeepL API with context and glossary support.
@@ -164,10 +175,14 @@ class Translator: @@ -164,10 +175,14 @@ class Translator:
164 "Content-Type": "application/json", 175 "Content-Type": "application/json",
165 } 176 }
166 177
167 - # Build text with context for better disambiguation 178 + # Use prompt as context parameter for DeepL API (not as text prefix)
  179 + # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
  180 + # If prompt is provided, use it as context; otherwise use the default context
  181 + api_context = prompt if prompt else context
  182 +
168 # For e-commerce, add context words to help DeepL understand the domain 183 # For e-commerce, add context words to help DeepL understand the domain
169 # This is especially important for single-word ambiguous terms like "车" (car vs rook) 184 # This is especially important for single-word ambiguous terms like "车" (car vs rook)
170 - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, context) 185 + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
171 186
172 payload = { 187 payload = {
173 "text": [text_to_translate], 188 "text": [text_to_translate],
@@ -178,15 +193,18 @@ class Translator: @@ -178,15 +193,18 @@ class Translator:
178 source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) 193 source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
179 payload["source_lang"] = source_code 194 payload["source_lang"] = source_code
180 195
  196 + # Add context parameter (prompt or default context)
  197 + # Context influences translation but is not translated itself
  198 + if api_context:
  199 + payload["context"] = api_context
  200 +
181 # Add glossary if configured 201 # Add glossary if configured
182 if self.glossary_id: 202 if self.glossary_id:
183 payload["glossary_id"] = self.glossary_id 203 payload["glossary_id"] = self.glossary_id
184 204
185 - # Note: DeepL API v2 doesn't have a direct "context" parameter,  
186 - # but we can improve translation by:  
187 - # 1. Using glossary for domain-specific terms (best solution)  
188 - # 2. Adding context words to the text (for single-word queries) - implemented in _add_ecommerce_context  
189 - # 3. Using more specific source language detection 205 + # Note: DeepL API v2 supports "context" parameter for additional context
  206 + # that influences translation but is not translated itself.
  207 + # We use prompt as context parameter when provided.
190 208
191 try: 209 try:
192 response = requests.post( 210 response = requests.post(
@@ -207,14 +225,14 @@ class Translator: @@ -207,14 +225,14 @@ class Translator:
207 ) 225 )
208 return translated_text 226 return translated_text
209 else: 227 else:
210 - print(f"[Translator] DeepL API error: {response.status_code} - {response.text}") 228 + logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}")
211 return None 229 return None
212 230
213 except requests.Timeout: 231 except requests.Timeout:
214 - print(f"[Translator] Translation request timed out") 232 + logger.warning(f"[Translator] Translation request timed out")
215 return None 233 return None
216 except Exception as e: 234 except Exception as e:
217 - print(f"[Translator] Translation failed: {e}") 235 + logger.error(f"[Translator] Translation failed: {e}", exc_info=True)
218 return None 236 return None
219 237
220 def _translate_deepl_free( 238 def _translate_deepl_free(
@@ -222,7 +240,8 @@ class Translator: @@ -222,7 +240,8 @@ class Translator:
222 text: str, 240 text: str,
223 target_lang: str, 241 target_lang: str,
224 source_lang: Optional[str], 242 source_lang: Optional[str],
225 - context: Optional[str] = None 243 + context: Optional[str] = None,
  244 + prompt: Optional[str] = None
226 ) -> Optional[str]: 245 ) -> Optional[str]:
227 """ 246 """
228 Translate using DeepL Free API. 247 Translate using DeepL Free API.
@@ -237,6 +256,9 @@ class Translator: @@ -237,6 +256,9 @@ class Translator:
237 "Content-Type": "application/json", 256 "Content-Type": "application/json",
238 } 257 }
239 258
  259 + # Use prompt as context parameter for DeepL API
  260 + api_context = prompt if prompt else context
  261 +
240 payload = { 262 payload = {
241 "text": [text], 263 "text": [text],
242 "target_lang": target_code, 264 "target_lang": target_code,
@@ -246,6 +268,10 @@ class Translator: @@ -246,6 +268,10 @@ class Translator:
246 source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) 268 source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
247 payload["source_lang"] = source_code 269 payload["source_lang"] = source_code
248 270
  271 + # Add context parameter
  272 + if api_context:
  273 + payload["context"] = api_context
  274 +
249 # Note: Free API typically doesn't support glossary_id 275 # Note: Free API typically doesn't support glossary_id
250 # But we can still use context hints in the text 276 # But we can still use context hints in the text
251 277
@@ -262,14 +288,14 @@ class Translator: @@ -262,14 +288,14 @@ class Translator:
262 if "translations" in data and len(data["translations"]) > 0: 288 if "translations" in data and len(data["translations"]) > 0:
263 return data["translations"][0]["text"] 289 return data["translations"][0]["text"]
264 else: 290 else:
265 - print(f"[Translator] DeepL Free API error: {response.status_code} - {response.text}") 291 + logger.error(f"[Translator] DeepL Free API error: {response.status_code} - {response.text}")
266 return None 292 return None
267 293
268 except requests.Timeout: 294 except requests.Timeout:
269 - print(f"[Translator] Free API request timed out") 295 + logger.warning(f"[Translator] Free API request timed out")
270 return None 296 return None
271 except Exception as e: 297 except Exception as e:
272 - print(f"[Translator] Free API translation failed: {e}") 298 + logger.error(f"[Translator] Free API translation failed: {e}", exc_info=True)
273 return None 299 return None
274 300
275 def translate_multi( 301 def translate_multi(
@@ -278,7 +304,8 @@ class Translator: @@ -278,7 +304,8 @@ class Translator:
278 target_langs: List[str], 304 target_langs: List[str],
279 source_lang: Optional[str] = None, 305 source_lang: Optional[str] = None,
280 context: Optional[str] = None, 306 context: Optional[str] = None,
281 - async_mode: bool = True 307 + async_mode: bool = True,
  308 + prompt: Optional[str] = None
282 ) -> Dict[str, Optional[str]]: 309 ) -> Dict[str, Optional[str]]:
283 """ 310 """
284 Translate text to multiple target languages. 311 Translate text to multiple target languages.
@@ -297,6 +324,7 @@ class Translator: @@ -297,6 +324,7 @@ class Translator:
297 source_lang: Source language code (optional) 324 source_lang: Source language code (optional)
298 context: Context hint for translation (optional) 325 context: Context hint for translation (optional)
299 async_mode: If True, return cached results immediately and translate missing ones async 326 async_mode: If True, return cached results immediately and translate missing ones async
  327 + prompt: Translation prompt/instruction (optional)
300 328
301 Returns: 329 Returns:
302 Dictionary mapping language code to translated text (only cached results in async mode) 330 Dictionary mapping language code to translated text (only cached results in async mode)
@@ -306,7 +334,7 @@ class Translator: @@ -306,7 +334,7 @@ class Translator:
306 334
307 # First, get cached translations 335 # First, get cached translations
308 for lang in target_langs: 336 for lang in target_langs:
309 - cached = self._get_cached_translation(text, lang, source_lang, context) 337 + cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
310 if cached is not None: 338 if cached is not None:
311 results[lang] = cached 339 results[lang] = cached
312 else: 340 else:
@@ -315,14 +343,14 @@ class Translator: @@ -315,14 +343,14 @@ class Translator:
315 # If async mode and there are missing translations, launch async tasks 343 # If async mode and there are missing translations, launch async tasks
316 if async_mode and missing_langs: 344 if async_mode and missing_langs:
317 for lang in missing_langs: 345 for lang in missing_langs:
318 - self._translate_async(text, lang, source_lang, context) 346 + self._translate_async(text, lang, source_lang, context, prompt)
319 # Return None for missing translations 347 # Return None for missing translations
320 for lang in missing_langs: 348 for lang in missing_langs:
321 results[lang] = None 349 results[lang] = None
322 else: 350 else:
323 # Synchronous mode: wait for all translations 351 # Synchronous mode: wait for all translations
324 for lang in missing_langs: 352 for lang in missing_langs:
325 - results[lang] = self.translate(text, lang, source_lang, context) 353 + results[lang] = self.translate(text, lang, source_lang, context, prompt)
326 354
327 return results 355 return results
328 356
@@ -331,14 +359,19 @@ class Translator: @@ -331,14 +359,19 @@ class Translator:
331 text: str, 359 text: str,
332 target_lang: str, 360 target_lang: str,
333 source_lang: Optional[str] = None, 361 source_lang: Optional[str] = None,
334 - context: Optional[str] = None 362 + context: Optional[str] = None,
  363 + prompt: Optional[str] = None
335 ) -> Optional[str]: 364 ) -> Optional[str]:
336 """Get translation from cache if available.""" 365 """Get translation from cache if available."""
337 if not self.cache: 366 if not self.cache:
338 return None 367 return None
339 368
340 translation_context = context or self.translation_context 369 translation_context = context or self.translation_context
341 - cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}" 370 + cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
  371 + if prompt:
  372 + cache_key_parts.append(prompt)
  373 + cache_key_parts.append(text)
  374 + cache_key = ':'.join(cache_key_parts)
342 return self.cache.get(cache_key, category="translations") 375 return self.cache.get(cache_key, category="translations")
343 376
344 def _translate_async( 377 def _translate_async(
@@ -346,12 +379,13 @@ class Translator: @@ -346,12 +379,13 @@ class Translator:
346 text: str, 379 text: str,
347 target_lang: str, 380 target_lang: str,
348 source_lang: Optional[str] = None, 381 source_lang: Optional[str] = None,
349 - context: Optional[str] = None 382 + context: Optional[str] = None,
  383 + prompt: Optional[str] = None
350 ): 384 ):
351 """Launch async translation task.""" 385 """Launch async translation task."""
352 def _do_translate(): 386 def _do_translate():
353 try: 387 try:
354 - result = self.translate(text, target_lang, source_lang, context) 388 + result = self.translate(text, target_lang, source_lang, context, prompt)
355 if result: 389 if result:
356 logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") 390 logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
357 except Exception as e: 391 except Exception as e: