Commit 1f6d15faa32291662f64d4e0e3f50269b8e104e5

Authored by tangwang
1 parent 06e79082

重构:SPU级别索引、统一索引架构和API响应格式优化

主要变更:
1. 去掉数据源应用结构配置化,我们只针对店匠的spu sku表设计索引,数据灌入流程是写死的(只是满足测试需求,后面外层应用负责数据全量+增量灌入)。搜索系统主要关注如何适配外部搜索需求
目前有两个数据灌入脚本,一种是之前的,一种是现在的从两个店匠的表sku表+spu表读取并且以spu为单位组织doc。
   - 配置只关注ES搜索相关配置,提高可维护性
   - 创建base配置(店匠通用配置)

2. 索引结构重构(SPU维度)
   - 所有客户共享search_products索引,通过tenant_id隔离
   - 支持嵌套variants字段(SKU变体数组)
   - 创建SPUTransformer用于SPU数据转换

3. API响应格式优化
   - 约定一套搜索结果的格式,而不是直接暴露ES doc的结构(_id _score _source内的字段)
   - 添加ProductResult和VariantResult模型
   - 添加suggestions和related_searches字段 (预留接口,逻辑暂未实现)

4. 数据导入流程
   - 创建店匠数据导入脚本(ingest_shoplazza.py)
   - Pipeline层决定数据源,配置不包含数据源信息
   - 创建测试数据生成和导入脚本

5. 文档更新
   - 更新设计文档,反映新架构
   - 创建BASE_CONFIG_GUIDE.md使用指南
@@ -172,11 +172,40 @@ class FacetResult(BaseModel): @@ -172,11 +172,40 @@ class FacetResult(BaseModel):
172 total_count: Optional[int] = Field(None, description="该字段的总文档数") 172 total_count: Optional[int] = Field(None, description="该字段的总文档数")
173 173
174 174
  175 +class VariantResult(BaseModel):
  176 + """商品变体结果"""
  177 + variant_id: str = Field(..., description="变体ID")
  178 + title: Optional[str] = Field(None, description="变体标题")
  179 + price: Optional[float] = Field(None, description="价格")
  180 + compare_at_price: Optional[float] = Field(None, description="原价")
  181 + sku: Optional[str] = Field(None, description="SKU编码")
  182 + stock: int = Field(0, description="库存数量")
  183 + options: Optional[Dict[str, Any]] = Field(None, description="选项(颜色、尺寸等)")
  184 +
  185 +
  186 +class ProductResult(BaseModel):
  187 + """商品搜索结果"""
  188 + product_id: str = Field(..., description="商品ID")
  189 + title: Optional[str] = Field(None, description="商品标题")
  190 + handle: Optional[str] = Field(None, description="商品handle")
  191 + description: Optional[str] = Field(None, description="商品描述")
  192 + vendor: Optional[str] = Field(None, description="供应商/品牌")
  193 + product_type: Optional[str] = Field(None, description="商品类型")
  194 + tags: Optional[str] = Field(None, description="标签")
  195 + price: Optional[float] = Field(None, description="价格(min_price)")
  196 + compare_at_price: Optional[float] = Field(None, description="原价")
  197 + currency: str = Field("USD", description="货币单位")
  198 + image_url: Optional[str] = Field(None, description="主图URL")
  199 + in_stock: bool = Field(True, description="是否有库存")
  200 + variants: List[VariantResult] = Field(default_factory=list, description="变体列表")
  201 + relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性分数(0-1)")
  202 +
  203 +
175 class SearchResponse(BaseModel): 204 class SearchResponse(BaseModel):
176 - """搜索响应模型(重构版)""" 205 + """搜索响应模型(外部友好格式)"""
177 206
178 # 核心结果 207 # 核心结果
179 - hits: List[Dict[str, Any]] = Field(..., description="搜索结果列表") 208 + results: List[ProductResult] = Field(..., description="搜索结果列表")
180 total: int = Field(..., description="匹配的总文档数") 209 total: int = Field(..., description="匹配的总文档数")
181 max_score: float = Field(..., description="最高相关性分数") 210 max_score: float = Field(..., description="最高相关性分数")
182 211
@@ -192,8 +221,9 @@ class SearchResponse(BaseModel): @@ -192,8 +221,9 @@ class SearchResponse(BaseModel):
192 description="查询处理信息(原始查询、改写、语言检测、翻译等)" 221 description="查询处理信息(原始查询、改写、语言检测、翻译等)"
193 ) 222 )
194 223
195 - # 推荐与建议(预留)  
196 - related_queries: Optional[List[str]] = Field(None, description="相关搜索查询") 224 + # 推荐与建议
  225 + suggestions: List[str] = Field(default_factory=list, description="搜索建议")
  226 + related_searches: List[str] = Field(default_factory=list, description="相关搜索")
197 227
198 # 性能指标 228 # 性能指标
199 took_ms: int = Field(..., description="搜索总耗时(毫秒)") 229 took_ms: int = Field(..., description="搜索总耗时(毫秒)")
api/result_formatter.py 0 → 100644
@@ -0,0 +1,177 @@ @@ -0,0 +1,177 @@
  1 +"""
  2 +Result formatter for converting ES internal format to external-friendly format.
  3 +"""
  4 +
  5 +from typing import List, Dict, Any, Optional
  6 +from .models import ProductResult, VariantResult, FacetResult, FacetValue
  7 +
  8 +
  9 +class ResultFormatter:
  10 + """Formats ES search results to external-friendly format."""
  11 +
  12 + @staticmethod
  13 + def format_search_results(
  14 + es_hits: List[Dict[str, Any]],
  15 + max_score: float = 1.0
  16 + ) -> List[ProductResult]:
  17 + """
  18 + Convert ES hits to ProductResult list.
  19 +
  20 + Args:
  21 + es_hits: List of ES hit dictionaries (with _id, _score, _source)
  22 + max_score: Maximum score for normalization
  23 +
  24 + Returns:
  25 + List of ProductResult objects
  26 + """
  27 + results = []
  28 +
  29 + for hit in es_hits:
  30 + source = hit.get('_source', {})
  31 + score = hit.get('_score', 0.0)
  32 +
  33 + # Normalize relevance score to 0-1
  34 + if max_score > 0:
  35 + relevance_score = min(score / max_score, 1.0)
  36 + else:
  37 + relevance_score = 0.0
  38 +
  39 + # Extract variants
  40 + variants = []
  41 + variants_data = source.get('variants', [])
  42 + if isinstance(variants_data, list):
  43 + for variant_data in variants_data:
  44 + variant = VariantResult(
  45 + variant_id=str(variant_data.get('variant_id', '')),
  46 + title=variant_data.get('title'),
  47 + price=variant_data.get('price'),
  48 + compare_at_price=variant_data.get('compare_at_price'),
  49 + sku=variant_data.get('sku'),
  50 + stock=variant_data.get('stock', 0),
  51 + options=variant_data.get('options')
  52 + )
  53 + variants.append(variant)
  54 +
  55 + # Determine in_stock (any variant has stock > 0)
  56 + in_stock = any(v.stock > 0 for v in variants) if variants else True
  57 +
  58 + # Build ProductResult
  59 + product = ProductResult(
  60 + product_id=str(source.get('product_id', '')),
  61 + title=source.get('title'),
  62 + handle=source.get('handle'),
  63 + description=source.get('description'),
  64 + vendor=source.get('vendor'),
  65 + product_type=source.get('product_type'),
  66 + tags=source.get('tags'),
  67 + price=source.get('min_price'),
  68 + compare_at_price=source.get('compare_at_price'),
  69 + currency="USD", # Default currency
  70 + image_url=source.get('image_url'),
  71 + in_stock=in_stock,
  72 + variants=variants,
  73 + relevance_score=relevance_score
  74 + )
  75 +
  76 + results.append(product)
  77 +
  78 + return results
  79 +
  80 + @staticmethod
  81 + def format_facets(
  82 + es_aggregations: Dict[str, Any],
  83 + facet_configs: Optional[List[Any]] = None
  84 + ) -> List[FacetResult]:
  85 + """
  86 + Format ES aggregations to FacetResult list.
  87 +
  88 + Args:
  89 + es_aggregations: ES aggregations response
  90 + facet_configs: Facet configurations (optional)
  91 +
  92 + Returns:
  93 + List of FacetResult objects
  94 + """
  95 + facets = []
  96 +
  97 + for field_name, agg_data in es_aggregations.items():
  98 + # Handle terms aggregation
  99 + if 'buckets' in agg_data:
  100 + values = []
  101 + for bucket in agg_data['buckets']:
  102 + value = FacetValue(
  103 + value=bucket['key'],
  104 + label=bucket.get('key_as_string', str(bucket['key'])),
  105 + count=bucket['doc_count'],
  106 + selected=False
  107 + )
  108 + values.append(value)
  109 +
  110 + facet = FacetResult(
  111 + field=field_name,
  112 + label=field_name, # Can be enhanced with field labels
  113 + type="terms",
  114 + values=values,
  115 + total_count=agg_data.get('sum_other_doc_count', 0) + len(values)
  116 + )
  117 + facets.append(facet)
  118 +
  119 + # Handle range aggregation
  120 + elif 'buckets' in agg_data and any('from' in b or 'to' in b for b in agg_data['buckets']):
  121 + values = []
  122 + for bucket in agg_data['buckets']:
  123 + range_key = bucket.get('key', '')
  124 + value = FacetValue(
  125 + value=range_key,
  126 + label=range_key,
  127 + count=bucket['doc_count'],
  128 + selected=False
  129 + )
  130 + values.append(value)
  131 +
  132 + facet = FacetResult(
  133 + field=field_name,
  134 + label=field_name,
  135 + type="range",
  136 + values=values
  137 + )
  138 + facets.append(facet)
  139 +
  140 + return facets
  141 +
  142 + @staticmethod
  143 + def generate_suggestions(
  144 + query: str,
  145 + results: List[ProductResult]
  146 + ) -> List[str]:
  147 + """
  148 + Generate search suggestions.
  149 +
  150 + Args:
  151 + query: Original search query
  152 + results: Search results
  153 +
  154 + Returns:
  155 + List of suggestion strings (currently returns empty list)
  156 + """
  157 + # TODO: Implement suggestion generation logic
  158 + return []
  159 +
  160 + @staticmethod
  161 + def generate_related_searches(
  162 + query: str,
  163 + results: List[ProductResult]
  164 + ) -> List[str]:
  165 + """
  166 + Generate related searches.
  167 +
  168 + Args:
  169 + query: Original search query
  170 + results: Search results
  171 +
  172 + Returns:
  173 + List of related search strings (currently returns empty list)
  174 + """
  175 + # TODO: Implement related search generation logic
  176 + return []
  177 +
api/routes/search.py
@@ -33,7 +33,7 @@ def extract_request_info(request: Request) -> tuple[str, str]: @@ -33,7 +33,7 @@ def extract_request_info(request: Request) -> tuple[str, str]:
33 @router.post("/", response_model=SearchResponse) 33 @router.post("/", response_model=SearchResponse)
34 async def search(request: SearchRequest, http_request: Request): 34 async def search(request: SearchRequest, http_request: Request):
35 """ 35 """
36 - Execute text search query (重构版). 36 + Execute text search query (外部友好格式).
37 37
38 Supports: 38 Supports:
39 - Multi-language query processing 39 - Multi-language query processing
@@ -42,9 +42,27 @@ async def search(request: SearchRequest, http_request: Request): @@ -42,9 +42,27 @@ async def search(request: SearchRequest, http_request: Request):
42 - Custom ranking functions 42 - Custom ranking functions
43 - Exact match filters and range filters 43 - Exact match filters and range filters
44 - Faceted search 44 - Faceted search
  45 +
  46 + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id).
45 """ 47 """
46 reqid, uid = extract_request_info(http_request) 48 reqid, uid = extract_request_info(http_request)
47 49
  50 + # Extract tenant_id (required)
  51 + tenant_id = http_request.headers.get('X-Tenant-ID')
  52 + if not tenant_id:
  53 + # Try to get from query string
  54 + from urllib.parse import parse_qs
  55 + query_string = http_request.url.query
  56 + if query_string:
  57 + params = parse_qs(query_string)
  58 + tenant_id = params.get('tenant_id', [None])[0]
  59 +
  60 + if not tenant_id:
  61 + raise HTTPException(
  62 + status_code=400,
  63 + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'"
  64 + )
  65 +
48 # Create request context 66 # Create request context
49 context = create_request_context(reqid=reqid, uid=uid) 67 context = create_request_context(reqid=reqid, uid=uid)
50 68
@@ -54,7 +72,7 @@ async def search(request: SearchRequest, http_request: Request): @@ -54,7 +72,7 @@ async def search(request: SearchRequest, http_request: Request):
54 try: 72 try:
55 # Log request start 73 # Log request start
56 context.logger.info( 74 context.logger.info(
57 - f"收到搜索请求 | IP: {http_request.client.host if http_request.client else 'unknown'} | " 75 + f"收到搜索请求 | Tenant: {tenant_id} | IP: {http_request.client.host if http_request.client else 'unknown'} | "
58 f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}", 76 f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}",
59 extra={'reqid': context.reqid, 'uid': context.uid} 77 extra={'reqid': context.reqid, 'uid': context.uid}
60 ) 78 )
@@ -66,6 +84,7 @@ async def search(request: SearchRequest, http_request: Request): @@ -66,6 +84,7 @@ async def search(request: SearchRequest, http_request: Request):
66 # Execute search with context (using backend defaults from config) 84 # Execute search with context (using backend defaults from config)
67 result = searcher.search( 85 result = searcher.search(
68 query=request.query, 86 query=request.query,
  87 + tenant_id=tenant_id,
69 size=request.size, 88 size=request.size,
70 from_=request.from_, 89 from_=request.from_,
71 filters=request.filters, 90 filters=request.filters,
@@ -83,12 +102,14 @@ async def search(request: SearchRequest, http_request: Request): @@ -83,12 +102,14 @@ async def search(request: SearchRequest, http_request: Request):
83 102
84 # Convert to response model 103 # Convert to response model
85 return SearchResponse( 104 return SearchResponse(
86 - hits=result.hits, 105 + results=result.results,
87 total=result.total, 106 total=result.total,
88 max_score=result.max_score, 107 max_score=result.max_score,
89 took_ms=result.took_ms, 108 took_ms=result.took_ms,
90 facets=result.facets, 109 facets=result.facets,
91 query_info=result.query_info, 110 query_info=result.query_info,
  111 + suggestions=result.suggestions,
  112 + related_searches=result.related_searches,
92 performance_info=performance_summary, 113 performance_info=performance_summary,
93 debug_info=result.debug_info 114 debug_info=result.debug_info
94 ) 115 )
@@ -110,13 +131,30 @@ async def search(request: SearchRequest, http_request: Request): @@ -110,13 +131,30 @@ async def search(request: SearchRequest, http_request: Request):
110 @router.post("/image", response_model=SearchResponse) 131 @router.post("/image", response_model=SearchResponse)
111 async def search_by_image(request: ImageSearchRequest, http_request: Request): 132 async def search_by_image(request: ImageSearchRequest, http_request: Request):
112 """ 133 """
113 - Search by image similarity (重构版). 134 + Search by image similarity (外部友好格式).
114 135
115 Uses image embeddings to find visually similar products. 136 Uses image embeddings to find visually similar products.
116 Supports exact match filters and range filters. 137 Supports exact match filters and range filters.
  138 +
  139 + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id).
117 """ 140 """
118 reqid, uid = extract_request_info(http_request) 141 reqid, uid = extract_request_info(http_request)
119 142
  143 + # Extract tenant_id (required)
  144 + tenant_id = http_request.headers.get('X-Tenant-ID')
  145 + if not tenant_id:
  146 + from urllib.parse import parse_qs
  147 + query_string = http_request.url.query
  148 + if query_string:
  149 + params = parse_qs(query_string)
  150 + tenant_id = params.get('tenant_id', [None])[0]
  151 +
  152 + if not tenant_id:
  153 + raise HTTPException(
  154 + status_code=400,
  155 + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'"
  156 + )
  157 +
120 # Create request context 158 # Create request context
121 context = create_request_context(reqid=reqid, uid=uid) 159 context = create_request_context(reqid=reqid, uid=uid)
122 160
@@ -126,7 +164,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -126,7 +164,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
126 try: 164 try:
127 # Log request start 165 # Log request start
128 context.logger.info( 166 context.logger.info(
129 - f"收到图片搜索请求 | 图片URL: {request.image_url} | " 167 + f"收到图片搜索请求 | Tenant: {tenant_id} | 图片URL: {request.image_url} | "
130 f"IP: {http_request.client.host if http_request.client else 'unknown'}", 168 f"IP: {http_request.client.host if http_request.client else 'unknown'}",
131 extra={'reqid': context.reqid, 'uid': context.uid} 169 extra={'reqid': context.reqid, 'uid': context.uid}
132 ) 170 )
@@ -137,6 +175,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -137,6 +175,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
137 # Execute image search 175 # Execute image search
138 result = searcher.search_by_image( 176 result = searcher.search_by_image(
139 image_url=request.image_url, 177 image_url=request.image_url,
  178 + tenant_id=tenant_id,
140 size=request.size, 179 size=request.size,
141 filters=request.filters, 180 filters=request.filters,
142 range_filters=request.range_filters 181 range_filters=request.range_filters
@@ -146,12 +185,14 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): @@ -146,12 +185,14 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
146 performance_summary = context.get_summary() if context else None 185 performance_summary = context.get_summary() if context else None
147 186
148 return SearchResponse( 187 return SearchResponse(
149 - hits=result.hits, 188 + results=result.results,
150 total=result.total, 189 total=result.total,
151 max_score=result.max_score, 190 max_score=result.max_score,
152 took_ms=result.took_ms, 191 took_ms=result.took_ms,
153 facets=result.facets, 192 facets=result.facets,
154 query_info=result.query_info, 193 query_info=result.query_info,
  194 + suggestions=result.suggestions,
  195 + related_searches=result.related_searches,
155 performance_info=performance_summary 196 performance_info=performance_summary
156 ) 197 )
157 198
@@ -226,7 +267,8 @@ async def search_suggestions( @@ -226,7 +267,8 @@ async def search_suggestions(
226 @router.get("/instant", response_model=SearchResponse) 267 @router.get("/instant", response_model=SearchResponse)
227 async def instant_search( 268 async def instant_search(
228 q: str = Query(..., min_length=2, description="搜索查询"), 269 q: str = Query(..., min_length=2, description="搜索查询"),
229 - size: int = Query(5, ge=1, le=20, description="结果数量") 270 + size: int = Query(5, ge=1, le=20, description="结果数量"),
  271 + tenant_id: str = Query(..., description="租户ID")
230 ): 272 ):
231 """ 273 """
232 即时搜索(Instant Search)。 274 即时搜索(Instant Search)。
@@ -246,17 +288,20 @@ async def instant_search( @@ -246,17 +288,20 @@ async def instant_search(
246 288
247 result = searcher.search( 289 result = searcher.search(
248 query=q, 290 query=q,
  291 + tenant_id=tenant_id,
249 size=size, 292 size=size,
250 from_=0 293 from_=0
251 ) 294 )
252 295
253 return SearchResponse( 296 return SearchResponse(
254 - hits=result.hits, 297 + results=result.results,
255 total=result.total, 298 total=result.total,
256 max_score=result.max_score, 299 max_score=result.max_score,
257 took_ms=result.took_ms, 300 took_ms=result.took_ms,
258 facets=result.facets, 301 facets=result.facets,
259 - query_info=result.query_info 302 + query_info=result.query_info,
  303 + suggestions=result.suggestions,
  304 + related_searches=result.related_searches
260 ) 305 )
261 306
262 307
config/config_loader.py
@@ -90,9 +90,6 @@ class CustomerConfig: @@ -90,9 +90,6 @@ class CustomerConfig:
90 customer_id: str 90 customer_id: str
91 customer_name: str 91 customer_name: str
92 92
93 - # Database settings  
94 - mysql_config: Dict[str, Any]  
95 -  
96 # Field definitions 93 # Field definitions
97 fields: List[FieldConfig] 94 fields: List[FieldConfig]
98 95
@@ -116,10 +113,6 @@ class CustomerConfig: @@ -116,10 +113,6 @@ class CustomerConfig:
116 113
117 # ES index settings 114 # ES index settings
118 es_index_name: str 115 es_index_name: str
119 -  
120 - # Optional fields with defaults  
121 - main_table: str = "shoplazza_product_sku"  
122 - extension_table: Optional[str] = None  
123 es_settings: Dict[str, Any] = field(default_factory=dict) 116 es_settings: Dict[str, Any] = field(default_factory=dict)
124 117
125 118
@@ -272,9 +265,6 @@ class ConfigLoader: @@ -272,9 +265,6 @@ class ConfigLoader:
272 return CustomerConfig( 265 return CustomerConfig(
273 customer_id=customer_id, 266 customer_id=customer_id,
274 customer_name=config_data.get("customer_name", customer_id), 267 customer_name=config_data.get("customer_name", customer_id),
275 - mysql_config=config_data.get("mysql_config", {}),  
276 - main_table=config_data.get("main_table", "shoplazza_product_sku"),  
277 - extension_table=config_data.get("extension_table"),  
278 fields=fields, 268 fields=fields,
279 indexes=indexes, 269 indexes=indexes,
280 query_config=query_config, 270 query_config=query_config,
@@ -310,8 +300,6 @@ class ConfigLoader: @@ -310,8 +300,6 @@ class ConfigLoader:
310 return FieldConfig( 300 return FieldConfig(
311 name=name, 301 name=name,
312 field_type=field_type, 302 field_type=field_type,
313 - source_table=field_data.get("source_table"),  
314 - source_column=field_data.get("source_column", name),  
315 analyzer=analyzer, 303 analyzer=analyzer,
316 search_analyzer=search_analyzer, 304 search_analyzer=search_analyzer,
317 required=field_data.get("required", False), 305 required=field_data.get("required", False),
@@ -426,15 +414,17 @@ class ConfigLoader: @@ -426,15 +414,17 @@ class ConfigLoader:
426 if field.embedding_similarity not in ["dot_product", "cosine", "l2_norm"]: 414 if field.embedding_similarity not in ["dot_product", "cosine", "l2_norm"]:
427 errors.append(f"Field '{field.name}': invalid embedding_similarity") 415 errors.append(f"Field '{field.name}': invalid embedding_similarity")
428 416
429 - # Validate MySQL config  
430 - if "host" not in config.mysql_config:  
431 - errors.append("MySQL configuration missing 'host'")  
432 - if "username" not in config.mysql_config:  
433 - errors.append("MySQL configuration missing 'username'")  
434 - if "password" not in config.mysql_config:  
435 - errors.append("MySQL configuration missing 'password'")  
436 - if "database" not in config.mysql_config:  
437 - errors.append("MySQL configuration missing 'database'") 417 + # Validate tenant_id field (required)
  418 + tenant_id_field = None
  419 + for field in config.fields:
  420 + if field.name == "tenant_id":
  421 + tenant_id_field = field
  422 + break
  423 +
  424 + if not tenant_id_field:
  425 + errors.append("Required field 'tenant_id' not found in fields")
  426 + elif not tenant_id_field.required:
  427 + errors.append("Field 'tenant_id' must be marked as required")
438 428
439 return errors 429 return errors
440 430
@@ -457,9 +447,6 @@ class ConfigLoader: @@ -457,9 +447,6 @@ class ConfigLoader:
457 # Convert config back to dictionary format 447 # Convert config back to dictionary format
458 config_dict = { 448 config_dict = {
459 "customer_name": config.customer_name, 449 "customer_name": config.customer_name,
460 - "mysql_config": config.mysql_config,  
461 - "main_table": config.main_table,  
462 - "extension_table": config.extension_table,  
463 "es_index_name": config.es_index_name, 450 "es_index_name": config.es_index_name,
464 "es_settings": config.es_settings, 451 "es_settings": config.es_settings,
465 "fields": [self._field_to_dict(field) for field in config.fields], 452 "fields": [self._field_to_dict(field) for field in config.fields],
@@ -478,6 +465,16 @@ class ConfigLoader: @@ -478,6 +465,16 @@ class ConfigLoader:
478 "expression": config.ranking.expression, 465 "expression": config.ranking.expression,
479 "description": config.ranking.description 466 "description": config.ranking.description
480 }, 467 },
  468 + "function_score": {
  469 + "score_mode": config.function_score.score_mode,
  470 + "boost_mode": config.function_score.boost_mode,
  471 + "functions": config.function_score.functions
  472 + },
  473 + "rerank": {
  474 + "enabled": config.rerank.enabled,
  475 + "expression": config.rerank.expression,
  476 + "description": config.rerank.description
  477 + },
481 "spu_config": { 478 "spu_config": {
482 "enabled": config.spu_config.enabled, 479 "enabled": config.spu_config.enabled,
483 "spu_field": config.spu_config.spu_field, 480 "spu_field": config.spu_config.spu_field,
@@ -512,8 +509,6 @@ class ConfigLoader: @@ -512,8 +509,6 @@ class ConfigLoader:
512 result = { 509 result = {
513 "name": field.name, 510 "name": field.name,
514 "type": field.field_type.value, 511 "type": field.field_type.value,
515 - "source_table": field.source_table,  
516 - "source_column": field.source_column,  
517 "required": field.required, 512 "required": field.required,
518 "boost": field.boost, 513 "boost": field.boost,
519 "store": field.store, 514 "store": field.store,
config/field_types.py
@@ -54,8 +54,6 @@ class FieldConfig: @@ -54,8 +54,6 @@ class FieldConfig:
54 """Configuration for a single field.""" 54 """Configuration for a single field."""
55 name: str 55 name: str
56 field_type: FieldType 56 field_type: FieldType
57 - source_table: Optional[str] = None # 'main' or 'extension' or specific table name  
58 - source_column: Optional[str] = None  
59 analyzer: Optional[AnalyzerType] = None 57 analyzer: Optional[AnalyzerType] = None
60 search_analyzer: Optional[AnalyzerType] = None 58 search_analyzer: Optional[AnalyzerType] = None
61 required: bool = False 59 required: bool = False
@@ -172,10 +170,34 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: @@ -172,10 +170,34 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
172 } 170 }
173 171
174 elif field_config.field_type == FieldType.JSON: 172 elif field_config.field_type == FieldType.JSON:
175 - mapping = {  
176 - "type": "object",  
177 - "enabled": True  
178 - } 173 + if field_config.nested and field_config.nested_properties:
  174 + # Nested type with properties (e.g., variants)
  175 + mapping = {
  176 + "type": "nested",
  177 + "properties": {}
  178 + }
  179 + # Generate mappings for nested properties
  180 + for prop_name, prop_config in field_config.nested_properties.items():
  181 + prop_type = prop_config.get("type", "keyword")
  182 + prop_mapping = {"type": prop_type}
  183 +
  184 + # Add analyzer for text fields
  185 + if prop_type == "text" and "analyzer" in prop_config:
  186 + prop_mapping["analyzer"] = prop_config["analyzer"]
  187 +
  188 + # Add other properties
  189 + if "index" in prop_config:
  190 + prop_mapping["index"] = prop_config["index"]
  191 + if "store" in prop_config:
  192 + prop_mapping["store"] = prop_config["store"]
  193 +
  194 + mapping["properties"][prop_name] = prop_mapping
  195 + else:
  196 + # Simple object type
  197 + mapping = {
  198 + "type": "object",
  199 + "enabled": True
  200 + }
179 201
180 return mapping 202 return mapping
181 203
config/schema/base/config.yaml
@@ -0,0 +1,271 @@ @@ -0,0 +1,271 @@
  1 +# Base Configuration for Shoplazza
  2 +# 店匠通用配置文件,所有使用店匠表的客户共用
  3 +# 注意:此配置不包含MySQL相关配置,只包含ES搜索相关配置
  4 +
  5 +customer_name: "Shoplazza Base Configuration"
  6 +
  7 +# Elasticsearch Index
  8 +es_index_name: "search_products"
  9 +
  10 +# ES Index Settings
  11 +es_settings:
  12 + number_of_shards: 1
  13 + number_of_replicas: 0
  14 + refresh_interval: "30s"
  15 +
  16 +# Field Definitions (SPU级别,只包含对搜索有帮助的字段)
  17 +fields:
  18 + # 租户隔离字段(必需)
  19 + - name: "tenant_id"
  20 + type: "KEYWORD"
  21 + required: true
  22 + index: true
  23 + store: true
  24 +
  25 + # 商品标识字段
  26 + - name: "product_id"
  27 + type: "KEYWORD"
  28 + required: true
  29 + index: true
  30 + store: true
  31 +
  32 + - name: "handle"
  33 + type: "KEYWORD"
  34 + index: true
  35 + store: true
  36 +
  37 + # 文本搜索字段
  38 + - name: "title"
  39 + type: "TEXT"
  40 + analyzer: "chinese_ecommerce"
  41 + boost: 3.0
  42 + index: true
  43 + store: true
  44 +
  45 + - name: "brief"
  46 + type: "TEXT"
  47 + analyzer: "chinese_ecommerce"
  48 + boost: 1.5
  49 + index: true
  50 + store: true
  51 +
  52 + - name: "description"
  53 + type: "TEXT"
  54 + analyzer: "chinese_ecommerce"
  55 + boost: 1.0
  56 + index: true
  57 + store: true
  58 +
  59 + # SEO字段(提升相关性)
  60 + - name: "seo_title"
  61 + type: "TEXT"
  62 + analyzer: "chinese_ecommerce"
  63 + boost: 2.0
  64 + index: true
  65 + store: true
  66 +
  67 + - name: "seo_description"
  68 + type: "TEXT"
  69 + analyzer: "chinese_ecommerce"
  70 + boost: 1.5
  71 + index: true
  72 + store: true
  73 +
  74 + - name: "seo_keywords"
  75 + type: "TEXT"
  76 + analyzer: "chinese_ecommerce"
  77 + boost: 2.0
  78 + index: true
  79 + store: true
  80 +
  81 + # 分类和标签字段(TEXT + KEYWORD双重索引)
  82 + - name: "vendor"
  83 + type: "TEXT"
  84 + analyzer: "chinese_ecommerce"
  85 + boost: 1.5
  86 + index: true
  87 + store: true
  88 +
  89 + - name: "vendor_keyword"
  90 + type: "KEYWORD"
  91 + index: true
  92 + store: false
  93 +
  94 + - name: "product_type"
  95 + type: "TEXT"
  96 + analyzer: "chinese_ecommerce"
  97 + boost: 1.5
  98 + index: true
  99 + store: true
  100 +
  101 + - name: "product_type_keyword"
  102 + type: "KEYWORD"
  103 + index: true
  104 + store: false
  105 +
  106 + - name: "tags"
  107 + type: "TEXT"
  108 + analyzer: "chinese_ecommerce"
  109 + boost: 1.0
  110 + index: true
  111 + store: true
  112 +
  113 + - name: "tags_keyword"
  114 + type: "KEYWORD"
  115 + index: true
  116 + store: false
  117 +
  118 + - name: "category"
  119 + type: "TEXT"
  120 + analyzer: "chinese_ecommerce"
  121 + boost: 1.5
  122 + index: true
  123 + store: true
  124 +
  125 + - name: "category_keyword"
  126 + type: "KEYWORD"
  127 + index: true
  128 + store: false
  129 +
  130 + # 价格字段(扁平化)
  131 + - name: "min_price"
  132 + type: "FLOAT"
  133 + index: true
  134 + store: true
  135 +
  136 + - name: "max_price"
  137 + type: "FLOAT"
  138 + index: true
  139 + store: true
  140 +
  141 + - name: "compare_at_price"
  142 + type: "FLOAT"
  143 + index: true
  144 + store: true
  145 +
  146 + # 图片字段(用于显示,不参与搜索)
  147 + - name: "image_url"
  148 + type: "KEYWORD"
  149 + index: false
  150 + store: true
  151 +
  152 + # 嵌套variants字段
  153 + - name: "variants"
  154 + type: "JSON"
  155 + nested: true
  156 + nested_properties:
  157 + variant_id:
  158 + type: "keyword"
  159 + index: true
  160 + store: true
  161 + title:
  162 + type: "text"
  163 + analyzer: "chinese_ecommerce"
  164 + index: true
  165 + store: true
  166 + price:
  167 + type: "float"
  168 + index: true
  169 + store: true
  170 + compare_at_price:
  171 + type: "float"
  172 + index: true
  173 + store: true
  174 + sku:
  175 + type: "keyword"
  176 + index: true
  177 + store: true
  178 + stock:
  179 + type: "long"
  180 + index: true
  181 + store: true
  182 + options:
  183 + type: "object"
  184 + enabled: true
  185 +
  186 +# Index Structure (Query Domains)
  187 +indexes:
  188 + - name: "default"
  189 + label: "默认索引"
  190 + fields:
  191 + - "title"
  192 + - "brief"
  193 + - "description"
  194 + - "seo_title"
  195 + - "seo_description"
  196 + - "seo_keywords"
  197 + - "vendor"
  198 + - "product_type"
  199 + - "tags"
  200 + - "category"
  201 + analyzer: "chinese_ecommerce"
  202 + boost: 1.0
  203 +
  204 + - name: "title"
  205 + label: "标题索引"
  206 + fields:
  207 + - "title"
  208 + - "seo_title"
  209 + analyzer: "chinese_ecommerce"
  210 + boost: 2.0
  211 +
  212 + - name: "vendor"
  213 + label: "品牌索引"
  214 + fields:
  215 + - "vendor"
  216 + analyzer: "chinese_ecommerce"
  217 + boost: 1.5
  218 +
  219 + - name: "category"
  220 + label: "类目索引"
  221 + fields:
  222 + - "category"
  223 + analyzer: "chinese_ecommerce"
  224 + boost: 1.5
  225 +
  226 + - name: "tags"
  227 + label: "标签索引"
  228 + fields:
  229 + - "tags"
  230 + - "seo_keywords"
  231 + analyzer: "chinese_ecommerce"
  232 + boost: 1.0
  233 +
  234 +# Query Configuration
  235 +query_config:
  236 + supported_languages:
  237 + - "zh"
  238 + - "en"
  239 + default_language: "zh"
  240 + enable_translation: true
  241 + enable_text_embedding: true
  242 + enable_query_rewrite: true
  243 +
  244 + # Translation API (DeepL)
  245 + translation_service: "deepl"
  246 + translation_api_key: null # Set via environment variable
  247 +
  248 +# Ranking Configuration
  249 +ranking:
  250 + expression: "bm25() + 0.2*text_embedding_relevance()"
  251 + description: "BM25 text relevance combined with semantic embedding similarity"
  252 +
  253 +# Function Score配置(ES层打分规则)
  254 +function_score:
  255 + score_mode: "sum"
  256 + boost_mode: "multiply"
  257 +
  258 + functions: []
  259 +
  260 +# Rerank配置(本地重排,当前禁用)
  261 +rerank:
  262 + enabled: false
  263 + expression: ""
  264 + description: "Local reranking (disabled, use ES function_score instead)"
  265 +
  266 +# SPU配置(已启用,使用嵌套variants)
  267 +spu_config:
  268 + enabled: true
  269 + spu_field: "product_id"
  270 + inner_hits_size: 10
  271 +
docs/BASE_CONFIG_GUIDE.md 0 → 100644
@@ -0,0 +1,257 @@ @@ -0,0 +1,257 @@
  1 +# Base Configuration Guide
  2 +
  3 +店匠通用配置(Base Configuration)使用指南
  4 +
  5 +## 概述
  6 +
  7 +Base配置是店匠(Shoplazza)通用配置,适用于所有使用店匠标准表的客户。该配置采用SPU级别的索引结构,所有客户共享同一个Elasticsearch索引(`search_products`),通过`tenant_id`字段实现数据隔离。
  8 +
  9 +## 核心特性
  10 +
  11 +- **SPU级别索引**:每个ES文档代表一个SPU,包含嵌套的variants数组
  12 +- **统一索引**:所有客户共享`search_products`索引
  13 +- **租户隔离**:通过`tenant_id`字段实现数据隔离
  14 +- **配置简化**:配置只包含ES搜索相关配置,不包含MySQL数据源配置
  15 +- **外部友好格式**:API返回格式不包含ES内部字段(`_id`, `_score`, `_source`)
  16 +
  17 +## 配置说明
  18 +
  19 +### 配置文件位置
  20 +
  21 +`config/schema/base/config.yaml`
  22 +
  23 +### 配置内容
  24 +
  25 +Base配置**不包含**以下内容:
  26 +- `mysql_config` - MySQL数据库配置
  27 +- `main_table` - 主表配置
  28 +- `extension_table` - 扩展表配置
  29 +- `source_table` / `source_column` - 字段数据源映射
  30 +
  31 +Base配置**只包含**:
  32 +- ES字段定义(字段类型、分析器、boost等)
  33 +- 查询域(indexes)配置
  34 +- 查询处理配置(query_config)
  35 +- 排序和打分配置(function_score)
  36 +- SPU配置(spu_config)
  37 +
  38 +### 必需字段
  39 +
  40 +- `tenant_id` (KEYWORD, required) - 租户隔离字段
  41 +
  42 +### 主要字段
  43 +
  44 +- `product_id` - 商品ID
  45 +- `title`, `brief`, `description` - 文本搜索字段
  46 +- `seo_title`, `seo_description`, `seo_keywords` - SEO字段
  47 +- `vendor`, `product_type`, `tags`, `category` - 分类和标签字段
  48 +- `min_price`, `max_price`, `compare_at_price` - 价格字段
  49 +- `variants` (nested) - 嵌套变体数组
  50 +
  51 +## 数据导入流程
  52 +
  53 +### 1. 生成测试数据
  54 +
  55 +```bash
  56 +python scripts/generate_test_data.py \
  57 + --num-spus 100 \
  58 + --tenant-id "1" \
  59 + --start-spu-id 1 \
  60 + --start-sku-id 1 \
  61 + --output test_data.sql
  62 +```
  63 +
  64 +### 2. 导入测试数据到MySQL
  65 +
  66 +```bash
  67 +python scripts/import_test_data.py \
  68 + --db-host localhost \
  69 + --db-port 3306 \
  70 + --db-database saas \
  71 + --db-username root \
  72 + --db-password password \
  73 + --sql-file test_data.sql \
  74 + --tenant-id "1"
  75 +```
  76 +
  77 +### 3. 导入数据到Elasticsearch
  78 +
  79 +```bash
  80 +python scripts/ingest_shoplazza.py \
  81 + --db-host localhost \
  82 + --db-port 3306 \
  83 + --db-database saas \
  84 + --db-username root \
  85 + --db-password password \
  86 + --tenant-id "1" \
  87 + --config base \
  88 + --es-host http://localhost:9200 \
  89 + --recreate \
  90 + --batch-size 500
  91 +```
  92 +
  93 +## API使用
  94 +
  95 +### 搜索接口
  96 +
  97 +**端点**: `POST /search/`
  98 +
  99 +**请求头**:
  100 +```
  101 +X-Tenant-ID: 1
  102 +Content-Type: application/json
  103 +```
  104 +
  105 +**请求体**:
  106 +```json
  107 +{
  108 + "query": "耳机",
  109 + "size": 10,
  110 + "from": 0,
  111 + "filters": {
  112 + "category_keyword": "电子产品"
  113 + },
  114 + "facets": ["category_keyword", "vendor_keyword"]
  115 +}
  116 +```
  117 +
  118 +**响应格式**:
  119 +```json
  120 +{
  121 + "results": [
  122 + {
  123 + "product_id": "1",
  124 + "title": "蓝牙耳机 Sony",
  125 + "handle": "product-1",
  126 + "description": "高品质无线蓝牙耳机",
  127 + "vendor": "Sony",
  128 + "product_type": "电子产品",
  129 + "price": 199.99,
  130 + "compare_at_price": 299.99,
  131 + "currency": "USD",
  132 + "image_url": "//cdn.example.com/products/1.jpg",
  133 + "in_stock": true,
  134 + "variants": [
  135 + {
  136 + "variant_id": "1",
  137 + "title": "黑色",
  138 + "price": 199.99,
  139 + "compare_at_price": 299.99,
  140 + "sku": "SKU-1-1",
  141 + "stock": 50,
  142 + "options": {
  143 + "option1": "黑色"
  144 + }
  145 + }
  146 + ],
  147 + "relevance_score": 0.95
  148 + }
  149 + ],
  150 + "total": 10,
  151 + "max_score": 1.0,
  152 + "facets": [
  153 + {
  154 + "field": "category_keyword",
  155 + "label": "category_keyword",
  156 + "type": "terms",
  157 + "values": [
  158 + {
  159 + "value": "电子产品",
  160 + "label": "电子产品",
  161 + "count": 5,
  162 + "selected": false
  163 + }
  164 + ]
  165 + }
  166 + ],
  167 + "suggestions": [],
  168 + "related_searches": [],
  169 + "took_ms": 15,
  170 + "query_info": {}
  171 +}
  172 +```
  173 +
  174 +### 响应格式说明
  175 +
  176 +#### 主要变化
  177 +
  178 +1. **`results`替代`hits`**:返回字段从`hits`改为`results`
  179 +2. **结构化结果**:每个结果包含`product_id`, `title`, `variants`, `relevance_score`等字段
  180 +3. **无ES内部字段**:不包含`_id`, `_score`, `_source`等ES内部字段
  181 +4. **嵌套variants**:每个商品包含variants数组,每个variant包含完整的变体信息
  182 +5. **相关性分数**:`relevance_score`是0-1之间的归一化分数
  183 +
  184 +#### ProductResult字段
  185 +
  186 +- `product_id` - 商品ID
  187 +- `title` - 商品标题
  188 +- `handle` - 商品handle
  189 +- `description` - 商品描述
  190 +- `vendor` - 供应商/品牌
  191 +- `product_type` - 商品类型
  192 +- `tags` - 标签
  193 +- `price` - 最低价格(min_price)
  194 +- `compare_at_price` - 原价
  195 +- `currency` - 货币单位(默认USD)
  196 +- `image_url` - 主图URL
  197 +- `in_stock` - 是否有库存
  198 +- `variants` - 变体列表
  199 +- `relevance_score` - 相关性分数(0-1)
  200 +
  201 +#### VariantResult字段
  202 +
  203 +- `variant_id` - 变体ID
  204 +- `title` - 变体标题
  205 +- `price` - 价格
  206 +- `compare_at_price` - 原价
  207 +- `sku` - SKU编码
  208 +- `stock` - 库存数量
  209 +- `options` - 选项(颜色、尺寸等)
  210 +
  211 +## 测试
  212 +
  213 +### 运行测试脚本
  214 +
  215 +```bash
  216 +python scripts/test_base.py \
  217 + --api-url http://localhost:8000 \
  218 + --tenant-id "1" \
  219 + --test-tenant-2 "2"
  220 +```
  221 +
  222 +### 测试内容
  223 +
  224 +1. **基本搜索**:测试搜索API基本功能
  225 +2. **响应格式验证**:验证返回格式是否符合要求
  226 +3. **Facets聚合**:测试分面搜索功能
  227 +4. **租户隔离**:验证不同租户的数据隔离
  228 +
  229 +## 常见问题
  230 +
  231 +### Q: 为什么配置中没有MySQL相关配置?
  232 +
  233 +A: 数据源配置和数据导入流程是写死的脚本,不在搜索配置中。搜索配置只关注ES搜索相关的内容。
  234 +
  235 +### Q: 如何为新的租户导入数据?
  236 +
  237 +A: 使用`ingest_shoplazza.py`脚本,指定不同的`--tenant-id`参数即可。
  238 +
  239 +### Q: 如何验证租户隔离是否生效?
  240 +
  241 +A: 使用`test_base.py`脚本,指定两个不同的`--tenant-id`,检查搜索结果是否隔离。
  242 +
  243 +### Q: API返回格式中为什么没有`_id`和`_score`?
  244 +
  245 +A: 为了提供外部友好的API格式,我们移除了ES内部字段,使用`product_id`和`relevance_score`替代。
  246 +
  247 +### Q: 如何添加新的搜索字段?
  248 +
  249 +A: 在`config/schema/base/config.yaml`中添加字段定义,然后重新生成索引映射并重新导入数据。
  250 +
  251 +## 注意事项
  252 +
  253 +1. **tenant_id必需**:所有API请求必须提供`tenant_id`(通过请求头`X-Tenant-ID`或查询参数`tenant_id`)
  254 +2. **索引共享**:所有客户共享`search_products`索引,确保`tenant_id`字段正确设置
  255 +3. **数据导入**:数据导入脚本是写死的,不依赖配置中的MySQL设置
  256 +4. **配置分离**:搜索配置和数据源配置完全分离,提高可维护性
  257 +
indexer/spu_transformer.py 0 → 100644
@@ -0,0 +1,288 @@ @@ -0,0 +1,288 @@
  1 +"""
  2 +SPU data transformer for Shoplazza products.
  3 +
  4 +Transforms SPU and SKU data from MySQL into SPU-level ES documents with nested variants.
  5 +"""
  6 +
  7 +import pandas as pd
  8 +import numpy as np
  9 +from typing import Dict, Any, List, Optional
  10 +from sqlalchemy import create_engine, text
  11 +from utils.db_connector import create_db_connection
  12 +
  13 +
  14 +class SPUTransformer:
  15 + """Transform SPU and SKU data into SPU-level ES documents."""
  16 +
  17 + def __init__(
  18 + self,
  19 + db_engine: Any,
  20 + tenant_id: str
  21 + ):
  22 + """
  23 + Initialize SPU transformer.
  24 +
  25 + Args:
  26 + db_engine: SQLAlchemy database engine
  27 + tenant_id: Tenant ID for filtering data
  28 + """
  29 + self.db_engine = db_engine
  30 + self.tenant_id = tenant_id
  31 +
  32 + def load_spu_data(self) -> pd.DataFrame:
  33 + """
  34 + Load SPU data from MySQL.
  35 +
  36 + Returns:
  37 + DataFrame with SPU data
  38 + """
  39 + query = text("""
  40 + SELECT
  41 + id, shop_id, shoplazza_id, handle, title, brief, description,
  42 + spu, vendor, vendor_url, seo_title, seo_description, seo_keywords,
  43 + image_src, image_width, image_height, image_path, image_alt,
  44 + tags, note, category,
  45 + shoplazza_created_at, shoplazza_updated_at, tenant_id,
  46 + creator, create_time, updater, update_time, deleted
  47 + FROM shoplazza_product_spu
  48 + WHERE tenant_id = :tenant_id AND deleted = 0
  49 + """)
  50 +
  51 + with self.db_engine.connect() as conn:
  52 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  53 +
  54 + return df
  55 +
  56 + def load_sku_data(self) -> pd.DataFrame:
  57 + """
  58 + Load SKU data from MySQL.
  59 +
  60 + Returns:
  61 + DataFrame with SKU data
  62 + """
  63 + query = text("""
  64 + SELECT
  65 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  66 + shoplazza_image_id, title, sku, barcode, position,
  67 + price, compare_at_price, cost_price,
  68 + option1, option2, option3,
  69 + inventory_quantity, weight, weight_unit, image_src,
  70 + wholesale_price, note, extend,
  71 + shoplazza_created_at, shoplazza_updated_at, tenant_id,
  72 + creator, create_time, updater, update_time, deleted
  73 + FROM shoplazza_product_sku
  74 + WHERE tenant_id = :tenant_id AND deleted = 0
  75 + """)
  76 +
  77 + with self.db_engine.connect() as conn:
  78 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  79 +
  80 + return df
  81 +
  82 + def transform_batch(self) -> List[Dict[str, Any]]:
  83 + """
  84 + Transform SPU and SKU data into ES documents.
  85 +
  86 + Returns:
  87 + List of SPU-level ES documents
  88 + """
  89 + # Load data
  90 + spu_df = self.load_spu_data()
  91 + sku_df = self.load_sku_data()
  92 +
  93 + if spu_df.empty:
  94 + return []
  95 +
  96 + # Group SKUs by SPU
  97 + sku_groups = sku_df.groupby('spu_id')
  98 +
  99 + documents = []
  100 + for _, spu_row in spu_df.iterrows():
  101 + spu_id = spu_row['id']
  102 +
  103 + # Get SKUs for this SPU
  104 + skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame()
  105 +
  106 + # Transform to ES document
  107 + doc = self._transform_spu_to_doc(spu_row, skus)
  108 + if doc:
  109 + documents.append(doc)
  110 +
  111 + return documents
  112 +
  113 + def _transform_spu_to_doc(
  114 + self,
  115 + spu_row: pd.Series,
  116 + skus: pd.DataFrame
  117 + ) -> Optional[Dict[str, Any]]:
  118 + """
  119 + Transform a single SPU row and its SKUs into an ES document.
  120 +
  121 + Args:
  122 + spu_row: SPU row from database
  123 + skus: DataFrame with SKUs for this SPU
  124 +
  125 + Returns:
  126 + ES document or None if transformation fails
  127 + """
  128 + doc = {}
  129 +
  130 + # Tenant ID (required)
  131 + doc['tenant_id'] = str(self.tenant_id)
  132 +
  133 + # Product ID
  134 + doc['product_id'] = str(spu_row['id'])
  135 +
  136 + # Handle
  137 + if pd.notna(spu_row.get('handle')):
  138 + doc['handle'] = str(spu_row['handle'])
  139 +
  140 + # Title
  141 + if pd.notna(spu_row.get('title')):
  142 + doc['title'] = str(spu_row['title'])
  143 +
  144 + # Brief
  145 + if pd.notna(spu_row.get('brief')):
  146 + doc['brief'] = str(spu_row['brief'])
  147 +
  148 + # Description
  149 + if pd.notna(spu_row.get('description')):
  150 + doc['description'] = str(spu_row['description'])
  151 +
  152 + # SEO fields
  153 + if pd.notna(spu_row.get('seo_title')):
  154 + doc['seo_title'] = str(spu_row['seo_title'])
  155 + if pd.notna(spu_row.get('seo_description')):
  156 + doc['seo_description'] = str(spu_row['seo_description'])
  157 + if pd.notna(spu_row.get('seo_keywords')):
  158 + doc['seo_keywords'] = str(spu_row['seo_keywords'])
  159 +
  160 + # Vendor
  161 + if pd.notna(spu_row.get('vendor')):
  162 + doc['vendor'] = str(spu_row['vendor'])
  163 + doc['vendor_keyword'] = str(spu_row['vendor'])
  164 +
  165 + # Product type (from category or tags)
  166 + if pd.notna(spu_row.get('category')):
  167 + doc['product_type'] = str(spu_row['category'])
  168 + doc['product_type_keyword'] = str(spu_row['category'])
  169 +
  170 + # Tags
  171 + if pd.notna(spu_row.get('tags')):
  172 + tags_str = str(spu_row['tags'])
  173 + doc['tags'] = tags_str
  174 + doc['tags_keyword'] = tags_str
  175 +
  176 + # Category
  177 + if pd.notna(spu_row.get('category')):
  178 + doc['category'] = str(spu_row['category'])
  179 + doc['category_keyword'] = str(spu_row['category'])
  180 +
  181 + # Image URL
  182 + if pd.notna(spu_row.get('image_src')):
  183 + image_src = str(spu_row['image_src'])
  184 + if not image_src.startswith('http'):
  185 + image_src = f"//{image_src}" if image_src.startswith('//') else image_src
  186 + doc['image_url'] = image_src
  187 +
  188 + # Process variants
  189 + variants = []
  190 + prices = []
  191 + compare_prices = []
  192 +
  193 + for _, sku_row in skus.iterrows():
  194 + variant = self._transform_sku_to_variant(sku_row)
  195 + if variant:
  196 + variants.append(variant)
  197 + if 'price' in variant and variant['price'] is not None:
  198 + try:
  199 + prices.append(float(variant['price']))
  200 + except (ValueError, TypeError):
  201 + pass
  202 + if 'compare_at_price' in variant and variant['compare_at_price'] is not None:
  203 + try:
  204 + compare_prices.append(float(variant['compare_at_price']))
  205 + except (ValueError, TypeError):
  206 + pass
  207 +
  208 + doc['variants'] = variants
  209 +
  210 + # Calculate price ranges
  211 + if prices:
  212 + doc['min_price'] = float(min(prices))
  213 + doc['max_price'] = float(max(prices))
  214 + else:
  215 + doc['min_price'] = 0.0
  216 + doc['max_price'] = 0.0
  217 +
  218 + if compare_prices:
  219 + doc['compare_at_price'] = float(max(compare_prices))
  220 + else:
  221 + doc['compare_at_price'] = None
  222 +
  223 + return doc
  224 +
  225 + def _transform_sku_to_variant(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]:
  226 + """
  227 + Transform a SKU row into a variant object.
  228 +
  229 + Args:
  230 + sku_row: SKU row from database
  231 +
  232 + Returns:
  233 + Variant dictionary or None
  234 + """
  235 + variant = {}
  236 +
  237 + # Variant ID
  238 + variant['variant_id'] = str(sku_row['id'])
  239 +
  240 + # Title
  241 + if pd.notna(sku_row.get('title')):
  242 + variant['title'] = str(sku_row['title'])
  243 +
  244 + # Price
  245 + if pd.notna(sku_row.get('price')):
  246 + try:
  247 + variant['price'] = float(sku_row['price'])
  248 + except (ValueError, TypeError):
  249 + variant['price'] = None
  250 + else:
  251 + variant['price'] = None
  252 +
  253 + # Compare at price
  254 + if pd.notna(sku_row.get('compare_at_price')):
  255 + try:
  256 + variant['compare_at_price'] = float(sku_row['compare_at_price'])
  257 + except (ValueError, TypeError):
  258 + variant['compare_at_price'] = None
  259 + else:
  260 + variant['compare_at_price'] = None
  261 +
  262 + # SKU
  263 + if pd.notna(sku_row.get('sku')):
  264 + variant['sku'] = str(sku_row['sku'])
  265 +
  266 + # Stock
  267 + if pd.notna(sku_row.get('inventory_quantity')):
  268 + try:
  269 + variant['stock'] = int(sku_row['inventory_quantity'])
  270 + except (ValueError, TypeError):
  271 + variant['stock'] = 0
  272 + else:
  273 + variant['stock'] = 0
  274 +
  275 + # Options (from option1, option2, option3)
  276 + options = {}
  277 + if pd.notna(sku_row.get('option1')):
  278 + options['option1'] = str(sku_row['option1'])
  279 + if pd.notna(sku_row.get('option2')):
  280 + options['option2'] = str(sku_row['option2'])
  281 + if pd.notna(sku_row.get('option3')):
  282 + options['option3'] = str(sku_row['option3'])
  283 +
  284 + if options:
  285 + variant['options'] = options
  286 +
  287 + return variant
  288 +
scripts/generate_test_data.py 0 → 100644
@@ -0,0 +1,325 @@ @@ -0,0 +1,325 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Generate test data for Shoplazza SPU and SKU tables.
  4 +
  5 +Generates 100 SPU records with 1-5 SKU variants each.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import random
  11 +import argparse
  12 +from pathlib import Path
  13 +from datetime import datetime, timedelta
  14 +
  15 +# Add parent directory to path
  16 +sys.path.insert(0, str(Path(__file__).parent.parent))
  17 +
  18 +
  19 +def generate_spu_data(num_spus: int = 100, tenant_id: str = "1", start_id: int = 1):
  20 + """
  21 + Generate SPU test data.
  22 +
  23 + Args:
  24 + num_spus: Number of SPUs to generate
  25 + tenant_id: Tenant ID
  26 + start_id: Starting ID for SPUs
  27 +
  28 + Returns:
  29 + List of SPU data dictionaries
  30 + """
  31 + categories = ["电子产品", "服装", "家居用品", "美妆", "食品", "运动用品", "图书", "玩具"]
  32 + vendors = ["Sony", "Nike", "Apple", "Samsung", "华为", "小米", "美的", "海尔"]
  33 +
  34 + products = [
  35 + ("蓝牙耳机", "Bluetooth Headphone", "高品质无线蓝牙耳机", "High-quality wireless Bluetooth headphone"),
  36 + ("运动鞋", "Running Shoes", "舒适透气的运动鞋", "Comfortable and breathable running shoes"),
  37 + ("智能手机", "Smartphone", "高性能智能手机", "High-performance smartphone"),
  38 + ("笔记本电脑", "Laptop", "轻薄便携笔记本电脑", "Lightweight and portable laptop"),
  39 + ("智能手表", "Smart Watch", "多功能智能手表", "Multi-function smart watch"),
  40 + ("平板电脑", "Tablet", "高清平板电脑", "High-definition tablet"),
  41 + ("无线鼠标", "Wireless Mouse", "人体工学无线鼠标", "Ergonomic wireless mouse"),
  42 + ("机械键盘", "Mechanical Keyboard", "RGB背光机械键盘", "RGB backlit mechanical keyboard"),
  43 + ("显示器", "Monitor", "4K高清显示器", "4K high-definition monitor"),
  44 + ("音响", "Speaker", "蓝牙无线音响", "Bluetooth wireless speaker"),
  45 + ]
  46 +
  47 + spus = []
  48 + for i in range(num_spus):
  49 + spu_id = start_id + i
  50 + product = random.choice(products)
  51 + category = random.choice(categories)
  52 + vendor = random.choice(vendors)
  53 +
  54 + # Generate handle
  55 + handle = f"product-{spu_id}"
  56 +
  57 + # Generate title (Chinese)
  58 + title_zh = f"{product[0]} {vendor}"
  59 +
  60 + # Generate brief
  61 + brief_zh = product[2]
  62 +
  63 + # Generate description
  64 + description_zh = f"<p>{product[2]},来自{vendor}品牌。{product[3]}</p>"
  65 +
  66 + # Generate SEO fields
  67 + seo_title = f"{title_zh} - {category}"
  68 + seo_description = f"购买{vendor}{product[0]},{product[2]}"
  69 + seo_keywords = f"{product[0]},{vendor},{category}"
  70 +
  71 + # Generate tags
  72 + tags = f"{category},{vendor},{product[0]}"
  73 +
  74 + # Generate image
  75 + image_src = f"//cdn.example.com/products/{spu_id}.jpg"
  76 +
  77 + # Generate dates
  78 + created_at = datetime.now() - timedelta(days=random.randint(1, 365))
  79 + updated_at = created_at + timedelta(days=random.randint(0, 30))
  80 +
  81 + spu = {
  82 + 'id': spu_id,
  83 + 'shop_id': 1,
  84 + 'shoplazza_id': f"spu-{spu_id}",
  85 + 'handle': handle,
  86 + 'title': title_zh,
  87 + 'brief': brief_zh,
  88 + 'description': description_zh,
  89 + 'spu': '',
  90 + 'vendor': vendor,
  91 + 'vendor_url': f"https://{vendor.lower()}.com",
  92 + 'seo_title': seo_title,
  93 + 'seo_description': seo_description,
  94 + 'seo_keywords': seo_keywords,
  95 + 'image_src': image_src,
  96 + 'image_width': 800,
  97 + 'image_height': 600,
  98 + 'image_path': f"products/{spu_id}.jpg",
  99 + 'image_alt': title_zh,
  100 + 'inventory_policy': '',
  101 + 'inventory_quantity': 0,
  102 + 'inventory_tracking': '0',
  103 + 'published': 1,
  104 + 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  105 + 'requires_shipping': 1,
  106 + 'taxable': 0,
  107 + 'fake_sales': 0,
  108 + 'display_fake_sales': 0,
  109 + 'mixed_wholesale': 0,
  110 + 'need_variant_image': 0,
  111 + 'has_only_default_variant': 0,
  112 + 'tags': tags,
  113 + 'note': '',
  114 + 'category': category,
  115 + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  116 + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  117 + 'tenant_id': tenant_id,
  118 + 'creator': '1',
  119 + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  120 + 'updater': '1',
  121 + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  122 + 'deleted': 0
  123 + }
  124 + spus.append(spu)
  125 +
  126 + return spus
  127 +
  128 +
  129 +def generate_sku_data(spus: list, start_sku_id: int = 1):
  130 + """
  131 + Generate SKU test data for SPUs.
  132 +
  133 + Args:
  134 + spus: List of SPU data
  135 + start_sku_id: Starting ID for SKUs
  136 +
  137 + Returns:
  138 + List of SKU data dictionaries
  139 + """
  140 + colors = ["黑色", "白色", "红色", "蓝色", "绿色", "灰色"]
  141 + sizes = ["S", "M", "L", "XL", "XXL"]
  142 +
  143 + skus = []
  144 + sku_id = start_sku_id
  145 +
  146 + for spu in spus:
  147 + spu_id = spu['id']
  148 + num_variants = random.randint(1, 5)
  149 +
  150 + # Base price
  151 + base_price = random.uniform(50, 500)
  152 +
  153 + for i in range(num_variants):
  154 + # Generate variant options
  155 + color = random.choice(colors) if num_variants > 1 else None
  156 + size = random.choice(sizes) if num_variants > 2 else None
  157 +
  158 + # Generate title
  159 + title_parts = []
  160 + if color:
  161 + title_parts.append(color)
  162 + if size:
  163 + title_parts.append(size)
  164 + title = " / ".join(title_parts) if title_parts else ""
  165 +
  166 + # Generate SKU
  167 + sku_code = f"SKU-{spu_id}-{i+1}"
  168 +
  169 + # Generate price (variation from base)
  170 + price = base_price + random.uniform(-20, 50)
  171 + compare_at_price = price * random.uniform(1.2, 1.5)
  172 +
  173 + # Generate stock
  174 + stock = random.randint(0, 100)
  175 +
  176 + # Generate dates
  177 + created_at = datetime.now() - timedelta(days=random.randint(1, 365))
  178 + updated_at = created_at + timedelta(days=random.randint(0, 30))
  179 +
  180 + sku = {
  181 + 'id': sku_id,
  182 + 'spu_id': spu_id,
  183 + 'shop_id': 1,
  184 + 'shoplazza_id': f"sku-{sku_id}",
  185 + 'shoplazza_product_id': spu['shoplazza_id'],
  186 + 'shoplazza_image_id': '',
  187 + 'title': title,
  188 + 'sku': sku_code,
  189 + 'barcode': f"BAR{sku_id:08d}",
  190 + 'position': i + 1,
  191 + 'price': round(price, 2),
  192 + 'compare_at_price': round(compare_at_price, 2),
  193 + 'cost_price': round(price * 0.6, 2),
  194 + 'option1': color if color else '',
  195 + 'option2': size if size else '',
  196 + 'option3': '',
  197 + 'inventory_quantity': stock,
  198 + 'weight': round(random.uniform(0.1, 5.0), 2),
  199 + 'weight_unit': 'kg',
  200 + 'image_src': '',
  201 + 'wholesale_price': '[{"price": ' + str(round(price * 0.8, 2)) + ', "minQuantity": 10}]',
  202 + 'note': '',
  203 + 'extend': '',
  204 + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  205 + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  206 + 'tenant_id': spu['tenant_id'],
  207 + 'creator': '1',
  208 + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  209 + 'updater': '1',
  210 + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  211 + 'deleted': 0
  212 + }
  213 + skus.append(sku)
  214 + sku_id += 1
  215 +
  216 + return skus
  217 +
  218 +
  219 +def generate_sql_inserts(spus: list, skus: list, output_file: str):
  220 + """
  221 + Generate SQL INSERT statements.
  222 +
  223 + Args:
  224 + spus: List of SPU data
  225 + skus: List of SKU data
  226 + output_file: Output file path
  227 + """
  228 + with open(output_file, 'w', encoding='utf-8') as f:
  229 + f.write("-- SPU Test Data\n")
  230 + f.write("INSERT INTO shoplazza_product_spu (\n")
  231 + f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n")
  232 + f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n")
  233 + f.write(" image_src, image_width, image_height, image_path, image_alt,\n")
  234 + f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n")
  235 + f.write(" published, published_at, requires_shipping, taxable,\n")
  236 + f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n")
  237 + f.write(" has_only_default_variant, tags, note, category,\n")
  238 + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
  239 + f.write(" creator, create_time, updater, update_time, deleted\n")
  240 + f.write(") VALUES\n")
  241 +
  242 + for i, spu in enumerate(spus):
  243 + values = (
  244 + f"({spu['id']}, {spu['shop_id']}, '{spu['shoplazza_id']}', "
  245 + f"'{spu['handle']}', '{spu['title']}', '{spu['brief']}', "
  246 + f"'{spu['description']}', '{spu['spu']}', '{spu['vendor']}', "
  247 + f"'{spu['vendor_url']}', '{spu['seo_title']}', '{spu['seo_description']}', "
  248 + f"'{spu['seo_keywords']}', '{spu['image_src']}', {spu['image_width']}, "
  249 + f"{spu['image_height']}, '{spu['image_path']}', '{spu['image_alt']}', "
  250 + f"'{spu['inventory_policy']}', {spu['inventory_quantity']}, "
  251 + f"'{spu['inventory_tracking']}', {spu['published']}, "
  252 + f"'{spu['published_at']}', {spu['requires_shipping']}, {spu['taxable']}, "
  253 + f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, "
  254 + f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, "
  255 + f"'{spu['tags']}', '{spu['note']}', '{spu['category']}', "
  256 + f"'{spu['shoplazza_created_at']}', '{spu['shoplazza_updated_at']}', "
  257 + f"'{spu['tenant_id']}', '{spu['creator']}', '{spu['create_time']}', "
  258 + f"'{spu['updater']}', '{spu['update_time']}', {spu['deleted']})"
  259 + )
  260 + f.write(values)
  261 + if i < len(spus) - 1:
  262 + f.write(",\n")
  263 + else:
  264 + f.write(";\n\n")
  265 +
  266 + f.write("-- SKU Test Data\n")
  267 + f.write("INSERT INTO shoplazza_product_sku (\n")
  268 + f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n")
  269 + f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n")
  270 + f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n")
  271 + f.write(" image_src, wholesale_price, note, extend,\n")
  272 + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
  273 + f.write(" creator, create_time, updater, update_time, deleted\n")
  274 + f.write(") VALUES\n")
  275 +
  276 + for i, sku in enumerate(skus):
  277 + values = (
  278 + f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{sku['shoplazza_id']}', "
  279 + f"'{sku['shoplazza_product_id']}', '{sku['shoplazza_image_id']}', "
  280 + f"'{sku['title']}', '{sku['sku']}', '{sku['barcode']}', {sku['position']}, "
  281 + f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, "
  282 + f"'{sku['option1']}', '{sku['option2']}', '{sku['option3']}', "
  283 + f"{sku['inventory_quantity']}, {sku['weight']}, '{sku['weight_unit']}', "
  284 + f"'{sku['image_src']}', '{sku['wholesale_price']}', '{sku['note']}', "
  285 + f"'{sku['extend']}', '{sku['shoplazza_created_at']}', "
  286 + f"'{sku['shoplazza_updated_at']}', '{sku['tenant_id']}', "
  287 + f"'{sku['creator']}', '{sku['create_time']}', '{sku['updater']}', "
  288 + f"'{sku['update_time']}', {sku['deleted']})"
  289 + )
  290 + f.write(values)
  291 + if i < len(skus) - 1:
  292 + f.write(",\n")
  293 + else:
  294 + f.write(";\n")
  295 +
  296 +
  297 +def main():
  298 + parser = argparse.ArgumentParser(description='Generate test data for Shoplazza tables')
  299 + parser.add_argument('--num-spus', type=int, default=100, help='Number of SPUs to generate')
  300 + parser.add_argument('--tenant-id', default='1', help='Tenant ID')
  301 + parser.add_argument('--start-spu-id', type=int, default=1, help='Starting SPU ID')
  302 + parser.add_argument('--start-sku-id', type=int, default=1, help='Starting SKU ID')
  303 + parser.add_argument('--output', default='test_data.sql', help='Output SQL file')
  304 +
  305 + args = parser.parse_args()
  306 +
  307 + print(f"Generating {args.num_spus} SPUs with variants...")
  308 +
  309 + # Generate SPU data
  310 + spus = generate_spu_data(args.num_spus, args.tenant_id, args.start_spu_id)
  311 + print(f"Generated {len(spus)} SPUs")
  312 +
  313 + # Generate SKU data
  314 + skus = generate_sku_data(spus, args.start_sku_id)
  315 + print(f"Generated {len(skus)} SKUs")
  316 +
  317 + # Generate SQL file
  318 + generate_sql_inserts(spus, skus, args.output)
  319 + print(f"SQL file generated: {args.output}")
  320 +
  321 +
  322 +if __name__ == '__main__':
  323 + import json
  324 + main()
  325 +
scripts/import_test_data.py 0 → 100644
@@ -0,0 +1,132 @@ @@ -0,0 +1,132 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Import test data into MySQL Shoplazza tables.
  4 +
  5 +Reads SQL file generated by generate_test_data.py and imports into MySQL.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +from pathlib import Path
  12 +
  13 +# Add parent directory to path
  14 +sys.path.insert(0, str(Path(__file__).parent.parent))
  15 +
  16 +from utils.db_connector import create_db_connection, test_connection
  17 +
  18 +
  19 +def import_sql_file(db_engine, sql_file: str):
  20 + """
  21 + Import SQL file into database.
  22 +
  23 + Args:
  24 + db_engine: SQLAlchemy database engine
  25 + sql_file: Path to SQL file
  26 + """
  27 + with open(sql_file, 'r', encoding='utf-8') as f:
  28 + sql_content = f.read()
  29 +
  30 + # Split by semicolons to get individual statements
  31 + statements = [s.strip() for s in sql_content.split(';') if s.strip() and not s.strip().startswith('--')]
  32 +
  33 + print(f"Executing {len(statements)} SQL statements...")
  34 +
  35 + with db_engine.connect() as conn:
  36 + for i, statement in enumerate(statements, 1):
  37 + if statement:
  38 + try:
  39 + conn.execute(statement)
  40 + conn.commit()
  41 + print(f" [{i}/{len(statements)}] Executed successfully")
  42 + except Exception as e:
  43 + print(f" [{i}/{len(statements)}] ERROR: {e}")
  44 + print(f" Statement: {statement[:100]}...")
  45 + raise
  46 +
  47 +
  48 +def verify_import(db_engine, tenant_id: str):
  49 + """
  50 + Verify imported data.
  51 +
  52 + Args:
  53 + db_engine: SQLAlchemy database engine
  54 + tenant_id: Tenant ID to verify
  55 + """
  56 + from sqlalchemy import text
  57 +
  58 + with db_engine.connect() as conn:
  59 + # Count SPUs
  60 + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
  61 + spu_count = result.scalar()
  62 +
  63 + # Count SKUs
  64 + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
  65 + sku_count = result.scalar()
  66 +
  67 + print(f"\nVerification:")
  68 + print(f" SPUs: {spu_count}")
  69 + print(f" SKUs: {sku_count}")
  70 +
  71 + return spu_count, sku_count
  72 +
  73 +
  74 +def main():
  75 + parser = argparse.ArgumentParser(description='Import test data into MySQL')
  76 +
  77 + # Database connection
  78 + parser.add_argument('--db-host', required=True, help='MySQL host')
  79 + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
  80 + parser.add_argument('--db-database', required=True, help='MySQL database name')
  81 + parser.add_argument('--db-username', required=True, help='MySQL username')
  82 + parser.add_argument('--db-password', required=True, help='MySQL password')
  83 +
  84 + # Import options
  85 + parser.add_argument('--sql-file', required=True, help='SQL file to import')
  86 + parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)')
  87 +
  88 + args = parser.parse_args()
  89 +
  90 + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
  91 +
  92 + # Connect to database
  93 + try:
  94 + db_engine = create_db_connection(
  95 + host=args.db_host,
  96 + port=args.db_port,
  97 + database=args.db_database,
  98 + username=args.db_username,
  99 + password=args.db_password
  100 + )
  101 + except Exception as e:
  102 + print(f"ERROR: Failed to connect to MySQL: {e}")
  103 + return 1
  104 +
  105 + # Test connection
  106 + if not test_connection(db_engine):
  107 + print("ERROR: Database connection test failed")
  108 + return 1
  109 +
  110 + print("Database connection successful")
  111 +
  112 + # Import SQL file
  113 + print(f"\nImporting SQL file: {args.sql_file}")
  114 + try:
  115 + import_sql_file(db_engine, args.sql_file)
  116 + print("Import completed successfully")
  117 + except Exception as e:
  118 + print(f"ERROR: Failed to import SQL file: {e}")
  119 + import traceback
  120 + traceback.print_exc()
  121 + return 1
  122 +
  123 + # Verify import if tenant_id provided
  124 + if args.tenant_id:
  125 + verify_import(db_engine, args.tenant_id)
  126 +
  127 + return 0
  128 +
  129 +
  130 +if __name__ == '__main__':
  131 + sys.exit(main())
  132 +
scripts/ingest_shoplazza.py 0 → 100644
@@ -0,0 +1,148 @@ @@ -0,0 +1,148 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Shoplazza data ingestion script.
  4 +
  5 +Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +from pathlib import Path
  12 +
  13 +# Add parent directory to path
  14 +sys.path.insert(0, str(Path(__file__).parent.parent))
  15 +
  16 +from utils.db_connector import create_db_connection
  17 +from utils.es_client import ESClient
  18 +from indexer.spu_transformer import SPUTransformer
  19 +from indexer.mapping_generator import MappingGenerator
  20 +from indexer.bulk_indexer import BulkIndexer
  21 +from config import ConfigLoader
  22 +
  23 +
  24 +def main():
  25 + parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch')
  26 +
  27 + # Database connection
  28 + parser.add_argument('--db-host', required=True, help='MySQL host')
  29 + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
  30 + parser.add_argument('--db-database', required=True, help='MySQL database name')
  31 + parser.add_argument('--db-username', required=True, help='MySQL username')
  32 + parser.add_argument('--db-password', required=True, help='MySQL password')
  33 +
  34 + # Tenant and index
  35 + parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)')
  36 + parser.add_argument('--config', default='base', help='Configuration ID (default: base)')
  37 + parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')
  38 +
  39 + # Options
  40 + parser.add_argument('--recreate', action='store_true', help='Recreate index if exists')
  41 + parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)')
  42 +
  43 + args = parser.parse_args()
  44 +
  45 + print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}")
  46 +
  47 + # Load configuration
  48 + config_loader = ConfigLoader("config/schema")
  49 + try:
  50 + config = config_loader.load_customer_config(args.config)
  51 + print(f"Loaded configuration: {config.customer_name}")
  52 + except Exception as e:
  53 + print(f"ERROR: Failed to load configuration: {e}")
  54 + return 1
  55 +
  56 + # Validate tenant_id field exists
  57 + tenant_id_field = None
  58 + for field in config.fields:
  59 + if field.name == "tenant_id":
  60 + tenant_id_field = field
  61 + break
  62 +
  63 + if not tenant_id_field:
  64 + print("ERROR: Configuration must include 'tenant_id' field")
  65 + return 1
  66 +
  67 + # Connect to MySQL
  68 + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
  69 + try:
  70 + db_engine = create_db_connection(
  71 + host=args.db_host,
  72 + port=args.db_port,
  73 + database=args.db_database,
  74 + username=args.db_username,
  75 + password=args.db_password
  76 + )
  77 + except Exception as e:
  78 + print(f"ERROR: Failed to connect to MySQL: {e}")
  79 + return 1
  80 +
  81 + # Connect to Elasticsearch
  82 + print(f"Connecting to Elasticsearch: {args.es_host}")
  83 + es_client = ESClient(hosts=[args.es_host])
  84 + if not es_client.ping():
  85 + print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}")
  86 + return 1
  87 +
  88 + # Generate and create index
  89 + mapping_gen = MappingGenerator(config)
  90 + mapping = mapping_gen.generate_mapping()
  91 + index_name = config.es_index_name
  92 +
  93 + if args.recreate:
  94 + if es_client.index_exists(index_name):
  95 + print(f"Deleting existing index: {index_name}")
  96 + es_client.delete_index(index_name)
  97 +
  98 + if not es_client.index_exists(index_name):
  99 + print(f"Creating index: {index_name}")
  100 + es_client.create_index(index_name, mapping)
  101 + else:
  102 + print(f"Using existing index: {index_name}")
  103 +
  104 + # Initialize SPU transformer
  105 + print(f"Initializing SPU transformer for tenant: {args.tenant_id}")
  106 + transformer = SPUTransformer(db_engine, args.tenant_id)
  107 +
  108 + # Transform data
  109 + print("Transforming SPU and SKU data...")
  110 + try:
  111 + documents = transformer.transform_batch()
  112 + print(f"Transformed {len(documents)} SPU documents")
  113 + except Exception as e:
  114 + print(f"ERROR: Failed to transform data: {e}")
  115 + import traceback
  116 + traceback.print_exc()
  117 + return 1
  118 +
  119 + if not documents:
  120 + print("WARNING: No documents to index")
  121 + return 0
  122 +
  123 + # Bulk index
  124 + print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...")
  125 + indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
  126 +
  127 + try:
  128 + results = indexer.index_documents(documents, id_field="product_id", show_progress=True)
  129 + print(f"\nIngestion complete:")
  130 + print(f" Success: {results['success']}")
  131 + print(f" Failed: {results['failed']}")
  132 + print(f" Time: {results.get('elapsed_time', 0):.2f}s")
  133 +
  134 + if results['failed'] > 0:
  135 + print(f"\nWARNING: {results['failed']} documents failed to index")
  136 + return 1
  137 +
  138 + return 0
  139 + except Exception as e:
  140 + print(f"ERROR: Failed to index documents: {e}")
  141 + import traceback
  142 + traceback.print_exc()
  143 + return 1
  144 +
  145 +
  146 +if __name__ == '__main__':
  147 + sys.exit(main())
  148 +
scripts/test_base.py 0 → 100644
@@ -0,0 +1,242 @@ @@ -0,0 +1,242 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Test script for base configuration.
  4 +
  5 +Tests data ingestion, search API, response format, and tenant isolation.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +import requests
  12 +import json
  13 +from pathlib import Path
  14 +
  15 +# Add parent directory to path
  16 +sys.path.insert(0, str(Path(__file__).parent.parent))
  17 +
  18 +
  19 +def test_search_api(base_url: str, tenant_id: str, query: str = "耳机"):
  20 + """
  21 + Test search API.
  22 +
  23 + Args:
  24 + base_url: API base URL
  25 + tenant_id: Tenant ID
  26 + query: Search query
  27 +
  28 + Returns:
  29 + Response JSON or None if failed
  30 + """
  31 + url = f"{base_url}/search/"
  32 + headers = {
  33 + "X-Tenant-ID": tenant_id,
  34 + "Content-Type": "application/json"
  35 + }
  36 + payload = {
  37 + "query": query,
  38 + "size": 10,
  39 + "from": 0
  40 + }
  41 +
  42 + print(f"\nTesting search API:")
  43 + print(f" URL: {url}")
  44 + print(f" Query: {query}")
  45 + print(f" Tenant ID: {tenant_id}")
  46 +
  47 + try:
  48 + response = requests.post(url, json=payload, headers=headers, timeout=30)
  49 + response.raise_for_status()
  50 + data = response.json()
  51 +
  52 + print(f" Status: {response.status_code}")
  53 + print(f" Total: {data.get('total', 0)}")
  54 + print(f" Results: {len(data.get('results', []))}")
  55 +
  56 + return data
  57 + except Exception as e:
  58 + print(f" ERROR: {e}")
  59 + return None
  60 +
  61 +
  62 +def validate_response_format(data: dict):
  63 + """
  64 + Validate response format.
  65 +
  66 + Args:
  67 + data: Response data
  68 +
  69 + Returns:
  70 + List of validation errors (empty if valid)
  71 + """
  72 + errors = []
  73 +
  74 + # Check for results field (not hits)
  75 + if 'hits' in data:
  76 + errors.append("Response contains 'hits' field (should be 'results')")
  77 +
  78 + if 'results' not in data:
  79 + errors.append("Response missing 'results' field")
  80 + else:
  81 + results = data['results']
  82 + if not isinstance(results, list):
  83 + errors.append("'results' should be a list")
  84 + else:
  85 + # Validate first result structure
  86 + if results:
  87 + result = results[0]
  88 + required_fields = ['product_id', 'title', 'variants', 'relevance_score']
  89 + for field in required_fields:
  90 + if field not in result:
  91 + errors.append(f"Result missing required field: {field}")
  92 +
  93 + # Check for ES internal fields
  94 + es_internal_fields = ['_id', '_score', '_source']
  95 + for field in es_internal_fields:
  96 + if field in result:
  97 + errors.append(f"Result contains ES internal field: {field}")
  98 +
  99 + # Validate variants
  100 + if 'variants' in result:
  101 + variants = result['variants']
  102 + if not isinstance(variants, list):
  103 + errors.append("'variants' should be a list")
  104 + elif variants:
  105 + variant = variants[0]
  106 + variant_required = ['variant_id', 'price', 'sku', 'stock']
  107 + for field in variant_required:
  108 + if field not in variant:
  109 + errors.append(f"Variant missing required field: {field}")
  110 +
  111 + # Check for suggestions and related_searches
  112 + if 'suggestions' not in data:
  113 + errors.append("Response missing 'suggestions' field")
  114 + if 'related_searches' not in data:
  115 + errors.append("Response missing 'related_searches' field")
  116 +
  117 + return errors
  118 +
  119 +
  120 +def test_facets(base_url: str, tenant_id: str):
  121 + """
  122 + Test facets aggregation.
  123 +
  124 + Args:
  125 + base_url: API base URL
  126 + tenant_id: Tenant ID
  127 +
  128 + Returns:
  129 + Response JSON or None if failed
  130 + """
  131 + url = f"{base_url}/search/"
  132 + headers = {
  133 + "X-Tenant-ID": tenant_id,
  134 + "Content-Type": "application/json"
  135 + }
  136 + payload = {
  137 + "query": "商品",
  138 + "size": 10,
  139 + "facets": ["category_keyword", "vendor_keyword"]
  140 + }
  141 +
  142 + print(f"\nTesting facets:")
  143 + print(f" Facets: {payload['facets']}")
  144 +
  145 + try:
  146 + response = requests.post(url, json=payload, headers=headers, timeout=30)
  147 + response.raise_for_status()
  148 + data = response.json()
  149 +
  150 + if 'facets' in data and data['facets']:
  151 + print(f" Facets returned: {len(data['facets'])}")
  152 + for facet in data['facets']:
  153 + print(f" - {facet.get('field')}: {len(facet.get('values', []))} values")
  154 + else:
  155 + print(" WARNING: No facets returned")
  156 +
  157 + return data
  158 + except Exception as e:
  159 + print(f" ERROR: {e}")
  160 + return None
  161 +
  162 +
  163 +def test_tenant_isolation(base_url: str, tenant_id_1: str, tenant_id_2: str):
  164 + """
  165 + Test tenant isolation.
  166 +
  167 + Args:
  168 + base_url: API base URL
  169 + tenant_id_1: First tenant ID
  170 + tenant_id_2: Second tenant ID
  171 + """
  172 + print(f"\nTesting tenant isolation:")
  173 + print(f" Tenant 1: {tenant_id_1}")
  174 + print(f" Tenant 2: {tenant_id_2}")
  175 +
  176 + # Search for tenant 1
  177 + data1 = test_search_api(base_url, tenant_id_1, "商品")
  178 + # Search for tenant 2
  179 + data2 = test_search_api(base_url, tenant_id_2, "商品")
  180 +
  181 + if data1 and data2:
  182 + results1 = set(r.get('product_id') for r in data1.get('results', []))
  183 + results2 = set(r.get('product_id') for r in data2.get('results', []))
  184 +
  185 + overlap = results1 & results2
  186 + if overlap:
  187 + print(f" WARNING: Found {len(overlap)} overlapping results between tenants")
  188 + else:
  189 + print(f" OK: No overlapping results (tenant isolation working)")
  190 +
  191 +
  192 +def main():
  193 + parser = argparse.ArgumentParser(description='Test base configuration')
  194 + parser.add_argument('--api-url', default='http://localhost:8000', help='API base URL')
  195 + parser.add_argument('--tenant-id', default='1', help='Tenant ID for testing')
  196 + parser.add_argument('--test-tenant-2', help='Second tenant ID for isolation test')
  197 +
  198 + args = parser.parse_args()
  199 +
  200 + print("=" * 60)
  201 + print("Base Configuration Test Suite")
  202 + print("=" * 60)
  203 +
  204 + # Test 1: Basic search
  205 + print("\n[Test 1] Basic Search")
  206 + data = test_search_api(args.api_url, args.tenant_id)
  207 + if not data:
  208 + print("FAILED: Basic search test")
  209 + return 1
  210 +
  211 + # Test 2: Response format validation
  212 + print("\n[Test 2] Response Format Validation")
  213 + errors = validate_response_format(data)
  214 + if errors:
  215 + print("FAILED: Response format validation")
  216 + for error in errors:
  217 + print(f" - {error}")
  218 + return 1
  219 + else:
  220 + print("PASSED: Response format is correct")
  221 +
  222 + # Test 3: Facets
  223 + print("\n[Test 3] Facets Aggregation")
  224 + facet_data = test_facets(args.api_url, args.tenant_id)
  225 + if not facet_data:
  226 + print("WARNING: Facets test failed (may be expected if no data)")
  227 +
  228 + # Test 4: Tenant isolation (if second tenant provided)
  229 + if args.test_tenant_2:
  230 + print("\n[Test 4] Tenant Isolation")
  231 + test_tenant_isolation(args.api_url, args.tenant_id, args.test_tenant_2)
  232 +
  233 + print("\n" + "=" * 60)
  234 + print("All tests completed")
  235 + print("=" * 60)
  236 +
  237 + return 0
  238 +
  239 +
  240 +if __name__ == '__main__':
  241 + sys.exit(main())
  242 +
search/searcher.py
@@ -17,38 +17,45 @@ from .multilang_query_builder import MultiLanguageQueryBuilder @@ -17,38 +17,45 @@ from .multilang_query_builder import MultiLanguageQueryBuilder
17 from .rerank_engine import RerankEngine 17 from .rerank_engine import RerankEngine
18 from context.request_context import RequestContext, RequestContextStage, create_request_context 18 from context.request_context import RequestContext, RequestContextStage, create_request_context
19 from api.models import FacetResult, FacetValue 19 from api.models import FacetResult, FacetValue
  20 +from api.result_formatter import ResultFormatter
20 21
21 22
22 class SearchResult: 23 class SearchResult:
23 - """Container for search results (重构版).""" 24 + """Container for search results (外部友好格式)."""
24 25
25 def __init__( 26 def __init__(
26 self, 27 self,
27 - hits: List[Dict[str, Any]], 28 + results: List[Any], # List[ProductResult]
28 total: int, 29 total: int,
29 max_score: float, 30 max_score: float,
30 took_ms: int, 31 took_ms: int,
31 facets: Optional[List[FacetResult]] = None, 32 facets: Optional[List[FacetResult]] = None,
32 query_info: Optional[Dict[str, Any]] = None, 33 query_info: Optional[Dict[str, Any]] = None,
  34 + suggestions: Optional[List[str]] = None,
  35 + related_searches: Optional[List[str]] = None,
33 debug_info: Optional[Dict[str, Any]] = None 36 debug_info: Optional[Dict[str, Any]] = None
34 ): 37 ):
35 - self.hits = hits 38 + self.results = results
36 self.total = total 39 self.total = total
37 self.max_score = max_score 40 self.max_score = max_score
38 self.took_ms = took_ms 41 self.took_ms = took_ms
39 self.facets = facets 42 self.facets = facets
40 self.query_info = query_info or {} 43 self.query_info = query_info or {}
  44 + self.suggestions = suggestions or []
  45 + self.related_searches = related_searches or []
41 self.debug_info = debug_info 46 self.debug_info = debug_info
42 47
43 def to_dict(self) -> Dict[str, Any]: 48 def to_dict(self) -> Dict[str, Any]:
44 """Convert to dictionary representation.""" 49 """Convert to dictionary representation."""
45 result = { 50 result = {
46 - "hits": self.hits, 51 + "results": [r.model_dump() if hasattr(r, 'model_dump') else r for r in self.results],
47 "total": self.total, 52 "total": self.total,
48 "max_score": self.max_score, 53 "max_score": self.max_score,
49 "took_ms": self.took_ms, 54 "took_ms": self.took_ms,
50 "facets": [f.model_dump() for f in self.facets] if self.facets else None, 55 "facets": [f.model_dump() for f in self.facets] if self.facets else None,
51 - "query_info": self.query_info 56 + "query_info": self.query_info,
  57 + "suggestions": self.suggestions,
  58 + "related_searches": self.related_searches
52 } 59 }
53 if self.debug_info is not None: 60 if self.debug_info is not None:
54 result["debug_info"] = self.debug_info 61 result["debug_info"] = self.debug_info
@@ -106,6 +113,7 @@ class Searcher: @@ -106,6 +113,7 @@ class Searcher:
106 def search( 113 def search(
107 self, 114 self,
108 query: str, 115 query: str,
  116 + tenant_id: str,
109 size: int = 10, 117 size: int = 10,
110 from_: int = 0, 118 from_: int = 0,
111 filters: Optional[Dict[str, Any]] = None, 119 filters: Optional[Dict[str, Any]] = None,
@@ -118,10 +126,11 @@ class Searcher: @@ -118,10 +126,11 @@ class Searcher:
118 debug: bool = False 126 debug: bool = False
119 ) -> SearchResult: 127 ) -> SearchResult:
120 """ 128 """
121 - Execute search query (重构版). 129 + Execute search query (外部友好格式).
122 130
123 Args: 131 Args:
124 query: Search query string 132 query: Search query string
  133 + tenant_id: Tenant ID (required for filtering)
125 size: Number of results to return 134 size: Number of results to return
126 from_: Offset for pagination 135 from_: Offset for pagination
127 filters: Exact match filters 136 filters: Exact match filters
@@ -134,7 +143,7 @@ class Searcher: @@ -134,7 +143,7 @@ class Searcher:
134 debug: Enable debug information output 143 debug: Enable debug information output
135 144
136 Returns: 145 Returns:
137 - SearchResult object 146 + SearchResult object with formatted results
138 """ 147 """
139 # Create context if not provided (backward compatibility) 148 # Create context if not provided (backward compatibility)
140 if context is None: 149 if context is None:
@@ -248,6 +257,11 @@ class Searcher: @@ -248,6 +257,11 @@ class Searcher:
248 # Step 3: Query building 257 # Step 3: Query building
249 context.start_stage(RequestContextStage.QUERY_BUILDING) 258 context.start_stage(RequestContextStage.QUERY_BUILDING)
250 try: 259 try:
  260 + # Add tenant_id to filters (required)
  261 + if filters is None:
  262 + filters = {}
  263 + filters['tenant_id'] = tenant_id
  264 +
251 es_query = self.query_builder.build_multilang_query( 265 es_query = self.query_builder.build_multilang_query(
252 parsed_query=parsed_query, 266 parsed_query=parsed_query,
253 query_vector=parsed_query.query_vector if enable_embedding else None, 267 query_vector=parsed_query.query_vector if enable_embedding else None,
@@ -341,56 +355,10 @@ class Searcher: @@ -341,56 +355,10 @@ class Searcher:
341 # Step 5: Result processing 355 # Step 5: Result processing
342 context.start_stage(RequestContextStage.RESULT_PROCESSING) 356 context.start_stage(RequestContextStage.RESULT_PROCESSING)
343 try: 357 try:
344 - hits = []  
345 - raw_hits = []  
346 - 358 + # Extract ES hits
  359 + es_hits = []
347 if 'hits' in es_response and 'hits' in es_response['hits']: 360 if 'hits' in es_response and 'hits' in es_response['hits']:
348 - for hit in es_response['hits']['hits']:  
349 - raw_hits.append(hit)  
350 -  
351 - result_doc = {  
352 - '_id': hit['_id'],  
353 - '_score': hit.get('_score') or 0.0,  
354 - '_source': hit['_source']  
355 - }  
356 -  
357 - # 应用本地重排(仅当启用时)  
358 - if enable_rerank and self.rerank_engine.enabled:  
359 - base_score = hit.get('_score') or 0.0  
360 - knn_score = None  
361 -  
362 - # 检查是否使用了KNN(新结构:在function_score内部)  
363 - query_section = es_query.get('query', {})  
364 - if 'function_score' in query_section:  
365 - fs_query = query_section['function_score'].get('query', {})  
366 - outer_bool = fs_query.get('bool', {})  
367 - inner_bool_list = outer_bool.get('must', [])  
368 - if inner_bool_list and 'bool' in inner_bool_list[0]:  
369 - inner_should = inner_bool_list[0]['bool'].get('should', [])  
370 - if any('knn' in clause for clause in inner_should):  
371 - knn_score = base_score * 0.2  
372 -  
373 - custom_score = self.rerank_engine.calculate_score(  
374 - hit,  
375 - base_score,  
376 - knn_score  
377 - )  
378 - result_doc['_custom_score'] = custom_score  
379 - result_doc['_original_score'] = base_score  
380 -  
381 - hits.append(result_doc)  
382 -  
383 - # 重排序(仅当启用时)  
384 - if enable_rerank and self.rerank_engine.enabled:  
385 - hits.sort(key=lambda x: x.get('_custom_score', x['_score']), reverse=True)  
386 - context.logger.info(  
387 - f"本地重排完成 | 使用RerankEngine",  
388 - extra={'reqid': context.reqid, 'uid': context.uid}  
389 - )  
390 -  
391 - # Store intermediate results in context  
392 - context.store_intermediate_result('raw_hits', raw_hits)  
393 - context.store_intermediate_result('processed_hits', hits) 361 + es_hits = es_response['hits']['hits']
394 362
395 # Extract total and max_score 363 # Extract total and max_score
396 total = es_response.get('hits', {}).get('total', {}) 364 total = es_response.get('hits', {}).get('total', {})
@@ -401,16 +369,24 @@ class Searcher: @@ -401,16 +369,24 @@ class Searcher:
401 369
402 max_score = es_response.get('hits', {}).get('max_score') or 0.0 370 max_score = es_response.get('hits', {}).get('max_score') or 0.0
403 371
404 - # Standardize facets  
405 - standardized_facets = self._standardize_facets(  
406 - es_response.get('aggregations', {}),  
407 - facets,  
408 - filters  
409 - ) 372 + # Format results using ResultFormatter
  373 + formatted_results = ResultFormatter.format_search_results(es_hits, max_score)
  374 +
  375 + # Format facets
  376 + standardized_facets = None
  377 + if facets:
  378 + standardized_facets = ResultFormatter.format_facets(
  379 + es_response.get('aggregations', {}),
  380 + facets
  381 + )
  382 +
  383 + # Generate suggestions and related searches
  384 + query_text = parsed_query.original_query if parsed_query else query
  385 + suggestions = ResultFormatter.generate_suggestions(query_text, formatted_results)
  386 + related_searches = ResultFormatter.generate_related_searches(query_text, formatted_results)
410 387
411 context.logger.info( 388 context.logger.info(
412 - f"结果处理完成 | 返回: {len(hits)}条 | 总计: {total_value}条 | "  
413 - f"重排序: {'是' if enable_rerank else '否'}", 389 + f"结果处理完成 | 返回: {len(formatted_results)}条 | 总计: {total_value}条",
414 extra={'reqid': context.reqid, 'uid': context.uid} 390 extra={'reqid': context.reqid, 'uid': context.uid}
415 ) 391 )
416 392
@@ -459,12 +435,14 @@ class Searcher: @@ -459,12 +435,14 @@ class Searcher:
459 435
460 # Build result 436 # Build result
461 result = SearchResult( 437 result = SearchResult(
462 - hits=hits, 438 + results=formatted_results,
463 total=total_value, 439 total=total_value,
464 max_score=max_score, 440 max_score=max_score,
465 took_ms=int(total_duration), 441 took_ms=int(total_duration),
466 facets=standardized_facets, 442 facets=standardized_facets,
467 query_info=parsed_query.to_dict(), 443 query_info=parsed_query.to_dict(),
  444 + suggestions=suggestions,
  445 + related_searches=related_searches,
468 debug_info=debug_info 446 debug_info=debug_info
469 ) 447 )
470 448
@@ -476,21 +454,23 @@ class Searcher: @@ -476,21 +454,23 @@ class Searcher:
476 def search_by_image( 454 def search_by_image(
477 self, 455 self,
478 image_url: str, 456 image_url: str,
  457 + tenant_id: str,
479 size: int = 10, 458 size: int = 10,
480 filters: Optional[Dict[str, Any]] = None, 459 filters: Optional[Dict[str, Any]] = None,
481 range_filters: Optional[Dict[str, Any]] = None 460 range_filters: Optional[Dict[str, Any]] = None
482 ) -> SearchResult: 461 ) -> SearchResult:
483 """ 462 """
484 - Search by image similarity (重构版). 463 + Search by image similarity (外部友好格式).
485 464
486 Args: 465 Args:
487 image_url: URL of query image 466 image_url: URL of query image
  467 + tenant_id: Tenant ID (required for filtering)
488 size: Number of results 468 size: Number of results
489 filters: Exact match filters 469 filters: Exact match filters
490 range_filters: Range filters for numeric fields 470 range_filters: Range filters for numeric fields
491 471
492 Returns: 472 Returns:
493 - SearchResult object 473 + SearchResult object with formatted results
494 """ 474 """
495 if not self.image_embedding_field: 475 if not self.image_embedding_field:
496 raise ValueError("Image embedding field not configured") 476 raise ValueError("Image embedding field not configured")
@@ -503,6 +483,11 @@ class Searcher: @@ -503,6 +483,11 @@ class Searcher:
503 if image_vector is None: 483 if image_vector is None:
504 raise ValueError(f"Failed to encode image: {image_url}") 484 raise ValueError(f"Failed to encode image: {image_url}")
505 485
  486 + # Add tenant_id to filters (required)
  487 + if filters is None:
  488 + filters = {}
  489 + filters['tenant_id'] = tenant_id
  490 +
506 # Build KNN query 491 # Build KNN query
507 es_query = { 492 es_query = {
508 "size": size, 493 "size": size,
@@ -536,28 +521,32 @@ class Searcher: @@ -536,28 +521,32 @@ class Searcher:
536 size=size 521 size=size
537 ) 522 )
538 523
539 - # Process results (similar to text search)  
540 - hits = [] 524 + # Extract ES hits
  525 + es_hits = []
541 if 'hits' in es_response and 'hits' in es_response['hits']: 526 if 'hits' in es_response and 'hits' in es_response['hits']:
542 - for hit in es_response['hits']['hits']:  
543 - hits.append({  
544 - '_id': hit['_id'],  
545 - '_score': hit['_score'],  
546 - '_source': hit['_source']  
547 - }) 527 + es_hits = es_response['hits']['hits']
548 528
  529 + # Extract total and max_score
549 total = es_response.get('hits', {}).get('total', {}) 530 total = es_response.get('hits', {}).get('total', {})
550 if isinstance(total, dict): 531 if isinstance(total, dict):
551 total_value = total.get('value', 0) 532 total_value = total.get('value', 0)
552 else: 533 else:
553 total_value = total 534 total_value = total
554 535
  536 + max_score = es_response.get('hits', {}).get('max_score') or 0.0
  537 +
  538 + # Format results using ResultFormatter
  539 + formatted_results = ResultFormatter.format_search_results(es_hits, max_score)
  540 +
555 return SearchResult( 541 return SearchResult(
556 - hits=hits, 542 + results=formatted_results,
557 total=total_value, 543 total=total_value,
558 - max_score=es_response.get('hits', {}).get('max_score') or 0.0, 544 + max_score=max_score,
559 took_ms=es_response.get('took', 0), 545 took_ms=es_response.get('took', 0),
560 - query_info={'image_url': image_url, 'search_type': 'image_similarity'} 546 + facets=None,
  547 + query_info={'image_url': image_url, 'search_type': 'image_similarity'},
  548 + suggestions=[],
  549 + related_searches=[]
561 ) 550 )
562 551
563 def get_domain_summary(self) -> Dict[str, Any]: 552 def get_domain_summary(self) -> Dict[str, Any]:
@@ -12,33 +12,57 @@ @@ -12,33 +12,57 @@
12 12
13 ## 1. 原始数据层的约定 13 ## 1. 原始数据层的约定
14 14
15 -所有租户共用主表、独立配置和扩展表,有自己独立的ES索引。  
16 -  
17 ### 1.1 店匠主表 15 ### 1.1 店匠主表
18 16
19 所有租户共用以下主表: 17 所有租户共用以下主表:
20 - `shoplazza_product_sku` - SKU级别商品数据 18 - `shoplazza_product_sku` - SKU级别商品数据
21 - `shoplazza_product_spu` - SPU级别商品数据 19 - `shoplazza_product_spu` - SPU级别商品数据
22 20
23 -### 1.2 每个租户的扩展表  
24 -  
25 -各个租户有自己的扩展表,不同的租户根据不同的业务需要、以及不同的数据源,来定制自己的扩展表:  
26 -- 自定义属性体系  
27 -- 多语言商品标题(中文、英文、俄文等)  
28 -- 品牌名、不同的类目和标签体系  
29 -- 业务过滤和聚合字段  
30 -- 权重(提权)字段  
31 -  
32 -**数据关联方式**:  
33 -- 入索引时,商品主表 `shoplazza_product_sku` 的 `id` + `shopid` 与租户扩展表关联  
34 -- 例如:`customer1_extension` 表存储 customer1 的自定义字段 21 +### 1.2 索引结构(SPU维度)
  22 +
  23 +**统一索引架构**:
  24 +- 所有客户共享同一个Elasticsearch索引:`search_products`
  25 +- 索引粒度:SPU级别(每个文档代表一个SPU)
  26 +- 数据隔离:通过`tenant_id`字段实现租户隔离
  27 +- 嵌套结构:每个SPU文档包含嵌套的`variants`数组(SKU变体)
  28 +
  29 +**索引文档结构**:
  30 +```json
  31 +{
  32 + "tenant_id": "1",
  33 + "product_id": "123",
  34 + "title": "蓝牙耳机",
  35 + "variants": [
  36 + {
  37 + "variant_id": "456",
  38 + "title": "黑色",
  39 + "price": 199.99,
  40 + "sku": "SKU-123-1",
  41 + "stock": 50
  42 + }
  43 + ],
  44 + "min_price": 199.99,
  45 + "max_price": 299.99
  46 +}
  47 +```
35 48
36 ### 1.3 配置化方案 49 ### 1.3 配置化方案
37 50
  51 +**配置分离原则**:
  52 +- **搜索配置**:只包含ES字段定义、查询域、排序规则等搜索相关配置
  53 +- **数据源配置**:不在搜索配置中,由Pipeline层(脚本)决定
  54 +- **数据导入流程**:写死的脚本,不依赖配置
  55 +
38 统一通过配置文件定义: 56 统一通过配置文件定义:
39 -1. ES 字段定义(字段类型、分析器、来源表/列 57 +1. ES 字段定义(字段类型、分析器、boost等
40 2. ES mapping 结构生成 58 2. ES mapping 结构生成
41 -3. 数据入库映射关系 59 +3. 查询域配置(indexes)
  60 +4. 排序和打分配置(function_score)
  61 +
  62 +**注意**:配置中**不包含**以下内容:
  63 +- `mysql_config` - MySQL数据库配置
  64 +- `main_table` / `extension_table` - 数据表配置
  65 +- `source_table` / `source_column` - 字段数据源映射
42 66
43 --- 67 ---
44 68
@@ -72,62 +96,54 @@ @@ -72,62 +96,54 @@
72 - **standard**:标准分析器 96 - **standard**:标准分析器
73 - **keyword**:关键词分析器 97 - **keyword**:关键词分析器
74 98
75 -#### 字段配置示例 99 +#### 字段配置示例(Base配置)
76 100
77 ```yaml 101 ```yaml
78 fields: 102 fields:
79 - # 主键字段  
80 - - name: "skuId"  
81 - type: "LONG"  
82 - source_table: "main" # 主表  
83 - source_column: "id" 103 + # 租户隔离字段(必需)
  104 + - name: "tenant_id"
  105 + type: "KEYWORD"
84 required: true 106 required: true
85 index: true 107 index: true
86 store: true 108 store: true
87 109
88 - # 多语言文本字段  
89 - - name: "name"  
90 - type: "TEXT"  
91 - source_table: "extension" # 扩展表  
92 - source_column: "name"  
93 - analyzer: "chinese_ecommerce"  
94 - boost: 2.0 110 + # 商品标识字段
  111 + - name: "product_id"
  112 + type: "KEYWORD"
  113 + required: true
95 index: true 114 index: true
96 store: true 115 store: true
97 116
98 - - name: "enSpuName" 117 + # 文本搜索字段
  118 + - name: "title"
99 type: "TEXT" 119 type: "TEXT"
100 - source_table: "extension"  
101 - source_column: "enSpuName"  
102 - analyzer: "english"  
103 - boost: 2.0 120 + analyzer: "chinese_ecommerce"
  121 + boost: 3.0
  122 + index: true
  123 + store: true
104 124
105 - - name: "ruSkuName" 125 + - name: "seo_keywords"
106 type: "TEXT" 126 type: "TEXT"
107 - source_table: "extension"  
108 - source_column: "ruSkuName"  
109 - analyzer: "russian" 127 + analyzer: "chinese_ecommerce"
110 boost: 2.0 128 boost: 2.0
111 -  
112 - # 文本向量字段  
113 - - name: "name_embedding"  
114 - type: "TEXT_EMBEDDING"  
115 - source_table: "extension"  
116 - source_column: "name"  
117 - embedding_dims: 1024  
118 - embedding_similarity: "dot_product"  
119 index: true 129 index: true
  130 + store: true
120 131
121 - # 图片向量字段  
122 - - name: "image_embedding"  
123 - type: "IMAGE_EMBEDDING"  
124 - source_table: "extension"  
125 - source_column: "imageUrl"  
126 - embedding_dims: 1024  
127 - embedding_similarity: "dot_product"  
128 - nested: false 132 + # 嵌套variants字段
  133 + - name: "variants"
  134 + type: "JSON"
  135 + nested: true
  136 + nested_properties:
  137 + variant_id:
  138 + type: "keyword"
  139 + price:
  140 + type: "float"
  141 + sku:
  142 + type: "keyword"
129 ``` 143 ```
130 144
  145 +**注意**:配置中**不包含**`source_table`和`source_column`,数据源映射由Pipeline层决定。
  146 +
131 **实现模块**: 147 **实现模块**:
132 - `config/config_loader.py` - 配置加载器 148 - `config/config_loader.py` - 配置加载器
133 - `config/field_types.py` - 字段类型定义 149 - `config/field_types.py` - 字段类型定义
@@ -204,77 +220,69 @@ indexes: @@ -204,77 +220,69 @@ indexes:
204 220
205 --- 221 ---
206 222
207 -## 3. 测试数据灌入 223 +## 3. 数据导入流程
208 224
209 ### 3.1 数据源 225 ### 3.1 数据源
210 226
211 -**主表**:`shoplazza_product_sku`  
212 -- 所有租户共用  
213 -- 包含基础商品信息(id, shopid 等) 227 +**店匠标准表**(Base配置使用):
  228 +- `shoplazza_product_spu` - SPU级别商品数据
  229 +- `shoplazza_product_sku` - SKU级别商品数据
214 230
215 -**扩展表**:`customer1_extension`  
216 -- 每个租户独立  
217 -- 包含自定义字段和多语言字段 231 +**其他客户表**(customer1等):
  232 +- 使用各自的数据源表和扩展表
218 233
219 -### 3.2 数据入方式 234 +### 3.2 数据入方式
220 235
221 -**实现情况**: 236 +**Pipeline层决定数据源**:
  237 +- 数据导入流程是写死的脚本,不依赖配置
  238 +- 配置只关注ES搜索相关的内容
  239 +- 数据源映射逻辑写死在转换器代码中
222 240
223 -#### 命令行工具  
224 -```bash  
225 -python main.py ingest \  
226 - --customer customer1 \  
227 - --csv-file data/customer1_data.csv \  
228 - --es-host http://localhost:9200 \  
229 - --recreate \  
230 - --batch-size 100  
231 -``` 241 +#### Base配置数据导入(店匠通用)
  242 +
  243 +**脚本**:`scripts/ingest_shoplazza.py`
232 244
233 -#### 数据流程  
234 -1. **数据加载**:从 CSV 文件或 MySQL 数据库加载数据  
235 -2. **数据转换**:  
236 - - 字段映射(根据配置将源字段映射到 ES 字段)  
237 - - 类型转换(字符串、数字、日期等)  
238 - - 向量生成(文本向量、图片向量)  
239 - - 向量缓存(避免重复计算) 245 +**数据流程**:
  246 +1. **数据加载**:从MySQL读取`shoplazza_product_spu`和`shoplazza_product_sku`表
  247 +2. **数据转换**(`indexer/spu_transformer.py`):
  248 + - 按`spu_id`和`tenant_id`关联SPU和SKU数据
  249 + - 将SKU数据聚合为嵌套的`variants`数组
  250 + - 计算扁平化价格字段(`min_price`, `max_price`, `compare_at_price`)
  251 + - 字段映射(写死在代码中,不依赖配置)
  252 + - 注入`tenant_id`字段
240 3. **索引创建**: 253 3. **索引创建**:
241 - - 根据配置生成 ES mapping  
242 - - 创建或更新索引 254 + - 根据配置生成ES mapping
  255 + - 创建或更新`search_products`索引
243 4. **批量入库**: 256 4. **批量入库**:
244 - - 批量写入 ES(默认每批 500 条) 257 + - 批量写入ES(默认每批500条)
245 - 错误处理和重试机制 258 - 错误处理和重试机制
246 259
247 -#### 配置映射示例  
248 -  
249 -**customer1_config.yaml** 配置:  
250 -```yaml  
251 -main_table: "shoplazza_product_sku"  
252 -extension_table: "customer1_extension"  
253 -es_index_name: "search_customer1"  
254 -  
255 -fields:  
256 - - name: "skuId"  
257 - source_table: "main"  
258 - source_column: "id"  
259 - - name: "name"  
260 - source_table: "extension"  
261 - source_column: "name"  
262 - - name: "enSpuName"  
263 - source_table: "extension"  
264 - source_column: "enSpuName" 260 +**命令行工具**:
  261 +```bash
  262 +python scripts/ingest_shoplazza.py \
  263 + --db-host localhost \
  264 + --db-port 3306 \
  265 + --db-database saas \
  266 + --db-username root \
  267 + --db-password password \
  268 + --tenant-id "1" \
  269 + --config base \
  270 + --es-host http://localhost:9200 \
  271 + --recreate \
  272 + --batch-size 500
265 ``` 273 ```
266 274
267 -**数据转换**:  
268 -- 主表字段:直接从 `shoplazza_product_sku` 表的 `id` 字段读取  
269 -- 扩展表字段:从 `customer1_extension` 表的对应列读取  
270 -- 向量字段:对源文本/图片生成向量并缓存 275 +#### 其他客户数据导入
  276 +
  277 +- 使用各自的数据转换器(如`indexer/data_transformer.py`)
  278 +- 数据源映射逻辑写死在各自的转换器中
  279 +- 共享`search_products`索引,通过`tenant_id`隔离
271 280
272 **实现模块**: 281 **实现模块**:
273 -- `indexer/data_transformer.py` - 数据转换器 282 +- `indexer/spu_transformer.py` - SPU数据转换器(Base配置)
  283 +- `indexer/data_transformer.py` - 通用数据转换器(其他客户)
274 - `indexer/bulk_indexer.py` - 批量索引器 284 - `indexer/bulk_indexer.py` - 批量索引器
275 -- `indexer/indexing_pipeline.py` - 索引流水线  
276 -- `embeddings/bge_encoder.py` - 文本向量编码器  
277 -- `embeddings/clip_image_encoder.py` - 图片向量编码器 285 +- `scripts/ingest_shoplazza.py` - 店匠数据导入脚本
278 286
279 --- 287 ---
280 288
@@ -506,6 +514,14 @@ ranking: @@ -506,6 +514,14 @@ ranking:
506 - ✅ 搜索接口(文本搜索、图片搜索) 514 - ✅ 搜索接口(文本搜索、图片搜索)
507 - ✅ 文档查询接口 515 - ✅ 文档查询接口
508 - ✅ 前端界面(HTML + JavaScript) 516 - ✅ 前端界面(HTML + JavaScript)
  517 +- ✅ 租户隔离(tenant_id过滤)
  518 +
  519 +### 6.6 Base配置(店匠通用)
  520 +- ✅ SPU级别索引结构
  521 +- ✅ 嵌套variants字段
  522 +- ✅ 统一索引(search_products)
  523 +- ✅ 租户隔离(tenant_id)
  524 +- ✅ 配置简化(移除MySQL相关配置)
509 525
510 --- 526 ---
511 527
@@ -521,9 +537,55 @@ ranking: @@ -521,9 +537,55 @@ ranking:
521 537
522 --- 538 ---
523 539
524 -## 8. 配置文件示例 540 +## 8. API响应格式
  541 +
  542 +### 8.1 外部友好格式
  543 +
  544 +API返回格式不包含ES内部字段(`_id`, `_score`, `_source`),使用外部友好的格式:
  545 +
  546 +**响应结构**:
  547 +```json
  548 +{
  549 + "results": [
  550 + {
  551 + "product_id": "123",
  552 + "title": "蓝牙耳机",
  553 + "variants": [
  554 + {
  555 + "variant_id": "456",
  556 + "price": 199.99,
  557 + "sku": "SKU-123-1",
  558 + "stock": 50
  559 + }
  560 + ],
  561 + "relevance_score": 0.95
  562 + }
  563 + ],
  564 + "total": 10,
  565 + "facets": [...],
  566 + "suggestions": [],
  567 + "related_searches": []
  568 +}
  569 +```
  570 +
  571 +**主要变化**:
  572 +- 结构化结果(`ProductResult`和`VariantResult`)
  573 +- 嵌套variants数组
  574 +- 无ES内部字段
  575 +
  576 +### 8.2 租户隔离
  577 +
  578 +所有API请求必须提供`tenant_id`:
  579 +- 请求头:`X-Tenant-ID: 1`
  580 +- 或查询参数:`?tenant_id=1`
  581 +
  582 +搜索时自动添加`tenant_id`过滤,确保数据隔离。
  583 +
  584 +## 9. 配置文件示例
  585 +
  586 +**Base配置**(店匠通用):`config/schema/base/config.yaml`
525 587
526 -完整配置示例请参考:`config/schema/customer1_config.yaml` 588 +**其他客户配置**:`config/schema/customer1/config.yaml`
527 589
528 --- 590 ---
529 591