Commit 1f6d15faa32291662f64d4e0e3f50269b8e104e5

Authored by tangwang
1 parent 06e79082

重构:SPU级别索引、统一索引架构和API响应格式优化

主要变更:
1. 去掉数据源应用结构配置化,我们只针对店匠的spu sku表设计索引,数据灌入流程是写死的(只是满足测试需求,后面外层应用负责数据全量+增量灌入)。搜索系统主要关注如何适配外部搜索需求
目前有两个数据灌入脚本,一种是之前的,一种是现在的从两个店匠的表sku表+spu表读取并且以spu为单位组织doc。
   - 配置只关注ES搜索相关配置,提高可维护性
   - 创建base配置(店匠通用配置)

2. 索引结构重构(SPU维度)
   - 所有客户共享search_products索引,通过tenant_id隔离
   - 支持嵌套variants字段(SKU变体数组)
   - 创建SPUTransformer用于SPU数据转换

3. API响应格式优化
   - 约定一套搜索结果的格式,而不是直接暴露ES doc的结构(_id _score _source内的字段)
   - 添加ProductResult和VariantResult模型
   - 添加suggestions和related_searches字段 (预留接口,逻辑暂未实现)

4. 数据导入流程
   - 创建店匠数据导入脚本(ingest_shoplazza.py)
   - Pipeline层决定数据源,配置不包含数据源信息
   - 创建测试数据生成和导入脚本

5. 文档更新
   - 更新设计文档,反映新架构
   - 创建BASE_CONFIG_GUIDE.md使用指南
api/models.py
... ... @@ -172,11 +172,40 @@ class FacetResult(BaseModel):
172 172 total_count: Optional[int] = Field(None, description="该字段的总文档数")
173 173  
174 174  
  175 +class VariantResult(BaseModel):
  176 + """商品变体结果"""
  177 + variant_id: str = Field(..., description="变体ID")
  178 + title: Optional[str] = Field(None, description="变体标题")
  179 + price: Optional[float] = Field(None, description="价格")
  180 + compare_at_price: Optional[float] = Field(None, description="原价")
  181 + sku: Optional[str] = Field(None, description="SKU编码")
  182 + stock: int = Field(0, description="库存数量")
  183 + options: Optional[Dict[str, Any]] = Field(None, description="选项(颜色、尺寸等)")
  184 +
  185 +
  186 +class ProductResult(BaseModel):
  187 + """商品搜索结果"""
  188 + product_id: str = Field(..., description="商品ID")
  189 + title: Optional[str] = Field(None, description="商品标题")
  190 + handle: Optional[str] = Field(None, description="商品handle")
  191 + description: Optional[str] = Field(None, description="商品描述")
  192 + vendor: Optional[str] = Field(None, description="供应商/品牌")
  193 + product_type: Optional[str] = Field(None, description="商品类型")
  194 + tags: Optional[str] = Field(None, description="标签")
  195 + price: Optional[float] = Field(None, description="价格(min_price)")
  196 + compare_at_price: Optional[float] = Field(None, description="原价")
  197 + currency: str = Field("USD", description="货币单位")
  198 + image_url: Optional[str] = Field(None, description="主图URL")
  199 + in_stock: bool = Field(True, description="是否有库存")
  200 + variants: List[VariantResult] = Field(default_factory=list, description="变体列表")
  201 + relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性分数(0-1)")
  202 +
  203 +
175 204 class SearchResponse(BaseModel):
176   - """搜索响应模型(重构版)"""
  205 + """搜索响应模型(外部友好格式)"""
177 206  
178 207 # 核心结果
179   - hits: List[Dict[str, Any]] = Field(..., description="搜索结果列表")
  208 + results: List[ProductResult] = Field(..., description="搜索结果列表")
180 209 total: int = Field(..., description="匹配的总文档数")
181 210 max_score: float = Field(..., description="最高相关性分数")
182 211  
... ... @@ -192,8 +221,9 @@ class SearchResponse(BaseModel):
192 221 description="查询处理信息(原始查询、改写、语言检测、翻译等)"
193 222 )
194 223  
195   - # 推荐与建议(预留)
196   - related_queries: Optional[List[str]] = Field(None, description="相关搜索查询")
  224 + # 推荐与建议
  225 + suggestions: List[str] = Field(default_factory=list, description="搜索建议")
  226 + related_searches: List[str] = Field(default_factory=list, description="相关搜索")
197 227  
198 228 # 性能指标
199 229 took_ms: int = Field(..., description="搜索总耗时(毫秒)")
... ...
api/result_formatter.py 0 → 100644
... ... @@ -0,0 +1,177 @@
  1 +"""
  2 +Result formatter for converting ES internal format to external-friendly format.
  3 +"""
  4 +
  5 +from typing import List, Dict, Any, Optional
  6 +from .models import ProductResult, VariantResult, FacetResult, FacetValue
  7 +
  8 +
  9 +class ResultFormatter:
  10 + """Formats ES search results to external-friendly format."""
  11 +
  12 + @staticmethod
  13 + def format_search_results(
  14 + es_hits: List[Dict[str, Any]],
  15 + max_score: float = 1.0
  16 + ) -> List[ProductResult]:
  17 + """
  18 + Convert ES hits to ProductResult list.
  19 +
  20 + Args:
  21 + es_hits: List of ES hit dictionaries (with _id, _score, _source)
  22 + max_score: Maximum score for normalization
  23 +
  24 + Returns:
  25 + List of ProductResult objects
  26 + """
  27 + results = []
  28 +
  29 + for hit in es_hits:
  30 + source = hit.get('_source', {})
  31 + score = hit.get('_score', 0.0)
  32 +
  33 + # Normalize relevance score to 0-1
  34 + if max_score > 0:
  35 + relevance_score = min(score / max_score, 1.0)
  36 + else:
  37 + relevance_score = 0.0
  38 +
  39 + # Extract variants
  40 + variants = []
  41 + variants_data = source.get('variants', [])
  42 + if isinstance(variants_data, list):
  43 + for variant_data in variants_data:
  44 + variant = VariantResult(
  45 + variant_id=str(variant_data.get('variant_id', '')),
  46 + title=variant_data.get('title'),
  47 + price=variant_data.get('price'),
  48 + compare_at_price=variant_data.get('compare_at_price'),
  49 + sku=variant_data.get('sku'),
  50 + stock=variant_data.get('stock', 0),
  51 + options=variant_data.get('options')
  52 + )
  53 + variants.append(variant)
  54 +
  55 + # Determine in_stock (any variant has stock > 0)
  56 + in_stock = any(v.stock > 0 for v in variants) if variants else True
  57 +
  58 + # Build ProductResult
  59 + product = ProductResult(
  60 + product_id=str(source.get('product_id', '')),
  61 + title=source.get('title'),
  62 + handle=source.get('handle'),
  63 + description=source.get('description'),
  64 + vendor=source.get('vendor'),
  65 + product_type=source.get('product_type'),
  66 + tags=source.get('tags'),
  67 + price=source.get('min_price'),
  68 + compare_at_price=source.get('compare_at_price'),
  69 + currency="USD", # Default currency
  70 + image_url=source.get('image_url'),
  71 + in_stock=in_stock,
  72 + variants=variants,
  73 + relevance_score=relevance_score
  74 + )
  75 +
  76 + results.append(product)
  77 +
  78 + return results
  79 +
  80 + @staticmethod
  81 + def format_facets(
  82 + es_aggregations: Dict[str, Any],
  83 + facet_configs: Optional[List[Any]] = None
  84 + ) -> List[FacetResult]:
  85 + """
  86 + Format ES aggregations to FacetResult list.
  87 +
  88 + Args:
  89 + es_aggregations: ES aggregations response
  90 + facet_configs: Facet configurations (optional)
  91 +
  92 + Returns:
  93 + List of FacetResult objects
  94 + """
  95 + facets = []
  96 +
  97 + for field_name, agg_data in es_aggregations.items():
  98 + # Handle terms aggregation
  99 + if 'buckets' in agg_data:
  100 + values = []
  101 + for bucket in agg_data['buckets']:
  102 + value = FacetValue(
  103 + value=bucket['key'],
  104 + label=bucket.get('key_as_string', str(bucket['key'])),
  105 + count=bucket['doc_count'],
  106 + selected=False
  107 + )
  108 + values.append(value)
  109 +
  110 + facet = FacetResult(
  111 + field=field_name,
  112 + label=field_name, # Can be enhanced with field labels
  113 + type="terms",
  114 + values=values,
  115 + total_count=agg_data.get('sum_other_doc_count', 0) + len(values)
  116 + )
  117 + facets.append(facet)
  118 +
  119 + # Handle range aggregation
  120 + elif 'buckets' in agg_data and any('from' in b or 'to' in b for b in agg_data['buckets']):
  121 + values = []
  122 + for bucket in agg_data['buckets']:
  123 + range_key = bucket.get('key', '')
  124 + value = FacetValue(
  125 + value=range_key,
  126 + label=range_key,
  127 + count=bucket['doc_count'],
  128 + selected=False
  129 + )
  130 + values.append(value)
  131 +
  132 + facet = FacetResult(
  133 + field=field_name,
  134 + label=field_name,
  135 + type="range",
  136 + values=values
  137 + )
  138 + facets.append(facet)
  139 +
  140 + return facets
  141 +
  142 + @staticmethod
  143 + def generate_suggestions(
  144 + query: str,
  145 + results: List[ProductResult]
  146 + ) -> List[str]:
  147 + """
  148 + Generate search suggestions.
  149 +
  150 + Args:
  151 + query: Original search query
  152 + results: Search results
  153 +
  154 + Returns:
  155 + List of suggestion strings (currently returns empty list)
  156 + """
  157 + # TODO: Implement suggestion generation logic
  158 + return []
  159 +
  160 + @staticmethod
  161 + def generate_related_searches(
  162 + query: str,
  163 + results: List[ProductResult]
  164 + ) -> List[str]:
  165 + """
  166 + Generate related searches.
  167 +
  168 + Args:
  169 + query: Original search query
  170 + results: Search results
  171 +
  172 + Returns:
  173 + List of related search strings (currently returns empty list)
  174 + """
  175 + # TODO: Implement related search generation logic
  176 + return []
  177 +
... ...
api/routes/search.py
... ... @@ -33,7 +33,7 @@ def extract_request_info(request: Request) -> tuple[str, str]:
33 33 @router.post("/", response_model=SearchResponse)
34 34 async def search(request: SearchRequest, http_request: Request):
35 35 """
36   - Execute text search query (重构版).
  36 + Execute text search query (外部友好格式).
37 37  
38 38 Supports:
39 39 - Multi-language query processing
... ... @@ -42,9 +42,27 @@ async def search(request: SearchRequest, http_request: Request):
42 42 - Custom ranking functions
43 43 - Exact match filters and range filters
44 44 - Faceted search
  45 +
  46 + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id).
45 47 """
46 48 reqid, uid = extract_request_info(http_request)
47 49  
  50 + # Extract tenant_id (required)
  51 + tenant_id = http_request.headers.get('X-Tenant-ID')
  52 + if not tenant_id:
  53 + # Try to get from query string
  54 + from urllib.parse import parse_qs
  55 + query_string = http_request.url.query
  56 + if query_string:
  57 + params = parse_qs(query_string)
  58 + tenant_id = params.get('tenant_id', [None])[0]
  59 +
  60 + if not tenant_id:
  61 + raise HTTPException(
  62 + status_code=400,
  63 + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'"
  64 + )
  65 +
48 66 # Create request context
49 67 context = create_request_context(reqid=reqid, uid=uid)
50 68  
... ... @@ -54,7 +72,7 @@ async def search(request: SearchRequest, http_request: Request):
54 72 try:
55 73 # Log request start
56 74 context.logger.info(
57   - f"收到搜索请求 | IP: {http_request.client.host if http_request.client else 'unknown'} | "
  75 + f"收到搜索请求 | Tenant: {tenant_id} | IP: {http_request.client.host if http_request.client else 'unknown'} | "
58 76 f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}",
59 77 extra={'reqid': context.reqid, 'uid': context.uid}
60 78 )
... ... @@ -66,6 +84,7 @@ async def search(request: SearchRequest, http_request: Request):
66 84 # Execute search with context (using backend defaults from config)
67 85 result = searcher.search(
68 86 query=request.query,
  87 + tenant_id=tenant_id,
69 88 size=request.size,
70 89 from_=request.from_,
71 90 filters=request.filters,
... ... @@ -83,12 +102,14 @@ async def search(request: SearchRequest, http_request: Request):
83 102  
84 103 # Convert to response model
85 104 return SearchResponse(
86   - hits=result.hits,
  105 + results=result.results,
87 106 total=result.total,
88 107 max_score=result.max_score,
89 108 took_ms=result.took_ms,
90 109 facets=result.facets,
91 110 query_info=result.query_info,
  111 + suggestions=result.suggestions,
  112 + related_searches=result.related_searches,
92 113 performance_info=performance_summary,
93 114 debug_info=result.debug_info
94 115 )
... ... @@ -110,13 +131,30 @@ async def search(request: SearchRequest, http_request: Request):
110 131 @router.post("/image", response_model=SearchResponse)
111 132 async def search_by_image(request: ImageSearchRequest, http_request: Request):
112 133 """
113   - Search by image similarity (重构版).
  134 + Search by image similarity (外部友好格式).
114 135  
115 136 Uses image embeddings to find visually similar products.
116 137 Supports exact match filters and range filters.
  138 +
  139 + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id).
117 140 """
118 141 reqid, uid = extract_request_info(http_request)
119 142  
  143 + # Extract tenant_id (required)
  144 + tenant_id = http_request.headers.get('X-Tenant-ID')
  145 + if not tenant_id:
  146 + from urllib.parse import parse_qs
  147 + query_string = http_request.url.query
  148 + if query_string:
  149 + params = parse_qs(query_string)
  150 + tenant_id = params.get('tenant_id', [None])[0]
  151 +
  152 + if not tenant_id:
  153 + raise HTTPException(
  154 + status_code=400,
  155 + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'"
  156 + )
  157 +
120 158 # Create request context
121 159 context = create_request_context(reqid=reqid, uid=uid)
122 160  
... ... @@ -126,7 +164,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
126 164 try:
127 165 # Log request start
128 166 context.logger.info(
129   - f"收到图片搜索请求 | 图片URL: {request.image_url} | "
  167 + f"收到图片搜索请求 | Tenant: {tenant_id} | 图片URL: {request.image_url} | "
130 168 f"IP: {http_request.client.host if http_request.client else 'unknown'}",
131 169 extra={'reqid': context.reqid, 'uid': context.uid}
132 170 )
... ... @@ -137,6 +175,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
137 175 # Execute image search
138 176 result = searcher.search_by_image(
139 177 image_url=request.image_url,
  178 + tenant_id=tenant_id,
140 179 size=request.size,
141 180 filters=request.filters,
142 181 range_filters=request.range_filters
... ... @@ -146,12 +185,14 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request):
146 185 performance_summary = context.get_summary() if context else None
147 186  
148 187 return SearchResponse(
149   - hits=result.hits,
  188 + results=result.results,
150 189 total=result.total,
151 190 max_score=result.max_score,
152 191 took_ms=result.took_ms,
153 192 facets=result.facets,
154 193 query_info=result.query_info,
  194 + suggestions=result.suggestions,
  195 + related_searches=result.related_searches,
155 196 performance_info=performance_summary
156 197 )
157 198  
... ... @@ -226,7 +267,8 @@ async def search_suggestions(
226 267 @router.get("/instant", response_model=SearchResponse)
227 268 async def instant_search(
228 269 q: str = Query(..., min_length=2, description="搜索查询"),
229   - size: int = Query(5, ge=1, le=20, description="结果数量")
  270 + size: int = Query(5, ge=1, le=20, description="结果数量"),
  271 + tenant_id: str = Query(..., description="租户ID")
230 272 ):
231 273 """
232 274 即时搜索(Instant Search)。
... ... @@ -246,17 +288,20 @@ async def instant_search(
246 288  
247 289 result = searcher.search(
248 290 query=q,
  291 + tenant_id=tenant_id,
249 292 size=size,
250 293 from_=0
251 294 )
252 295  
253 296 return SearchResponse(
254   - hits=result.hits,
  297 + results=result.results,
255 298 total=result.total,
256 299 max_score=result.max_score,
257 300 took_ms=result.took_ms,
258 301 facets=result.facets,
259   - query_info=result.query_info
  302 + query_info=result.query_info,
  303 + suggestions=result.suggestions,
  304 + related_searches=result.related_searches
260 305 )
261 306  
262 307  
... ...
config/config_loader.py
... ... @@ -90,9 +90,6 @@ class CustomerConfig:
90 90 customer_id: str
91 91 customer_name: str
92 92  
93   - # Database settings
94   - mysql_config: Dict[str, Any]
95   -
96 93 # Field definitions
97 94 fields: List[FieldConfig]
98 95  
... ... @@ -116,10 +113,6 @@ class CustomerConfig:
116 113  
117 114 # ES index settings
118 115 es_index_name: str
119   -
120   - # Optional fields with defaults
121   - main_table: str = "shoplazza_product_sku"
122   - extension_table: Optional[str] = None
123 116 es_settings: Dict[str, Any] = field(default_factory=dict)
124 117  
125 118  
... ... @@ -272,9 +265,6 @@ class ConfigLoader:
272 265 return CustomerConfig(
273 266 customer_id=customer_id,
274 267 customer_name=config_data.get("customer_name", customer_id),
275   - mysql_config=config_data.get("mysql_config", {}),
276   - main_table=config_data.get("main_table", "shoplazza_product_sku"),
277   - extension_table=config_data.get("extension_table"),
278 268 fields=fields,
279 269 indexes=indexes,
280 270 query_config=query_config,
... ... @@ -310,8 +300,6 @@ class ConfigLoader:
310 300 return FieldConfig(
311 301 name=name,
312 302 field_type=field_type,
313   - source_table=field_data.get("source_table"),
314   - source_column=field_data.get("source_column", name),
315 303 analyzer=analyzer,
316 304 search_analyzer=search_analyzer,
317 305 required=field_data.get("required", False),
... ... @@ -426,15 +414,17 @@ class ConfigLoader:
426 414 if field.embedding_similarity not in ["dot_product", "cosine", "l2_norm"]:
427 415 errors.append(f"Field '{field.name}': invalid embedding_similarity")
428 416  
429   - # Validate MySQL config
430   - if "host" not in config.mysql_config:
431   - errors.append("MySQL configuration missing 'host'")
432   - if "username" not in config.mysql_config:
433   - errors.append("MySQL configuration missing 'username'")
434   - if "password" not in config.mysql_config:
435   - errors.append("MySQL configuration missing 'password'")
436   - if "database" not in config.mysql_config:
437   - errors.append("MySQL configuration missing 'database'")
  417 + # Validate tenant_id field (required)
  418 + tenant_id_field = None
  419 + for field in config.fields:
  420 + if field.name == "tenant_id":
  421 + tenant_id_field = field
  422 + break
  423 +
  424 + if not tenant_id_field:
  425 + errors.append("Required field 'tenant_id' not found in fields")
  426 + elif not tenant_id_field.required:
  427 + errors.append("Field 'tenant_id' must be marked as required")
438 428  
439 429 return errors
440 430  
... ... @@ -457,9 +447,6 @@ class ConfigLoader:
457 447 # Convert config back to dictionary format
458 448 config_dict = {
459 449 "customer_name": config.customer_name,
460   - "mysql_config": config.mysql_config,
461   - "main_table": config.main_table,
462   - "extension_table": config.extension_table,
463 450 "es_index_name": config.es_index_name,
464 451 "es_settings": config.es_settings,
465 452 "fields": [self._field_to_dict(field) for field in config.fields],
... ... @@ -478,6 +465,16 @@ class ConfigLoader:
478 465 "expression": config.ranking.expression,
479 466 "description": config.ranking.description
480 467 },
  468 + "function_score": {
  469 + "score_mode": config.function_score.score_mode,
  470 + "boost_mode": config.function_score.boost_mode,
  471 + "functions": config.function_score.functions
  472 + },
  473 + "rerank": {
  474 + "enabled": config.rerank.enabled,
  475 + "expression": config.rerank.expression,
  476 + "description": config.rerank.description
  477 + },
481 478 "spu_config": {
482 479 "enabled": config.spu_config.enabled,
483 480 "spu_field": config.spu_config.spu_field,
... ... @@ -512,8 +509,6 @@ class ConfigLoader:
512 509 result = {
513 510 "name": field.name,
514 511 "type": field.field_type.value,
515   - "source_table": field.source_table,
516   - "source_column": field.source_column,
517 512 "required": field.required,
518 513 "boost": field.boost,
519 514 "store": field.store,
... ...
config/field_types.py
... ... @@ -54,8 +54,6 @@ class FieldConfig:
54 54 """Configuration for a single field."""
55 55 name: str
56 56 field_type: FieldType
57   - source_table: Optional[str] = None # 'main' or 'extension' or specific table name
58   - source_column: Optional[str] = None
59 57 analyzer: Optional[AnalyzerType] = None
60 58 search_analyzer: Optional[AnalyzerType] = None
61 59 required: bool = False
... ... @@ -172,10 +170,34 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
172 170 }
173 171  
174 172 elif field_config.field_type == FieldType.JSON:
175   - mapping = {
176   - "type": "object",
177   - "enabled": True
178   - }
  173 + if field_config.nested and field_config.nested_properties:
  174 + # Nested type with properties (e.g., variants)
  175 + mapping = {
  176 + "type": "nested",
  177 + "properties": {}
  178 + }
  179 + # Generate mappings for nested properties
  180 + for prop_name, prop_config in field_config.nested_properties.items():
  181 + prop_type = prop_config.get("type", "keyword")
  182 + prop_mapping = {"type": prop_type}
  183 +
  184 + # Add analyzer for text fields
  185 + if prop_type == "text" and "analyzer" in prop_config:
  186 + prop_mapping["analyzer"] = prop_config["analyzer"]
  187 +
  188 + # Add other properties
  189 + if "index" in prop_config:
  190 + prop_mapping["index"] = prop_config["index"]
  191 + if "store" in prop_config:
  192 + prop_mapping["store"] = prop_config["store"]
  193 +
  194 + mapping["properties"][prop_name] = prop_mapping
  195 + else:
  196 + # Simple object type
  197 + mapping = {
  198 + "type": "object",
  199 + "enabled": True
  200 + }
179 201  
180 202 return mapping
181 203  
... ...
config/schema/base/config.yaml
... ... @@ -0,0 +1,271 @@
  1 +# Base Configuration for Shoplazza
  2 +# 店匠通用配置文件,所有使用店匠表的客户共用
  3 +# 注意:此配置不包含MySQL相关配置,只包含ES搜索相关配置
  4 +
  5 +customer_name: "Shoplazza Base Configuration"
  6 +
  7 +# Elasticsearch Index
  8 +es_index_name: "search_products"
  9 +
  10 +# ES Index Settings
  11 +es_settings:
  12 + number_of_shards: 1
  13 + number_of_replicas: 0
  14 + refresh_interval: "30s"
  15 +
  16 +# Field Definitions (SPU级别,只包含对搜索有帮助的字段)
  17 +fields:
  18 + # 租户隔离字段(必需)
  19 + - name: "tenant_id"
  20 + type: "KEYWORD"
  21 + required: true
  22 + index: true
  23 + store: true
  24 +
  25 + # 商品标识字段
  26 + - name: "product_id"
  27 + type: "KEYWORD"
  28 + required: true
  29 + index: true
  30 + store: true
  31 +
  32 + - name: "handle"
  33 + type: "KEYWORD"
  34 + index: true
  35 + store: true
  36 +
  37 + # 文本搜索字段
  38 + - name: "title"
  39 + type: "TEXT"
  40 + analyzer: "chinese_ecommerce"
  41 + boost: 3.0
  42 + index: true
  43 + store: true
  44 +
  45 + - name: "brief"
  46 + type: "TEXT"
  47 + analyzer: "chinese_ecommerce"
  48 + boost: 1.5
  49 + index: true
  50 + store: true
  51 +
  52 + - name: "description"
  53 + type: "TEXT"
  54 + analyzer: "chinese_ecommerce"
  55 + boost: 1.0
  56 + index: true
  57 + store: true
  58 +
  59 + # SEO字段(提升相关性)
  60 + - name: "seo_title"
  61 + type: "TEXT"
  62 + analyzer: "chinese_ecommerce"
  63 + boost: 2.0
  64 + index: true
  65 + store: true
  66 +
  67 + - name: "seo_description"
  68 + type: "TEXT"
  69 + analyzer: "chinese_ecommerce"
  70 + boost: 1.5
  71 + index: true
  72 + store: true
  73 +
  74 + - name: "seo_keywords"
  75 + type: "TEXT"
  76 + analyzer: "chinese_ecommerce"
  77 + boost: 2.0
  78 + index: true
  79 + store: true
  80 +
  81 + # 分类和标签字段(TEXT + KEYWORD双重索引)
  82 + - name: "vendor"
  83 + type: "TEXT"
  84 + analyzer: "chinese_ecommerce"
  85 + boost: 1.5
  86 + index: true
  87 + store: true
  88 +
  89 + - name: "vendor_keyword"
  90 + type: "KEYWORD"
  91 + index: true
  92 + store: false
  93 +
  94 + - name: "product_type"
  95 + type: "TEXT"
  96 + analyzer: "chinese_ecommerce"
  97 + boost: 1.5
  98 + index: true
  99 + store: true
  100 +
  101 + - name: "product_type_keyword"
  102 + type: "KEYWORD"
  103 + index: true
  104 + store: false
  105 +
  106 + - name: "tags"
  107 + type: "TEXT"
  108 + analyzer: "chinese_ecommerce"
  109 + boost: 1.0
  110 + index: true
  111 + store: true
  112 +
  113 + - name: "tags_keyword"
  114 + type: "KEYWORD"
  115 + index: true
  116 + store: false
  117 +
  118 + - name: "category"
  119 + type: "TEXT"
  120 + analyzer: "chinese_ecommerce"
  121 + boost: 1.5
  122 + index: true
  123 + store: true
  124 +
  125 + - name: "category_keyword"
  126 + type: "KEYWORD"
  127 + index: true
  128 + store: false
  129 +
  130 + # 价格字段(扁平化)
  131 + - name: "min_price"
  132 + type: "FLOAT"
  133 + index: true
  134 + store: true
  135 +
  136 + - name: "max_price"
  137 + type: "FLOAT"
  138 + index: true
  139 + store: true
  140 +
  141 + - name: "compare_at_price"
  142 + type: "FLOAT"
  143 + index: true
  144 + store: true
  145 +
  146 + # 图片字段(用于显示,不参与搜索)
  147 + - name: "image_url"
  148 + type: "KEYWORD"
  149 + index: false
  150 + store: true
  151 +
  152 + # 嵌套variants字段
  153 + - name: "variants"
  154 + type: "JSON"
  155 + nested: true
  156 + nested_properties:
  157 + variant_id:
  158 + type: "keyword"
  159 + index: true
  160 + store: true
  161 + title:
  162 + type: "text"
  163 + analyzer: "chinese_ecommerce"
  164 + index: true
  165 + store: true
  166 + price:
  167 + type: "float"
  168 + index: true
  169 + store: true
  170 + compare_at_price:
  171 + type: "float"
  172 + index: true
  173 + store: true
  174 + sku:
  175 + type: "keyword"
  176 + index: true
  177 + store: true
  178 + stock:
  179 + type: "long"
  180 + index: true
  181 + store: true
  182 + options:
  183 + type: "object"
  184 + enabled: true
  185 +
  186 +# Index Structure (Query Domains)
  187 +indexes:
  188 + - name: "default"
  189 + label: "默认索引"
  190 + fields:
  191 + - "title"
  192 + - "brief"
  193 + - "description"
  194 + - "seo_title"
  195 + - "seo_description"
  196 + - "seo_keywords"
  197 + - "vendor"
  198 + - "product_type"
  199 + - "tags"
  200 + - "category"
  201 + analyzer: "chinese_ecommerce"
  202 + boost: 1.0
  203 +
  204 + - name: "title"
  205 + label: "标题索引"
  206 + fields:
  207 + - "title"
  208 + - "seo_title"
  209 + analyzer: "chinese_ecommerce"
  210 + boost: 2.0
  211 +
  212 + - name: "vendor"
  213 + label: "品牌索引"
  214 + fields:
  215 + - "vendor"
  216 + analyzer: "chinese_ecommerce"
  217 + boost: 1.5
  218 +
  219 + - name: "category"
  220 + label: "类目索引"
  221 + fields:
  222 + - "category"
  223 + analyzer: "chinese_ecommerce"
  224 + boost: 1.5
  225 +
  226 + - name: "tags"
  227 + label: "标签索引"
  228 + fields:
  229 + - "tags"
  230 + - "seo_keywords"
  231 + analyzer: "chinese_ecommerce"
  232 + boost: 1.0
  233 +
  234 +# Query Configuration
  235 +query_config:
  236 + supported_languages:
  237 + - "zh"
  238 + - "en"
  239 + default_language: "zh"
  240 + enable_translation: true
  241 + enable_text_embedding: true
  242 + enable_query_rewrite: true
  243 +
  244 + # Translation API (DeepL)
  245 + translation_service: "deepl"
  246 + translation_api_key: null # Set via environment variable
  247 +
  248 +# Ranking Configuration
  249 +ranking:
  250 + expression: "bm25() + 0.2*text_embedding_relevance()"
  251 + description: "BM25 text relevance combined with semantic embedding similarity"
  252 +
  253 +# Function Score配置(ES层打分规则)
  254 +function_score:
  255 + score_mode: "sum"
  256 + boost_mode: "multiply"
  257 +
  258 + functions: []
  259 +
  260 +# Rerank配置(本地重排,当前禁用)
  261 +rerank:
  262 + enabled: false
  263 + expression: ""
  264 + description: "Local reranking (disabled, use ES function_score instead)"
  265 +
  266 +# SPU配置(已启用,使用嵌套variants)
  267 +spu_config:
  268 + enabled: true
  269 + spu_field: "product_id"
  270 + inner_hits_size: 10
  271 +
... ...
docs/BASE_CONFIG_GUIDE.md 0 → 100644
... ... @@ -0,0 +1,257 @@
  1 +# Base Configuration Guide
  2 +
  3 +店匠通用配置(Base Configuration)使用指南
  4 +
  5 +## 概述
  6 +
  7 +Base配置是店匠(Shoplazza)通用配置,适用于所有使用店匠标准表的客户。该配置采用SPU级别的索引结构,所有客户共享同一个Elasticsearch索引(`search_products`),通过`tenant_id`字段实现数据隔离。
  8 +
  9 +## 核心特性
  10 +
  11 +- **SPU级别索引**:每个ES文档代表一个SPU,包含嵌套的variants数组
  12 +- **统一索引**:所有客户共享`search_products`索引
  13 +- **租户隔离**:通过`tenant_id`字段实现数据隔离
  14 +- **配置简化**:配置只包含ES搜索相关配置,不包含MySQL数据源配置
  15 +- **外部友好格式**:API返回格式不包含ES内部字段(`_id`, `_score`, `_source`)
  16 +
  17 +## 配置说明
  18 +
  19 +### 配置文件位置
  20 +
  21 +`config/schema/base/config.yaml`
  22 +
  23 +### 配置内容
  24 +
  25 +Base配置**不包含**以下内容:
  26 +- `mysql_config` - MySQL数据库配置
  27 +- `main_table` - 主表配置
  28 +- `extension_table` - 扩展表配置
  29 +- `source_table` / `source_column` - 字段数据源映射
  30 +
  31 +Base配置**只包含**:
  32 +- ES字段定义(字段类型、分析器、boost等)
  33 +- 查询域(indexes)配置
  34 +- 查询处理配置(query_config)
  35 +- 排序和打分配置(function_score)
  36 +- SPU配置(spu_config)
  37 +
  38 +### 必需字段
  39 +
  40 +- `tenant_id` (KEYWORD, required) - 租户隔离字段
  41 +
  42 +### 主要字段
  43 +
  44 +- `product_id` - 商品ID
  45 +- `title`, `brief`, `description` - 文本搜索字段
  46 +- `seo_title`, `seo_description`, `seo_keywords` - SEO字段
  47 +- `vendor`, `product_type`, `tags`, `category` - 分类和标签字段
  48 +- `min_price`, `max_price`, `compare_at_price` - 价格字段
  49 +- `variants` (nested) - 嵌套变体数组
  50 +
  51 +## 数据导入流程
  52 +
  53 +### 1. 生成测试数据
  54 +
  55 +```bash
  56 +python scripts/generate_test_data.py \
  57 + --num-spus 100 \
  58 + --tenant-id "1" \
  59 + --start-spu-id 1 \
  60 + --start-sku-id 1 \
  61 + --output test_data.sql
  62 +```
  63 +
  64 +### 2. 导入测试数据到MySQL
  65 +
  66 +```bash
  67 +python scripts/import_test_data.py \
  68 + --db-host localhost \
  69 + --db-port 3306 \
  70 + --db-database saas \
  71 + --db-username root \
  72 + --db-password password \
  73 + --sql-file test_data.sql \
  74 + --tenant-id "1"
  75 +```
  76 +
  77 +### 3. 导入数据到Elasticsearch
  78 +
  79 +```bash
  80 +python scripts/ingest_shoplazza.py \
  81 + --db-host localhost \
  82 + --db-port 3306 \
  83 + --db-database saas \
  84 + --db-username root \
  85 + --db-password password \
  86 + --tenant-id "1" \
  87 + --config base \
  88 + --es-host http://localhost:9200 \
  89 + --recreate \
  90 + --batch-size 500
  91 +```
  92 +
  93 +## API使用
  94 +
  95 +### 搜索接口
  96 +
  97 +**端点**: `POST /search/`
  98 +
  99 +**请求头**:
  100 +```
  101 +X-Tenant-ID: 1
  102 +Content-Type: application/json
  103 +```
  104 +
  105 +**请求体**:
  106 +```json
  107 +{
  108 + "query": "耳机",
  109 + "size": 10,
  110 + "from": 0,
  111 + "filters": {
  112 + "category_keyword": "电子产品"
  113 + },
  114 + "facets": ["category_keyword", "vendor_keyword"]
  115 +}
  116 +```
  117 +
  118 +**响应格式**:
  119 +```json
  120 +{
  121 + "results": [
  122 + {
  123 + "product_id": "1",
  124 + "title": "蓝牙耳机 Sony",
  125 + "handle": "product-1",
  126 + "description": "高品质无线蓝牙耳机",
  127 + "vendor": "Sony",
  128 + "product_type": "电子产品",
  129 + "price": 199.99,
  130 + "compare_at_price": 299.99,
  131 + "currency": "USD",
  132 + "image_url": "//cdn.example.com/products/1.jpg",
  133 + "in_stock": true,
  134 + "variants": [
  135 + {
  136 + "variant_id": "1",
  137 + "title": "黑色",
  138 + "price": 199.99,
  139 + "compare_at_price": 299.99,
  140 + "sku": "SKU-1-1",
  141 + "stock": 50,
  142 + "options": {
  143 + "option1": "黑色"
  144 + }
  145 + }
  146 + ],
  147 + "relevance_score": 0.95
  148 + }
  149 + ],
  150 + "total": 10,
  151 + "max_score": 1.0,
  152 + "facets": [
  153 + {
  154 + "field": "category_keyword",
  155 + "label": "category_keyword",
  156 + "type": "terms",
  157 + "values": [
  158 + {
  159 + "value": "电子产品",
  160 + "label": "电子产品",
  161 + "count": 5,
  162 + "selected": false
  163 + }
  164 + ]
  165 + }
  166 + ],
  167 + "suggestions": [],
  168 + "related_searches": [],
  169 + "took_ms": 15,
  170 + "query_info": {}
  171 +}
  172 +```
  173 +
  174 +### 响应格式说明
  175 +
  176 +#### 主要变化
  177 +
  178 +1. **`results`替代`hits`**:返回字段从`hits`改为`results`
  179 +2. **结构化结果**:每个结果包含`product_id`, `title`, `variants`, `relevance_score`等字段
  180 +3. **无ES内部字段**:不包含`_id`, `_score`, `_source`等ES内部字段
  181 +4. **嵌套variants**:每个商品包含variants数组,每个variant包含完整的变体信息
  182 +5. **相关性分数**:`relevance_score`是0-1之间的归一化分数
  183 +
  184 +#### ProductResult字段
  185 +
  186 +- `product_id` - 商品ID
  187 +- `title` - 商品标题
  188 +- `handle` - 商品handle
  189 +- `description` - 商品描述
  190 +- `vendor` - 供应商/品牌
  191 +- `product_type` - 商品类型
  192 +- `tags` - 标签
  193 +- `price` - 最低价格(min_price)
  194 +- `compare_at_price` - 原价
  195 +- `currency` - 货币单位(默认USD)
  196 +- `image_url` - 主图URL
  197 +- `in_stock` - 是否有库存
  198 +- `variants` - 变体列表
  199 +- `relevance_score` - 相关性分数(0-1)
  200 +
  201 +#### VariantResult字段
  202 +
  203 +- `variant_id` - 变体ID
  204 +- `title` - 变体标题
  205 +- `price` - 价格
  206 +- `compare_at_price` - 原价
  207 +- `sku` - SKU编码
  208 +- `stock` - 库存数量
  209 +- `options` - 选项(颜色、尺寸等)
  210 +
  211 +## 测试
  212 +
  213 +### 运行测试脚本
  214 +
  215 +```bash
  216 +python scripts/test_base.py \
  217 + --api-url http://localhost:8000 \
  218 + --tenant-id "1" \
  219 + --test-tenant-2 "2"
  220 +```
  221 +
  222 +### 测试内容
  223 +
  224 +1. **基本搜索**:测试搜索API基本功能
  225 +2. **响应格式验证**:验证返回格式是否符合要求
  226 +3. **Facets聚合**:测试分面搜索功能
  227 +4. **租户隔离**:验证不同租户的数据隔离
  228 +
  229 +## 常见问题
  230 +
  231 +### Q: 为什么配置中没有MySQL相关配置?
  232 +
  233 +A: 数据源配置和数据导入流程是写死的脚本,不在搜索配置中。搜索配置只关注ES搜索相关的内容。
  234 +
  235 +### Q: 如何为新的租户导入数据?
  236 +
  237 +A: 使用`ingest_shoplazza.py`脚本,指定不同的`--tenant-id`参数即可。
  238 +
  239 +### Q: 如何验证租户隔离是否生效?
  240 +
  241 +A: 使用`test_base.py`脚本,指定两个不同的`--tenant-id`,检查搜索结果是否隔离。
  242 +
  243 +### Q: API返回格式中为什么没有`_id`和`_score`?
  244 +
  245 +A: 为了提供外部友好的API格式,我们移除了ES内部字段,使用`product_id`和`relevance_score`替代。
  246 +
  247 +### Q: 如何添加新的搜索字段?
  248 +
  249 +A: 在`config/schema/base/config.yaml`中添加字段定义,然后重新生成索引映射并重新导入数据。
  250 +
  251 +## 注意事项
  252 +
  253 +1. **tenant_id必需**:所有API请求必须提供`tenant_id`(通过请求头`X-Tenant-ID`或查询参数`tenant_id`)
  254 +2. **索引共享**:所有客户共享`search_products`索引,确保`tenant_id`字段正确设置
  255 +3. **数据导入**:数据导入脚本是写死的,不依赖配置中的MySQL设置
  256 +4. **配置分离**:搜索配置和数据源配置完全分离,提高可维护性
  257 +
... ...
indexer/spu_transformer.py 0 → 100644
... ... @@ -0,0 +1,288 @@
  1 +"""
  2 +SPU data transformer for Shoplazza products.
  3 +
  4 +Transforms SPU and SKU data from MySQL into SPU-level ES documents with nested variants.
  5 +"""
  6 +
  7 +import pandas as pd
  8 +import numpy as np
  9 +from typing import Dict, Any, List, Optional
  10 +from sqlalchemy import create_engine, text
  11 +from utils.db_connector import create_db_connection
  12 +
  13 +
  14 +class SPUTransformer:
  15 + """Transform SPU and SKU data into SPU-level ES documents."""
  16 +
  17 + def __init__(
  18 + self,
  19 + db_engine: Any,
  20 + tenant_id: str
  21 + ):
  22 + """
  23 + Initialize SPU transformer.
  24 +
  25 + Args:
  26 + db_engine: SQLAlchemy database engine
  27 + tenant_id: Tenant ID for filtering data
  28 + """
  29 + self.db_engine = db_engine
  30 + self.tenant_id = tenant_id
  31 +
  32 + def load_spu_data(self) -> pd.DataFrame:
  33 + """
  34 + Load SPU data from MySQL.
  35 +
  36 + Returns:
  37 + DataFrame with SPU data
  38 + """
  39 + query = text("""
  40 + SELECT
  41 + id, shop_id, shoplazza_id, handle, title, brief, description,
  42 + spu, vendor, vendor_url, seo_title, seo_description, seo_keywords,
  43 + image_src, image_width, image_height, image_path, image_alt,
  44 + tags, note, category,
  45 + shoplazza_created_at, shoplazza_updated_at, tenant_id,
  46 + creator, create_time, updater, update_time, deleted
  47 + FROM shoplazza_product_spu
  48 + WHERE tenant_id = :tenant_id AND deleted = 0
  49 + """)
  50 +
  51 + with self.db_engine.connect() as conn:
  52 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  53 +
  54 + return df
  55 +
  56 + def load_sku_data(self) -> pd.DataFrame:
  57 + """
  58 + Load SKU data from MySQL.
  59 +
  60 + Returns:
  61 + DataFrame with SKU data
  62 + """
  63 + query = text("""
  64 + SELECT
  65 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  66 + shoplazza_image_id, title, sku, barcode, position,
  67 + price, compare_at_price, cost_price,
  68 + option1, option2, option3,
  69 + inventory_quantity, weight, weight_unit, image_src,
  70 + wholesale_price, note, extend,
  71 + shoplazza_created_at, shoplazza_updated_at, tenant_id,
  72 + creator, create_time, updater, update_time, deleted
  73 + FROM shoplazza_product_sku
  74 + WHERE tenant_id = :tenant_id AND deleted = 0
  75 + """)
  76 +
  77 + with self.db_engine.connect() as conn:
  78 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  79 +
  80 + return df
  81 +
  82 + def transform_batch(self) -> List[Dict[str, Any]]:
  83 + """
  84 + Transform SPU and SKU data into ES documents.
  85 +
  86 + Returns:
  87 + List of SPU-level ES documents
  88 + """
  89 + # Load data
  90 + spu_df = self.load_spu_data()
  91 + sku_df = self.load_sku_data()
  92 +
  93 + if spu_df.empty:
  94 + return []
  95 +
  96 + # Group SKUs by SPU
  97 + sku_groups = sku_df.groupby('spu_id')
  98 +
  99 + documents = []
  100 + for _, spu_row in spu_df.iterrows():
  101 + spu_id = spu_row['id']
  102 +
  103 + # Get SKUs for this SPU
  104 + skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame()
  105 +
  106 + # Transform to ES document
  107 + doc = self._transform_spu_to_doc(spu_row, skus)
  108 + if doc:
  109 + documents.append(doc)
  110 +
  111 + return documents
  112 +
  113 + def _transform_spu_to_doc(
  114 + self,
  115 + spu_row: pd.Series,
  116 + skus: pd.DataFrame
  117 + ) -> Optional[Dict[str, Any]]:
  118 + """
  119 + Transform a single SPU row and its SKUs into an ES document.
  120 +
  121 + Args:
  122 + spu_row: SPU row from database
  123 + skus: DataFrame with SKUs for this SPU
  124 +
  125 + Returns:
  126 + ES document or None if transformation fails
  127 + """
  128 + doc = {}
  129 +
  130 + # Tenant ID (required)
  131 + doc['tenant_id'] = str(self.tenant_id)
  132 +
  133 + # Product ID
  134 + doc['product_id'] = str(spu_row['id'])
  135 +
  136 + # Handle
  137 + if pd.notna(spu_row.get('handle')):
  138 + doc['handle'] = str(spu_row['handle'])
  139 +
  140 + # Title
  141 + if pd.notna(spu_row.get('title')):
  142 + doc['title'] = str(spu_row['title'])
  143 +
  144 + # Brief
  145 + if pd.notna(spu_row.get('brief')):
  146 + doc['brief'] = str(spu_row['brief'])
  147 +
  148 + # Description
  149 + if pd.notna(spu_row.get('description')):
  150 + doc['description'] = str(spu_row['description'])
  151 +
  152 + # SEO fields
  153 + if pd.notna(spu_row.get('seo_title')):
  154 + doc['seo_title'] = str(spu_row['seo_title'])
  155 + if pd.notna(spu_row.get('seo_description')):
  156 + doc['seo_description'] = str(spu_row['seo_description'])
  157 + if pd.notna(spu_row.get('seo_keywords')):
  158 + doc['seo_keywords'] = str(spu_row['seo_keywords'])
  159 +
  160 + # Vendor
  161 + if pd.notna(spu_row.get('vendor')):
  162 + doc['vendor'] = str(spu_row['vendor'])
  163 + doc['vendor_keyword'] = str(spu_row['vendor'])
  164 +
  165 + # Product type (from category or tags)
  166 + if pd.notna(spu_row.get('category')):
  167 + doc['product_type'] = str(spu_row['category'])
  168 + doc['product_type_keyword'] = str(spu_row['category'])
  169 +
  170 + # Tags
  171 + if pd.notna(spu_row.get('tags')):
  172 + tags_str = str(spu_row['tags'])
  173 + doc['tags'] = tags_str
  174 + doc['tags_keyword'] = tags_str
  175 +
  176 + # Category
  177 + if pd.notna(spu_row.get('category')):
  178 + doc['category'] = str(spu_row['category'])
  179 + doc['category_keyword'] = str(spu_row['category'])
  180 +
  181 + # Image URL
  182 + if pd.notna(spu_row.get('image_src')):
  183 + image_src = str(spu_row['image_src'])
  184 + if not image_src.startswith('http'):
  185 + image_src = f"//{image_src}" if image_src.startswith('//') else image_src
  186 + doc['image_url'] = image_src
  187 +
  188 + # Process variants
  189 + variants = []
  190 + prices = []
  191 + compare_prices = []
  192 +
  193 + for _, sku_row in skus.iterrows():
  194 + variant = self._transform_sku_to_variant(sku_row)
  195 + if variant:
  196 + variants.append(variant)
  197 + if 'price' in variant and variant['price'] is not None:
  198 + try:
  199 + prices.append(float(variant['price']))
  200 + except (ValueError, TypeError):
  201 + pass
  202 + if 'compare_at_price' in variant and variant['compare_at_price'] is not None:
  203 + try:
  204 + compare_prices.append(float(variant['compare_at_price']))
  205 + except (ValueError, TypeError):
  206 + pass
  207 +
  208 + doc['variants'] = variants
  209 +
  210 + # Calculate price ranges
  211 + if prices:
  212 + doc['min_price'] = float(min(prices))
  213 + doc['max_price'] = float(max(prices))
  214 + else:
  215 + doc['min_price'] = 0.0
  216 + doc['max_price'] = 0.0
  217 +
  218 + if compare_prices:
  219 + doc['compare_at_price'] = float(max(compare_prices))
  220 + else:
  221 + doc['compare_at_price'] = None
  222 +
  223 + return doc
  224 +
  225 + def _transform_sku_to_variant(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]:
  226 + """
  227 + Transform a SKU row into a variant object.
  228 +
  229 + Args:
  230 + sku_row: SKU row from database
  231 +
  232 + Returns:
  233 + Variant dictionary or None
  234 + """
  235 + variant = {}
  236 +
  237 + # Variant ID
  238 + variant['variant_id'] = str(sku_row['id'])
  239 +
  240 + # Title
  241 + if pd.notna(sku_row.get('title')):
  242 + variant['title'] = str(sku_row['title'])
  243 +
  244 + # Price
  245 + if pd.notna(sku_row.get('price')):
  246 + try:
  247 + variant['price'] = float(sku_row['price'])
  248 + except (ValueError, TypeError):
  249 + variant['price'] = None
  250 + else:
  251 + variant['price'] = None
  252 +
  253 + # Compare at price
  254 + if pd.notna(sku_row.get('compare_at_price')):
  255 + try:
  256 + variant['compare_at_price'] = float(sku_row['compare_at_price'])
  257 + except (ValueError, TypeError):
  258 + variant['compare_at_price'] = None
  259 + else:
  260 + variant['compare_at_price'] = None
  261 +
  262 + # SKU
  263 + if pd.notna(sku_row.get('sku')):
  264 + variant['sku'] = str(sku_row['sku'])
  265 +
  266 + # Stock
  267 + if pd.notna(sku_row.get('inventory_quantity')):
  268 + try:
  269 + variant['stock'] = int(sku_row['inventory_quantity'])
  270 + except (ValueError, TypeError):
  271 + variant['stock'] = 0
  272 + else:
  273 + variant['stock'] = 0
  274 +
  275 + # Options (from option1, option2, option3)
  276 + options = {}
  277 + if pd.notna(sku_row.get('option1')):
  278 + options['option1'] = str(sku_row['option1'])
  279 + if pd.notna(sku_row.get('option2')):
  280 + options['option2'] = str(sku_row['option2'])
  281 + if pd.notna(sku_row.get('option3')):
  282 + options['option3'] = str(sku_row['option3'])
  283 +
  284 + if options:
  285 + variant['options'] = options
  286 +
  287 + return variant
  288 +
... ...
scripts/generate_test_data.py 0 → 100644
... ... @@ -0,0 +1,325 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Generate test data for Shoplazza SPU and SKU tables.
  4 +
  5 +Generates 100 SPU records with 1-5 SKU variants each.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import random
  11 +import argparse
  12 +from pathlib import Path
  13 +from datetime import datetime, timedelta
  14 +
  15 +# Add parent directory to path
  16 +sys.path.insert(0, str(Path(__file__).parent.parent))
  17 +
  18 +
  19 +def generate_spu_data(num_spus: int = 100, tenant_id: str = "1", start_id: int = 1):
  20 + """
  21 + Generate SPU test data.
  22 +
  23 + Args:
  24 + num_spus: Number of SPUs to generate
  25 + tenant_id: Tenant ID
  26 + start_id: Starting ID for SPUs
  27 +
  28 + Returns:
  29 + List of SPU data dictionaries
  30 + """
  31 + categories = ["电子产品", "服装", "家居用品", "美妆", "食品", "运动用品", "图书", "玩具"]
  32 + vendors = ["Sony", "Nike", "Apple", "Samsung", "华为", "小米", "美的", "海尔"]
  33 +
  34 + products = [
  35 + ("蓝牙耳机", "Bluetooth Headphone", "高品质无线蓝牙耳机", "High-quality wireless Bluetooth headphone"),
  36 + ("运动鞋", "Running Shoes", "舒适透气的运动鞋", "Comfortable and breathable running shoes"),
  37 + ("智能手机", "Smartphone", "高性能智能手机", "High-performance smartphone"),
  38 + ("笔记本电脑", "Laptop", "轻薄便携笔记本电脑", "Lightweight and portable laptop"),
  39 + ("智能手表", "Smart Watch", "多功能智能手表", "Multi-function smart watch"),
  40 + ("平板电脑", "Tablet", "高清平板电脑", "High-definition tablet"),
  41 + ("无线鼠标", "Wireless Mouse", "人体工学无线鼠标", "Ergonomic wireless mouse"),
  42 + ("机械键盘", "Mechanical Keyboard", "RGB背光机械键盘", "RGB backlit mechanical keyboard"),
  43 + ("显示器", "Monitor", "4K高清显示器", "4K high-definition monitor"),
  44 + ("音响", "Speaker", "蓝牙无线音响", "Bluetooth wireless speaker"),
  45 + ]
  46 +
  47 + spus = []
  48 + for i in range(num_spus):
  49 + spu_id = start_id + i
  50 + product = random.choice(products)
  51 + category = random.choice(categories)
  52 + vendor = random.choice(vendors)
  53 +
  54 + # Generate handle
  55 + handle = f"product-{spu_id}"
  56 +
  57 + # Generate title (Chinese)
  58 + title_zh = f"{product[0]} {vendor}"
  59 +
  60 + # Generate brief
  61 + brief_zh = product[2]
  62 +
  63 + # Generate description
  64 + description_zh = f"<p>{product[2]},来自{vendor}品牌。{product[3]}</p>"
  65 +
  66 + # Generate SEO fields
  67 + seo_title = f"{title_zh} - {category}"
  68 + seo_description = f"购买{vendor}{product[0]},{product[2]}"
  69 + seo_keywords = f"{product[0]},{vendor},{category}"
  70 +
  71 + # Generate tags
  72 + tags = f"{category},{vendor},{product[0]}"
  73 +
  74 + # Generate image
  75 + image_src = f"//cdn.example.com/products/{spu_id}.jpg"
  76 +
  77 + # Generate dates
  78 + created_at = datetime.now() - timedelta(days=random.randint(1, 365))
  79 + updated_at = created_at + timedelta(days=random.randint(0, 30))
  80 +
  81 + spu = {
  82 + 'id': spu_id,
  83 + 'shop_id': 1,
  84 + 'shoplazza_id': f"spu-{spu_id}",
  85 + 'handle': handle,
  86 + 'title': title_zh,
  87 + 'brief': brief_zh,
  88 + 'description': description_zh,
  89 + 'spu': '',
  90 + 'vendor': vendor,
  91 + 'vendor_url': f"https://{vendor.lower()}.com",
  92 + 'seo_title': seo_title,
  93 + 'seo_description': seo_description,
  94 + 'seo_keywords': seo_keywords,
  95 + 'image_src': image_src,
  96 + 'image_width': 800,
  97 + 'image_height': 600,
  98 + 'image_path': f"products/{spu_id}.jpg",
  99 + 'image_alt': title_zh,
  100 + 'inventory_policy': '',
  101 + 'inventory_quantity': 0,
  102 + 'inventory_tracking': '0',
  103 + 'published': 1,
  104 + 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  105 + 'requires_shipping': 1,
  106 + 'taxable': 0,
  107 + 'fake_sales': 0,
  108 + 'display_fake_sales': 0,
  109 + 'mixed_wholesale': 0,
  110 + 'need_variant_image': 0,
  111 + 'has_only_default_variant': 0,
  112 + 'tags': tags,
  113 + 'note': '',
  114 + 'category': category,
  115 + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  116 + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  117 + 'tenant_id': tenant_id,
  118 + 'creator': '1',
  119 + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  120 + 'updater': '1',
  121 + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  122 + 'deleted': 0
  123 + }
  124 + spus.append(spu)
  125 +
  126 + return spus
  127 +
  128 +
  129 +def generate_sku_data(spus: list, start_sku_id: int = 1):
  130 + """
  131 + Generate SKU test data for SPUs.
  132 +
  133 + Args:
  134 + spus: List of SPU data
  135 + start_sku_id: Starting ID for SKUs
  136 +
  137 + Returns:
  138 + List of SKU data dictionaries
  139 + """
  140 + colors = ["黑色", "白色", "红色", "蓝色", "绿色", "灰色"]
  141 + sizes = ["S", "M", "L", "XL", "XXL"]
  142 +
  143 + skus = []
  144 + sku_id = start_sku_id
  145 +
  146 + for spu in spus:
  147 + spu_id = spu['id']
  148 + num_variants = random.randint(1, 5)
  149 +
  150 + # Base price
  151 + base_price = random.uniform(50, 500)
  152 +
  153 + for i in range(num_variants):
  154 + # Generate variant options
  155 + color = random.choice(colors) if num_variants > 1 else None
  156 + size = random.choice(sizes) if num_variants > 2 else None
  157 +
  158 + # Generate title
  159 + title_parts = []
  160 + if color:
  161 + title_parts.append(color)
  162 + if size:
  163 + title_parts.append(size)
  164 + title = " / ".join(title_parts) if title_parts else ""
  165 +
  166 + # Generate SKU
  167 + sku_code = f"SKU-{spu_id}-{i+1}"
  168 +
  169 + # Generate price (variation from base)
  170 + price = base_price + random.uniform(-20, 50)
  171 + compare_at_price = price * random.uniform(1.2, 1.5)
  172 +
  173 + # Generate stock
  174 + stock = random.randint(0, 100)
  175 +
  176 + # Generate dates
  177 + created_at = datetime.now() - timedelta(days=random.randint(1, 365))
  178 + updated_at = created_at + timedelta(days=random.randint(0, 30))
  179 +
  180 + sku = {
  181 + 'id': sku_id,
  182 + 'spu_id': spu_id,
  183 + 'shop_id': 1,
  184 + 'shoplazza_id': f"sku-{sku_id}",
  185 + 'shoplazza_product_id': spu['shoplazza_id'],
  186 + 'shoplazza_image_id': '',
  187 + 'title': title,
  188 + 'sku': sku_code,
  189 + 'barcode': f"BAR{sku_id:08d}",
  190 + 'position': i + 1,
  191 + 'price': round(price, 2),
  192 + 'compare_at_price': round(compare_at_price, 2),
  193 + 'cost_price': round(price * 0.6, 2),
  194 + 'option1': color if color else '',
  195 + 'option2': size if size else '',
  196 + 'option3': '',
  197 + 'inventory_quantity': stock,
  198 + 'weight': round(random.uniform(0.1, 5.0), 2),
  199 + 'weight_unit': 'kg',
  200 + 'image_src': '',
  201 + 'wholesale_price': '[{"price": ' + str(round(price * 0.8, 2)) + ', "minQuantity": 10}]',
  202 + 'note': '',
  203 + 'extend': '',
  204 + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  205 + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  206 + 'tenant_id': spu['tenant_id'],
  207 + 'creator': '1',
  208 + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
  209 + 'updater': '1',
  210 + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
  211 + 'deleted': 0
  212 + }
  213 + skus.append(sku)
  214 + sku_id += 1
  215 +
  216 + return skus
  217 +
  218 +
  219 +def generate_sql_inserts(spus: list, skus: list, output_file: str):
  220 + """
  221 + Generate SQL INSERT statements.
  222 +
  223 + Args:
  224 + spus: List of SPU data
  225 + skus: List of SKU data
  226 + output_file: Output file path
  227 + """
  228 + with open(output_file, 'w', encoding='utf-8') as f:
  229 + f.write("-- SPU Test Data\n")
  230 + f.write("INSERT INTO shoplazza_product_spu (\n")
  231 + f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n")
  232 + f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n")
  233 + f.write(" image_src, image_width, image_height, image_path, image_alt,\n")
  234 + f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n")
  235 + f.write(" published, published_at, requires_shipping, taxable,\n")
  236 + f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n")
  237 + f.write(" has_only_default_variant, tags, note, category,\n")
  238 + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
  239 + f.write(" creator, create_time, updater, update_time, deleted\n")
  240 + f.write(") VALUES\n")
  241 +
  242 + for i, spu in enumerate(spus):
  243 + values = (
  244 + f"({spu['id']}, {spu['shop_id']}, '{spu['shoplazza_id']}', "
  245 + f"'{spu['handle']}', '{spu['title']}', '{spu['brief']}', "
  246 + f"'{spu['description']}', '{spu['spu']}', '{spu['vendor']}', "
  247 + f"'{spu['vendor_url']}', '{spu['seo_title']}', '{spu['seo_description']}', "
  248 + f"'{spu['seo_keywords']}', '{spu['image_src']}', {spu['image_width']}, "
  249 + f"{spu['image_height']}, '{spu['image_path']}', '{spu['image_alt']}', "
  250 + f"'{spu['inventory_policy']}', {spu['inventory_quantity']}, "
  251 + f"'{spu['inventory_tracking']}', {spu['published']}, "
  252 + f"'{spu['published_at']}', {spu['requires_shipping']}, {spu['taxable']}, "
  253 + f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, "
  254 + f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, "
  255 + f"'{spu['tags']}', '{spu['note']}', '{spu['category']}', "
  256 + f"'{spu['shoplazza_created_at']}', '{spu['shoplazza_updated_at']}', "
  257 + f"'{spu['tenant_id']}', '{spu['creator']}', '{spu['create_time']}', "
  258 + f"'{spu['updater']}', '{spu['update_time']}', {spu['deleted']})"
  259 + )
  260 + f.write(values)
  261 + if i < len(spus) - 1:
  262 + f.write(",\n")
  263 + else:
  264 + f.write(";\n\n")
  265 +
  266 + f.write("-- SKU Test Data\n")
  267 + f.write("INSERT INTO shoplazza_product_sku (\n")
  268 + f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n")
  269 + f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n")
  270 + f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n")
  271 + f.write(" image_src, wholesale_price, note, extend,\n")
  272 + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
  273 + f.write(" creator, create_time, updater, update_time, deleted\n")
  274 + f.write(") VALUES\n")
  275 +
  276 + for i, sku in enumerate(skus):
  277 + values = (
  278 + f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{sku['shoplazza_id']}', "
  279 + f"'{sku['shoplazza_product_id']}', '{sku['shoplazza_image_id']}', "
  280 + f"'{sku['title']}', '{sku['sku']}', '{sku['barcode']}', {sku['position']}, "
  281 + f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, "
  282 + f"'{sku['option1']}', '{sku['option2']}', '{sku['option3']}', "
  283 + f"{sku['inventory_quantity']}, {sku['weight']}, '{sku['weight_unit']}', "
  284 + f"'{sku['image_src']}', '{sku['wholesale_price']}', '{sku['note']}', "
  285 + f"'{sku['extend']}', '{sku['shoplazza_created_at']}', "
  286 + f"'{sku['shoplazza_updated_at']}', '{sku['tenant_id']}', "
  287 + f"'{sku['creator']}', '{sku['create_time']}', '{sku['updater']}', "
  288 + f"'{sku['update_time']}', {sku['deleted']})"
  289 + )
  290 + f.write(values)
  291 + if i < len(skus) - 1:
  292 + f.write(",\n")
  293 + else:
  294 + f.write(";\n")
  295 +
  296 +
  297 +def main():
  298 + parser = argparse.ArgumentParser(description='Generate test data for Shoplazza tables')
  299 + parser.add_argument('--num-spus', type=int, default=100, help='Number of SPUs to generate')
  300 + parser.add_argument('--tenant-id', default='1', help='Tenant ID')
  301 + parser.add_argument('--start-spu-id', type=int, default=1, help='Starting SPU ID')
  302 + parser.add_argument('--start-sku-id', type=int, default=1, help='Starting SKU ID')
  303 + parser.add_argument('--output', default='test_data.sql', help='Output SQL file')
  304 +
  305 + args = parser.parse_args()
  306 +
  307 + print(f"Generating {args.num_spus} SPUs with variants...")
  308 +
  309 + # Generate SPU data
  310 + spus = generate_spu_data(args.num_spus, args.tenant_id, args.start_spu_id)
  311 + print(f"Generated {len(spus)} SPUs")
  312 +
  313 + # Generate SKU data
  314 + skus = generate_sku_data(spus, args.start_sku_id)
  315 + print(f"Generated {len(skus)} SKUs")
  316 +
  317 + # Generate SQL file
  318 + generate_sql_inserts(spus, skus, args.output)
  319 + print(f"SQL file generated: {args.output}")
  320 +
  321 +
  322 +if __name__ == '__main__':
  323 + import json
  324 + main()
  325 +
... ...
scripts/import_test_data.py 0 → 100644
... ... @@ -0,0 +1,132 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Import test data into MySQL Shoplazza tables.
  4 +
  5 +Reads SQL file generated by generate_test_data.py and imports into MySQL.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +from pathlib import Path
  12 +
  13 +# Add parent directory to path
  14 +sys.path.insert(0, str(Path(__file__).parent.parent))
  15 +
  16 +from utils.db_connector import create_db_connection, test_connection
  17 +
  18 +
  19 +def import_sql_file(db_engine, sql_file: str):
  20 + """
  21 + Import SQL file into database.
  22 +
  23 + Args:
  24 + db_engine: SQLAlchemy database engine
  25 + sql_file: Path to SQL file
  26 + """
  27 + with open(sql_file, 'r', encoding='utf-8') as f:
  28 + sql_content = f.read()
  29 +
  30 + # Split by semicolons to get individual statements
  31 + statements = [s.strip() for s in sql_content.split(';') if s.strip() and not s.strip().startswith('--')]
  32 +
  33 + print(f"Executing {len(statements)} SQL statements...")
  34 +
  35 + with db_engine.connect() as conn:
  36 + for i, statement in enumerate(statements, 1):
  37 + if statement:
  38 + try:
  39 + conn.execute(statement)
  40 + conn.commit()
  41 + print(f" [{i}/{len(statements)}] Executed successfully")
  42 + except Exception as e:
  43 + print(f" [{i}/{len(statements)}] ERROR: {e}")
  44 + print(f" Statement: {statement[:100]}...")
  45 + raise
  46 +
  47 +
  48 +def verify_import(db_engine, tenant_id: str):
  49 + """
  50 + Verify imported data.
  51 +
  52 + Args:
  53 + db_engine: SQLAlchemy database engine
  54 + tenant_id: Tenant ID to verify
  55 + """
  56 + from sqlalchemy import text
  57 +
  58 + with db_engine.connect() as conn:
  59 + # Count SPUs
  60 + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
  61 + spu_count = result.scalar()
  62 +
  63 + # Count SKUs
  64 + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
  65 + sku_count = result.scalar()
  66 +
  67 + print(f"\nVerification:")
  68 + print(f" SPUs: {spu_count}")
  69 + print(f" SKUs: {sku_count}")
  70 +
  71 + return spu_count, sku_count
  72 +
  73 +
  74 +def main():
  75 + parser = argparse.ArgumentParser(description='Import test data into MySQL')
  76 +
  77 + # Database connection
  78 + parser.add_argument('--db-host', required=True, help='MySQL host')
  79 + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
  80 + parser.add_argument('--db-database', required=True, help='MySQL database name')
  81 + parser.add_argument('--db-username', required=True, help='MySQL username')
  82 + parser.add_argument('--db-password', required=True, help='MySQL password')
  83 +
  84 + # Import options
  85 + parser.add_argument('--sql-file', required=True, help='SQL file to import')
  86 + parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)')
  87 +
  88 + args = parser.parse_args()
  89 +
  90 + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
  91 +
  92 + # Connect to database
  93 + try:
  94 + db_engine = create_db_connection(
  95 + host=args.db_host,
  96 + port=args.db_port,
  97 + database=args.db_database,
  98 + username=args.db_username,
  99 + password=args.db_password
  100 + )
  101 + except Exception as e:
  102 + print(f"ERROR: Failed to connect to MySQL: {e}")
  103 + return 1
  104 +
  105 + # Test connection
  106 + if not test_connection(db_engine):
  107 + print("ERROR: Database connection test failed")
  108 + return 1
  109 +
  110 + print("Database connection successful")
  111 +
  112 + # Import SQL file
  113 + print(f"\nImporting SQL file: {args.sql_file}")
  114 + try:
  115 + import_sql_file(db_engine, args.sql_file)
  116 + print("Import completed successfully")
  117 + except Exception as e:
  118 + print(f"ERROR: Failed to import SQL file: {e}")
  119 + import traceback
  120 + traceback.print_exc()
  121 + return 1
  122 +
  123 + # Verify import if tenant_id provided
  124 + if args.tenant_id:
  125 + verify_import(db_engine, args.tenant_id)
  126 +
  127 + return 0
  128 +
  129 +
  130 +if __name__ == '__main__':
  131 + sys.exit(main())
  132 +
... ...
scripts/ingest_shoplazza.py 0 → 100644
... ... @@ -0,0 +1,148 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Shoplazza data ingestion script.
  4 +
  5 +Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +from pathlib import Path
  12 +
  13 +# Add parent directory to path
  14 +sys.path.insert(0, str(Path(__file__).parent.parent))
  15 +
  16 +from utils.db_connector import create_db_connection
  17 +from utils.es_client import ESClient
  18 +from indexer.spu_transformer import SPUTransformer
  19 +from indexer.mapping_generator import MappingGenerator
  20 +from indexer.bulk_indexer import BulkIndexer
  21 +from config import ConfigLoader
  22 +
  23 +
  24 +def main():
  25 + parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch')
  26 +
  27 + # Database connection
  28 + parser.add_argument('--db-host', required=True, help='MySQL host')
  29 + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
  30 + parser.add_argument('--db-database', required=True, help='MySQL database name')
  31 + parser.add_argument('--db-username', required=True, help='MySQL username')
  32 + parser.add_argument('--db-password', required=True, help='MySQL password')
  33 +
  34 + # Tenant and index
  35 + parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)')
  36 + parser.add_argument('--config', default='base', help='Configuration ID (default: base)')
  37 + parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')
  38 +
  39 + # Options
  40 + parser.add_argument('--recreate', action='store_true', help='Recreate index if exists')
  41 + parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)')
  42 +
  43 + args = parser.parse_args()
  44 +
  45 + print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}")
  46 +
  47 + # Load configuration
  48 + config_loader = ConfigLoader("config/schema")
  49 + try:
  50 + config = config_loader.load_customer_config(args.config)
  51 + print(f"Loaded configuration: {config.customer_name}")
  52 + except Exception as e:
  53 + print(f"ERROR: Failed to load configuration: {e}")
  54 + return 1
  55 +
  56 + # Validate tenant_id field exists
  57 + tenant_id_field = None
  58 + for field in config.fields:
  59 + if field.name == "tenant_id":
  60 + tenant_id_field = field
  61 + break
  62 +
  63 + if not tenant_id_field:
  64 + print("ERROR: Configuration must include 'tenant_id' field")
  65 + return 1
  66 +
  67 + # Connect to MySQL
  68 + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
  69 + try:
  70 + db_engine = create_db_connection(
  71 + host=args.db_host,
  72 + port=args.db_port,
  73 + database=args.db_database,
  74 + username=args.db_username,
  75 + password=args.db_password
  76 + )
  77 + except Exception as e:
  78 + print(f"ERROR: Failed to connect to MySQL: {e}")
  79 + return 1
  80 +
  81 + # Connect to Elasticsearch
  82 + print(f"Connecting to Elasticsearch: {args.es_host}")
  83 + es_client = ESClient(hosts=[args.es_host])
  84 + if not es_client.ping():
  85 + print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}")
  86 + return 1
  87 +
  88 + # Generate and create index
  89 + mapping_gen = MappingGenerator(config)
  90 + mapping = mapping_gen.generate_mapping()
  91 + index_name = config.es_index_name
  92 +
  93 + if args.recreate:
  94 + if es_client.index_exists(index_name):
  95 + print(f"Deleting existing index: {index_name}")
  96 + es_client.delete_index(index_name)
  97 +
  98 + if not es_client.index_exists(index_name):
  99 + print(f"Creating index: {index_name}")
  100 + es_client.create_index(index_name, mapping)
  101 + else:
  102 + print(f"Using existing index: {index_name}")
  103 +
  104 + # Initialize SPU transformer
  105 + print(f"Initializing SPU transformer for tenant: {args.tenant_id}")
  106 + transformer = SPUTransformer(db_engine, args.tenant_id)
  107 +
  108 + # Transform data
  109 + print("Transforming SPU and SKU data...")
  110 + try:
  111 + documents = transformer.transform_batch()
  112 + print(f"Transformed {len(documents)} SPU documents")
  113 + except Exception as e:
  114 + print(f"ERROR: Failed to transform data: {e}")
  115 + import traceback
  116 + traceback.print_exc()
  117 + return 1
  118 +
  119 + if not documents:
  120 + print("WARNING: No documents to index")
  121 + return 0
  122 +
  123 + # Bulk index
  124 + print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...")
  125 + indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
  126 +
  127 + try:
  128 + results = indexer.index_documents(documents, id_field="product_id", show_progress=True)
  129 + print(f"\nIngestion complete:")
  130 + print(f" Success: {results['success']}")
  131 + print(f" Failed: {results['failed']}")
  132 + print(f" Time: {results.get('elapsed_time', 0):.2f}s")
  133 +
  134 + if results['failed'] > 0:
  135 + print(f"\nWARNING: {results['failed']} documents failed to index")
  136 + return 1
  137 +
  138 + return 0
  139 + except Exception as e:
  140 + print(f"ERROR: Failed to index documents: {e}")
  141 + import traceback
  142 + traceback.print_exc()
  143 + return 1
  144 +
  145 +
  146 +if __name__ == '__main__':
  147 + sys.exit(main())
  148 +
... ...
scripts/test_base.py 0 → 100644
... ... @@ -0,0 +1,242 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Test script for base configuration.
  4 +
  5 +Tests data ingestion, search API, response format, and tenant isolation.
  6 +"""
  7 +
  8 +import sys
  9 +import os
  10 +import argparse
  11 +import requests
  12 +import json
  13 +from pathlib import Path
  14 +
  15 +# Add parent directory to path
  16 +sys.path.insert(0, str(Path(__file__).parent.parent))
  17 +
  18 +
  19 +def test_search_api(base_url: str, tenant_id: str, query: str = "耳机"):
  20 + """
  21 + Test search API.
  22 +
  23 + Args:
  24 + base_url: API base URL
  25 + tenant_id: Tenant ID
  26 + query: Search query
  27 +
  28 + Returns:
  29 + Response JSON or None if failed
  30 + """
  31 + url = f"{base_url}/search/"
  32 + headers = {
  33 + "X-Tenant-ID": tenant_id,
  34 + "Content-Type": "application/json"
  35 + }
  36 + payload = {
  37 + "query": query,
  38 + "size": 10,
  39 + "from": 0
  40 + }
  41 +
  42 + print(f"\nTesting search API:")
  43 + print(f" URL: {url}")
  44 + print(f" Query: {query}")
  45 + print(f" Tenant ID: {tenant_id}")
  46 +
  47 + try:
  48 + response = requests.post(url, json=payload, headers=headers, timeout=30)
  49 + response.raise_for_status()
  50 + data = response.json()
  51 +
  52 + print(f" Status: {response.status_code}")
  53 + print(f" Total: {data.get('total', 0)}")
  54 + print(f" Results: {len(data.get('results', []))}")
  55 +
  56 + return data
  57 + except Exception as e:
  58 + print(f" ERROR: {e}")
  59 + return None
  60 +
  61 +
  62 +def validate_response_format(data: dict):
  63 + """
  64 + Validate response format.
  65 +
  66 + Args:
  67 + data: Response data
  68 +
  69 + Returns:
  70 + List of validation errors (empty if valid)
  71 + """
  72 + errors = []
  73 +
  74 + # Check for results field (not hits)
  75 + if 'hits' in data:
  76 + errors.append("Response contains 'hits' field (should be 'results')")
  77 +
  78 + if 'results' not in data:
  79 + errors.append("Response missing 'results' field")
  80 + else:
  81 + results = data['results']
  82 + if not isinstance(results, list):
  83 + errors.append("'results' should be a list")
  84 + else:
  85 + # Validate first result structure
  86 + if results:
  87 + result = results[0]
  88 + required_fields = ['product_id', 'title', 'variants', 'relevance_score']
  89 + for field in required_fields:
  90 + if field not in result:
  91 + errors.append(f"Result missing required field: {field}")
  92 +
  93 + # Check for ES internal fields
  94 + es_internal_fields = ['_id', '_score', '_source']
  95 + for field in es_internal_fields:
  96 + if field in result:
  97 + errors.append(f"Result contains ES internal field: {field}")
  98 +
  99 + # Validate variants
  100 + if 'variants' in result:
  101 + variants = result['variants']
  102 + if not isinstance(variants, list):
  103 + errors.append("'variants' should be a list")
  104 + elif variants:
  105 + variant = variants[0]
  106 + variant_required = ['variant_id', 'price', 'sku', 'stock']
  107 + for field in variant_required:
  108 + if field not in variant:
  109 + errors.append(f"Variant missing required field: {field}")
  110 +
  111 + # Check for suggestions and related_searches
  112 + if 'suggestions' not in data:
  113 + errors.append("Response missing 'suggestions' field")
  114 + if 'related_searches' not in data:
  115 + errors.append("Response missing 'related_searches' field")
  116 +
  117 + return errors
  118 +
  119 +
  120 +def test_facets(base_url: str, tenant_id: str):
  121 + """
  122 + Test facets aggregation.
  123 +
  124 + Args:
  125 + base_url: API base URL
  126 + tenant_id: Tenant ID
  127 +
  128 + Returns:
  129 + Response JSON or None if failed
  130 + """
  131 + url = f"{base_url}/search/"
  132 + headers = {
  133 + "X-Tenant-ID": tenant_id,
  134 + "Content-Type": "application/json"
  135 + }
  136 + payload = {
  137 + "query": "商品",
  138 + "size": 10,
  139 + "facets": ["category_keyword", "vendor_keyword"]
  140 + }
  141 +
  142 + print(f"\nTesting facets:")
  143 + print(f" Facets: {payload['facets']}")
  144 +
  145 + try:
  146 + response = requests.post(url, json=payload, headers=headers, timeout=30)
  147 + response.raise_for_status()
  148 + data = response.json()
  149 +
  150 + if 'facets' in data and data['facets']:
  151 + print(f" Facets returned: {len(data['facets'])}")
  152 + for facet in data['facets']:
  153 + print(f" - {facet.get('field')}: {len(facet.get('values', []))} values")
  154 + else:
  155 + print(" WARNING: No facets returned")
  156 +
  157 + return data
  158 + except Exception as e:
  159 + print(f" ERROR: {e}")
  160 + return None
  161 +
  162 +
  163 +def test_tenant_isolation(base_url: str, tenant_id_1: str, tenant_id_2: str):
  164 + """
  165 + Test tenant isolation.
  166 +
  167 + Args:
  168 + base_url: API base URL
  169 + tenant_id_1: First tenant ID
  170 + tenant_id_2: Second tenant ID
  171 + """
  172 + print(f"\nTesting tenant isolation:")
  173 + print(f" Tenant 1: {tenant_id_1}")
  174 + print(f" Tenant 2: {tenant_id_2}")
  175 +
  176 + # Search for tenant 1
  177 + data1 = test_search_api(base_url, tenant_id_1, "商品")
  178 + # Search for tenant 2
  179 + data2 = test_search_api(base_url, tenant_id_2, "商品")
  180 +
  181 + if data1 and data2:
  182 + results1 = set(r.get('product_id') for r in data1.get('results', []))
  183 + results2 = set(r.get('product_id') for r in data2.get('results', []))
  184 +
  185 + overlap = results1 & results2
  186 + if overlap:
  187 + print(f" WARNING: Found {len(overlap)} overlapping results between tenants")
  188 + else:
  189 + print(f" OK: No overlapping results (tenant isolation working)")
  190 +
  191 +
  192 +def main():
  193 + parser = argparse.ArgumentParser(description='Test base configuration')
  194 + parser.add_argument('--api-url', default='http://localhost:8000', help='API base URL')
  195 + parser.add_argument('--tenant-id', default='1', help='Tenant ID for testing')
  196 + parser.add_argument('--test-tenant-2', help='Second tenant ID for isolation test')
  197 +
  198 + args = parser.parse_args()
  199 +
  200 + print("=" * 60)
  201 + print("Base Configuration Test Suite")
  202 + print("=" * 60)
  203 +
  204 + # Test 1: Basic search
  205 + print("\n[Test 1] Basic Search")
  206 + data = test_search_api(args.api_url, args.tenant_id)
  207 + if not data:
  208 + print("FAILED: Basic search test")
  209 + return 1
  210 +
  211 + # Test 2: Response format validation
  212 + print("\n[Test 2] Response Format Validation")
  213 + errors = validate_response_format(data)
  214 + if errors:
  215 + print("FAILED: Response format validation")
  216 + for error in errors:
  217 + print(f" - {error}")
  218 + return 1
  219 + else:
  220 + print("PASSED: Response format is correct")
  221 +
  222 + # Test 3: Facets
  223 + print("\n[Test 3] Facets Aggregation")
  224 + facet_data = test_facets(args.api_url, args.tenant_id)
  225 + if not facet_data:
  226 + print("WARNING: Facets test failed (may be expected if no data)")
  227 +
  228 + # Test 4: Tenant isolation (if second tenant provided)
  229 + if args.test_tenant_2:
  230 + print("\n[Test 4] Tenant Isolation")
  231 + test_tenant_isolation(args.api_url, args.tenant_id, args.test_tenant_2)
  232 +
  233 + print("\n" + "=" * 60)
  234 + print("All tests completed")
  235 + print("=" * 60)
  236 +
  237 + return 0
  238 +
  239 +
  240 +if __name__ == '__main__':
  241 + sys.exit(main())
  242 +
... ...
search/searcher.py
... ... @@ -17,38 +17,45 @@ from .multilang_query_builder import MultiLanguageQueryBuilder
17 17 from .rerank_engine import RerankEngine
18 18 from context.request_context import RequestContext, RequestContextStage, create_request_context
19 19 from api.models import FacetResult, FacetValue
  20 +from api.result_formatter import ResultFormatter
20 21  
21 22  
22 23 class SearchResult:
23   - """Container for search results (重构版)."""
  24 + """Container for search results (外部友好格式)."""
24 25  
25 26 def __init__(
26 27 self,
27   - hits: List[Dict[str, Any]],
  28 + results: List[Any], # List[ProductResult]
28 29 total: int,
29 30 max_score: float,
30 31 took_ms: int,
31 32 facets: Optional[List[FacetResult]] = None,
32 33 query_info: Optional[Dict[str, Any]] = None,
  34 + suggestions: Optional[List[str]] = None,
  35 + related_searches: Optional[List[str]] = None,
33 36 debug_info: Optional[Dict[str, Any]] = None
34 37 ):
35   - self.hits = hits
  38 + self.results = results
36 39 self.total = total
37 40 self.max_score = max_score
38 41 self.took_ms = took_ms
39 42 self.facets = facets
40 43 self.query_info = query_info or {}
  44 + self.suggestions = suggestions or []
  45 + self.related_searches = related_searches or []
41 46 self.debug_info = debug_info
42 47  
43 48 def to_dict(self) -> Dict[str, Any]:
44 49 """Convert to dictionary representation."""
45 50 result = {
46   - "hits": self.hits,
  51 + "results": [r.model_dump() if hasattr(r, 'model_dump') else r for r in self.results],
47 52 "total": self.total,
48 53 "max_score": self.max_score,
49 54 "took_ms": self.took_ms,
50 55 "facets": [f.model_dump() for f in self.facets] if self.facets else None,
51   - "query_info": self.query_info
  56 + "query_info": self.query_info,
  57 + "suggestions": self.suggestions,
  58 + "related_searches": self.related_searches
52 59 }
53 60 if self.debug_info is not None:
54 61 result["debug_info"] = self.debug_info
... ... @@ -106,6 +113,7 @@ class Searcher:
106 113 def search(
107 114 self,
108 115 query: str,
  116 + tenant_id: str,
109 117 size: int = 10,
110 118 from_: int = 0,
111 119 filters: Optional[Dict[str, Any]] = None,
... ... @@ -118,10 +126,11 @@ class Searcher:
118 126 debug: bool = False
119 127 ) -> SearchResult:
120 128 """
121   - Execute search query (重构版).
  129 + Execute search query (外部友好格式).
122 130  
123 131 Args:
124 132 query: Search query string
  133 + tenant_id: Tenant ID (required for filtering)
125 134 size: Number of results to return
126 135 from_: Offset for pagination
127 136 filters: Exact match filters
... ... @@ -134,7 +143,7 @@ class Searcher:
134 143 debug: Enable debug information output
135 144  
136 145 Returns:
137   - SearchResult object
  146 + SearchResult object with formatted results
138 147 """
139 148 # Create context if not provided (backward compatibility)
140 149 if context is None:
... ... @@ -248,6 +257,11 @@ class Searcher:
248 257 # Step 3: Query building
249 258 context.start_stage(RequestContextStage.QUERY_BUILDING)
250 259 try:
  260 + # Add tenant_id to filters (required)
  261 + if filters is None:
  262 + filters = {}
  263 + filters['tenant_id'] = tenant_id
  264 +
251 265 es_query = self.query_builder.build_multilang_query(
252 266 parsed_query=parsed_query,
253 267 query_vector=parsed_query.query_vector if enable_embedding else None,
... ... @@ -341,56 +355,10 @@ class Searcher:
341 355 # Step 5: Result processing
342 356 context.start_stage(RequestContextStage.RESULT_PROCESSING)
343 357 try:
344   - hits = []
345   - raw_hits = []
346   -
  358 + # Extract ES hits
  359 + es_hits = []
347 360 if 'hits' in es_response and 'hits' in es_response['hits']:
348   - for hit in es_response['hits']['hits']:
349   - raw_hits.append(hit)
350   -
351   - result_doc = {
352   - '_id': hit['_id'],
353   - '_score': hit.get('_score') or 0.0,
354   - '_source': hit['_source']
355   - }
356   -
357   - # 应用本地重排(仅当启用时)
358   - if enable_rerank and self.rerank_engine.enabled:
359   - base_score = hit.get('_score') or 0.0
360   - knn_score = None
361   -
362   - # 检查是否使用了KNN(新结构:在function_score内部)
363   - query_section = es_query.get('query', {})
364   - if 'function_score' in query_section:
365   - fs_query = query_section['function_score'].get('query', {})
366   - outer_bool = fs_query.get('bool', {})
367   - inner_bool_list = outer_bool.get('must', [])
368   - if inner_bool_list and 'bool' in inner_bool_list[0]:
369   - inner_should = inner_bool_list[0]['bool'].get('should', [])
370   - if any('knn' in clause for clause in inner_should):
371   - knn_score = base_score * 0.2
372   -
373   - custom_score = self.rerank_engine.calculate_score(
374   - hit,
375   - base_score,
376   - knn_score
377   - )
378   - result_doc['_custom_score'] = custom_score
379   - result_doc['_original_score'] = base_score
380   -
381   - hits.append(result_doc)
382   -
383   - # 重排序(仅当启用时)
384   - if enable_rerank and self.rerank_engine.enabled:
385   - hits.sort(key=lambda x: x.get('_custom_score', x['_score']), reverse=True)
386   - context.logger.info(
387   - f"本地重排完成 | 使用RerankEngine",
388   - extra={'reqid': context.reqid, 'uid': context.uid}
389   - )
390   -
391   - # Store intermediate results in context
392   - context.store_intermediate_result('raw_hits', raw_hits)
393   - context.store_intermediate_result('processed_hits', hits)
  361 + es_hits = es_response['hits']['hits']
394 362  
395 363 # Extract total and max_score
396 364 total = es_response.get('hits', {}).get('total', {})
... ... @@ -401,16 +369,24 @@ class Searcher:
401 369  
402 370 max_score = es_response.get('hits', {}).get('max_score') or 0.0
403 371  
404   - # Standardize facets
405   - standardized_facets = self._standardize_facets(
406   - es_response.get('aggregations', {}),
407   - facets,
408   - filters
409   - )
  372 + # Format results using ResultFormatter
  373 + formatted_results = ResultFormatter.format_search_results(es_hits, max_score)
  374 +
  375 + # Format facets
  376 + standardized_facets = None
  377 + if facets:
  378 + standardized_facets = ResultFormatter.format_facets(
  379 + es_response.get('aggregations', {}),
  380 + facets
  381 + )
  382 +
  383 + # Generate suggestions and related searches
  384 + query_text = parsed_query.original_query if parsed_query else query
  385 + suggestions = ResultFormatter.generate_suggestions(query_text, formatted_results)
  386 + related_searches = ResultFormatter.generate_related_searches(query_text, formatted_results)
410 387  
411 388 context.logger.info(
412   - f"结果处理完成 | 返回: {len(hits)}条 | 总计: {total_value}条 | "
413   - f"重排序: {'是' if enable_rerank else '否'}",
  389 + f"结果处理完成 | 返回: {len(formatted_results)}条 | 总计: {total_value}条",
414 390 extra={'reqid': context.reqid, 'uid': context.uid}
415 391 )
416 392  
... ... @@ -459,12 +435,14 @@ class Searcher:
459 435  
460 436 # Build result
461 437 result = SearchResult(
462   - hits=hits,
  438 + results=formatted_results,
463 439 total=total_value,
464 440 max_score=max_score,
465 441 took_ms=int(total_duration),
466 442 facets=standardized_facets,
467 443 query_info=parsed_query.to_dict(),
  444 + suggestions=suggestions,
  445 + related_searches=related_searches,
468 446 debug_info=debug_info
469 447 )
470 448  
... ... @@ -476,21 +454,23 @@ class Searcher:
476 454 def search_by_image(
477 455 self,
478 456 image_url: str,
  457 + tenant_id: str,
479 458 size: int = 10,
480 459 filters: Optional[Dict[str, Any]] = None,
481 460 range_filters: Optional[Dict[str, Any]] = None
482 461 ) -> SearchResult:
483 462 """
484   - Search by image similarity (重构版).
  463 + Search by image similarity (外部友好格式).
485 464  
486 465 Args:
487 466 image_url: URL of query image
  467 + tenant_id: Tenant ID (required for filtering)
488 468 size: Number of results
489 469 filters: Exact match filters
490 470 range_filters: Range filters for numeric fields
491 471  
492 472 Returns:
493   - SearchResult object
  473 + SearchResult object with formatted results
494 474 """
495 475 if not self.image_embedding_field:
496 476 raise ValueError("Image embedding field not configured")
... ... @@ -503,6 +483,11 @@ class Searcher:
503 483 if image_vector is None:
504 484 raise ValueError(f"Failed to encode image: {image_url}")
505 485  
  486 + # Add tenant_id to filters (required)
  487 + if filters is None:
  488 + filters = {}
  489 + filters['tenant_id'] = tenant_id
  490 +
506 491 # Build KNN query
507 492 es_query = {
508 493 "size": size,
... ... @@ -536,28 +521,32 @@ class Searcher:
536 521 size=size
537 522 )
538 523  
539   - # Process results (similar to text search)
540   - hits = []
  524 + # Extract ES hits
  525 + es_hits = []
541 526 if 'hits' in es_response and 'hits' in es_response['hits']:
542   - for hit in es_response['hits']['hits']:
543   - hits.append({
544   - '_id': hit['_id'],
545   - '_score': hit['_score'],
546   - '_source': hit['_source']
547   - })
  527 + es_hits = es_response['hits']['hits']
548 528  
  529 + # Extract total and max_score
549 530 total = es_response.get('hits', {}).get('total', {})
550 531 if isinstance(total, dict):
551 532 total_value = total.get('value', 0)
552 533 else:
553 534 total_value = total
554 535  
  536 + max_score = es_response.get('hits', {}).get('max_score') or 0.0
  537 +
  538 + # Format results using ResultFormatter
  539 + formatted_results = ResultFormatter.format_search_results(es_hits, max_score)
  540 +
555 541 return SearchResult(
556   - hits=hits,
  542 + results=formatted_results,
557 543 total=total_value,
558   - max_score=es_response.get('hits', {}).get('max_score') or 0.0,
  544 + max_score=max_score,
559 545 took_ms=es_response.get('took', 0),
560   - query_info={'image_url': image_url, 'search_type': 'image_similarity'}
  546 + facets=None,
  547 + query_info={'image_url': image_url, 'search_type': 'image_similarity'},
  548 + suggestions=[],
  549 + related_searches=[]
561 550 )
562 551  
563 552 def get_domain_summary(self) -> Dict[str, Any]:
... ...
设计文档.md
... ... @@ -12,33 +12,57 @@
12 12  
13 13 ## 1. 原始数据层的约定
14 14  
15   -所有租户共用主表、独立配置和扩展表,有自己独立的ES索引。
16   -
17 15 ### 1.1 店匠主表
18 16  
19 17 所有租户共用以下主表:
20 18 - `shoplazza_product_sku` - SKU级别商品数据
21 19 - `shoplazza_product_spu` - SPU级别商品数据
22 20  
23   -### 1.2 每个租户的扩展表
24   -
25   -各个租户有自己的扩展表,不同的租户根据不同的业务需要、以及不同的数据源,来定制自己的扩展表:
26   -- 自定义属性体系
27   -- 多语言商品标题(中文、英文、俄文等)
28   -- 品牌名、不同的类目和标签体系
29   -- 业务过滤和聚合字段
30   -- 权重(提权)字段
31   -
32   -**数据关联方式**:
33   -- 入索引时,商品主表 `shoplazza_product_sku` 的 `id` + `shopid` 与租户扩展表关联
34   -- 例如:`customer1_extension` 表存储 customer1 的自定义字段
  21 +### 1.2 索引结构(SPU维度)
  22 +
  23 +**统一索引架构**:
  24 +- 所有客户共享同一个Elasticsearch索引:`search_products`
  25 +- 索引粒度:SPU级别(每个文档代表一个SPU)
  26 +- 数据隔离:通过`tenant_id`字段实现租户隔离
  27 +- 嵌套结构:每个SPU文档包含嵌套的`variants`数组(SKU变体)
  28 +
  29 +**索引文档结构**:
  30 +```json
  31 +{
  32 + "tenant_id": "1",
  33 + "product_id": "123",
  34 + "title": "蓝牙耳机",
  35 + "variants": [
  36 + {
  37 + "variant_id": "456",
  38 + "title": "黑色",
  39 + "price": 199.99,
  40 + "sku": "SKU-123-1",
  41 + "stock": 50
  42 + }
  43 + ],
  44 + "min_price": 199.99,
  45 + "max_price": 299.99
  46 +}
  47 +```
35 48  
36 49 ### 1.3 配置化方案
37 50  
  51 +**配置分离原则**:
  52 +- **搜索配置**:只包含ES字段定义、查询域、排序规则等搜索相关配置
  53 +- **数据源配置**:不在搜索配置中,由Pipeline层(脚本)决定
  54 +- **数据导入流程**:写死的脚本,不依赖配置
  55 +
38 56 统一通过配置文件定义:
39   -1. ES 字段定义(字段类型、分析器、来源表/列
  57 +1. ES 字段定义(字段类型、分析器、boost等
40 58 2. ES mapping 结构生成
41   -3. 数据入库映射关系
  59 +3. 查询域配置(indexes)
  60 +4. 排序和打分配置(function_score)
  61 +
  62 +**注意**:配置中**不包含**以下内容:
  63 +- `mysql_config` - MySQL数据库配置
  64 +- `main_table` / `extension_table` - 数据表配置
  65 +- `source_table` / `source_column` - 字段数据源映射
42 66  
43 67 ---
44 68  
... ... @@ -72,62 +96,54 @@
72 96 - **standard**:标准分析器
73 97 - **keyword**:关键词分析器
74 98  
75   -#### 字段配置示例
  99 +#### 字段配置示例(Base配置)
76 100  
77 101 ```yaml
78 102 fields:
79   - # 主键字段
80   - - name: "skuId"
81   - type: "LONG"
82   - source_table: "main" # 主表
83   - source_column: "id"
  103 + # 租户隔离字段(必需)
  104 + - name: "tenant_id"
  105 + type: "KEYWORD"
84 106 required: true
85 107 index: true
86 108 store: true
87 109  
88   - # 多语言文本字段
89   - - name: "name"
90   - type: "TEXT"
91   - source_table: "extension" # 扩展表
92   - source_column: "name"
93   - analyzer: "chinese_ecommerce"
94   - boost: 2.0
  110 + # 商品标识字段
  111 + - name: "product_id"
  112 + type: "KEYWORD"
  113 + required: true
95 114 index: true
96 115 store: true
97 116  
98   - - name: "enSpuName"
  117 + # 文本搜索字段
  118 + - name: "title"
99 119 type: "TEXT"
100   - source_table: "extension"
101   - source_column: "enSpuName"
102   - analyzer: "english"
103   - boost: 2.0
  120 + analyzer: "chinese_ecommerce"
  121 + boost: 3.0
  122 + index: true
  123 + store: true
104 124  
105   - - name: "ruSkuName"
  125 + - name: "seo_keywords"
106 126 type: "TEXT"
107   - source_table: "extension"
108   - source_column: "ruSkuName"
109   - analyzer: "russian"
  127 + analyzer: "chinese_ecommerce"
110 128 boost: 2.0
111   -
112   - # 文本向量字段
113   - - name: "name_embedding"
114   - type: "TEXT_EMBEDDING"
115   - source_table: "extension"
116   - source_column: "name"
117   - embedding_dims: 1024
118   - embedding_similarity: "dot_product"
119 129 index: true
  130 + store: true
120 131  
121   - # 图片向量字段
122   - - name: "image_embedding"
123   - type: "IMAGE_EMBEDDING"
124   - source_table: "extension"
125   - source_column: "imageUrl"
126   - embedding_dims: 1024
127   - embedding_similarity: "dot_product"
128   - nested: false
  132 + # 嵌套variants字段
  133 + - name: "variants"
  134 + type: "JSON"
  135 + nested: true
  136 + nested_properties:
  137 + variant_id:
  138 + type: "keyword"
  139 + price:
  140 + type: "float"
  141 + sku:
  142 + type: "keyword"
129 143 ```
130 144  
  145 +**注意**:配置中**不包含**`source_table`和`source_column`,数据源映射由Pipeline层决定。
  146 +
131 147 **实现模块**:
132 148 - `config/config_loader.py` - 配置加载器
133 149 - `config/field_types.py` - 字段类型定义
... ... @@ -204,77 +220,69 @@ indexes:
204 220  
205 221 ---
206 222  
207   -## 3. 测试数据灌入
  223 +## 3. 数据导入流程
208 224  
209 225 ### 3.1 数据源
210 226  
211   -**主表**:`shoplazza_product_sku`
212   -- 所有租户共用
213   -- 包含基础商品信息(id, shopid 等)
  227 +**店匠标准表**(Base配置使用):
  228 +- `shoplazza_product_spu` - SPU级别商品数据
  229 +- `shoplazza_product_sku` - SKU级别商品数据
214 230  
215   -**扩展表**:`customer1_extension`
216   -- 每个租户独立
217   -- 包含自定义字段和多语言字段
  231 +**其他客户表**(customer1等):
  232 +- 使用各自的数据源表和扩展表
218 233  
219   -### 3.2 数据入方式
  234 +### 3.2 数据入方式
220 235  
221   -**实现情况**:
  236 +**Pipeline层决定数据源**:
  237 +- 数据导入流程是写死的脚本,不依赖配置
  238 +- 配置只关注ES搜索相关的内容
  239 +- 数据源映射逻辑写死在转换器代码中
222 240  
223   -#### 命令行工具
224   -```bash
225   -python main.py ingest \
226   - --customer customer1 \
227   - --csv-file data/customer1_data.csv \
228   - --es-host http://localhost:9200 \
229   - --recreate \
230   - --batch-size 100
231   -```
  241 +#### Base配置数据导入(店匠通用)
  242 +
  243 +**脚本**:`scripts/ingest_shoplazza.py`
232 244  
233   -#### 数据流程
234   -1. **数据加载**:从 CSV 文件或 MySQL 数据库加载数据
235   -2. **数据转换**:
236   - - 字段映射(根据配置将源字段映射到 ES 字段)
237   - - 类型转换(字符串、数字、日期等)
238   - - 向量生成(文本向量、图片向量)
239   - - 向量缓存(避免重复计算)
  245 +**数据流程**:
  246 +1. **数据加载**:从MySQL读取`shoplazza_product_spu`和`shoplazza_product_sku`表
  247 +2. **数据转换**(`indexer/spu_transformer.py`):
  248 + - 按`spu_id`和`tenant_id`关联SPU和SKU数据
  249 + - 将SKU数据聚合为嵌套的`variants`数组
  250 + - 计算扁平化价格字段(`min_price`, `max_price`, `compare_at_price`)
  251 + - 字段映射(写死在代码中,不依赖配置)
  252 + - 注入`tenant_id`字段
240 253 3. **索引创建**:
241   - - 根据配置生成 ES mapping
242   - - 创建或更新索引
  254 + - 根据配置生成ES mapping
  255 + - 创建或更新`search_products`索引
243 256 4. **批量入库**:
244   - - 批量写入 ES(默认每批 500 条)
  257 + - 批量写入ES(默认每批500条)
245 258 - 错误处理和重试机制
246 259  
247   -#### 配置映射示例
248   -
249   -**customer1_config.yaml** 配置:
250   -```yaml
251   -main_table: "shoplazza_product_sku"
252   -extension_table: "customer1_extension"
253   -es_index_name: "search_customer1"
254   -
255   -fields:
256   - - name: "skuId"
257   - source_table: "main"
258   - source_column: "id"
259   - - name: "name"
260   - source_table: "extension"
261   - source_column: "name"
262   - - name: "enSpuName"
263   - source_table: "extension"
264   - source_column: "enSpuName"
  260 +**命令行工具**:
  261 +```bash
  262 +python scripts/ingest_shoplazza.py \
  263 + --db-host localhost \
  264 + --db-port 3306 \
  265 + --db-database saas \
  266 + --db-username root \
  267 + --db-password password \
  268 + --tenant-id "1" \
  269 + --config base \
  270 + --es-host http://localhost:9200 \
  271 + --recreate \
  272 + --batch-size 500
265 273 ```
266 274  
267   -**数据转换**:
268   -- 主表字段:直接从 `shoplazza_product_sku` 表的 `id` 字段读取
269   -- 扩展表字段:从 `customer1_extension` 表的对应列读取
270   -- 向量字段:对源文本/图片生成向量并缓存
  275 +#### 其他客户数据导入
  276 +
  277 +- 使用各自的数据转换器(如`indexer/data_transformer.py`)
  278 +- 数据源映射逻辑写死在各自的转换器中
  279 +- 共享`search_products`索引,通过`tenant_id`隔离
271 280  
272 281 **实现模块**:
273   -- `indexer/data_transformer.py` - 数据转换器
  282 +- `indexer/spu_transformer.py` - SPU数据转换器(Base配置)
  283 +- `indexer/data_transformer.py` - 通用数据转换器(其他客户)
274 284 - `indexer/bulk_indexer.py` - 批量索引器
275   -- `indexer/indexing_pipeline.py` - 索引流水线
276   -- `embeddings/bge_encoder.py` - 文本向量编码器
277   -- `embeddings/clip_image_encoder.py` - 图片向量编码器
  285 +- `scripts/ingest_shoplazza.py` - 店匠数据导入脚本
278 286  
279 287 ---
280 288  
... ... @@ -506,6 +514,14 @@ ranking:
506 514 - ✅ 搜索接口(文本搜索、图片搜索)
507 515 - ✅ 文档查询接口
508 516 - ✅ 前端界面(HTML + JavaScript)
  517 +- ✅ 租户隔离(tenant_id过滤)
  518 +
  519 +### 6.6 Base配置(店匠通用)
  520 +- ✅ SPU级别索引结构
  521 +- ✅ 嵌套variants字段
  522 +- ✅ 统一索引(search_products)
  523 +- ✅ 租户隔离(tenant_id)
  524 +- ✅ 配置简化(移除MySQL相关配置)
509 525  
510 526 ---
511 527  
... ... @@ -521,9 +537,55 @@ ranking:
521 537  
522 538 ---
523 539  
524   -## 8. 配置文件示例
  540 +## 8. API响应格式
  541 +
  542 +### 8.1 外部友好格式
  543 +
  544 +API返回格式不包含ES内部字段(`_id`, `_score`, `_source`),使用外部友好的格式:
  545 +
  546 +**响应结构**:
  547 +```json
  548 +{
  549 + "results": [
  550 + {
  551 + "product_id": "123",
  552 + "title": "蓝牙耳机",
  553 + "variants": [
  554 + {
  555 + "variant_id": "456",
  556 + "price": 199.99,
  557 + "sku": "SKU-123-1",
  558 + "stock": 50
  559 + }
  560 + ],
  561 + "relevance_score": 0.95
  562 + }
  563 + ],
  564 + "total": 10,
  565 + "facets": [...],
  566 + "suggestions": [],
  567 + "related_searches": []
  568 +}
  569 +```
  570 +
  571 +**主要变化**:
  572 +- 结构化结果(`ProductResult`和`VariantResult`)
  573 +- 嵌套variants数组
  574 +- 无ES内部字段
  575 +
  576 +### 8.2 租户隔离
  577 +
  578 +所有API请求必须提供`tenant_id`:
  579 +- 请求头:`X-Tenant-ID: 1`
  580 +- 或查询参数:`?tenant_id=1`
  581 +
  582 +搜索时自动添加`tenant_id`过滤,确保数据隔离。
  583 +
  584 +## 9. 配置文件示例
  585 +
  586 +**Base配置**(店匠通用):`config/schema/base/config.yaml`
525 587  
526   -完整配置示例请参考:`config/schema/customer1_config.yaml`
  588 +**其他客户配置**:`config/schema/customer1/config.yaml`
527 589  
528 590 ---
529 591  
... ...