Commit 1f6d15faa32291662f64d4e0e3f50269b8e104e5
1 parent
06e79082
重构:SPU级别索引、统一索引架构和API响应格式优化
主要变更: 1. 去掉数据源应用结构配置化,我们只针对店匠的spu sku表设计索引,数据灌入流程是写死的(只是满足测试需求,后面外层应用负责数据全量+增量灌入)。搜索系统主要关注如何适配外部搜索需求 目前有两个数据灌入脚本,一种是之前的,一种是现在的从两个店匠的表sku表+spu表读取并且以spu为单位组织doc。 - 配置只关注ES搜索相关配置,提高可维护性 - 创建base配置(店匠通用配置) 2. 索引结构重构(SPU维度) - 所有客户共享search_products索引,通过tenant_id隔离 - 支持嵌套variants字段(SKU变体数组) - 创建SPUTransformer用于SPU数据转换 3. API响应格式优化 - 约定一套搜索结果的格式,而不是直接暴露ES doc的结构(_id _score _source内的字段) - 添加ProductResult和VariantResult模型 - 添加suggestions和related_searches字段 (预留接口,逻辑暂未实现) 4. 数据导入流程 - 创建店匠数据导入脚本(ingest_shoplazza.py) - Pipeline层决定数据源,配置不包含数据源信息 - 创建测试数据生成和导入脚本 5. 文档更新 - 更新设计文档,反映新架构 - 创建BASE_CONFIG_GUIDE.md使用指南
Showing
14 changed files
with
2216 additions
and
233 deletions
Show diff stats
api/models.py
| @@ -172,11 +172,40 @@ class FacetResult(BaseModel): | @@ -172,11 +172,40 @@ class FacetResult(BaseModel): | ||
| 172 | total_count: Optional[int] = Field(None, description="该字段的总文档数") | 172 | total_count: Optional[int] = Field(None, description="该字段的总文档数") |
| 173 | 173 | ||
| 174 | 174 | ||
| 175 | +class VariantResult(BaseModel): | ||
| 176 | + """商品变体结果""" | ||
| 177 | + variant_id: str = Field(..., description="变体ID") | ||
| 178 | + title: Optional[str] = Field(None, description="变体标题") | ||
| 179 | + price: Optional[float] = Field(None, description="价格") | ||
| 180 | + compare_at_price: Optional[float] = Field(None, description="原价") | ||
| 181 | + sku: Optional[str] = Field(None, description="SKU编码") | ||
| 182 | + stock: int = Field(0, description="库存数量") | ||
| 183 | + options: Optional[Dict[str, Any]] = Field(None, description="选项(颜色、尺寸等)") | ||
| 184 | + | ||
| 185 | + | ||
| 186 | +class ProductResult(BaseModel): | ||
| 187 | + """商品搜索结果""" | ||
| 188 | + product_id: str = Field(..., description="商品ID") | ||
| 189 | + title: Optional[str] = Field(None, description="商品标题") | ||
| 190 | + handle: Optional[str] = Field(None, description="商品handle") | ||
| 191 | + description: Optional[str] = Field(None, description="商品描述") | ||
| 192 | + vendor: Optional[str] = Field(None, description="供应商/品牌") | ||
| 193 | + product_type: Optional[str] = Field(None, description="商品类型") | ||
| 194 | + tags: Optional[str] = Field(None, description="标签") | ||
| 195 | + price: Optional[float] = Field(None, description="价格(min_price)") | ||
| 196 | + compare_at_price: Optional[float] = Field(None, description="原价") | ||
| 197 | + currency: str = Field("USD", description="货币单位") | ||
| 198 | + image_url: Optional[str] = Field(None, description="主图URL") | ||
| 199 | + in_stock: bool = Field(True, description="是否有库存") | ||
| 200 | + variants: List[VariantResult] = Field(default_factory=list, description="变体列表") | ||
| 201 | + relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性分数(0-1)") | ||
| 202 | + | ||
| 203 | + | ||
| 175 | class SearchResponse(BaseModel): | 204 | class SearchResponse(BaseModel): |
| 176 | - """搜索响应模型(重构版)""" | 205 | + """搜索响应模型(外部友好格式)""" |
| 177 | 206 | ||
| 178 | # 核心结果 | 207 | # 核心结果 |
| 179 | - hits: List[Dict[str, Any]] = Field(..., description="搜索结果列表") | 208 | + results: List[ProductResult] = Field(..., description="搜索结果列表") |
| 180 | total: int = Field(..., description="匹配的总文档数") | 209 | total: int = Field(..., description="匹配的总文档数") |
| 181 | max_score: float = Field(..., description="最高相关性分数") | 210 | max_score: float = Field(..., description="最高相关性分数") |
| 182 | 211 | ||
| @@ -192,8 +221,9 @@ class SearchResponse(BaseModel): | @@ -192,8 +221,9 @@ class SearchResponse(BaseModel): | ||
| 192 | description="查询处理信息(原始查询、改写、语言检测、翻译等)" | 221 | description="查询处理信息(原始查询、改写、语言检测、翻译等)" |
| 193 | ) | 222 | ) |
| 194 | 223 | ||
| 195 | - # 推荐与建议(预留) | ||
| 196 | - related_queries: Optional[List[str]] = Field(None, description="相关搜索查询") | 224 | + # 推荐与建议 |
| 225 | + suggestions: List[str] = Field(default_factory=list, description="搜索建议") | ||
| 226 | + related_searches: List[str] = Field(default_factory=list, description="相关搜索") | ||
| 197 | 227 | ||
| 198 | # 性能指标 | 228 | # 性能指标 |
| 199 | took_ms: int = Field(..., description="搜索总耗时(毫秒)") | 229 | took_ms: int = Field(..., description="搜索总耗时(毫秒)") |
| @@ -0,0 +1,177 @@ | @@ -0,0 +1,177 @@ | ||
| 1 | +""" | ||
| 2 | +Result formatter for converting ES internal format to external-friendly format. | ||
| 3 | +""" | ||
| 4 | + | ||
| 5 | +from typing import List, Dict, Any, Optional | ||
| 6 | +from .models import ProductResult, VariantResult, FacetResult, FacetValue | ||
| 7 | + | ||
| 8 | + | ||
| 9 | +class ResultFormatter: | ||
| 10 | + """Formats ES search results to external-friendly format.""" | ||
| 11 | + | ||
| 12 | + @staticmethod | ||
| 13 | + def format_search_results( | ||
| 14 | + es_hits: List[Dict[str, Any]], | ||
| 15 | + max_score: float = 1.0 | ||
| 16 | + ) -> List[ProductResult]: | ||
| 17 | + """ | ||
| 18 | + Convert ES hits to ProductResult list. | ||
| 19 | + | ||
| 20 | + Args: | ||
| 21 | + es_hits: List of ES hit dictionaries (with _id, _score, _source) | ||
| 22 | + max_score: Maximum score for normalization | ||
| 23 | + | ||
| 24 | + Returns: | ||
| 25 | + List of ProductResult objects | ||
| 26 | + """ | ||
| 27 | + results = [] | ||
| 28 | + | ||
| 29 | + for hit in es_hits: | ||
| 30 | + source = hit.get('_source', {}) | ||
| 31 | + score = hit.get('_score', 0.0) | ||
| 32 | + | ||
| 33 | + # Normalize relevance score to 0-1 | ||
| 34 | + if max_score > 0: | ||
| 35 | + relevance_score = min(score / max_score, 1.0) | ||
| 36 | + else: | ||
| 37 | + relevance_score = 0.0 | ||
| 38 | + | ||
| 39 | + # Extract variants | ||
| 40 | + variants = [] | ||
| 41 | + variants_data = source.get('variants', []) | ||
| 42 | + if isinstance(variants_data, list): | ||
| 43 | + for variant_data in variants_data: | ||
| 44 | + variant = VariantResult( | ||
| 45 | + variant_id=str(variant_data.get('variant_id', '')), | ||
| 46 | + title=variant_data.get('title'), | ||
| 47 | + price=variant_data.get('price'), | ||
| 48 | + compare_at_price=variant_data.get('compare_at_price'), | ||
| 49 | + sku=variant_data.get('sku'), | ||
| 50 | + stock=variant_data.get('stock', 0), | ||
| 51 | + options=variant_data.get('options') | ||
| 52 | + ) | ||
| 53 | + variants.append(variant) | ||
| 54 | + | ||
| 55 | + # Determine in_stock (any variant has stock > 0) | ||
| 56 | + in_stock = any(v.stock > 0 for v in variants) if variants else True | ||
| 57 | + | ||
| 58 | + # Build ProductResult | ||
| 59 | + product = ProductResult( | ||
| 60 | + product_id=str(source.get('product_id', '')), | ||
| 61 | + title=source.get('title'), | ||
| 62 | + handle=source.get('handle'), | ||
| 63 | + description=source.get('description'), | ||
| 64 | + vendor=source.get('vendor'), | ||
| 65 | + product_type=source.get('product_type'), | ||
| 66 | + tags=source.get('tags'), | ||
| 67 | + price=source.get('min_price'), | ||
| 68 | + compare_at_price=source.get('compare_at_price'), | ||
| 69 | + currency="USD", # Default currency | ||
| 70 | + image_url=source.get('image_url'), | ||
| 71 | + in_stock=in_stock, | ||
| 72 | + variants=variants, | ||
| 73 | + relevance_score=relevance_score | ||
| 74 | + ) | ||
| 75 | + | ||
| 76 | + results.append(product) | ||
| 77 | + | ||
| 78 | + return results | ||
| 79 | + | ||
| 80 | + @staticmethod | ||
| 81 | + def format_facets( | ||
| 82 | + es_aggregations: Dict[str, Any], | ||
| 83 | + facet_configs: Optional[List[Any]] = None | ||
| 84 | + ) -> List[FacetResult]: | ||
| 85 | + """ | ||
| 86 | + Format ES aggregations to FacetResult list. | ||
| 87 | + | ||
| 88 | + Args: | ||
| 89 | + es_aggregations: ES aggregations response | ||
| 90 | + facet_configs: Facet configurations (optional) | ||
| 91 | + | ||
| 92 | + Returns: | ||
| 93 | + List of FacetResult objects | ||
| 94 | + """ | ||
| 95 | + facets = [] | ||
| 96 | + | ||
| 97 | + for field_name, agg_data in es_aggregations.items(): | ||
| 98 | + # Handle terms aggregation | ||
| 99 | + if 'buckets' in agg_data: | ||
| 100 | + values = [] | ||
| 101 | + for bucket in agg_data['buckets']: | ||
| 102 | + value = FacetValue( | ||
| 103 | + value=bucket['key'], | ||
| 104 | + label=bucket.get('key_as_string', str(bucket['key'])), | ||
| 105 | + count=bucket['doc_count'], | ||
| 106 | + selected=False | ||
| 107 | + ) | ||
| 108 | + values.append(value) | ||
| 109 | + | ||
| 110 | + facet = FacetResult( | ||
| 111 | + field=field_name, | ||
| 112 | + label=field_name, # Can be enhanced with field labels | ||
| 113 | + type="terms", | ||
| 114 | + values=values, | ||
| 115 | + total_count=agg_data.get('sum_other_doc_count', 0) + len(values) | ||
| 116 | + ) | ||
| 117 | + facets.append(facet) | ||
| 118 | + | ||
| 119 | + # Handle range aggregation | ||
| 120 | + elif 'buckets' in agg_data and any('from' in b or 'to' in b for b in agg_data['buckets']): | ||
| 121 | + values = [] | ||
| 122 | + for bucket in agg_data['buckets']: | ||
| 123 | + range_key = bucket.get('key', '') | ||
| 124 | + value = FacetValue( | ||
| 125 | + value=range_key, | ||
| 126 | + label=range_key, | ||
| 127 | + count=bucket['doc_count'], | ||
| 128 | + selected=False | ||
| 129 | + ) | ||
| 130 | + values.append(value) | ||
| 131 | + | ||
| 132 | + facet = FacetResult( | ||
| 133 | + field=field_name, | ||
| 134 | + label=field_name, | ||
| 135 | + type="range", | ||
| 136 | + values=values | ||
| 137 | + ) | ||
| 138 | + facets.append(facet) | ||
| 139 | + | ||
| 140 | + return facets | ||
| 141 | + | ||
| 142 | + @staticmethod | ||
| 143 | + def generate_suggestions( | ||
| 144 | + query: str, | ||
| 145 | + results: List[ProductResult] | ||
| 146 | + ) -> List[str]: | ||
| 147 | + """ | ||
| 148 | + Generate search suggestions. | ||
| 149 | + | ||
| 150 | + Args: | ||
| 151 | + query: Original search query | ||
| 152 | + results: Search results | ||
| 153 | + | ||
| 154 | + Returns: | ||
| 155 | + List of suggestion strings (currently returns empty list) | ||
| 156 | + """ | ||
| 157 | + # TODO: Implement suggestion generation logic | ||
| 158 | + return [] | ||
| 159 | + | ||
| 160 | + @staticmethod | ||
| 161 | + def generate_related_searches( | ||
| 162 | + query: str, | ||
| 163 | + results: List[ProductResult] | ||
| 164 | + ) -> List[str]: | ||
| 165 | + """ | ||
| 166 | + Generate related searches. | ||
| 167 | + | ||
| 168 | + Args: | ||
| 169 | + query: Original search query | ||
| 170 | + results: Search results | ||
| 171 | + | ||
| 172 | + Returns: | ||
| 173 | + List of related search strings (currently returns empty list) | ||
| 174 | + """ | ||
| 175 | + # TODO: Implement related search generation logic | ||
| 176 | + return [] | ||
| 177 | + |
api/routes/search.py
| @@ -33,7 +33,7 @@ def extract_request_info(request: Request) -> tuple[str, str]: | @@ -33,7 +33,7 @@ def extract_request_info(request: Request) -> tuple[str, str]: | ||
| 33 | @router.post("/", response_model=SearchResponse) | 33 | @router.post("/", response_model=SearchResponse) |
| 34 | async def search(request: SearchRequest, http_request: Request): | 34 | async def search(request: SearchRequest, http_request: Request): |
| 35 | """ | 35 | """ |
| 36 | - Execute text search query (重构版). | 36 | + Execute text search query (外部友好格式). |
| 37 | 37 | ||
| 38 | Supports: | 38 | Supports: |
| 39 | - Multi-language query processing | 39 | - Multi-language query processing |
| @@ -42,9 +42,27 @@ async def search(request: SearchRequest, http_request: Request): | @@ -42,9 +42,27 @@ async def search(request: SearchRequest, http_request: Request): | ||
| 42 | - Custom ranking functions | 42 | - Custom ranking functions |
| 43 | - Exact match filters and range filters | 43 | - Exact match filters and range filters |
| 44 | - Faceted search | 44 | - Faceted search |
| 45 | + | ||
| 46 | + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id). | ||
| 45 | """ | 47 | """ |
| 46 | reqid, uid = extract_request_info(http_request) | 48 | reqid, uid = extract_request_info(http_request) |
| 47 | 49 | ||
| 50 | + # Extract tenant_id (required) | ||
| 51 | + tenant_id = http_request.headers.get('X-Tenant-ID') | ||
| 52 | + if not tenant_id: | ||
| 53 | + # Try to get from query string | ||
| 54 | + from urllib.parse import parse_qs | ||
| 55 | + query_string = http_request.url.query | ||
| 56 | + if query_string: | ||
| 57 | + params = parse_qs(query_string) | ||
| 58 | + tenant_id = params.get('tenant_id', [None])[0] | ||
| 59 | + | ||
| 60 | + if not tenant_id: | ||
| 61 | + raise HTTPException( | ||
| 62 | + status_code=400, | ||
| 63 | + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'" | ||
| 64 | + ) | ||
| 65 | + | ||
| 48 | # Create request context | 66 | # Create request context |
| 49 | context = create_request_context(reqid=reqid, uid=uid) | 67 | context = create_request_context(reqid=reqid, uid=uid) |
| 50 | 68 | ||
| @@ -54,7 +72,7 @@ async def search(request: SearchRequest, http_request: Request): | @@ -54,7 +72,7 @@ async def search(request: SearchRequest, http_request: Request): | ||
| 54 | try: | 72 | try: |
| 55 | # Log request start | 73 | # Log request start |
| 56 | context.logger.info( | 74 | context.logger.info( |
| 57 | - f"收到搜索请求 | IP: {http_request.client.host if http_request.client else 'unknown'} | " | 75 | + f"收到搜索请求 | Tenant: {tenant_id} | IP: {http_request.client.host if http_request.client else 'unknown'} | " |
| 58 | f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}", | 76 | f"用户代理: {http_request.headers.get('User-Agent', 'unknown')[:100]}", |
| 59 | extra={'reqid': context.reqid, 'uid': context.uid} | 77 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 60 | ) | 78 | ) |
| @@ -66,6 +84,7 @@ async def search(request: SearchRequest, http_request: Request): | @@ -66,6 +84,7 @@ async def search(request: SearchRequest, http_request: Request): | ||
| 66 | # Execute search with context (using backend defaults from config) | 84 | # Execute search with context (using backend defaults from config) |
| 67 | result = searcher.search( | 85 | result = searcher.search( |
| 68 | query=request.query, | 86 | query=request.query, |
| 87 | + tenant_id=tenant_id, | ||
| 69 | size=request.size, | 88 | size=request.size, |
| 70 | from_=request.from_, | 89 | from_=request.from_, |
| 71 | filters=request.filters, | 90 | filters=request.filters, |
| @@ -83,12 +102,14 @@ async def search(request: SearchRequest, http_request: Request): | @@ -83,12 +102,14 @@ async def search(request: SearchRequest, http_request: Request): | ||
| 83 | 102 | ||
| 84 | # Convert to response model | 103 | # Convert to response model |
| 85 | return SearchResponse( | 104 | return SearchResponse( |
| 86 | - hits=result.hits, | 105 | + results=result.results, |
| 87 | total=result.total, | 106 | total=result.total, |
| 88 | max_score=result.max_score, | 107 | max_score=result.max_score, |
| 89 | took_ms=result.took_ms, | 108 | took_ms=result.took_ms, |
| 90 | facets=result.facets, | 109 | facets=result.facets, |
| 91 | query_info=result.query_info, | 110 | query_info=result.query_info, |
| 111 | + suggestions=result.suggestions, | ||
| 112 | + related_searches=result.related_searches, | ||
| 92 | performance_info=performance_summary, | 113 | performance_info=performance_summary, |
| 93 | debug_info=result.debug_info | 114 | debug_info=result.debug_info |
| 94 | ) | 115 | ) |
| @@ -110,13 +131,30 @@ async def search(request: SearchRequest, http_request: Request): | @@ -110,13 +131,30 @@ async def search(request: SearchRequest, http_request: Request): | ||
| 110 | @router.post("/image", response_model=SearchResponse) | 131 | @router.post("/image", response_model=SearchResponse) |
| 111 | async def search_by_image(request: ImageSearchRequest, http_request: Request): | 132 | async def search_by_image(request: ImageSearchRequest, http_request: Request): |
| 112 | """ | 133 | """ |
| 113 | - Search by image similarity (重构版). | 134 | + Search by image similarity (外部友好格式). |
| 114 | 135 | ||
| 115 | Uses image embeddings to find visually similar products. | 136 | Uses image embeddings to find visually similar products. |
| 116 | Supports exact match filters and range filters. | 137 | Supports exact match filters and range filters. |
| 138 | + | ||
| 139 | + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id). | ||
| 117 | """ | 140 | """ |
| 118 | reqid, uid = extract_request_info(http_request) | 141 | reqid, uid = extract_request_info(http_request) |
| 119 | 142 | ||
| 143 | + # Extract tenant_id (required) | ||
| 144 | + tenant_id = http_request.headers.get('X-Tenant-ID') | ||
| 145 | + if not tenant_id: | ||
| 146 | + from urllib.parse import parse_qs | ||
| 147 | + query_string = http_request.url.query | ||
| 148 | + if query_string: | ||
| 149 | + params = parse_qs(query_string) | ||
| 150 | + tenant_id = params.get('tenant_id', [None])[0] | ||
| 151 | + | ||
| 152 | + if not tenant_id: | ||
| 153 | + raise HTTPException( | ||
| 154 | + status_code=400, | ||
| 155 | + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'" | ||
| 156 | + ) | ||
| 157 | + | ||
| 120 | # Create request context | 158 | # Create request context |
| 121 | context = create_request_context(reqid=reqid, uid=uid) | 159 | context = create_request_context(reqid=reqid, uid=uid) |
| 122 | 160 | ||
| @@ -126,7 +164,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | @@ -126,7 +164,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | ||
| 126 | try: | 164 | try: |
| 127 | # Log request start | 165 | # Log request start |
| 128 | context.logger.info( | 166 | context.logger.info( |
| 129 | - f"收到图片搜索请求 | 图片URL: {request.image_url} | " | 167 | + f"收到图片搜索请求 | Tenant: {tenant_id} | 图片URL: {request.image_url} | " |
| 130 | f"IP: {http_request.client.host if http_request.client else 'unknown'}", | 168 | f"IP: {http_request.client.host if http_request.client else 'unknown'}", |
| 131 | extra={'reqid': context.reqid, 'uid': context.uid} | 169 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 132 | ) | 170 | ) |
| @@ -137,6 +175,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | @@ -137,6 +175,7 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | ||
| 137 | # Execute image search | 175 | # Execute image search |
| 138 | result = searcher.search_by_image( | 176 | result = searcher.search_by_image( |
| 139 | image_url=request.image_url, | 177 | image_url=request.image_url, |
| 178 | + tenant_id=tenant_id, | ||
| 140 | size=request.size, | 179 | size=request.size, |
| 141 | filters=request.filters, | 180 | filters=request.filters, |
| 142 | range_filters=request.range_filters | 181 | range_filters=request.range_filters |
| @@ -146,12 +185,14 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | @@ -146,12 +185,14 @@ async def search_by_image(request: ImageSearchRequest, http_request: Request): | ||
| 146 | performance_summary = context.get_summary() if context else None | 185 | performance_summary = context.get_summary() if context else None |
| 147 | 186 | ||
| 148 | return SearchResponse( | 187 | return SearchResponse( |
| 149 | - hits=result.hits, | 188 | + results=result.results, |
| 150 | total=result.total, | 189 | total=result.total, |
| 151 | max_score=result.max_score, | 190 | max_score=result.max_score, |
| 152 | took_ms=result.took_ms, | 191 | took_ms=result.took_ms, |
| 153 | facets=result.facets, | 192 | facets=result.facets, |
| 154 | query_info=result.query_info, | 193 | query_info=result.query_info, |
| 194 | + suggestions=result.suggestions, | ||
| 195 | + related_searches=result.related_searches, | ||
| 155 | performance_info=performance_summary | 196 | performance_info=performance_summary |
| 156 | ) | 197 | ) |
| 157 | 198 | ||
| @@ -226,7 +267,8 @@ async def search_suggestions( | @@ -226,7 +267,8 @@ async def search_suggestions( | ||
| 226 | @router.get("/instant", response_model=SearchResponse) | 267 | @router.get("/instant", response_model=SearchResponse) |
| 227 | async def instant_search( | 268 | async def instant_search( |
| 228 | q: str = Query(..., min_length=2, description="搜索查询"), | 269 | q: str = Query(..., min_length=2, description="搜索查询"), |
| 229 | - size: int = Query(5, ge=1, le=20, description="结果数量") | 270 | + size: int = Query(5, ge=1, le=20, description="结果数量"), |
| 271 | + tenant_id: str = Query(..., description="租户ID") | ||
| 230 | ): | 272 | ): |
| 231 | """ | 273 | """ |
| 232 | 即时搜索(Instant Search)。 | 274 | 即时搜索(Instant Search)。 |
| @@ -246,17 +288,20 @@ async def instant_search( | @@ -246,17 +288,20 @@ async def instant_search( | ||
| 246 | 288 | ||
| 247 | result = searcher.search( | 289 | result = searcher.search( |
| 248 | query=q, | 290 | query=q, |
| 291 | + tenant_id=tenant_id, | ||
| 249 | size=size, | 292 | size=size, |
| 250 | from_=0 | 293 | from_=0 |
| 251 | ) | 294 | ) |
| 252 | 295 | ||
| 253 | return SearchResponse( | 296 | return SearchResponse( |
| 254 | - hits=result.hits, | 297 | + results=result.results, |
| 255 | total=result.total, | 298 | total=result.total, |
| 256 | max_score=result.max_score, | 299 | max_score=result.max_score, |
| 257 | took_ms=result.took_ms, | 300 | took_ms=result.took_ms, |
| 258 | facets=result.facets, | 301 | facets=result.facets, |
| 259 | - query_info=result.query_info | 302 | + query_info=result.query_info, |
| 303 | + suggestions=result.suggestions, | ||
| 304 | + related_searches=result.related_searches | ||
| 260 | ) | 305 | ) |
| 261 | 306 | ||
| 262 | 307 |
config/config_loader.py
| @@ -90,9 +90,6 @@ class CustomerConfig: | @@ -90,9 +90,6 @@ class CustomerConfig: | ||
| 90 | customer_id: str | 90 | customer_id: str |
| 91 | customer_name: str | 91 | customer_name: str |
| 92 | 92 | ||
| 93 | - # Database settings | ||
| 94 | - mysql_config: Dict[str, Any] | ||
| 95 | - | ||
| 96 | # Field definitions | 93 | # Field definitions |
| 97 | fields: List[FieldConfig] | 94 | fields: List[FieldConfig] |
| 98 | 95 | ||
| @@ -116,10 +113,6 @@ class CustomerConfig: | @@ -116,10 +113,6 @@ class CustomerConfig: | ||
| 116 | 113 | ||
| 117 | # ES index settings | 114 | # ES index settings |
| 118 | es_index_name: str | 115 | es_index_name: str |
| 119 | - | ||
| 120 | - # Optional fields with defaults | ||
| 121 | - main_table: str = "shoplazza_product_sku" | ||
| 122 | - extension_table: Optional[str] = None | ||
| 123 | es_settings: Dict[str, Any] = field(default_factory=dict) | 116 | es_settings: Dict[str, Any] = field(default_factory=dict) |
| 124 | 117 | ||
| 125 | 118 | ||
| @@ -272,9 +265,6 @@ class ConfigLoader: | @@ -272,9 +265,6 @@ class ConfigLoader: | ||
| 272 | return CustomerConfig( | 265 | return CustomerConfig( |
| 273 | customer_id=customer_id, | 266 | customer_id=customer_id, |
| 274 | customer_name=config_data.get("customer_name", customer_id), | 267 | customer_name=config_data.get("customer_name", customer_id), |
| 275 | - mysql_config=config_data.get("mysql_config", {}), | ||
| 276 | - main_table=config_data.get("main_table", "shoplazza_product_sku"), | ||
| 277 | - extension_table=config_data.get("extension_table"), | ||
| 278 | fields=fields, | 268 | fields=fields, |
| 279 | indexes=indexes, | 269 | indexes=indexes, |
| 280 | query_config=query_config, | 270 | query_config=query_config, |
| @@ -310,8 +300,6 @@ class ConfigLoader: | @@ -310,8 +300,6 @@ class ConfigLoader: | ||
| 310 | return FieldConfig( | 300 | return FieldConfig( |
| 311 | name=name, | 301 | name=name, |
| 312 | field_type=field_type, | 302 | field_type=field_type, |
| 313 | - source_table=field_data.get("source_table"), | ||
| 314 | - source_column=field_data.get("source_column", name), | ||
| 315 | analyzer=analyzer, | 303 | analyzer=analyzer, |
| 316 | search_analyzer=search_analyzer, | 304 | search_analyzer=search_analyzer, |
| 317 | required=field_data.get("required", False), | 305 | required=field_data.get("required", False), |
| @@ -426,15 +414,17 @@ class ConfigLoader: | @@ -426,15 +414,17 @@ class ConfigLoader: | ||
| 426 | if field.embedding_similarity not in ["dot_product", "cosine", "l2_norm"]: | 414 | if field.embedding_similarity not in ["dot_product", "cosine", "l2_norm"]: |
| 427 | errors.append(f"Field '{field.name}': invalid embedding_similarity") | 415 | errors.append(f"Field '{field.name}': invalid embedding_similarity") |
| 428 | 416 | ||
| 429 | - # Validate MySQL config | ||
| 430 | - if "host" not in config.mysql_config: | ||
| 431 | - errors.append("MySQL configuration missing 'host'") | ||
| 432 | - if "username" not in config.mysql_config: | ||
| 433 | - errors.append("MySQL configuration missing 'username'") | ||
| 434 | - if "password" not in config.mysql_config: | ||
| 435 | - errors.append("MySQL configuration missing 'password'") | ||
| 436 | - if "database" not in config.mysql_config: | ||
| 437 | - errors.append("MySQL configuration missing 'database'") | 417 | + # Validate tenant_id field (required) |
| 418 | + tenant_id_field = None | ||
| 419 | + for field in config.fields: | ||
| 420 | + if field.name == "tenant_id": | ||
| 421 | + tenant_id_field = field | ||
| 422 | + break | ||
| 423 | + | ||
| 424 | + if not tenant_id_field: | ||
| 425 | + errors.append("Required field 'tenant_id' not found in fields") | ||
| 426 | + elif not tenant_id_field.required: | ||
| 427 | + errors.append("Field 'tenant_id' must be marked as required") | ||
| 438 | 428 | ||
| 439 | return errors | 429 | return errors |
| 440 | 430 | ||
| @@ -457,9 +447,6 @@ class ConfigLoader: | @@ -457,9 +447,6 @@ class ConfigLoader: | ||
| 457 | # Convert config back to dictionary format | 447 | # Convert config back to dictionary format |
| 458 | config_dict = { | 448 | config_dict = { |
| 459 | "customer_name": config.customer_name, | 449 | "customer_name": config.customer_name, |
| 460 | - "mysql_config": config.mysql_config, | ||
| 461 | - "main_table": config.main_table, | ||
| 462 | - "extension_table": config.extension_table, | ||
| 463 | "es_index_name": config.es_index_name, | 450 | "es_index_name": config.es_index_name, |
| 464 | "es_settings": config.es_settings, | 451 | "es_settings": config.es_settings, |
| 465 | "fields": [self._field_to_dict(field) for field in config.fields], | 452 | "fields": [self._field_to_dict(field) for field in config.fields], |
| @@ -478,6 +465,16 @@ class ConfigLoader: | @@ -478,6 +465,16 @@ class ConfigLoader: | ||
| 478 | "expression": config.ranking.expression, | 465 | "expression": config.ranking.expression, |
| 479 | "description": config.ranking.description | 466 | "description": config.ranking.description |
| 480 | }, | 467 | }, |
| 468 | + "function_score": { | ||
| 469 | + "score_mode": config.function_score.score_mode, | ||
| 470 | + "boost_mode": config.function_score.boost_mode, | ||
| 471 | + "functions": config.function_score.functions | ||
| 472 | + }, | ||
| 473 | + "rerank": { | ||
| 474 | + "enabled": config.rerank.enabled, | ||
| 475 | + "expression": config.rerank.expression, | ||
| 476 | + "description": config.rerank.description | ||
| 477 | + }, | ||
| 481 | "spu_config": { | 478 | "spu_config": { |
| 482 | "enabled": config.spu_config.enabled, | 479 | "enabled": config.spu_config.enabled, |
| 483 | "spu_field": config.spu_config.spu_field, | 480 | "spu_field": config.spu_config.spu_field, |
| @@ -512,8 +509,6 @@ class ConfigLoader: | @@ -512,8 +509,6 @@ class ConfigLoader: | ||
| 512 | result = { | 509 | result = { |
| 513 | "name": field.name, | 510 | "name": field.name, |
| 514 | "type": field.field_type.value, | 511 | "type": field.field_type.value, |
| 515 | - "source_table": field.source_table, | ||
| 516 | - "source_column": field.source_column, | ||
| 517 | "required": field.required, | 512 | "required": field.required, |
| 518 | "boost": field.boost, | 513 | "boost": field.boost, |
| 519 | "store": field.store, | 514 | "store": field.store, |
config/field_types.py
| @@ -54,8 +54,6 @@ class FieldConfig: | @@ -54,8 +54,6 @@ class FieldConfig: | ||
| 54 | """Configuration for a single field.""" | 54 | """Configuration for a single field.""" |
| 55 | name: str | 55 | name: str |
| 56 | field_type: FieldType | 56 | field_type: FieldType |
| 57 | - source_table: Optional[str] = None # 'main' or 'extension' or specific table name | ||
| 58 | - source_column: Optional[str] = None | ||
| 59 | analyzer: Optional[AnalyzerType] = None | 57 | analyzer: Optional[AnalyzerType] = None |
| 60 | search_analyzer: Optional[AnalyzerType] = None | 58 | search_analyzer: Optional[AnalyzerType] = None |
| 61 | required: bool = False | 59 | required: bool = False |
| @@ -172,10 +170,34 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: | @@ -172,10 +170,34 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: | ||
| 172 | } | 170 | } |
| 173 | 171 | ||
| 174 | elif field_config.field_type == FieldType.JSON: | 172 | elif field_config.field_type == FieldType.JSON: |
| 175 | - mapping = { | ||
| 176 | - "type": "object", | ||
| 177 | - "enabled": True | ||
| 178 | - } | 173 | + if field_config.nested and field_config.nested_properties: |
| 174 | + # Nested type with properties (e.g., variants) | ||
| 175 | + mapping = { | ||
| 176 | + "type": "nested", | ||
| 177 | + "properties": {} | ||
| 178 | + } | ||
| 179 | + # Generate mappings for nested properties | ||
| 180 | + for prop_name, prop_config in field_config.nested_properties.items(): | ||
| 181 | + prop_type = prop_config.get("type", "keyword") | ||
| 182 | + prop_mapping = {"type": prop_type} | ||
| 183 | + | ||
| 184 | + # Add analyzer for text fields | ||
| 185 | + if prop_type == "text" and "analyzer" in prop_config: | ||
| 186 | + prop_mapping["analyzer"] = prop_config["analyzer"] | ||
| 187 | + | ||
| 188 | + # Add other properties | ||
| 189 | + if "index" in prop_config: | ||
| 190 | + prop_mapping["index"] = prop_config["index"] | ||
| 191 | + if "store" in prop_config: | ||
| 192 | + prop_mapping["store"] = prop_config["store"] | ||
| 193 | + | ||
| 194 | + mapping["properties"][prop_name] = prop_mapping | ||
| 195 | + else: | ||
| 196 | + # Simple object type | ||
| 197 | + mapping = { | ||
| 198 | + "type": "object", | ||
| 199 | + "enabled": True | ||
| 200 | + } | ||
| 179 | 201 | ||
| 180 | return mapping | 202 | return mapping |
| 181 | 203 |
config/schema/base/config.yaml
| @@ -0,0 +1,271 @@ | @@ -0,0 +1,271 @@ | ||
| 1 | +# Base Configuration for Shoplazza | ||
| 2 | +# 店匠通用配置文件,所有使用店匠表的客户共用 | ||
| 3 | +# 注意:此配置不包含MySQL相关配置,只包含ES搜索相关配置 | ||
| 4 | + | ||
| 5 | +customer_name: "Shoplazza Base Configuration" | ||
| 6 | + | ||
| 7 | +# Elasticsearch Index | ||
| 8 | +es_index_name: "search_products" | ||
| 9 | + | ||
| 10 | +# ES Index Settings | ||
| 11 | +es_settings: | ||
| 12 | + number_of_shards: 1 | ||
| 13 | + number_of_replicas: 0 | ||
| 14 | + refresh_interval: "30s" | ||
| 15 | + | ||
| 16 | +# Field Definitions (SPU级别,只包含对搜索有帮助的字段) | ||
| 17 | +fields: | ||
| 18 | + # 租户隔离字段(必需) | ||
| 19 | + - name: "tenant_id" | ||
| 20 | + type: "KEYWORD" | ||
| 21 | + required: true | ||
| 22 | + index: true | ||
| 23 | + store: true | ||
| 24 | + | ||
| 25 | + # 商品标识字段 | ||
| 26 | + - name: "product_id" | ||
| 27 | + type: "KEYWORD" | ||
| 28 | + required: true | ||
| 29 | + index: true | ||
| 30 | + store: true | ||
| 31 | + | ||
| 32 | + - name: "handle" | ||
| 33 | + type: "KEYWORD" | ||
| 34 | + index: true | ||
| 35 | + store: true | ||
| 36 | + | ||
| 37 | + # 文本搜索字段 | ||
| 38 | + - name: "title" | ||
| 39 | + type: "TEXT" | ||
| 40 | + analyzer: "chinese_ecommerce" | ||
| 41 | + boost: 3.0 | ||
| 42 | + index: true | ||
| 43 | + store: true | ||
| 44 | + | ||
| 45 | + - name: "brief" | ||
| 46 | + type: "TEXT" | ||
| 47 | + analyzer: "chinese_ecommerce" | ||
| 48 | + boost: 1.5 | ||
| 49 | + index: true | ||
| 50 | + store: true | ||
| 51 | + | ||
| 52 | + - name: "description" | ||
| 53 | + type: "TEXT" | ||
| 54 | + analyzer: "chinese_ecommerce" | ||
| 55 | + boost: 1.0 | ||
| 56 | + index: true | ||
| 57 | + store: true | ||
| 58 | + | ||
| 59 | + # SEO字段(提升相关性) | ||
| 60 | + - name: "seo_title" | ||
| 61 | + type: "TEXT" | ||
| 62 | + analyzer: "chinese_ecommerce" | ||
| 63 | + boost: 2.0 | ||
| 64 | + index: true | ||
| 65 | + store: true | ||
| 66 | + | ||
| 67 | + - name: "seo_description" | ||
| 68 | + type: "TEXT" | ||
| 69 | + analyzer: "chinese_ecommerce" | ||
| 70 | + boost: 1.5 | ||
| 71 | + index: true | ||
| 72 | + store: true | ||
| 73 | + | ||
| 74 | + - name: "seo_keywords" | ||
| 75 | + type: "TEXT" | ||
| 76 | + analyzer: "chinese_ecommerce" | ||
| 77 | + boost: 2.0 | ||
| 78 | + index: true | ||
| 79 | + store: true | ||
| 80 | + | ||
| 81 | + # 分类和标签字段(TEXT + KEYWORD双重索引) | ||
| 82 | + - name: "vendor" | ||
| 83 | + type: "TEXT" | ||
| 84 | + analyzer: "chinese_ecommerce" | ||
| 85 | + boost: 1.5 | ||
| 86 | + index: true | ||
| 87 | + store: true | ||
| 88 | + | ||
| 89 | + - name: "vendor_keyword" | ||
| 90 | + type: "KEYWORD" | ||
| 91 | + index: true | ||
| 92 | + store: false | ||
| 93 | + | ||
| 94 | + - name: "product_type" | ||
| 95 | + type: "TEXT" | ||
| 96 | + analyzer: "chinese_ecommerce" | ||
| 97 | + boost: 1.5 | ||
| 98 | + index: true | ||
| 99 | + store: true | ||
| 100 | + | ||
| 101 | + - name: "product_type_keyword" | ||
| 102 | + type: "KEYWORD" | ||
| 103 | + index: true | ||
| 104 | + store: false | ||
| 105 | + | ||
| 106 | + - name: "tags" | ||
| 107 | + type: "TEXT" | ||
| 108 | + analyzer: "chinese_ecommerce" | ||
| 109 | + boost: 1.0 | ||
| 110 | + index: true | ||
| 111 | + store: true | ||
| 112 | + | ||
| 113 | + - name: "tags_keyword" | ||
| 114 | + type: "KEYWORD" | ||
| 115 | + index: true | ||
| 116 | + store: false | ||
| 117 | + | ||
| 118 | + - name: "category" | ||
| 119 | + type: "TEXT" | ||
| 120 | + analyzer: "chinese_ecommerce" | ||
| 121 | + boost: 1.5 | ||
| 122 | + index: true | ||
| 123 | + store: true | ||
| 124 | + | ||
| 125 | + - name: "category_keyword" | ||
| 126 | + type: "KEYWORD" | ||
| 127 | + index: true | ||
| 128 | + store: false | ||
| 129 | + | ||
| 130 | + # 价格字段(扁平化) | ||
| 131 | + - name: "min_price" | ||
| 132 | + type: "FLOAT" | ||
| 133 | + index: true | ||
| 134 | + store: true | ||
| 135 | + | ||
| 136 | + - name: "max_price" | ||
| 137 | + type: "FLOAT" | ||
| 138 | + index: true | ||
| 139 | + store: true | ||
| 140 | + | ||
| 141 | + - name: "compare_at_price" | ||
| 142 | + type: "FLOAT" | ||
| 143 | + index: true | ||
| 144 | + store: true | ||
| 145 | + | ||
| 146 | + # 图片字段(用于显示,不参与搜索) | ||
| 147 | + - name: "image_url" | ||
| 148 | + type: "KEYWORD" | ||
| 149 | + index: false | ||
| 150 | + store: true | ||
| 151 | + | ||
| 152 | + # 嵌套variants字段 | ||
| 153 | + - name: "variants" | ||
| 154 | + type: "JSON" | ||
| 155 | + nested: true | ||
| 156 | + nested_properties: | ||
| 157 | + variant_id: | ||
| 158 | + type: "keyword" | ||
| 159 | + index: true | ||
| 160 | + store: true | ||
| 161 | + title: | ||
| 162 | + type: "text" | ||
| 163 | + analyzer: "chinese_ecommerce" | ||
| 164 | + index: true | ||
| 165 | + store: true | ||
| 166 | + price: | ||
| 167 | + type: "float" | ||
| 168 | + index: true | ||
| 169 | + store: true | ||
| 170 | + compare_at_price: | ||
| 171 | + type: "float" | ||
| 172 | + index: true | ||
| 173 | + store: true | ||
| 174 | + sku: | ||
| 175 | + type: "keyword" | ||
| 176 | + index: true | ||
| 177 | + store: true | ||
| 178 | + stock: | ||
| 179 | + type: "long" | ||
| 180 | + index: true | ||
| 181 | + store: true | ||
| 182 | + options: | ||
| 183 | + type: "object" | ||
| 184 | + enabled: true | ||
| 185 | + | ||
| 186 | +# Index Structure (Query Domains) | ||
| 187 | +indexes: | ||
| 188 | + - name: "default" | ||
| 189 | + label: "默认索引" | ||
| 190 | + fields: | ||
| 191 | + - "title" | ||
| 192 | + - "brief" | ||
| 193 | + - "description" | ||
| 194 | + - "seo_title" | ||
| 195 | + - "seo_description" | ||
| 196 | + - "seo_keywords" | ||
| 197 | + - "vendor" | ||
| 198 | + - "product_type" | ||
| 199 | + - "tags" | ||
| 200 | + - "category" | ||
| 201 | + analyzer: "chinese_ecommerce" | ||
| 202 | + boost: 1.0 | ||
| 203 | + | ||
| 204 | + - name: "title" | ||
| 205 | + label: "标题索引" | ||
| 206 | + fields: | ||
| 207 | + - "title" | ||
| 208 | + - "seo_title" | ||
| 209 | + analyzer: "chinese_ecommerce" | ||
| 210 | + boost: 2.0 | ||
| 211 | + | ||
| 212 | + - name: "vendor" | ||
| 213 | + label: "品牌索引" | ||
| 214 | + fields: | ||
| 215 | + - "vendor" | ||
| 216 | + analyzer: "chinese_ecommerce" | ||
| 217 | + boost: 1.5 | ||
| 218 | + | ||
| 219 | + - name: "category" | ||
| 220 | + label: "类目索引" | ||
| 221 | + fields: | ||
| 222 | + - "category" | ||
| 223 | + analyzer: "chinese_ecommerce" | ||
| 224 | + boost: 1.5 | ||
| 225 | + | ||
| 226 | + - name: "tags" | ||
| 227 | + label: "标签索引" | ||
| 228 | + fields: | ||
| 229 | + - "tags" | ||
| 230 | + - "seo_keywords" | ||
| 231 | + analyzer: "chinese_ecommerce" | ||
| 232 | + boost: 1.0 | ||
| 233 | + | ||
| 234 | +# Query Configuration | ||
| 235 | +query_config: | ||
| 236 | + supported_languages: | ||
| 237 | + - "zh" | ||
| 238 | + - "en" | ||
| 239 | + default_language: "zh" | ||
| 240 | + enable_translation: true | ||
| 241 | + enable_text_embedding: true | ||
| 242 | + enable_query_rewrite: true | ||
| 243 | + | ||
| 244 | + # Translation API (DeepL) | ||
| 245 | + translation_service: "deepl" | ||
| 246 | + translation_api_key: null # Set via environment variable | ||
| 247 | + | ||
| 248 | +# Ranking Configuration | ||
| 249 | +ranking: | ||
| 250 | + expression: "bm25() + 0.2*text_embedding_relevance()" | ||
| 251 | + description: "BM25 text relevance combined with semantic embedding similarity" | ||
| 252 | + | ||
| 253 | +# Function Score配置(ES层打分规则) | ||
| 254 | +function_score: | ||
| 255 | + score_mode: "sum" | ||
| 256 | + boost_mode: "multiply" | ||
| 257 | + | ||
| 258 | + functions: [] | ||
| 259 | + | ||
| 260 | +# Rerank配置(本地重排,当前禁用) | ||
| 261 | +rerank: | ||
| 262 | + enabled: false | ||
| 263 | + expression: "" | ||
| 264 | + description: "Local reranking (disabled, use ES function_score instead)" | ||
| 265 | + | ||
| 266 | +# SPU配置(已启用,使用嵌套variants) | ||
| 267 | +spu_config: | ||
| 268 | + enabled: true | ||
| 269 | + spu_field: "product_id" | ||
| 270 | + inner_hits_size: 10 | ||
| 271 | + |
| @@ -0,0 +1,257 @@ | @@ -0,0 +1,257 @@ | ||
| 1 | +# Base Configuration Guide | ||
| 2 | + | ||
| 3 | +店匠通用配置(Base Configuration)使用指南 | ||
| 4 | + | ||
| 5 | +## 概述 | ||
| 6 | + | ||
| 7 | +Base配置是店匠(Shoplazza)通用配置,适用于所有使用店匠标准表的客户。该配置采用SPU级别的索引结构,所有客户共享同一个Elasticsearch索引(`search_products`),通过`tenant_id`字段实现数据隔离。 | ||
| 8 | + | ||
| 9 | +## 核心特性 | ||
| 10 | + | ||
| 11 | +- **SPU级别索引**:每个ES文档代表一个SPU,包含嵌套的variants数组 | ||
| 12 | +- **统一索引**:所有客户共享`search_products`索引 | ||
| 13 | +- **租户隔离**:通过`tenant_id`字段实现数据隔离 | ||
| 14 | +- **配置简化**:配置只包含ES搜索相关配置,不包含MySQL数据源配置 | ||
| 15 | +- **外部友好格式**:API返回格式不包含ES内部字段(`_id`, `_score`, `_source`) | ||
| 16 | + | ||
| 17 | +## 配置说明 | ||
| 18 | + | ||
| 19 | +### 配置文件位置 | ||
| 20 | + | ||
| 21 | +`config/schema/base/config.yaml` | ||
| 22 | + | ||
| 23 | +### 配置内容 | ||
| 24 | + | ||
| 25 | +Base配置**不包含**以下内容: | ||
| 26 | +- `mysql_config` - MySQL数据库配置 | ||
| 27 | +- `main_table` - 主表配置 | ||
| 28 | +- `extension_table` - 扩展表配置 | ||
| 29 | +- `source_table` / `source_column` - 字段数据源映射 | ||
| 30 | + | ||
| 31 | +Base配置**只包含**: | ||
| 32 | +- ES字段定义(字段类型、分析器、boost等) | ||
| 33 | +- 查询域(indexes)配置 | ||
| 34 | +- 查询处理配置(query_config) | ||
| 35 | +- 排序和打分配置(function_score) | ||
| 36 | +- SPU配置(spu_config) | ||
| 37 | + | ||
| 38 | +### 必需字段 | ||
| 39 | + | ||
| 40 | +- `tenant_id` (KEYWORD, required) - 租户隔离字段 | ||
| 41 | + | ||
| 42 | +### 主要字段 | ||
| 43 | + | ||
| 44 | +- `product_id` - 商品ID | ||
| 45 | +- `title`, `brief`, `description` - 文本搜索字段 | ||
| 46 | +- `seo_title`, `seo_description`, `seo_keywords` - SEO字段 | ||
| 47 | +- `vendor`, `product_type`, `tags`, `category` - 分类和标签字段 | ||
| 48 | +- `min_price`, `max_price`, `compare_at_price` - 价格字段 | ||
| 49 | +- `variants` (nested) - 嵌套变体数组 | ||
| 50 | + | ||
| 51 | +## 数据导入流程 | ||
| 52 | + | ||
| 53 | +### 1. 生成测试数据 | ||
| 54 | + | ||
| 55 | +```bash | ||
| 56 | +python scripts/generate_test_data.py \ | ||
| 57 | + --num-spus 100 \ | ||
| 58 | + --tenant-id "1" \ | ||
| 59 | + --start-spu-id 1 \ | ||
| 60 | + --start-sku-id 1 \ | ||
| 61 | + --output test_data.sql | ||
| 62 | +``` | ||
| 63 | + | ||
| 64 | +### 2. 导入测试数据到MySQL | ||
| 65 | + | ||
| 66 | +```bash | ||
| 67 | +python scripts/import_test_data.py \ | ||
| 68 | + --db-host localhost \ | ||
| 69 | + --db-port 3306 \ | ||
| 70 | + --db-database saas \ | ||
| 71 | + --db-username root \ | ||
| 72 | + --db-password password \ | ||
| 73 | + --sql-file test_data.sql \ | ||
| 74 | + --tenant-id "1" | ||
| 75 | +``` | ||
| 76 | + | ||
| 77 | +### 3. 导入数据到Elasticsearch | ||
| 78 | + | ||
| 79 | +```bash | ||
| 80 | +python scripts/ingest_shoplazza.py \ | ||
| 81 | + --db-host localhost \ | ||
| 82 | + --db-port 3306 \ | ||
| 83 | + --db-database saas \ | ||
| 84 | + --db-username root \ | ||
| 85 | + --db-password password \ | ||
| 86 | + --tenant-id "1" \ | ||
| 87 | + --config base \ | ||
| 88 | + --es-host http://localhost:9200 \ | ||
| 89 | + --recreate \ | ||
| 90 | + --batch-size 500 | ||
| 91 | +``` | ||
| 92 | + | ||
| 93 | +## API使用 | ||
| 94 | + | ||
| 95 | +### 搜索接口 | ||
| 96 | + | ||
| 97 | +**端点**: `POST /search/` | ||
| 98 | + | ||
| 99 | +**请求头**: | ||
| 100 | +``` | ||
| 101 | +X-Tenant-ID: 1 | ||
| 102 | +Content-Type: application/json | ||
| 103 | +``` | ||
| 104 | + | ||
| 105 | +**请求体**: | ||
| 106 | +```json | ||
| 107 | +{ | ||
| 108 | + "query": "耳机", | ||
| 109 | + "size": 10, | ||
| 110 | + "from": 0, | ||
| 111 | + "filters": { | ||
| 112 | + "category_keyword": "电子产品" | ||
| 113 | + }, | ||
| 114 | + "facets": ["category_keyword", "vendor_keyword"] | ||
| 115 | +} | ||
| 116 | +``` | ||
| 117 | + | ||
| 118 | +**响应格式**: | ||
| 119 | +```json | ||
| 120 | +{ | ||
| 121 | + "results": [ | ||
| 122 | + { | ||
| 123 | + "product_id": "1", | ||
| 124 | + "title": "蓝牙耳机 Sony", | ||
| 125 | + "handle": "product-1", | ||
| 126 | + "description": "高品质无线蓝牙耳机", | ||
| 127 | + "vendor": "Sony", | ||
| 128 | + "product_type": "电子产品", | ||
| 129 | + "price": 199.99, | ||
| 130 | + "compare_at_price": 299.99, | ||
| 131 | + "currency": "USD", | ||
| 132 | + "image_url": "//cdn.example.com/products/1.jpg", | ||
| 133 | + "in_stock": true, | ||
| 134 | + "variants": [ | ||
| 135 | + { | ||
| 136 | + "variant_id": "1", | ||
| 137 | + "title": "黑色", | ||
| 138 | + "price": 199.99, | ||
| 139 | + "compare_at_price": 299.99, | ||
| 140 | + "sku": "SKU-1-1", | ||
| 141 | + "stock": 50, | ||
| 142 | + "options": { | ||
| 143 | + "option1": "黑色" | ||
| 144 | + } | ||
| 145 | + } | ||
| 146 | + ], | ||
| 147 | + "relevance_score": 0.95 | ||
| 148 | + } | ||
| 149 | + ], | ||
| 150 | + "total": 10, | ||
| 151 | + "max_score": 1.0, | ||
| 152 | + "facets": [ | ||
| 153 | + { | ||
| 154 | + "field": "category_keyword", | ||
| 155 | + "label": "category_keyword", | ||
| 156 | + "type": "terms", | ||
| 157 | + "values": [ | ||
| 158 | + { | ||
| 159 | + "value": "电子产品", | ||
| 160 | + "label": "电子产品", | ||
| 161 | + "count": 5, | ||
| 162 | + "selected": false | ||
| 163 | + } | ||
| 164 | + ] | ||
| 165 | + } | ||
| 166 | + ], | ||
| 167 | + "suggestions": [], | ||
| 168 | + "related_searches": [], | ||
| 169 | + "took_ms": 15, | ||
| 170 | + "query_info": {} | ||
| 171 | +} | ||
| 172 | +``` | ||
| 173 | + | ||
| 174 | +### 响应格式说明 | ||
| 175 | + | ||
| 176 | +#### 主要变化 | ||
| 177 | + | ||
| 178 | +1. **`results`替代`hits`**:返回字段从`hits`改为`results` | ||
| 179 | +2. **结构化结果**:每个结果包含`product_id`, `title`, `variants`, `relevance_score`等字段 | ||
| 180 | +3. **无ES内部字段**:不包含`_id`, `_score`, `_source`等ES内部字段 | ||
| 181 | +4. **嵌套variants**:每个商品包含variants数组,每个variant包含完整的变体信息 | ||
| 182 | +5. **相关性分数**:`relevance_score`是0-1之间的归一化分数 | ||
| 183 | + | ||
| 184 | +#### ProductResult字段 | ||
| 185 | + | ||
| 186 | +- `product_id` - 商品ID | ||
| 187 | +- `title` - 商品标题 | ||
| 188 | +- `handle` - 商品handle | ||
| 189 | +- `description` - 商品描述 | ||
| 190 | +- `vendor` - 供应商/品牌 | ||
| 191 | +- `product_type` - 商品类型 | ||
| 192 | +- `tags` - 标签 | ||
| 193 | +- `price` - 最低价格(min_price) | ||
| 194 | +- `compare_at_price` - 原价 | ||
| 195 | +- `currency` - 货币单位(默认USD) | ||
| 196 | +- `image_url` - 主图URL | ||
| 197 | +- `in_stock` - 是否有库存 | ||
| 198 | +- `variants` - 变体列表 | ||
| 199 | +- `relevance_score` - 相关性分数(0-1) | ||
| 200 | + | ||
| 201 | +#### VariantResult字段 | ||
| 202 | + | ||
| 203 | +- `variant_id` - 变体ID | ||
| 204 | +- `title` - 变体标题 | ||
| 205 | +- `price` - 价格 | ||
| 206 | +- `compare_at_price` - 原价 | ||
| 207 | +- `sku` - SKU编码 | ||
| 208 | +- `stock` - 库存数量 | ||
| 209 | +- `options` - 选项(颜色、尺寸等) | ||
| 210 | + | ||
| 211 | +## 测试 | ||
| 212 | + | ||
| 213 | +### 运行测试脚本 | ||
| 214 | + | ||
| 215 | +```bash | ||
| 216 | +python scripts/test_base.py \ | ||
| 217 | + --api-url http://localhost:8000 \ | ||
| 218 | + --tenant-id "1" \ | ||
| 219 | + --test-tenant-2 "2" | ||
| 220 | +``` | ||
| 221 | + | ||
| 222 | +### 测试内容 | ||
| 223 | + | ||
| 224 | +1. **基本搜索**:测试搜索API基本功能 | ||
| 225 | +2. **响应格式验证**:验证返回格式是否符合要求 | ||
| 226 | +3. **Facets聚合**:测试分面搜索功能 | ||
| 227 | +4. **租户隔离**:验证不同租户的数据隔离 | ||
| 228 | + | ||
| 229 | +## 常见问题 | ||
| 230 | + | ||
| 231 | +### Q: 为什么配置中没有MySQL相关配置? | ||
| 232 | + | ||
| 233 | +A: 数据源配置和数据导入流程是写死的脚本,不在搜索配置中。搜索配置只关注ES搜索相关的内容。 | ||
| 234 | + | ||
| 235 | +### Q: 如何为新的租户导入数据? | ||
| 236 | + | ||
| 237 | +A: 使用`ingest_shoplazza.py`脚本,指定不同的`--tenant-id`参数即可。 | ||
| 238 | + | ||
| 239 | +### Q: 如何验证租户隔离是否生效? | ||
| 240 | + | ||
| 241 | +A: 使用`test_base.py`脚本,指定两个不同的`--tenant-id`,检查搜索结果是否隔离。 | ||
| 242 | + | ||
| 243 | +### Q: API返回格式中为什么没有`_id`和`_score`? | ||
| 244 | + | ||
| 245 | +A: 为了提供外部友好的API格式,我们移除了ES内部字段,使用`product_id`和`relevance_score`替代。 | ||
| 246 | + | ||
| 247 | +### Q: 如何添加新的搜索字段? | ||
| 248 | + | ||
| 249 | +A: 在`config/schema/base/config.yaml`中添加字段定义,然后重新生成索引映射并重新导入数据。 | ||
| 250 | + | ||
| 251 | +## 注意事项 | ||
| 252 | + | ||
| 253 | +1. **tenant_id必需**:所有API请求必须提供`tenant_id`(通过请求头`X-Tenant-ID`或查询参数`tenant_id`) | ||
| 254 | +2. **索引共享**:所有客户共享`search_products`索引,确保`tenant_id`字段正确设置 | ||
| 255 | +3. **数据导入**:数据导入脚本是写死的,不依赖配置中的MySQL设置 | ||
| 256 | +4. **配置分离**:搜索配置和数据源配置完全分离,提高可维护性 | ||
| 257 | + |
| @@ -0,0 +1,288 @@ | @@ -0,0 +1,288 @@ | ||
| 1 | +""" | ||
| 2 | +SPU data transformer for Shoplazza products. | ||
| 3 | + | ||
| 4 | +Transforms SPU and SKU data from MySQL into SPU-level ES documents with nested variants. | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +import pandas as pd | ||
| 8 | +import numpy as np | ||
| 9 | +from typing import Dict, Any, List, Optional | ||
| 10 | +from sqlalchemy import create_engine, text | ||
| 11 | +from utils.db_connector import create_db_connection | ||
| 12 | + | ||
| 13 | + | ||
| 14 | +class SPUTransformer: | ||
| 15 | + """Transform SPU and SKU data into SPU-level ES documents.""" | ||
| 16 | + | ||
| 17 | + def __init__( | ||
| 18 | + self, | ||
| 19 | + db_engine: Any, | ||
| 20 | + tenant_id: str | ||
| 21 | + ): | ||
| 22 | + """ | ||
| 23 | + Initialize SPU transformer. | ||
| 24 | + | ||
| 25 | + Args: | ||
| 26 | + db_engine: SQLAlchemy database engine | ||
| 27 | + tenant_id: Tenant ID for filtering data | ||
| 28 | + """ | ||
| 29 | + self.db_engine = db_engine | ||
| 30 | + self.tenant_id = tenant_id | ||
| 31 | + | ||
| 32 | + def load_spu_data(self) -> pd.DataFrame: | ||
| 33 | + """ | ||
| 34 | + Load SPU data from MySQL. | ||
| 35 | + | ||
| 36 | + Returns: | ||
| 37 | + DataFrame with SPU data | ||
| 38 | + """ | ||
| 39 | + query = text(""" | ||
| 40 | + SELECT | ||
| 41 | + id, shop_id, shoplazza_id, handle, title, brief, description, | ||
| 42 | + spu, vendor, vendor_url, seo_title, seo_description, seo_keywords, | ||
| 43 | + image_src, image_width, image_height, image_path, image_alt, | ||
| 44 | + tags, note, category, | ||
| 45 | + shoplazza_created_at, shoplazza_updated_at, tenant_id, | ||
| 46 | + creator, create_time, updater, update_time, deleted | ||
| 47 | + FROM shoplazza_product_spu | ||
| 48 | + WHERE tenant_id = :tenant_id AND deleted = 0 | ||
| 49 | + """) | ||
| 50 | + | ||
| 51 | + with self.db_engine.connect() as conn: | ||
| 52 | + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id}) | ||
| 53 | + | ||
| 54 | + return df | ||
| 55 | + | ||
| 56 | + def load_sku_data(self) -> pd.DataFrame: | ||
| 57 | + """ | ||
| 58 | + Load SKU data from MySQL. | ||
| 59 | + | ||
| 60 | + Returns: | ||
| 61 | + DataFrame with SKU data | ||
| 62 | + """ | ||
| 63 | + query = text(""" | ||
| 64 | + SELECT | ||
| 65 | + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, | ||
| 66 | + shoplazza_image_id, title, sku, barcode, position, | ||
| 67 | + price, compare_at_price, cost_price, | ||
| 68 | + option1, option2, option3, | ||
| 69 | + inventory_quantity, weight, weight_unit, image_src, | ||
| 70 | + wholesale_price, note, extend, | ||
| 71 | + shoplazza_created_at, shoplazza_updated_at, tenant_id, | ||
| 72 | + creator, create_time, updater, update_time, deleted | ||
| 73 | + FROM shoplazza_product_sku | ||
| 74 | + WHERE tenant_id = :tenant_id AND deleted = 0 | ||
| 75 | + """) | ||
| 76 | + | ||
| 77 | + with self.db_engine.connect() as conn: | ||
| 78 | + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id}) | ||
| 79 | + | ||
| 80 | + return df | ||
| 81 | + | ||
| 82 | + def transform_batch(self) -> List[Dict[str, Any]]: | ||
| 83 | + """ | ||
| 84 | + Transform SPU and SKU data into ES documents. | ||
| 85 | + | ||
| 86 | + Returns: | ||
| 87 | + List of SPU-level ES documents | ||
| 88 | + """ | ||
| 89 | + # Load data | ||
| 90 | + spu_df = self.load_spu_data() | ||
| 91 | + sku_df = self.load_sku_data() | ||
| 92 | + | ||
| 93 | + if spu_df.empty: | ||
| 94 | + return [] | ||
| 95 | + | ||
| 96 | + # Group SKUs by SPU | ||
| 97 | + sku_groups = sku_df.groupby('spu_id') | ||
| 98 | + | ||
| 99 | + documents = [] | ||
| 100 | + for _, spu_row in spu_df.iterrows(): | ||
| 101 | + spu_id = spu_row['id'] | ||
| 102 | + | ||
| 103 | + # Get SKUs for this SPU | ||
| 104 | + skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame() | ||
| 105 | + | ||
| 106 | + # Transform to ES document | ||
| 107 | + doc = self._transform_spu_to_doc(spu_row, skus) | ||
| 108 | + if doc: | ||
| 109 | + documents.append(doc) | ||
| 110 | + | ||
| 111 | + return documents | ||
| 112 | + | ||
| 113 | + def _transform_spu_to_doc( | ||
| 114 | + self, | ||
| 115 | + spu_row: pd.Series, | ||
| 116 | + skus: pd.DataFrame | ||
| 117 | + ) -> Optional[Dict[str, Any]]: | ||
| 118 | + """ | ||
| 119 | + Transform a single SPU row and its SKUs into an ES document. | ||
| 120 | + | ||
| 121 | + Args: | ||
| 122 | + spu_row: SPU row from database | ||
| 123 | + skus: DataFrame with SKUs for this SPU | ||
| 124 | + | ||
| 125 | + Returns: | ||
| 126 | + ES document or None if transformation fails | ||
| 127 | + """ | ||
| 128 | + doc = {} | ||
| 129 | + | ||
| 130 | + # Tenant ID (required) | ||
| 131 | + doc['tenant_id'] = str(self.tenant_id) | ||
| 132 | + | ||
| 133 | + # Product ID | ||
| 134 | + doc['product_id'] = str(spu_row['id']) | ||
| 135 | + | ||
| 136 | + # Handle | ||
| 137 | + if pd.notna(spu_row.get('handle')): | ||
| 138 | + doc['handle'] = str(spu_row['handle']) | ||
| 139 | + | ||
| 140 | + # Title | ||
| 141 | + if pd.notna(spu_row.get('title')): | ||
| 142 | + doc['title'] = str(spu_row['title']) | ||
| 143 | + | ||
| 144 | + # Brief | ||
| 145 | + if pd.notna(spu_row.get('brief')): | ||
| 146 | + doc['brief'] = str(spu_row['brief']) | ||
| 147 | + | ||
| 148 | + # Description | ||
| 149 | + if pd.notna(spu_row.get('description')): | ||
| 150 | + doc['description'] = str(spu_row['description']) | ||
| 151 | + | ||
| 152 | + # SEO fields | ||
| 153 | + if pd.notna(spu_row.get('seo_title')): | ||
| 154 | + doc['seo_title'] = str(spu_row['seo_title']) | ||
| 155 | + if pd.notna(spu_row.get('seo_description')): | ||
| 156 | + doc['seo_description'] = str(spu_row['seo_description']) | ||
| 157 | + if pd.notna(spu_row.get('seo_keywords')): | ||
| 158 | + doc['seo_keywords'] = str(spu_row['seo_keywords']) | ||
| 159 | + | ||
| 160 | + # Vendor | ||
| 161 | + if pd.notna(spu_row.get('vendor')): | ||
| 162 | + doc['vendor'] = str(spu_row['vendor']) | ||
| 163 | + doc['vendor_keyword'] = str(spu_row['vendor']) | ||
| 164 | + | ||
| 165 | + # Product type (from category or tags) | ||
| 166 | + if pd.notna(spu_row.get('category')): | ||
| 167 | + doc['product_type'] = str(spu_row['category']) | ||
| 168 | + doc['product_type_keyword'] = str(spu_row['category']) | ||
| 169 | + | ||
| 170 | + # Tags | ||
| 171 | + if pd.notna(spu_row.get('tags')): | ||
| 172 | + tags_str = str(spu_row['tags']) | ||
| 173 | + doc['tags'] = tags_str | ||
| 174 | + doc['tags_keyword'] = tags_str | ||
| 175 | + | ||
| 176 | + # Category | ||
| 177 | + if pd.notna(spu_row.get('category')): | ||
| 178 | + doc['category'] = str(spu_row['category']) | ||
| 179 | + doc['category_keyword'] = str(spu_row['category']) | ||
| 180 | + | ||
| 181 | + # Image URL | ||
| 182 | + if pd.notna(spu_row.get('image_src')): | ||
| 183 | + image_src = str(spu_row['image_src']) | ||
| 184 | + if not image_src.startswith('http'): | ||
| 185 | + image_src = f"//{image_src}" if image_src.startswith('//') else image_src | ||
| 186 | + doc['image_url'] = image_src | ||
| 187 | + | ||
| 188 | + # Process variants | ||
| 189 | + variants = [] | ||
| 190 | + prices = [] | ||
| 191 | + compare_prices = [] | ||
| 192 | + | ||
| 193 | + for _, sku_row in skus.iterrows(): | ||
| 194 | + variant = self._transform_sku_to_variant(sku_row) | ||
| 195 | + if variant: | ||
| 196 | + variants.append(variant) | ||
| 197 | + if 'price' in variant and variant['price'] is not None: | ||
| 198 | + try: | ||
| 199 | + prices.append(float(variant['price'])) | ||
| 200 | + except (ValueError, TypeError): | ||
| 201 | + pass | ||
| 202 | + if 'compare_at_price' in variant and variant['compare_at_price'] is not None: | ||
| 203 | + try: | ||
| 204 | + compare_prices.append(float(variant['compare_at_price'])) | ||
| 205 | + except (ValueError, TypeError): | ||
| 206 | + pass | ||
| 207 | + | ||
| 208 | + doc['variants'] = variants | ||
| 209 | + | ||
| 210 | + # Calculate price ranges | ||
| 211 | + if prices: | ||
| 212 | + doc['min_price'] = float(min(prices)) | ||
| 213 | + doc['max_price'] = float(max(prices)) | ||
| 214 | + else: | ||
| 215 | + doc['min_price'] = 0.0 | ||
| 216 | + doc['max_price'] = 0.0 | ||
| 217 | + | ||
| 218 | + if compare_prices: | ||
| 219 | + doc['compare_at_price'] = float(max(compare_prices)) | ||
| 220 | + else: | ||
| 221 | + doc['compare_at_price'] = None | ||
| 222 | + | ||
| 223 | + return doc | ||
| 224 | + | ||
| 225 | + def _transform_sku_to_variant(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]: | ||
| 226 | + """ | ||
| 227 | + Transform a SKU row into a variant object. | ||
| 228 | + | ||
| 229 | + Args: | ||
| 230 | + sku_row: SKU row from database | ||
| 231 | + | ||
| 232 | + Returns: | ||
| 233 | + Variant dictionary or None | ||
| 234 | + """ | ||
| 235 | + variant = {} | ||
| 236 | + | ||
| 237 | + # Variant ID | ||
| 238 | + variant['variant_id'] = str(sku_row['id']) | ||
| 239 | + | ||
| 240 | + # Title | ||
| 241 | + if pd.notna(sku_row.get('title')): | ||
| 242 | + variant['title'] = str(sku_row['title']) | ||
| 243 | + | ||
| 244 | + # Price | ||
| 245 | + if pd.notna(sku_row.get('price')): | ||
| 246 | + try: | ||
| 247 | + variant['price'] = float(sku_row['price']) | ||
| 248 | + except (ValueError, TypeError): | ||
| 249 | + variant['price'] = None | ||
| 250 | + else: | ||
| 251 | + variant['price'] = None | ||
| 252 | + | ||
| 253 | + # Compare at price | ||
| 254 | + if pd.notna(sku_row.get('compare_at_price')): | ||
| 255 | + try: | ||
| 256 | + variant['compare_at_price'] = float(sku_row['compare_at_price']) | ||
| 257 | + except (ValueError, TypeError): | ||
| 258 | + variant['compare_at_price'] = None | ||
| 259 | + else: | ||
| 260 | + variant['compare_at_price'] = None | ||
| 261 | + | ||
| 262 | + # SKU | ||
| 263 | + if pd.notna(sku_row.get('sku')): | ||
| 264 | + variant['sku'] = str(sku_row['sku']) | ||
| 265 | + | ||
| 266 | + # Stock | ||
| 267 | + if pd.notna(sku_row.get('inventory_quantity')): | ||
| 268 | + try: | ||
| 269 | + variant['stock'] = int(sku_row['inventory_quantity']) | ||
| 270 | + except (ValueError, TypeError): | ||
| 271 | + variant['stock'] = 0 | ||
| 272 | + else: | ||
| 273 | + variant['stock'] = 0 | ||
| 274 | + | ||
| 275 | + # Options (from option1, option2, option3) | ||
| 276 | + options = {} | ||
| 277 | + if pd.notna(sku_row.get('option1')): | ||
| 278 | + options['option1'] = str(sku_row['option1']) | ||
| 279 | + if pd.notna(sku_row.get('option2')): | ||
| 280 | + options['option2'] = str(sku_row['option2']) | ||
| 281 | + if pd.notna(sku_row.get('option3')): | ||
| 282 | + options['option3'] = str(sku_row['option3']) | ||
| 283 | + | ||
| 284 | + if options: | ||
| 285 | + variant['options'] = options | ||
| 286 | + | ||
| 287 | + return variant | ||
| 288 | + |
| @@ -0,0 +1,325 @@ | @@ -0,0 +1,325 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +Generate test data for Shoplazza SPU and SKU tables. | ||
| 4 | + | ||
| 5 | +Generates 100 SPU records with 1-5 SKU variants each. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import sys | ||
| 9 | +import os | ||
| 10 | +import random | ||
| 11 | +import argparse | ||
| 12 | +from pathlib import Path | ||
| 13 | +from datetime import datetime, timedelta | ||
| 14 | + | ||
| 15 | +# Add parent directory to path | ||
| 16 | +sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +def generate_spu_data(num_spus: int = 100, tenant_id: str = "1", start_id: int = 1): | ||
| 20 | + """ | ||
| 21 | + Generate SPU test data. | ||
| 22 | + | ||
| 23 | + Args: | ||
| 24 | + num_spus: Number of SPUs to generate | ||
| 25 | + tenant_id: Tenant ID | ||
| 26 | + start_id: Starting ID for SPUs | ||
| 27 | + | ||
| 28 | + Returns: | ||
| 29 | + List of SPU data dictionaries | ||
| 30 | + """ | ||
| 31 | + categories = ["电子产品", "服装", "家居用品", "美妆", "食品", "运动用品", "图书", "玩具"] | ||
| 32 | + vendors = ["Sony", "Nike", "Apple", "Samsung", "华为", "小米", "美的", "海尔"] | ||
| 33 | + | ||
| 34 | + products = [ | ||
| 35 | + ("蓝牙耳机", "Bluetooth Headphone", "高品质无线蓝牙耳机", "High-quality wireless Bluetooth headphone"), | ||
| 36 | + ("运动鞋", "Running Shoes", "舒适透气的运动鞋", "Comfortable and breathable running shoes"), | ||
| 37 | + ("智能手机", "Smartphone", "高性能智能手机", "High-performance smartphone"), | ||
| 38 | + ("笔记本电脑", "Laptop", "轻薄便携笔记本电脑", "Lightweight and portable laptop"), | ||
| 39 | + ("智能手表", "Smart Watch", "多功能智能手表", "Multi-function smart watch"), | ||
| 40 | + ("平板电脑", "Tablet", "高清平板电脑", "High-definition tablet"), | ||
| 41 | + ("无线鼠标", "Wireless Mouse", "人体工学无线鼠标", "Ergonomic wireless mouse"), | ||
| 42 | + ("机械键盘", "Mechanical Keyboard", "RGB背光机械键盘", "RGB backlit mechanical keyboard"), | ||
| 43 | + ("显示器", "Monitor", "4K高清显示器", "4K high-definition monitor"), | ||
| 44 | + ("音响", "Speaker", "蓝牙无线音响", "Bluetooth wireless speaker"), | ||
| 45 | + ] | ||
| 46 | + | ||
| 47 | + spus = [] | ||
| 48 | + for i in range(num_spus): | ||
| 49 | + spu_id = start_id + i | ||
| 50 | + product = random.choice(products) | ||
| 51 | + category = random.choice(categories) | ||
| 52 | + vendor = random.choice(vendors) | ||
| 53 | + | ||
| 54 | + # Generate handle | ||
| 55 | + handle = f"product-{spu_id}" | ||
| 56 | + | ||
| 57 | + # Generate title (Chinese) | ||
| 58 | + title_zh = f"{product[0]} {vendor}" | ||
| 59 | + | ||
| 60 | + # Generate brief | ||
| 61 | + brief_zh = product[2] | ||
| 62 | + | ||
| 63 | + # Generate description | ||
| 64 | + description_zh = f"<p>{product[2]},来自{vendor}品牌。{product[3]}</p>" | ||
| 65 | + | ||
| 66 | + # Generate SEO fields | ||
| 67 | + seo_title = f"{title_zh} - {category}" | ||
| 68 | + seo_description = f"购买{vendor}{product[0]},{product[2]}" | ||
| 69 | + seo_keywords = f"{product[0]},{vendor},{category}" | ||
| 70 | + | ||
| 71 | + # Generate tags | ||
| 72 | + tags = f"{category},{vendor},{product[0]}" | ||
| 73 | + | ||
| 74 | + # Generate image | ||
| 75 | + image_src = f"//cdn.example.com/products/{spu_id}.jpg" | ||
| 76 | + | ||
| 77 | + # Generate dates | ||
| 78 | + created_at = datetime.now() - timedelta(days=random.randint(1, 365)) | ||
| 79 | + updated_at = created_at + timedelta(days=random.randint(0, 30)) | ||
| 80 | + | ||
| 81 | + spu = { | ||
| 82 | + 'id': spu_id, | ||
| 83 | + 'shop_id': 1, | ||
| 84 | + 'shoplazza_id': f"spu-{spu_id}", | ||
| 85 | + 'handle': handle, | ||
| 86 | + 'title': title_zh, | ||
| 87 | + 'brief': brief_zh, | ||
| 88 | + 'description': description_zh, | ||
| 89 | + 'spu': '', | ||
| 90 | + 'vendor': vendor, | ||
| 91 | + 'vendor_url': f"https://{vendor.lower()}.com", | ||
| 92 | + 'seo_title': seo_title, | ||
| 93 | + 'seo_description': seo_description, | ||
| 94 | + 'seo_keywords': seo_keywords, | ||
| 95 | + 'image_src': image_src, | ||
| 96 | + 'image_width': 800, | ||
| 97 | + 'image_height': 600, | ||
| 98 | + 'image_path': f"products/{spu_id}.jpg", | ||
| 99 | + 'image_alt': title_zh, | ||
| 100 | + 'inventory_policy': '', | ||
| 101 | + 'inventory_quantity': 0, | ||
| 102 | + 'inventory_tracking': '0', | ||
| 103 | + 'published': 1, | ||
| 104 | + 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 105 | + 'requires_shipping': 1, | ||
| 106 | + 'taxable': 0, | ||
| 107 | + 'fake_sales': 0, | ||
| 108 | + 'display_fake_sales': 0, | ||
| 109 | + 'mixed_wholesale': 0, | ||
| 110 | + 'need_variant_image': 0, | ||
| 111 | + 'has_only_default_variant': 0, | ||
| 112 | + 'tags': tags, | ||
| 113 | + 'note': '', | ||
| 114 | + 'category': category, | ||
| 115 | + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 116 | + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 117 | + 'tenant_id': tenant_id, | ||
| 118 | + 'creator': '1', | ||
| 119 | + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 120 | + 'updater': '1', | ||
| 121 | + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 122 | + 'deleted': 0 | ||
| 123 | + } | ||
| 124 | + spus.append(spu) | ||
| 125 | + | ||
| 126 | + return spus | ||
| 127 | + | ||
| 128 | + | ||
| 129 | +def generate_sku_data(spus: list, start_sku_id: int = 1): | ||
| 130 | + """ | ||
| 131 | + Generate SKU test data for SPUs. | ||
| 132 | + | ||
| 133 | + Args: | ||
| 134 | + spus: List of SPU data | ||
| 135 | + start_sku_id: Starting ID for SKUs | ||
| 136 | + | ||
| 137 | + Returns: | ||
| 138 | + List of SKU data dictionaries | ||
| 139 | + """ | ||
| 140 | + colors = ["黑色", "白色", "红色", "蓝色", "绿色", "灰色"] | ||
| 141 | + sizes = ["S", "M", "L", "XL", "XXL"] | ||
| 142 | + | ||
| 143 | + skus = [] | ||
| 144 | + sku_id = start_sku_id | ||
| 145 | + | ||
| 146 | + for spu in spus: | ||
| 147 | + spu_id = spu['id'] | ||
| 148 | + num_variants = random.randint(1, 5) | ||
| 149 | + | ||
| 150 | + # Base price | ||
| 151 | + base_price = random.uniform(50, 500) | ||
| 152 | + | ||
| 153 | + for i in range(num_variants): | ||
| 154 | + # Generate variant options | ||
| 155 | + color = random.choice(colors) if num_variants > 1 else None | ||
| 156 | + size = random.choice(sizes) if num_variants > 2 else None | ||
| 157 | + | ||
| 158 | + # Generate title | ||
| 159 | + title_parts = [] | ||
| 160 | + if color: | ||
| 161 | + title_parts.append(color) | ||
| 162 | + if size: | ||
| 163 | + title_parts.append(size) | ||
| 164 | + title = " / ".join(title_parts) if title_parts else "" | ||
| 165 | + | ||
| 166 | + # Generate SKU | ||
| 167 | + sku_code = f"SKU-{spu_id}-{i+1}" | ||
| 168 | + | ||
| 169 | + # Generate price (variation from base) | ||
| 170 | + price = base_price + random.uniform(-20, 50) | ||
| 171 | + compare_at_price = price * random.uniform(1.2, 1.5) | ||
| 172 | + | ||
| 173 | + # Generate stock | ||
| 174 | + stock = random.randint(0, 100) | ||
| 175 | + | ||
| 176 | + # Generate dates | ||
| 177 | + created_at = datetime.now() - timedelta(days=random.randint(1, 365)) | ||
| 178 | + updated_at = created_at + timedelta(days=random.randint(0, 30)) | ||
| 179 | + | ||
| 180 | + sku = { | ||
| 181 | + 'id': sku_id, | ||
| 182 | + 'spu_id': spu_id, | ||
| 183 | + 'shop_id': 1, | ||
| 184 | + 'shoplazza_id': f"sku-{sku_id}", | ||
| 185 | + 'shoplazza_product_id': spu['shoplazza_id'], | ||
| 186 | + 'shoplazza_image_id': '', | ||
| 187 | + 'title': title, | ||
| 188 | + 'sku': sku_code, | ||
| 189 | + 'barcode': f"BAR{sku_id:08d}", | ||
| 190 | + 'position': i + 1, | ||
| 191 | + 'price': round(price, 2), | ||
| 192 | + 'compare_at_price': round(compare_at_price, 2), | ||
| 193 | + 'cost_price': round(price * 0.6, 2), | ||
| 194 | + 'option1': color if color else '', | ||
| 195 | + 'option2': size if size else '', | ||
| 196 | + 'option3': '', | ||
| 197 | + 'inventory_quantity': stock, | ||
| 198 | + 'weight': round(random.uniform(0.1, 5.0), 2), | ||
| 199 | + 'weight_unit': 'kg', | ||
| 200 | + 'image_src': '', | ||
| 201 | + 'wholesale_price': '[{"price": ' + str(round(price * 0.8, 2)) + ', "minQuantity": 10}]', | ||
| 202 | + 'note': '', | ||
| 203 | + 'extend': '', | ||
| 204 | + 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 205 | + 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 206 | + 'tenant_id': spu['tenant_id'], | ||
| 207 | + 'creator': '1', | ||
| 208 | + 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 209 | + 'updater': '1', | ||
| 210 | + 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'), | ||
| 211 | + 'deleted': 0 | ||
| 212 | + } | ||
| 213 | + skus.append(sku) | ||
| 214 | + sku_id += 1 | ||
| 215 | + | ||
| 216 | + return skus | ||
| 217 | + | ||
| 218 | + | ||
| 219 | +def generate_sql_inserts(spus: list, skus: list, output_file: str): | ||
| 220 | + """ | ||
| 221 | + Generate SQL INSERT statements. | ||
| 222 | + | ||
| 223 | + Args: | ||
| 224 | + spus: List of SPU data | ||
| 225 | + skus: List of SKU data | ||
| 226 | + output_file: Output file path | ||
| 227 | + """ | ||
| 228 | + with open(output_file, 'w', encoding='utf-8') as f: | ||
| 229 | + f.write("-- SPU Test Data\n") | ||
| 230 | + f.write("INSERT INTO shoplazza_product_spu (\n") | ||
| 231 | + f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n") | ||
| 232 | + f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n") | ||
| 233 | + f.write(" image_src, image_width, image_height, image_path, image_alt,\n") | ||
| 234 | + f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n") | ||
| 235 | + f.write(" published, published_at, requires_shipping, taxable,\n") | ||
| 236 | + f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n") | ||
| 237 | + f.write(" has_only_default_variant, tags, note, category,\n") | ||
| 238 | + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n") | ||
| 239 | + f.write(" creator, create_time, updater, update_time, deleted\n") | ||
| 240 | + f.write(") VALUES\n") | ||
| 241 | + | ||
| 242 | + for i, spu in enumerate(spus): | ||
| 243 | + values = ( | ||
| 244 | + f"({spu['id']}, {spu['shop_id']}, '{spu['shoplazza_id']}', " | ||
| 245 | + f"'{spu['handle']}', '{spu['title']}', '{spu['brief']}', " | ||
| 246 | + f"'{spu['description']}', '{spu['spu']}', '{spu['vendor']}', " | ||
| 247 | + f"'{spu['vendor_url']}', '{spu['seo_title']}', '{spu['seo_description']}', " | ||
| 248 | + f"'{spu['seo_keywords']}', '{spu['image_src']}', {spu['image_width']}, " | ||
| 249 | + f"{spu['image_height']}, '{spu['image_path']}', '{spu['image_alt']}', " | ||
| 250 | + f"'{spu['inventory_policy']}', {spu['inventory_quantity']}, " | ||
| 251 | + f"'{spu['inventory_tracking']}', {spu['published']}, " | ||
| 252 | + f"'{spu['published_at']}', {spu['requires_shipping']}, {spu['taxable']}, " | ||
| 253 | + f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, " | ||
| 254 | + f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, " | ||
| 255 | + f"'{spu['tags']}', '{spu['note']}', '{spu['category']}', " | ||
| 256 | + f"'{spu['shoplazza_created_at']}', '{spu['shoplazza_updated_at']}', " | ||
| 257 | + f"'{spu['tenant_id']}', '{spu['creator']}', '{spu['create_time']}', " | ||
| 258 | + f"'{spu['updater']}', '{spu['update_time']}', {spu['deleted']})" | ||
| 259 | + ) | ||
| 260 | + f.write(values) | ||
| 261 | + if i < len(spus) - 1: | ||
| 262 | + f.write(",\n") | ||
| 263 | + else: | ||
| 264 | + f.write(";\n\n") | ||
| 265 | + | ||
| 266 | + f.write("-- SKU Test Data\n") | ||
| 267 | + f.write("INSERT INTO shoplazza_product_sku (\n") | ||
| 268 | + f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n") | ||
| 269 | + f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n") | ||
| 270 | + f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n") | ||
| 271 | + f.write(" image_src, wholesale_price, note, extend,\n") | ||
| 272 | + f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n") | ||
| 273 | + f.write(" creator, create_time, updater, update_time, deleted\n") | ||
| 274 | + f.write(") VALUES\n") | ||
| 275 | + | ||
| 276 | + for i, sku in enumerate(skus): | ||
| 277 | + values = ( | ||
| 278 | + f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{sku['shoplazza_id']}', " | ||
| 279 | + f"'{sku['shoplazza_product_id']}', '{sku['shoplazza_image_id']}', " | ||
| 280 | + f"'{sku['title']}', '{sku['sku']}', '{sku['barcode']}', {sku['position']}, " | ||
| 281 | + f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, " | ||
| 282 | + f"'{sku['option1']}', '{sku['option2']}', '{sku['option3']}', " | ||
| 283 | + f"{sku['inventory_quantity']}, {sku['weight']}, '{sku['weight_unit']}', " | ||
| 284 | + f"'{sku['image_src']}', '{sku['wholesale_price']}', '{sku['note']}', " | ||
| 285 | + f"'{sku['extend']}', '{sku['shoplazza_created_at']}', " | ||
| 286 | + f"'{sku['shoplazza_updated_at']}', '{sku['tenant_id']}', " | ||
| 287 | + f"'{sku['creator']}', '{sku['create_time']}', '{sku['updater']}', " | ||
| 288 | + f"'{sku['update_time']}', {sku['deleted']})" | ||
| 289 | + ) | ||
| 290 | + f.write(values) | ||
| 291 | + if i < len(skus) - 1: | ||
| 292 | + f.write(",\n") | ||
| 293 | + else: | ||
| 294 | + f.write(";\n") | ||
| 295 | + | ||
| 296 | + | ||
| 297 | +def main(): | ||
| 298 | + parser = argparse.ArgumentParser(description='Generate test data for Shoplazza tables') | ||
| 299 | + parser.add_argument('--num-spus', type=int, default=100, help='Number of SPUs to generate') | ||
| 300 | + parser.add_argument('--tenant-id', default='1', help='Tenant ID') | ||
| 301 | + parser.add_argument('--start-spu-id', type=int, default=1, help='Starting SPU ID') | ||
| 302 | + parser.add_argument('--start-sku-id', type=int, default=1, help='Starting SKU ID') | ||
| 303 | + parser.add_argument('--output', default='test_data.sql', help='Output SQL file') | ||
| 304 | + | ||
| 305 | + args = parser.parse_args() | ||
| 306 | + | ||
| 307 | + print(f"Generating {args.num_spus} SPUs with variants...") | ||
| 308 | + | ||
| 309 | + # Generate SPU data | ||
| 310 | + spus = generate_spu_data(args.num_spus, args.tenant_id, args.start_spu_id) | ||
| 311 | + print(f"Generated {len(spus)} SPUs") | ||
| 312 | + | ||
| 313 | + # Generate SKU data | ||
| 314 | + skus = generate_sku_data(spus, args.start_sku_id) | ||
| 315 | + print(f"Generated {len(skus)} SKUs") | ||
| 316 | + | ||
| 317 | + # Generate SQL file | ||
| 318 | + generate_sql_inserts(spus, skus, args.output) | ||
| 319 | + print(f"SQL file generated: {args.output}") | ||
| 320 | + | ||
| 321 | + | ||
| 322 | +if __name__ == '__main__': | ||
| 323 | + import json | ||
| 324 | + main() | ||
| 325 | + |
| @@ -0,0 +1,132 @@ | @@ -0,0 +1,132 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +Import test data into MySQL Shoplazza tables. | ||
| 4 | + | ||
| 5 | +Reads SQL file generated by generate_test_data.py and imports into MySQL. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import sys | ||
| 9 | +import os | ||
| 10 | +import argparse | ||
| 11 | +from pathlib import Path | ||
| 12 | + | ||
| 13 | +# Add parent directory to path | ||
| 14 | +sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 15 | + | ||
| 16 | +from utils.db_connector import create_db_connection, test_connection | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +def import_sql_file(db_engine, sql_file: str): | ||
| 20 | + """ | ||
| 21 | + Import SQL file into database. | ||
| 22 | + | ||
| 23 | + Args: | ||
| 24 | + db_engine: SQLAlchemy database engine | ||
| 25 | + sql_file: Path to SQL file | ||
| 26 | + """ | ||
| 27 | + with open(sql_file, 'r', encoding='utf-8') as f: | ||
| 28 | + sql_content = f.read() | ||
| 29 | + | ||
| 30 | + # Split by semicolons to get individual statements | ||
| 31 | + statements = [s.strip() for s in sql_content.split(';') if s.strip() and not s.strip().startswith('--')] | ||
| 32 | + | ||
| 33 | + print(f"Executing {len(statements)} SQL statements...") | ||
| 34 | + | ||
| 35 | + with db_engine.connect() as conn: | ||
| 36 | + for i, statement in enumerate(statements, 1): | ||
| 37 | + if statement: | ||
| 38 | + try: | ||
| 39 | + conn.execute(statement) | ||
| 40 | + conn.commit() | ||
| 41 | + print(f" [{i}/{len(statements)}] Executed successfully") | ||
| 42 | + except Exception as e: | ||
| 43 | + print(f" [{i}/{len(statements)}] ERROR: {e}") | ||
| 44 | + print(f" Statement: {statement[:100]}...") | ||
| 45 | + raise | ||
| 46 | + | ||
| 47 | + | ||
| 48 | +def verify_import(db_engine, tenant_id: str): | ||
| 49 | + """ | ||
| 50 | + Verify imported data. | ||
| 51 | + | ||
| 52 | + Args: | ||
| 53 | + db_engine: SQLAlchemy database engine | ||
| 54 | + tenant_id: Tenant ID to verify | ||
| 55 | + """ | ||
| 56 | + from sqlalchemy import text | ||
| 57 | + | ||
| 58 | + with db_engine.connect() as conn: | ||
| 59 | + # Count SPUs | ||
| 60 | + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id}) | ||
| 61 | + spu_count = result.scalar() | ||
| 62 | + | ||
| 63 | + # Count SKUs | ||
| 64 | + result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id}) | ||
| 65 | + sku_count = result.scalar() | ||
| 66 | + | ||
| 67 | + print(f"\nVerification:") | ||
| 68 | + print(f" SPUs: {spu_count}") | ||
| 69 | + print(f" SKUs: {sku_count}") | ||
| 70 | + | ||
| 71 | + return spu_count, sku_count | ||
| 72 | + | ||
| 73 | + | ||
| 74 | +def main(): | ||
| 75 | + parser = argparse.ArgumentParser(description='Import test data into MySQL') | ||
| 76 | + | ||
| 77 | + # Database connection | ||
| 78 | + parser.add_argument('--db-host', required=True, help='MySQL host') | ||
| 79 | + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)') | ||
| 80 | + parser.add_argument('--db-database', required=True, help='MySQL database name') | ||
| 81 | + parser.add_argument('--db-username', required=True, help='MySQL username') | ||
| 82 | + parser.add_argument('--db-password', required=True, help='MySQL password') | ||
| 83 | + | ||
| 84 | + # Import options | ||
| 85 | + parser.add_argument('--sql-file', required=True, help='SQL file to import') | ||
| 86 | + parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)') | ||
| 87 | + | ||
| 88 | + args = parser.parse_args() | ||
| 89 | + | ||
| 90 | + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}") | ||
| 91 | + | ||
| 92 | + # Connect to database | ||
| 93 | + try: | ||
| 94 | + db_engine = create_db_connection( | ||
| 95 | + host=args.db_host, | ||
| 96 | + port=args.db_port, | ||
| 97 | + database=args.db_database, | ||
| 98 | + username=args.db_username, | ||
| 99 | + password=args.db_password | ||
| 100 | + ) | ||
| 101 | + except Exception as e: | ||
| 102 | + print(f"ERROR: Failed to connect to MySQL: {e}") | ||
| 103 | + return 1 | ||
| 104 | + | ||
| 105 | + # Test connection | ||
| 106 | + if not test_connection(db_engine): | ||
| 107 | + print("ERROR: Database connection test failed") | ||
| 108 | + return 1 | ||
| 109 | + | ||
| 110 | + print("Database connection successful") | ||
| 111 | + | ||
| 112 | + # Import SQL file | ||
| 113 | + print(f"\nImporting SQL file: {args.sql_file}") | ||
| 114 | + try: | ||
| 115 | + import_sql_file(db_engine, args.sql_file) | ||
| 116 | + print("Import completed successfully") | ||
| 117 | + except Exception as e: | ||
| 118 | + print(f"ERROR: Failed to import SQL file: {e}") | ||
| 119 | + import traceback | ||
| 120 | + traceback.print_exc() | ||
| 121 | + return 1 | ||
| 122 | + | ||
| 123 | + # Verify import if tenant_id provided | ||
| 124 | + if args.tenant_id: | ||
| 125 | + verify_import(db_engine, args.tenant_id) | ||
| 126 | + | ||
| 127 | + return 0 | ||
| 128 | + | ||
| 129 | + | ||
| 130 | +if __name__ == '__main__': | ||
| 131 | + sys.exit(main()) | ||
| 132 | + |
| @@ -0,0 +1,148 @@ | @@ -0,0 +1,148 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +Shoplazza data ingestion script. | ||
| 4 | + | ||
| 5 | +Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import sys | ||
| 9 | +import os | ||
| 10 | +import argparse | ||
| 11 | +from pathlib import Path | ||
| 12 | + | ||
| 13 | +# Add parent directory to path | ||
| 14 | +sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 15 | + | ||
| 16 | +from utils.db_connector import create_db_connection | ||
| 17 | +from utils.es_client import ESClient | ||
| 18 | +from indexer.spu_transformer import SPUTransformer | ||
| 19 | +from indexer.mapping_generator import MappingGenerator | ||
| 20 | +from indexer.bulk_indexer import BulkIndexer | ||
| 21 | +from config import ConfigLoader | ||
| 22 | + | ||
| 23 | + | ||
| 24 | +def main(): | ||
| 25 | + parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch') | ||
| 26 | + | ||
| 27 | + # Database connection | ||
| 28 | + parser.add_argument('--db-host', required=True, help='MySQL host') | ||
| 29 | + parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)') | ||
| 30 | + parser.add_argument('--db-database', required=True, help='MySQL database name') | ||
| 31 | + parser.add_argument('--db-username', required=True, help='MySQL username') | ||
| 32 | + parser.add_argument('--db-password', required=True, help='MySQL password') | ||
| 33 | + | ||
| 34 | + # Tenant and index | ||
| 35 | + parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)') | ||
| 36 | + parser.add_argument('--config', default='base', help='Configuration ID (default: base)') | ||
| 37 | + parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host') | ||
| 38 | + | ||
| 39 | + # Options | ||
| 40 | + parser.add_argument('--recreate', action='store_true', help='Recreate index if exists') | ||
| 41 | + parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)') | ||
| 42 | + | ||
| 43 | + args = parser.parse_args() | ||
| 44 | + | ||
| 45 | + print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}") | ||
| 46 | + | ||
| 47 | + # Load configuration | ||
| 48 | + config_loader = ConfigLoader("config/schema") | ||
| 49 | + try: | ||
| 50 | + config = config_loader.load_customer_config(args.config) | ||
| 51 | + print(f"Loaded configuration: {config.customer_name}") | ||
| 52 | + except Exception as e: | ||
| 53 | + print(f"ERROR: Failed to load configuration: {e}") | ||
| 54 | + return 1 | ||
| 55 | + | ||
| 56 | + # Validate tenant_id field exists | ||
| 57 | + tenant_id_field = None | ||
| 58 | + for field in config.fields: | ||
| 59 | + if field.name == "tenant_id": | ||
| 60 | + tenant_id_field = field | ||
| 61 | + break | ||
| 62 | + | ||
| 63 | + if not tenant_id_field: | ||
| 64 | + print("ERROR: Configuration must include 'tenant_id' field") | ||
| 65 | + return 1 | ||
| 66 | + | ||
| 67 | + # Connect to MySQL | ||
| 68 | + print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}") | ||
| 69 | + try: | ||
| 70 | + db_engine = create_db_connection( | ||
| 71 | + host=args.db_host, | ||
| 72 | + port=args.db_port, | ||
| 73 | + database=args.db_database, | ||
| 74 | + username=args.db_username, | ||
| 75 | + password=args.db_password | ||
| 76 | + ) | ||
| 77 | + except Exception as e: | ||
| 78 | + print(f"ERROR: Failed to connect to MySQL: {e}") | ||
| 79 | + return 1 | ||
| 80 | + | ||
| 81 | + # Connect to Elasticsearch | ||
| 82 | + print(f"Connecting to Elasticsearch: {args.es_host}") | ||
| 83 | + es_client = ESClient(hosts=[args.es_host]) | ||
| 84 | + if not es_client.ping(): | ||
| 85 | + print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") | ||
| 86 | + return 1 | ||
| 87 | + | ||
| 88 | + # Generate and create index | ||
| 89 | + mapping_gen = MappingGenerator(config) | ||
| 90 | + mapping = mapping_gen.generate_mapping() | ||
| 91 | + index_name = config.es_index_name | ||
| 92 | + | ||
| 93 | + if args.recreate: | ||
| 94 | + if es_client.index_exists(index_name): | ||
| 95 | + print(f"Deleting existing index: {index_name}") | ||
| 96 | + es_client.delete_index(index_name) | ||
| 97 | + | ||
| 98 | + if not es_client.index_exists(index_name): | ||
| 99 | + print(f"Creating index: {index_name}") | ||
| 100 | + es_client.create_index(index_name, mapping) | ||
| 101 | + else: | ||
| 102 | + print(f"Using existing index: {index_name}") | ||
| 103 | + | ||
| 104 | + # Initialize SPU transformer | ||
| 105 | + print(f"Initializing SPU transformer for tenant: {args.tenant_id}") | ||
| 106 | + transformer = SPUTransformer(db_engine, args.tenant_id) | ||
| 107 | + | ||
| 108 | + # Transform data | ||
| 109 | + print("Transforming SPU and SKU data...") | ||
| 110 | + try: | ||
| 111 | + documents = transformer.transform_batch() | ||
| 112 | + print(f"Transformed {len(documents)} SPU documents") | ||
| 113 | + except Exception as e: | ||
| 114 | + print(f"ERROR: Failed to transform data: {e}") | ||
| 115 | + import traceback | ||
| 116 | + traceback.print_exc() | ||
| 117 | + return 1 | ||
| 118 | + | ||
| 119 | + if not documents: | ||
| 120 | + print("WARNING: No documents to index") | ||
| 121 | + return 0 | ||
| 122 | + | ||
| 123 | + # Bulk index | ||
| 124 | + print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...") | ||
| 125 | + indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size) | ||
| 126 | + | ||
| 127 | + try: | ||
| 128 | + results = indexer.index_documents(documents, id_field="product_id", show_progress=True) | ||
| 129 | + print(f"\nIngestion complete:") | ||
| 130 | + print(f" Success: {results['success']}") | ||
| 131 | + print(f" Failed: {results['failed']}") | ||
| 132 | + print(f" Time: {results.get('elapsed_time', 0):.2f}s") | ||
| 133 | + | ||
| 134 | + if results['failed'] > 0: | ||
| 135 | + print(f"\nWARNING: {results['failed']} documents failed to index") | ||
| 136 | + return 1 | ||
| 137 | + | ||
| 138 | + return 0 | ||
| 139 | + except Exception as e: | ||
| 140 | + print(f"ERROR: Failed to index documents: {e}") | ||
| 141 | + import traceback | ||
| 142 | + traceback.print_exc() | ||
| 143 | + return 1 | ||
| 144 | + | ||
| 145 | + | ||
| 146 | +if __name__ == '__main__': | ||
| 147 | + sys.exit(main()) | ||
| 148 | + |
| @@ -0,0 +1,242 @@ | @@ -0,0 +1,242 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +Test script for base configuration. | ||
| 4 | + | ||
| 5 | +Tests data ingestion, search API, response format, and tenant isolation. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import sys | ||
| 9 | +import os | ||
| 10 | +import argparse | ||
| 11 | +import requests | ||
| 12 | +import json | ||
| 13 | +from pathlib import Path | ||
| 14 | + | ||
| 15 | +# Add parent directory to path | ||
| 16 | +sys.path.insert(0, str(Path(__file__).parent.parent)) | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +def test_search_api(base_url: str, tenant_id: str, query: str = "耳机"): | ||
| 20 | + """ | ||
| 21 | + Test search API. | ||
| 22 | + | ||
| 23 | + Args: | ||
| 24 | + base_url: API base URL | ||
| 25 | + tenant_id: Tenant ID | ||
| 26 | + query: Search query | ||
| 27 | + | ||
| 28 | + Returns: | ||
| 29 | + Response JSON or None if failed | ||
| 30 | + """ | ||
| 31 | + url = f"{base_url}/search/" | ||
| 32 | + headers = { | ||
| 33 | + "X-Tenant-ID": tenant_id, | ||
| 34 | + "Content-Type": "application/json" | ||
| 35 | + } | ||
| 36 | + payload = { | ||
| 37 | + "query": query, | ||
| 38 | + "size": 10, | ||
| 39 | + "from": 0 | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + print(f"\nTesting search API:") | ||
| 43 | + print(f" URL: {url}") | ||
| 44 | + print(f" Query: {query}") | ||
| 45 | + print(f" Tenant ID: {tenant_id}") | ||
| 46 | + | ||
| 47 | + try: | ||
| 48 | + response = requests.post(url, json=payload, headers=headers, timeout=30) | ||
| 49 | + response.raise_for_status() | ||
| 50 | + data = response.json() | ||
| 51 | + | ||
| 52 | + print(f" Status: {response.status_code}") | ||
| 53 | + print(f" Total: {data.get('total', 0)}") | ||
| 54 | + print(f" Results: {len(data.get('results', []))}") | ||
| 55 | + | ||
| 56 | + return data | ||
| 57 | + except Exception as e: | ||
| 58 | + print(f" ERROR: {e}") | ||
| 59 | + return None | ||
| 60 | + | ||
| 61 | + | ||
| 62 | +def validate_response_format(data: dict): | ||
| 63 | + """ | ||
| 64 | + Validate response format. | ||
| 65 | + | ||
| 66 | + Args: | ||
| 67 | + data: Response data | ||
| 68 | + | ||
| 69 | + Returns: | ||
| 70 | + List of validation errors (empty if valid) | ||
| 71 | + """ | ||
| 72 | + errors = [] | ||
| 73 | + | ||
| 74 | + # Check for results field (not hits) | ||
| 75 | + if 'hits' in data: | ||
| 76 | + errors.append("Response contains 'hits' field (should be 'results')") | ||
| 77 | + | ||
| 78 | + if 'results' not in data: | ||
| 79 | + errors.append("Response missing 'results' field") | ||
| 80 | + else: | ||
| 81 | + results = data['results'] | ||
| 82 | + if not isinstance(results, list): | ||
| 83 | + errors.append("'results' should be a list") | ||
| 84 | + else: | ||
| 85 | + # Validate first result structure | ||
| 86 | + if results: | ||
| 87 | + result = results[0] | ||
| 88 | + required_fields = ['product_id', 'title', 'variants', 'relevance_score'] | ||
| 89 | + for field in required_fields: | ||
| 90 | + if field not in result: | ||
| 91 | + errors.append(f"Result missing required field: {field}") | ||
| 92 | + | ||
| 93 | + # Check for ES internal fields | ||
| 94 | + es_internal_fields = ['_id', '_score', '_source'] | ||
| 95 | + for field in es_internal_fields: | ||
| 96 | + if field in result: | ||
| 97 | + errors.append(f"Result contains ES internal field: {field}") | ||
| 98 | + | ||
| 99 | + # Validate variants | ||
| 100 | + if 'variants' in result: | ||
| 101 | + variants = result['variants'] | ||
| 102 | + if not isinstance(variants, list): | ||
| 103 | + errors.append("'variants' should be a list") | ||
| 104 | + elif variants: | ||
| 105 | + variant = variants[0] | ||
| 106 | + variant_required = ['variant_id', 'price', 'sku', 'stock'] | ||
| 107 | + for field in variant_required: | ||
| 108 | + if field not in variant: | ||
| 109 | + errors.append(f"Variant missing required field: {field}") | ||
| 110 | + | ||
| 111 | + # Check for suggestions and related_searches | ||
| 112 | + if 'suggestions' not in data: | ||
| 113 | + errors.append("Response missing 'suggestions' field") | ||
| 114 | + if 'related_searches' not in data: | ||
| 115 | + errors.append("Response missing 'related_searches' field") | ||
| 116 | + | ||
| 117 | + return errors | ||
| 118 | + | ||
| 119 | + | ||
| 120 | +def test_facets(base_url: str, tenant_id: str): | ||
| 121 | + """ | ||
| 122 | + Test facets aggregation. | ||
| 123 | + | ||
| 124 | + Args: | ||
| 125 | + base_url: API base URL | ||
| 126 | + tenant_id: Tenant ID | ||
| 127 | + | ||
| 128 | + Returns: | ||
| 129 | + Response JSON or None if failed | ||
| 130 | + """ | ||
| 131 | + url = f"{base_url}/search/" | ||
| 132 | + headers = { | ||
| 133 | + "X-Tenant-ID": tenant_id, | ||
| 134 | + "Content-Type": "application/json" | ||
| 135 | + } | ||
| 136 | + payload = { | ||
| 137 | + "query": "商品", | ||
| 138 | + "size": 10, | ||
| 139 | + "facets": ["category_keyword", "vendor_keyword"] | ||
| 140 | + } | ||
| 141 | + | ||
| 142 | + print(f"\nTesting facets:") | ||
| 143 | + print(f" Facets: {payload['facets']}") | ||
| 144 | + | ||
| 145 | + try: | ||
| 146 | + response = requests.post(url, json=payload, headers=headers, timeout=30) | ||
| 147 | + response.raise_for_status() | ||
| 148 | + data = response.json() | ||
| 149 | + | ||
| 150 | + if 'facets' in data and data['facets']: | ||
| 151 | + print(f" Facets returned: {len(data['facets'])}") | ||
| 152 | + for facet in data['facets']: | ||
| 153 | + print(f" - {facet.get('field')}: {len(facet.get('values', []))} values") | ||
| 154 | + else: | ||
| 155 | + print(" WARNING: No facets returned") | ||
| 156 | + | ||
| 157 | + return data | ||
| 158 | + except Exception as e: | ||
| 159 | + print(f" ERROR: {e}") | ||
| 160 | + return None | ||
| 161 | + | ||
| 162 | + | ||
| 163 | +def test_tenant_isolation(base_url: str, tenant_id_1: str, tenant_id_2: str): | ||
| 164 | + """ | ||
| 165 | + Test tenant isolation. | ||
| 166 | + | ||
| 167 | + Args: | ||
| 168 | + base_url: API base URL | ||
| 169 | + tenant_id_1: First tenant ID | ||
| 170 | + tenant_id_2: Second tenant ID | ||
| 171 | + """ | ||
| 172 | + print(f"\nTesting tenant isolation:") | ||
| 173 | + print(f" Tenant 1: {tenant_id_1}") | ||
| 174 | + print(f" Tenant 2: {tenant_id_2}") | ||
| 175 | + | ||
| 176 | + # Search for tenant 1 | ||
| 177 | + data1 = test_search_api(base_url, tenant_id_1, "商品") | ||
| 178 | + # Search for tenant 2 | ||
| 179 | + data2 = test_search_api(base_url, tenant_id_2, "商品") | ||
| 180 | + | ||
| 181 | + if data1 and data2: | ||
| 182 | + results1 = set(r.get('product_id') for r in data1.get('results', [])) | ||
| 183 | + results2 = set(r.get('product_id') for r in data2.get('results', [])) | ||
| 184 | + | ||
| 185 | + overlap = results1 & results2 | ||
| 186 | + if overlap: | ||
| 187 | + print(f" WARNING: Found {len(overlap)} overlapping results between tenants") | ||
| 188 | + else: | ||
| 189 | + print(f" OK: No overlapping results (tenant isolation working)") | ||
| 190 | + | ||
| 191 | + | ||
| 192 | +def main(): | ||
| 193 | + parser = argparse.ArgumentParser(description='Test base configuration') | ||
| 194 | + parser.add_argument('--api-url', default='http://localhost:8000', help='API base URL') | ||
| 195 | + parser.add_argument('--tenant-id', default='1', help='Tenant ID for testing') | ||
| 196 | + parser.add_argument('--test-tenant-2', help='Second tenant ID for isolation test') | ||
| 197 | + | ||
| 198 | + args = parser.parse_args() | ||
| 199 | + | ||
| 200 | + print("=" * 60) | ||
| 201 | + print("Base Configuration Test Suite") | ||
| 202 | + print("=" * 60) | ||
| 203 | + | ||
| 204 | + # Test 1: Basic search | ||
| 205 | + print("\n[Test 1] Basic Search") | ||
| 206 | + data = test_search_api(args.api_url, args.tenant_id) | ||
| 207 | + if not data: | ||
| 208 | + print("FAILED: Basic search test") | ||
| 209 | + return 1 | ||
| 210 | + | ||
| 211 | + # Test 2: Response format validation | ||
| 212 | + print("\n[Test 2] Response Format Validation") | ||
| 213 | + errors = validate_response_format(data) | ||
| 214 | + if errors: | ||
| 215 | + print("FAILED: Response format validation") | ||
| 216 | + for error in errors: | ||
| 217 | + print(f" - {error}") | ||
| 218 | + return 1 | ||
| 219 | + else: | ||
| 220 | + print("PASSED: Response format is correct") | ||
| 221 | + | ||
| 222 | + # Test 3: Facets | ||
| 223 | + print("\n[Test 3] Facets Aggregation") | ||
| 224 | + facet_data = test_facets(args.api_url, args.tenant_id) | ||
| 225 | + if not facet_data: | ||
| 226 | + print("WARNING: Facets test failed (may be expected if no data)") | ||
| 227 | + | ||
| 228 | + # Test 4: Tenant isolation (if second tenant provided) | ||
| 229 | + if args.test_tenant_2: | ||
| 230 | + print("\n[Test 4] Tenant Isolation") | ||
| 231 | + test_tenant_isolation(args.api_url, args.tenant_id, args.test_tenant_2) | ||
| 232 | + | ||
| 233 | + print("\n" + "=" * 60) | ||
| 234 | + print("All tests completed") | ||
| 235 | + print("=" * 60) | ||
| 236 | + | ||
| 237 | + return 0 | ||
| 238 | + | ||
| 239 | + | ||
| 240 | +if __name__ == '__main__': | ||
| 241 | + sys.exit(main()) | ||
| 242 | + |
search/searcher.py
| @@ -17,38 +17,45 @@ from .multilang_query_builder import MultiLanguageQueryBuilder | @@ -17,38 +17,45 @@ from .multilang_query_builder import MultiLanguageQueryBuilder | ||
| 17 | from .rerank_engine import RerankEngine | 17 | from .rerank_engine import RerankEngine |
| 18 | from context.request_context import RequestContext, RequestContextStage, create_request_context | 18 | from context.request_context import RequestContext, RequestContextStage, create_request_context |
| 19 | from api.models import FacetResult, FacetValue | 19 | from api.models import FacetResult, FacetValue |
| 20 | +from api.result_formatter import ResultFormatter | ||
| 20 | 21 | ||
| 21 | 22 | ||
| 22 | class SearchResult: | 23 | class SearchResult: |
| 23 | - """Container for search results (重构版).""" | 24 | + """Container for search results (外部友好格式).""" |
| 24 | 25 | ||
| 25 | def __init__( | 26 | def __init__( |
| 26 | self, | 27 | self, |
| 27 | - hits: List[Dict[str, Any]], | 28 | + results: List[Any], # List[ProductResult] |
| 28 | total: int, | 29 | total: int, |
| 29 | max_score: float, | 30 | max_score: float, |
| 30 | took_ms: int, | 31 | took_ms: int, |
| 31 | facets: Optional[List[FacetResult]] = None, | 32 | facets: Optional[List[FacetResult]] = None, |
| 32 | query_info: Optional[Dict[str, Any]] = None, | 33 | query_info: Optional[Dict[str, Any]] = None, |
| 34 | + suggestions: Optional[List[str]] = None, | ||
| 35 | + related_searches: Optional[List[str]] = None, | ||
| 33 | debug_info: Optional[Dict[str, Any]] = None | 36 | debug_info: Optional[Dict[str, Any]] = None |
| 34 | ): | 37 | ): |
| 35 | - self.hits = hits | 38 | + self.results = results |
| 36 | self.total = total | 39 | self.total = total |
| 37 | self.max_score = max_score | 40 | self.max_score = max_score |
| 38 | self.took_ms = took_ms | 41 | self.took_ms = took_ms |
| 39 | self.facets = facets | 42 | self.facets = facets |
| 40 | self.query_info = query_info or {} | 43 | self.query_info = query_info or {} |
| 44 | + self.suggestions = suggestions or [] | ||
| 45 | + self.related_searches = related_searches or [] | ||
| 41 | self.debug_info = debug_info | 46 | self.debug_info = debug_info |
| 42 | 47 | ||
| 43 | def to_dict(self) -> Dict[str, Any]: | 48 | def to_dict(self) -> Dict[str, Any]: |
| 44 | """Convert to dictionary representation.""" | 49 | """Convert to dictionary representation.""" |
| 45 | result = { | 50 | result = { |
| 46 | - "hits": self.hits, | 51 | + "results": [r.model_dump() if hasattr(r, 'model_dump') else r for r in self.results], |
| 47 | "total": self.total, | 52 | "total": self.total, |
| 48 | "max_score": self.max_score, | 53 | "max_score": self.max_score, |
| 49 | "took_ms": self.took_ms, | 54 | "took_ms": self.took_ms, |
| 50 | "facets": [f.model_dump() for f in self.facets] if self.facets else None, | 55 | "facets": [f.model_dump() for f in self.facets] if self.facets else None, |
| 51 | - "query_info": self.query_info | 56 | + "query_info": self.query_info, |
| 57 | + "suggestions": self.suggestions, | ||
| 58 | + "related_searches": self.related_searches | ||
| 52 | } | 59 | } |
| 53 | if self.debug_info is not None: | 60 | if self.debug_info is not None: |
| 54 | result["debug_info"] = self.debug_info | 61 | result["debug_info"] = self.debug_info |
| @@ -106,6 +113,7 @@ class Searcher: | @@ -106,6 +113,7 @@ class Searcher: | ||
| 106 | def search( | 113 | def search( |
| 107 | self, | 114 | self, |
| 108 | query: str, | 115 | query: str, |
| 116 | + tenant_id: str, | ||
| 109 | size: int = 10, | 117 | size: int = 10, |
| 110 | from_: int = 0, | 118 | from_: int = 0, |
| 111 | filters: Optional[Dict[str, Any]] = None, | 119 | filters: Optional[Dict[str, Any]] = None, |
| @@ -118,10 +126,11 @@ class Searcher: | @@ -118,10 +126,11 @@ class Searcher: | ||
| 118 | debug: bool = False | 126 | debug: bool = False |
| 119 | ) -> SearchResult: | 127 | ) -> SearchResult: |
| 120 | """ | 128 | """ |
| 121 | - Execute search query (重构版). | 129 | + Execute search query (外部友好格式). |
| 122 | 130 | ||
| 123 | Args: | 131 | Args: |
| 124 | query: Search query string | 132 | query: Search query string |
| 133 | + tenant_id: Tenant ID (required for filtering) | ||
| 125 | size: Number of results to return | 134 | size: Number of results to return |
| 126 | from_: Offset for pagination | 135 | from_: Offset for pagination |
| 127 | filters: Exact match filters | 136 | filters: Exact match filters |
| @@ -134,7 +143,7 @@ class Searcher: | @@ -134,7 +143,7 @@ class Searcher: | ||
| 134 | debug: Enable debug information output | 143 | debug: Enable debug information output |
| 135 | 144 | ||
| 136 | Returns: | 145 | Returns: |
| 137 | - SearchResult object | 146 | + SearchResult object with formatted results |
| 138 | """ | 147 | """ |
| 139 | # Create context if not provided (backward compatibility) | 148 | # Create context if not provided (backward compatibility) |
| 140 | if context is None: | 149 | if context is None: |
| @@ -248,6 +257,11 @@ class Searcher: | @@ -248,6 +257,11 @@ class Searcher: | ||
| 248 | # Step 3: Query building | 257 | # Step 3: Query building |
| 249 | context.start_stage(RequestContextStage.QUERY_BUILDING) | 258 | context.start_stage(RequestContextStage.QUERY_BUILDING) |
| 250 | try: | 259 | try: |
| 260 | + # Add tenant_id to filters (required) | ||
| 261 | + if filters is None: | ||
| 262 | + filters = {} | ||
| 263 | + filters['tenant_id'] = tenant_id | ||
| 264 | + | ||
| 251 | es_query = self.query_builder.build_multilang_query( | 265 | es_query = self.query_builder.build_multilang_query( |
| 252 | parsed_query=parsed_query, | 266 | parsed_query=parsed_query, |
| 253 | query_vector=parsed_query.query_vector if enable_embedding else None, | 267 | query_vector=parsed_query.query_vector if enable_embedding else None, |
| @@ -341,56 +355,10 @@ class Searcher: | @@ -341,56 +355,10 @@ class Searcher: | ||
| 341 | # Step 5: Result processing | 355 | # Step 5: Result processing |
| 342 | context.start_stage(RequestContextStage.RESULT_PROCESSING) | 356 | context.start_stage(RequestContextStage.RESULT_PROCESSING) |
| 343 | try: | 357 | try: |
| 344 | - hits = [] | ||
| 345 | - raw_hits = [] | ||
| 346 | - | 358 | + # Extract ES hits |
| 359 | + es_hits = [] | ||
| 347 | if 'hits' in es_response and 'hits' in es_response['hits']: | 360 | if 'hits' in es_response and 'hits' in es_response['hits']: |
| 348 | - for hit in es_response['hits']['hits']: | ||
| 349 | - raw_hits.append(hit) | ||
| 350 | - | ||
| 351 | - result_doc = { | ||
| 352 | - '_id': hit['_id'], | ||
| 353 | - '_score': hit.get('_score') or 0.0, | ||
| 354 | - '_source': hit['_source'] | ||
| 355 | - } | ||
| 356 | - | ||
| 357 | - # 应用本地重排(仅当启用时) | ||
| 358 | - if enable_rerank and self.rerank_engine.enabled: | ||
| 359 | - base_score = hit.get('_score') or 0.0 | ||
| 360 | - knn_score = None | ||
| 361 | - | ||
| 362 | - # 检查是否使用了KNN(新结构:在function_score内部) | ||
| 363 | - query_section = es_query.get('query', {}) | ||
| 364 | - if 'function_score' in query_section: | ||
| 365 | - fs_query = query_section['function_score'].get('query', {}) | ||
| 366 | - outer_bool = fs_query.get('bool', {}) | ||
| 367 | - inner_bool_list = outer_bool.get('must', []) | ||
| 368 | - if inner_bool_list and 'bool' in inner_bool_list[0]: | ||
| 369 | - inner_should = inner_bool_list[0]['bool'].get('should', []) | ||
| 370 | - if any('knn' in clause for clause in inner_should): | ||
| 371 | - knn_score = base_score * 0.2 | ||
| 372 | - | ||
| 373 | - custom_score = self.rerank_engine.calculate_score( | ||
| 374 | - hit, | ||
| 375 | - base_score, | ||
| 376 | - knn_score | ||
| 377 | - ) | ||
| 378 | - result_doc['_custom_score'] = custom_score | ||
| 379 | - result_doc['_original_score'] = base_score | ||
| 380 | - | ||
| 381 | - hits.append(result_doc) | ||
| 382 | - | ||
| 383 | - # 重排序(仅当启用时) | ||
| 384 | - if enable_rerank and self.rerank_engine.enabled: | ||
| 385 | - hits.sort(key=lambda x: x.get('_custom_score', x['_score']), reverse=True) | ||
| 386 | - context.logger.info( | ||
| 387 | - f"本地重排完成 | 使用RerankEngine", | ||
| 388 | - extra={'reqid': context.reqid, 'uid': context.uid} | ||
| 389 | - ) | ||
| 390 | - | ||
| 391 | - # Store intermediate results in context | ||
| 392 | - context.store_intermediate_result('raw_hits', raw_hits) | ||
| 393 | - context.store_intermediate_result('processed_hits', hits) | 361 | + es_hits = es_response['hits']['hits'] |
| 394 | 362 | ||
| 395 | # Extract total and max_score | 363 | # Extract total and max_score |
| 396 | total = es_response.get('hits', {}).get('total', {}) | 364 | total = es_response.get('hits', {}).get('total', {}) |
| @@ -401,16 +369,24 @@ class Searcher: | @@ -401,16 +369,24 @@ class Searcher: | ||
| 401 | 369 | ||
| 402 | max_score = es_response.get('hits', {}).get('max_score') or 0.0 | 370 | max_score = es_response.get('hits', {}).get('max_score') or 0.0 |
| 403 | 371 | ||
| 404 | - # Standardize facets | ||
| 405 | - standardized_facets = self._standardize_facets( | ||
| 406 | - es_response.get('aggregations', {}), | ||
| 407 | - facets, | ||
| 408 | - filters | ||
| 409 | - ) | 372 | + # Format results using ResultFormatter |
| 373 | + formatted_results = ResultFormatter.format_search_results(es_hits, max_score) | ||
| 374 | + | ||
| 375 | + # Format facets | ||
| 376 | + standardized_facets = None | ||
| 377 | + if facets: | ||
| 378 | + standardized_facets = ResultFormatter.format_facets( | ||
| 379 | + es_response.get('aggregations', {}), | ||
| 380 | + facets | ||
| 381 | + ) | ||
| 382 | + | ||
| 383 | + # Generate suggestions and related searches | ||
| 384 | + query_text = parsed_query.original_query if parsed_query else query | ||
| 385 | + suggestions = ResultFormatter.generate_suggestions(query_text, formatted_results) | ||
| 386 | + related_searches = ResultFormatter.generate_related_searches(query_text, formatted_results) | ||
| 410 | 387 | ||
| 411 | context.logger.info( | 388 | context.logger.info( |
| 412 | - f"结果处理完成 | 返回: {len(hits)}条 | 总计: {total_value}条 | " | ||
| 413 | - f"重排序: {'是' if enable_rerank else '否'}", | 389 | + f"结果处理完成 | 返回: {len(formatted_results)}条 | 总计: {total_value}条", |
| 414 | extra={'reqid': context.reqid, 'uid': context.uid} | 390 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 415 | ) | 391 | ) |
| 416 | 392 | ||
| @@ -459,12 +435,14 @@ class Searcher: | @@ -459,12 +435,14 @@ class Searcher: | ||
| 459 | 435 | ||
| 460 | # Build result | 436 | # Build result |
| 461 | result = SearchResult( | 437 | result = SearchResult( |
| 462 | - hits=hits, | 438 | + results=formatted_results, |
| 463 | total=total_value, | 439 | total=total_value, |
| 464 | max_score=max_score, | 440 | max_score=max_score, |
| 465 | took_ms=int(total_duration), | 441 | took_ms=int(total_duration), |
| 466 | facets=standardized_facets, | 442 | facets=standardized_facets, |
| 467 | query_info=parsed_query.to_dict(), | 443 | query_info=parsed_query.to_dict(), |
| 444 | + suggestions=suggestions, | ||
| 445 | + related_searches=related_searches, | ||
| 468 | debug_info=debug_info | 446 | debug_info=debug_info |
| 469 | ) | 447 | ) |
| 470 | 448 | ||
| @@ -476,21 +454,23 @@ class Searcher: | @@ -476,21 +454,23 @@ class Searcher: | ||
| 476 | def search_by_image( | 454 | def search_by_image( |
| 477 | self, | 455 | self, |
| 478 | image_url: str, | 456 | image_url: str, |
| 457 | + tenant_id: str, | ||
| 479 | size: int = 10, | 458 | size: int = 10, |
| 480 | filters: Optional[Dict[str, Any]] = None, | 459 | filters: Optional[Dict[str, Any]] = None, |
| 481 | range_filters: Optional[Dict[str, Any]] = None | 460 | range_filters: Optional[Dict[str, Any]] = None |
| 482 | ) -> SearchResult: | 461 | ) -> SearchResult: |
| 483 | """ | 462 | """ |
| 484 | - Search by image similarity (重构版). | 463 | + Search by image similarity (外部友好格式). |
| 485 | 464 | ||
| 486 | Args: | 465 | Args: |
| 487 | image_url: URL of query image | 466 | image_url: URL of query image |
| 467 | + tenant_id: Tenant ID (required for filtering) | ||
| 488 | size: Number of results | 468 | size: Number of results |
| 489 | filters: Exact match filters | 469 | filters: Exact match filters |
| 490 | range_filters: Range filters for numeric fields | 470 | range_filters: Range filters for numeric fields |
| 491 | 471 | ||
| 492 | Returns: | 472 | Returns: |
| 493 | - SearchResult object | 473 | + SearchResult object with formatted results |
| 494 | """ | 474 | """ |
| 495 | if not self.image_embedding_field: | 475 | if not self.image_embedding_field: |
| 496 | raise ValueError("Image embedding field not configured") | 476 | raise ValueError("Image embedding field not configured") |
| @@ -503,6 +483,11 @@ class Searcher: | @@ -503,6 +483,11 @@ class Searcher: | ||
| 503 | if image_vector is None: | 483 | if image_vector is None: |
| 504 | raise ValueError(f"Failed to encode image: {image_url}") | 484 | raise ValueError(f"Failed to encode image: {image_url}") |
| 505 | 485 | ||
| 486 | + # Add tenant_id to filters (required) | ||
| 487 | + if filters is None: | ||
| 488 | + filters = {} | ||
| 489 | + filters['tenant_id'] = tenant_id | ||
| 490 | + | ||
| 506 | # Build KNN query | 491 | # Build KNN query |
| 507 | es_query = { | 492 | es_query = { |
| 508 | "size": size, | 493 | "size": size, |
| @@ -536,28 +521,32 @@ class Searcher: | @@ -536,28 +521,32 @@ class Searcher: | ||
| 536 | size=size | 521 | size=size |
| 537 | ) | 522 | ) |
| 538 | 523 | ||
| 539 | - # Process results (similar to text search) | ||
| 540 | - hits = [] | 524 | + # Extract ES hits |
| 525 | + es_hits = [] | ||
| 541 | if 'hits' in es_response and 'hits' in es_response['hits']: | 526 | if 'hits' in es_response and 'hits' in es_response['hits']: |
| 542 | - for hit in es_response['hits']['hits']: | ||
| 543 | - hits.append({ | ||
| 544 | - '_id': hit['_id'], | ||
| 545 | - '_score': hit['_score'], | ||
| 546 | - '_source': hit['_source'] | ||
| 547 | - }) | 527 | + es_hits = es_response['hits']['hits'] |
| 548 | 528 | ||
| 529 | + # Extract total and max_score | ||
| 549 | total = es_response.get('hits', {}).get('total', {}) | 530 | total = es_response.get('hits', {}).get('total', {}) |
| 550 | if isinstance(total, dict): | 531 | if isinstance(total, dict): |
| 551 | total_value = total.get('value', 0) | 532 | total_value = total.get('value', 0) |
| 552 | else: | 533 | else: |
| 553 | total_value = total | 534 | total_value = total |
| 554 | 535 | ||
| 536 | + max_score = es_response.get('hits', {}).get('max_score') or 0.0 | ||
| 537 | + | ||
| 538 | + # Format results using ResultFormatter | ||
| 539 | + formatted_results = ResultFormatter.format_search_results(es_hits, max_score) | ||
| 540 | + | ||
| 555 | return SearchResult( | 541 | return SearchResult( |
| 556 | - hits=hits, | 542 | + results=formatted_results, |
| 557 | total=total_value, | 543 | total=total_value, |
| 558 | - max_score=es_response.get('hits', {}).get('max_score') or 0.0, | 544 | + max_score=max_score, |
| 559 | took_ms=es_response.get('took', 0), | 545 | took_ms=es_response.get('took', 0), |
| 560 | - query_info={'image_url': image_url, 'search_type': 'image_similarity'} | 546 | + facets=None, |
| 547 | + query_info={'image_url': image_url, 'search_type': 'image_similarity'}, | ||
| 548 | + suggestions=[], | ||
| 549 | + related_searches=[] | ||
| 561 | ) | 550 | ) |
| 562 | 551 | ||
| 563 | def get_domain_summary(self) -> Dict[str, Any]: | 552 | def get_domain_summary(self) -> Dict[str, Any]: |
设计文档.md
| @@ -12,33 +12,57 @@ | @@ -12,33 +12,57 @@ | ||
| 12 | 12 | ||
| 13 | ## 1. 原始数据层的约定 | 13 | ## 1. 原始数据层的约定 |
| 14 | 14 | ||
| 15 | -所有租户共用主表、独立配置和扩展表,有自己独立的ES索引。 | ||
| 16 | - | ||
| 17 | ### 1.1 店匠主表 | 15 | ### 1.1 店匠主表 |
| 18 | 16 | ||
| 19 | 所有租户共用以下主表: | 17 | 所有租户共用以下主表: |
| 20 | - `shoplazza_product_sku` - SKU级别商品数据 | 18 | - `shoplazza_product_sku` - SKU级别商品数据 |
| 21 | - `shoplazza_product_spu` - SPU级别商品数据 | 19 | - `shoplazza_product_spu` - SPU级别商品数据 |
| 22 | 20 | ||
| 23 | -### 1.2 每个租户的扩展表 | ||
| 24 | - | ||
| 25 | -各个租户有自己的扩展表,不同的租户根据不同的业务需要、以及不同的数据源,来定制自己的扩展表: | ||
| 26 | -- 自定义属性体系 | ||
| 27 | -- 多语言商品标题(中文、英文、俄文等) | ||
| 28 | -- 品牌名、不同的类目和标签体系 | ||
| 29 | -- 业务过滤和聚合字段 | ||
| 30 | -- 权重(提权)字段 | ||
| 31 | - | ||
| 32 | -**数据关联方式**: | ||
| 33 | -- 入索引时,商品主表 `shoplazza_product_sku` 的 `id` + `shopid` 与租户扩展表关联 | ||
| 34 | -- 例如:`customer1_extension` 表存储 customer1 的自定义字段 | 21 | +### 1.2 索引结构(SPU维度) |
| 22 | + | ||
| 23 | +**统一索引架构**: | ||
| 24 | +- 所有客户共享同一个Elasticsearch索引:`search_products` | ||
| 25 | +- 索引粒度:SPU级别(每个文档代表一个SPU) | ||
| 26 | +- 数据隔离:通过`tenant_id`字段实现租户隔离 | ||
| 27 | +- 嵌套结构:每个SPU文档包含嵌套的`variants`数组(SKU变体) | ||
| 28 | + | ||
| 29 | +**索引文档结构**: | ||
| 30 | +```json | ||
| 31 | +{ | ||
| 32 | + "tenant_id": "1", | ||
| 33 | + "product_id": "123", | ||
| 34 | + "title": "蓝牙耳机", | ||
| 35 | + "variants": [ | ||
| 36 | + { | ||
| 37 | + "variant_id": "456", | ||
| 38 | + "title": "黑色", | ||
| 39 | + "price": 199.99, | ||
| 40 | + "sku": "SKU-123-1", | ||
| 41 | + "stock": 50 | ||
| 42 | + } | ||
| 43 | + ], | ||
| 44 | + "min_price": 199.99, | ||
| 45 | + "max_price": 299.99 | ||
| 46 | +} | ||
| 47 | +``` | ||
| 35 | 48 | ||
| 36 | ### 1.3 配置化方案 | 49 | ### 1.3 配置化方案 |
| 37 | 50 | ||
| 51 | +**配置分离原则**: | ||
| 52 | +- **搜索配置**:只包含ES字段定义、查询域、排序规则等搜索相关配置 | ||
| 53 | +- **数据源配置**:不在搜索配置中,由Pipeline层(脚本)决定 | ||
| 54 | +- **数据导入流程**:写死的脚本,不依赖配置 | ||
| 55 | + | ||
| 38 | 统一通过配置文件定义: | 56 | 统一通过配置文件定义: |
| 39 | -1. ES 字段定义(字段类型、分析器、来源表/列) | 57 | +1. ES 字段定义(字段类型、分析器、boost等) |
| 40 | 2. ES mapping 结构生成 | 58 | 2. ES mapping 结构生成 |
| 41 | -3. 数据入库映射关系 | 59 | +3. 查询域配置(indexes) |
| 60 | +4. 排序和打分配置(function_score) | ||
| 61 | + | ||
| 62 | +**注意**:配置中**不包含**以下内容: | ||
| 63 | +- `mysql_config` - MySQL数据库配置 | ||
| 64 | +- `main_table` / `extension_table` - 数据表配置 | ||
| 65 | +- `source_table` / `source_column` - 字段数据源映射 | ||
| 42 | 66 | ||
| 43 | --- | 67 | --- |
| 44 | 68 | ||
| @@ -72,62 +96,54 @@ | @@ -72,62 +96,54 @@ | ||
| 72 | - **standard**:标准分析器 | 96 | - **standard**:标准分析器 |
| 73 | - **keyword**:关键词分析器 | 97 | - **keyword**:关键词分析器 |
| 74 | 98 | ||
| 75 | -#### 字段配置示例 | 99 | +#### 字段配置示例(Base配置) |
| 76 | 100 | ||
| 77 | ```yaml | 101 | ```yaml |
| 78 | fields: | 102 | fields: |
| 79 | - # 主键字段 | ||
| 80 | - - name: "skuId" | ||
| 81 | - type: "LONG" | ||
| 82 | - source_table: "main" # 主表 | ||
| 83 | - source_column: "id" | 103 | + # 租户隔离字段(必需) |
| 104 | + - name: "tenant_id" | ||
| 105 | + type: "KEYWORD" | ||
| 84 | required: true | 106 | required: true |
| 85 | index: true | 107 | index: true |
| 86 | store: true | 108 | store: true |
| 87 | 109 | ||
| 88 | - # 多语言文本字段 | ||
| 89 | - - name: "name" | ||
| 90 | - type: "TEXT" | ||
| 91 | - source_table: "extension" # 扩展表 | ||
| 92 | - source_column: "name" | ||
| 93 | - analyzer: "chinese_ecommerce" | ||
| 94 | - boost: 2.0 | 110 | + # 商品标识字段 |
| 111 | + - name: "product_id" | ||
| 112 | + type: "KEYWORD" | ||
| 113 | + required: true | ||
| 95 | index: true | 114 | index: true |
| 96 | store: true | 115 | store: true |
| 97 | 116 | ||
| 98 | - - name: "enSpuName" | 117 | + # 文本搜索字段 |
| 118 | + - name: "title" | ||
| 99 | type: "TEXT" | 119 | type: "TEXT" |
| 100 | - source_table: "extension" | ||
| 101 | - source_column: "enSpuName" | ||
| 102 | - analyzer: "english" | ||
| 103 | - boost: 2.0 | 120 | + analyzer: "chinese_ecommerce" |
| 121 | + boost: 3.0 | ||
| 122 | + index: true | ||
| 123 | + store: true | ||
| 104 | 124 | ||
| 105 | - - name: "ruSkuName" | 125 | + - name: "seo_keywords" |
| 106 | type: "TEXT" | 126 | type: "TEXT" |
| 107 | - source_table: "extension" | ||
| 108 | - source_column: "ruSkuName" | ||
| 109 | - analyzer: "russian" | 127 | + analyzer: "chinese_ecommerce" |
| 110 | boost: 2.0 | 128 | boost: 2.0 |
| 111 | - | ||
| 112 | - # 文本向量字段 | ||
| 113 | - - name: "name_embedding" | ||
| 114 | - type: "TEXT_EMBEDDING" | ||
| 115 | - source_table: "extension" | ||
| 116 | - source_column: "name" | ||
| 117 | - embedding_dims: 1024 | ||
| 118 | - embedding_similarity: "dot_product" | ||
| 119 | index: true | 129 | index: true |
| 130 | + store: true | ||
| 120 | 131 | ||
| 121 | - # 图片向量字段 | ||
| 122 | - - name: "image_embedding" | ||
| 123 | - type: "IMAGE_EMBEDDING" | ||
| 124 | - source_table: "extension" | ||
| 125 | - source_column: "imageUrl" | ||
| 126 | - embedding_dims: 1024 | ||
| 127 | - embedding_similarity: "dot_product" | ||
| 128 | - nested: false | 132 | + # 嵌套variants字段 |
| 133 | + - name: "variants" | ||
| 134 | + type: "JSON" | ||
| 135 | + nested: true | ||
| 136 | + nested_properties: | ||
| 137 | + variant_id: | ||
| 138 | + type: "keyword" | ||
| 139 | + price: | ||
| 140 | + type: "float" | ||
| 141 | + sku: | ||
| 142 | + type: "keyword" | ||
| 129 | ``` | 143 | ``` |
| 130 | 144 | ||
| 145 | +**注意**:配置中**不包含**`source_table`和`source_column`,数据源映射由Pipeline层决定。 | ||
| 146 | + | ||
| 131 | **实现模块**: | 147 | **实现模块**: |
| 132 | - `config/config_loader.py` - 配置加载器 | 148 | - `config/config_loader.py` - 配置加载器 |
| 133 | - `config/field_types.py` - 字段类型定义 | 149 | - `config/field_types.py` - 字段类型定义 |
| @@ -204,77 +220,69 @@ indexes: | @@ -204,77 +220,69 @@ indexes: | ||
| 204 | 220 | ||
| 205 | --- | 221 | --- |
| 206 | 222 | ||
| 207 | -## 3. 测试数据灌入 | 223 | +## 3. 数据导入流程 |
| 208 | 224 | ||
| 209 | ### 3.1 数据源 | 225 | ### 3.1 数据源 |
| 210 | 226 | ||
| 211 | -**主表**:`shoplazza_product_sku` | ||
| 212 | -- 所有租户共用 | ||
| 213 | -- 包含基础商品信息(id, shopid 等) | 227 | +**店匠标准表**(Base配置使用): |
| 228 | +- `shoplazza_product_spu` - SPU级别商品数据 | ||
| 229 | +- `shoplazza_product_sku` - SKU级别商品数据 | ||
| 214 | 230 | ||
| 215 | -**扩展表**:`customer1_extension` | ||
| 216 | -- 每个租户独立 | ||
| 217 | -- 包含自定义字段和多语言字段 | 231 | +**其他客户表**(customer1等): |
| 232 | +- 使用各自的数据源表和扩展表 | ||
| 218 | 233 | ||
| 219 | -### 3.2 数据灌入方式 | 234 | +### 3.2 数据导入方式 |
| 220 | 235 | ||
| 221 | -**实现情况**: | 236 | +**Pipeline层决定数据源**: |
| 237 | +- 数据导入流程是写死的脚本,不依赖配置 | ||
| 238 | +- 配置只关注ES搜索相关的内容 | ||
| 239 | +- 数据源映射逻辑写死在转换器代码中 | ||
| 222 | 240 | ||
| 223 | -#### 命令行工具 | ||
| 224 | -```bash | ||
| 225 | -python main.py ingest \ | ||
| 226 | - --customer customer1 \ | ||
| 227 | - --csv-file data/customer1_data.csv \ | ||
| 228 | - --es-host http://localhost:9200 \ | ||
| 229 | - --recreate \ | ||
| 230 | - --batch-size 100 | ||
| 231 | -``` | 241 | +#### Base配置数据导入(店匠通用) |
| 242 | + | ||
| 243 | +**脚本**:`scripts/ingest_shoplazza.py` | ||
| 232 | 244 | ||
| 233 | -#### 数据流程 | ||
| 234 | -1. **数据加载**:从 CSV 文件或 MySQL 数据库加载数据 | ||
| 235 | -2. **数据转换**: | ||
| 236 | - - 字段映射(根据配置将源字段映射到 ES 字段) | ||
| 237 | - - 类型转换(字符串、数字、日期等) | ||
| 238 | - - 向量生成(文本向量、图片向量) | ||
| 239 | - - 向量缓存(避免重复计算) | 245 | +**数据流程**: |
| 246 | +1. **数据加载**:从MySQL读取`shoplazza_product_spu`和`shoplazza_product_sku`表 | ||
| 247 | +2. **数据转换**(`indexer/spu_transformer.py`): | ||
| 248 | + - 按`spu_id`和`tenant_id`关联SPU和SKU数据 | ||
| 249 | + - 将SKU数据聚合为嵌套的`variants`数组 | ||
| 250 | + - 计算扁平化价格字段(`min_price`, `max_price`, `compare_at_price`) | ||
| 251 | + - 字段映射(写死在代码中,不依赖配置) | ||
| 252 | + - 注入`tenant_id`字段 | ||
| 240 | 3. **索引创建**: | 253 | 3. **索引创建**: |
| 241 | - - 根据配置生成 ES mapping | ||
| 242 | - - 创建或更新索引 | 254 | + - 根据配置生成ES mapping |
| 255 | + - 创建或更新`search_products`索引 | ||
| 243 | 4. **批量入库**: | 256 | 4. **批量入库**: |
| 244 | - - 批量写入 ES(默认每批 500 条) | 257 | + - 批量写入ES(默认每批500条) |
| 245 | - 错误处理和重试机制 | 258 | - 错误处理和重试机制 |
| 246 | 259 | ||
| 247 | -#### 配置映射示例 | ||
| 248 | - | ||
| 249 | -**customer1_config.yaml** 配置: | ||
| 250 | -```yaml | ||
| 251 | -main_table: "shoplazza_product_sku" | ||
| 252 | -extension_table: "customer1_extension" | ||
| 253 | -es_index_name: "search_customer1" | ||
| 254 | - | ||
| 255 | -fields: | ||
| 256 | - - name: "skuId" | ||
| 257 | - source_table: "main" | ||
| 258 | - source_column: "id" | ||
| 259 | - - name: "name" | ||
| 260 | - source_table: "extension" | ||
| 261 | - source_column: "name" | ||
| 262 | - - name: "enSpuName" | ||
| 263 | - source_table: "extension" | ||
| 264 | - source_column: "enSpuName" | 260 | +**命令行工具**: |
| 261 | +```bash | ||
| 262 | +python scripts/ingest_shoplazza.py \ | ||
| 263 | + --db-host localhost \ | ||
| 264 | + --db-port 3306 \ | ||
| 265 | + --db-database saas \ | ||
| 266 | + --db-username root \ | ||
| 267 | + --db-password password \ | ||
| 268 | + --tenant-id "1" \ | ||
| 269 | + --config base \ | ||
| 270 | + --es-host http://localhost:9200 \ | ||
| 271 | + --recreate \ | ||
| 272 | + --batch-size 500 | ||
| 265 | ``` | 273 | ``` |
| 266 | 274 | ||
| 267 | -**数据转换**: | ||
| 268 | -- 主表字段:直接从 `shoplazza_product_sku` 表的 `id` 字段读取 | ||
| 269 | -- 扩展表字段:从 `customer1_extension` 表的对应列读取 | ||
| 270 | -- 向量字段:对源文本/图片生成向量并缓存 | 275 | +#### 其他客户数据导入 |
| 276 | + | ||
| 277 | +- 使用各自的数据转换器(如`indexer/data_transformer.py`) | ||
| 278 | +- 数据源映射逻辑写死在各自的转换器中 | ||
| 279 | +- 共享`search_products`索引,通过`tenant_id`隔离 | ||
| 271 | 280 | ||
| 272 | **实现模块**: | 281 | **实现模块**: |
| 273 | -- `indexer/data_transformer.py` - 数据转换器 | 282 | +- `indexer/spu_transformer.py` - SPU数据转换器(Base配置) |
| 283 | +- `indexer/data_transformer.py` - 通用数据转换器(其他客户) | ||
| 274 | - `indexer/bulk_indexer.py` - 批量索引器 | 284 | - `indexer/bulk_indexer.py` - 批量索引器 |
| 275 | -- `indexer/indexing_pipeline.py` - 索引流水线 | ||
| 276 | -- `embeddings/bge_encoder.py` - 文本向量编码器 | ||
| 277 | -- `embeddings/clip_image_encoder.py` - 图片向量编码器 | 285 | +- `scripts/ingest_shoplazza.py` - 店匠数据导入脚本 |
| 278 | 286 | ||
| 279 | --- | 287 | --- |
| 280 | 288 | ||
| @@ -506,6 +514,14 @@ ranking: | @@ -506,6 +514,14 @@ ranking: | ||
| 506 | - ✅ 搜索接口(文本搜索、图片搜索) | 514 | - ✅ 搜索接口(文本搜索、图片搜索) |
| 507 | - ✅ 文档查询接口 | 515 | - ✅ 文档查询接口 |
| 508 | - ✅ 前端界面(HTML + JavaScript) | 516 | - ✅ 前端界面(HTML + JavaScript) |
| 517 | +- ✅ 租户隔离(tenant_id过滤) | ||
| 518 | + | ||
| 519 | +### 6.6 Base配置(店匠通用) | ||
| 520 | +- ✅ SPU级别索引结构 | ||
| 521 | +- ✅ 嵌套variants字段 | ||
| 522 | +- ✅ 统一索引(search_products) | ||
| 523 | +- ✅ 租户隔离(tenant_id) | ||
| 524 | +- ✅ 配置简化(移除MySQL相关配置) | ||
| 509 | 525 | ||
| 510 | --- | 526 | --- |
| 511 | 527 | ||
| @@ -521,9 +537,55 @@ ranking: | @@ -521,9 +537,55 @@ ranking: | ||
| 521 | 537 | ||
| 522 | --- | 538 | --- |
| 523 | 539 | ||
| 524 | -## 8. 配置文件示例 | 540 | +## 8. API响应格式 |
| 541 | + | ||
| 542 | +### 8.1 外部友好格式 | ||
| 543 | + | ||
| 544 | +API返回格式不包含ES内部字段(`_id`, `_score`, `_source`),使用外部友好的格式: | ||
| 545 | + | ||
| 546 | +**响应结构**: | ||
| 547 | +```json | ||
| 548 | +{ | ||
| 549 | + "results": [ | ||
| 550 | + { | ||
| 551 | + "product_id": "123", | ||
| 552 | + "title": "蓝牙耳机", | ||
| 553 | + "variants": [ | ||
| 554 | + { | ||
| 555 | + "variant_id": "456", | ||
| 556 | + "price": 199.99, | ||
| 557 | + "sku": "SKU-123-1", | ||
| 558 | + "stock": 50 | ||
| 559 | + } | ||
| 560 | + ], | ||
| 561 | + "relevance_score": 0.95 | ||
| 562 | + } | ||
| 563 | + ], | ||
| 564 | + "total": 10, | ||
| 565 | + "facets": [...], | ||
| 566 | + "suggestions": [], | ||
| 567 | + "related_searches": [] | ||
| 568 | +} | ||
| 569 | +``` | ||
| 570 | + | ||
| 571 | +**主要变化**: | ||
| 572 | +- 结构化结果(`ProductResult`和`VariantResult`) | ||
| 573 | +- 嵌套variants数组 | ||
| 574 | +- 无ES内部字段 | ||
| 575 | + | ||
| 576 | +### 8.2 租户隔离 | ||
| 577 | + | ||
| 578 | +所有API请求必须提供`tenant_id`: | ||
| 579 | +- 请求头:`X-Tenant-ID: 1` | ||
| 580 | +- 或查询参数:`?tenant_id=1` | ||
| 581 | + | ||
| 582 | +搜索时自动添加`tenant_id`过滤,确保数据隔离。 | ||
| 583 | + | ||
| 584 | +## 9. 配置文件示例 | ||
| 585 | + | ||
| 586 | +**Base配置**(店匠通用):`config/schema/base/config.yaml` | ||
| 525 | 587 | ||
| 526 | -完整配置示例请参考:`config/schema/customer1_config.yaml` | 588 | +**其他客户配置**:`config/schema/customer1/config.yaml` |
| 527 | 589 | ||
| 528 | --- | 590 | --- |
| 529 | 591 |