Commit 5dcddc06eabb865f4f59af2f73e384ea34148c15

Authored by tangwang
1 parent 39e63ad1

索引重构

主要是对 分类 属性 子sku 等重要字段的处理。
参考文档《 @docs/索引字段说明v2-mapping结构.md 》《 @docs/索引字段说明v2.md 》

feat:
1. 更新 field_types.py
添加 hanlp_index/hanlp_standard 分析器映射(映射到 CHINESE_ECOMMERCE/CHINESE_ECOMMERCE_QUERY)
支持 keyword_normalizer 配置(用于 vendor.keyword 的 lowercase normalizer)
更新 get_default_analyzers() 添加 hanlp 分析器和 lowercase normalizer
修复 image_embedding 的 url 字段类型为 text

2. 更新 config.yaml(32-207行)
移除无用字段:handle, seo_title, seo_description, seo_keywords, shoplazza_created_at, shoplazza_updated_at
添加中英文字段:title_zh, title_en, brief_zh, brief_en, description_zh, description_en, vendor_zh, vendor_en
添加 category 多层级字段:category_path_zh, category_path_en, category_name_zh, category_name_en, category_id, category_name, category_level, category1_name, category2_name, category3_name
添加 specifications 嵌套字段
添加 option 名称字段:option1_name, option2_name, option3_name
添加 SKU 扁平化字段:sku_prices, sku_weights, sku_weight_units, total_inventory
更新 skus 嵌套结构以匹配目标 mapping
添加 image_embedding 嵌套字段
更新 indexes 配置以使用新字段名

3. 更新 config_loader.py
添加 keyword_normalizer 字段支持

4. 重构 spu_transformer.py
添加 load_option_data() 方法从 option 表加载数据
更新 transform_batch() 加载 option 数据
重构 _transform_spu_to_doc():
实现中英文字段映射(暂时只填充中文)
实现 category 多层级字段映射和 category_path 解析
实现 specifications 构建(从 option 表获取 name,从 SKU 获取 value)
实现 option 名称字段映射
实现 SKU 扁平化字段计算
更新 skus 嵌套结构
重构 _transform_sku_row() 以匹配新的 SKU 结构
移除 SEO 和 handle 字段的处理
config/config.yaml
@@ -29,86 +29,88 @@ fields: @@ -29,86 +29,88 @@ fields:
29 store: true 29 store: true
30 return_in_source: true 30 return_in_source: true
31 31
32 - - name: "handle"  
33 - type: "KEYWORD"  
34 - index: true  
35 - store: true  
36 - return_in_source: true  
37 -  
38 - # 文本搜索字段  
39 - - name: "title" 32 + # 文本相关性相关字段(中英文双语)
  33 + - name: "title_zh"
40 type: "TEXT" 34 type: "TEXT"
41 - analyzer: "chinese_ecommerce" 35 + analyzer: "hanlp_index"
  36 + search_analyzer: "hanlp_standard"
42 boost: 3.0 37 boost: 3.0
43 index: true 38 index: true
44 store: true 39 store: true
45 return_in_source: true 40 return_in_source: true
46 41
47 - - name: "brief" 42 + - name: "brief_zh"
48 type: "TEXT" 43 type: "TEXT"
49 - analyzer: "chinese_ecommerce" 44 + analyzer: "hanlp_index"
  45 + search_analyzer: "hanlp_standard"
50 boost: 1.5 46 boost: 1.5
51 index: true 47 index: true
52 store: true 48 store: true
53 return_in_source: true 49 return_in_source: true
54 50
55 - - name: "description" 51 + - name: "description_zh"
56 type: "TEXT" 52 type: "TEXT"
57 - analyzer: "chinese_ecommerce" 53 + analyzer: "hanlp_index"
  54 + search_analyzer: "hanlp_standard"
58 boost: 1.0 55 boost: 1.0
59 index: true 56 index: true
60 store: true 57 store: true
61 return_in_source: true 58 return_in_source: true
62 59
63 - # SEO字段(提升相关性)  
64 - - name: "seo_title" 60 + - name: "vendor_zh"
65 type: "TEXT" 61 type: "TEXT"
66 - analyzer: "chinese_ecommerce"  
67 - boost: 2.0  
68 - index: true  
69 - store: true  
70 - return_in_source: false # SEO字段通常不需要在结果中返回  
71 -  
72 - - name: "seo_description"  
73 - type: "TEXT"  
74 - analyzer: "chinese_ecommerce" 62 + analyzer: "hanlp_index"
  63 + search_analyzer: "hanlp_standard"
75 boost: 1.5 64 boost: 1.5
76 index: true 65 index: true
77 store: true 66 store: true
78 - return_in_source: false 67 + return_in_source: true
  68 + keyword_subfield: true
  69 + keyword_normalizer: "lowercase"
79 70
80 - - name: "seo_keywords" 71 + - name: "title_en"
81 type: "TEXT" 72 type: "TEXT"
82 - analyzer: "chinese_ecommerce"  
83 - boost: 2.0 73 + analyzer: "english"
  74 + search_analyzer: "english"
  75 + boost: 3.0
84 index: true 76 index: true
85 store: true 77 store: true
86 - return_in_source: false 78 + return_in_source: true
87 79
88 - # 分类和标签字段(TEXT + KEYWORD双重索引)  
89 - - name: "vendor"  
90 - type: "HKText"  
91 - analyzer: "chinese_ecommerce" 80 + - name: "brief_en"
  81 + type: "TEXT"
  82 + analyzer: "english"
  83 + search_analyzer: "english"
92 boost: 1.5 84 boost: 1.5
93 index: true 85 index: true
94 store: true 86 store: true
95 return_in_source: true 87 return_in_source: true
96 88
97 - - name: "tags"  
98 - type: "HKText"  
99 - analyzer: "chinese_ecommerce" 89 + - name: "description_en"
  90 + type: "TEXT"
  91 + analyzer: "english"
  92 + search_analyzer: "english"
100 boost: 1.0 93 boost: 1.0
101 index: true 94 index: true
102 store: true 95 store: true
103 return_in_source: true 96 return_in_source: true
104 97
105 - - name: "category"  
106 - type: "HKText"  
107 - analyzer: "chinese_ecommerce" 98 + - name: "vendor_en"
  99 + type: "TEXT"
  100 + analyzer: "english"
  101 + search_analyzer: "english"
108 boost: 1.5 102 boost: 1.5
109 index: true 103 index: true
110 store: true 104 store: true
111 return_in_source: true 105 return_in_source: true
  106 + keyword_subfield: true
  107 + keyword_normalizer: "lowercase"
  108 +
  109 + - name: "tags"
  110 + type: "KEYWORD"
  111 + index: true
  112 + store: true
  113 + return_in_source: true
112 114
113 # 价格字段(扁平化) 115 # 价格字段(扁平化)
114 - name: "min_price" 116 - name: "min_price"
@@ -129,6 +131,30 @@ fields: @@ -129,6 +131,30 @@ fields:
129 store: true 131 store: true
130 return_in_source: true 132 return_in_source: true
131 133
  134 + - name: "sku_prices"
  135 + type: "FLOAT"
  136 + index: true
  137 + store: true
  138 + return_in_source: true
  139 +
  140 + - name: "sku_weights"
  141 + type: "LONG"
  142 + index: true
  143 + store: true
  144 + return_in_source: true
  145 +
  146 + - name: "sku_weight_units"
  147 + type: "KEYWORD"
  148 + index: true
  149 + store: true
  150 + return_in_source: true
  151 +
  152 + - name: "total_inventory"
  153 + type: "LONG"
  154 + index: true
  155 + store: true
  156 + return_in_source: true
  157 +
132 # 图片字段(用于显示,不参与搜索) 158 # 图片字段(用于显示,不参与搜索)
133 - name: "image_url" 159 - name: "image_url"
134 type: "KEYWORD" 160 type: "KEYWORD"
@@ -136,7 +162,7 @@ fields: @@ -136,7 +162,7 @@ fields:
136 store: true 162 store: true
137 return_in_source: true 163 return_in_source: true
138 164
139 - # 文本嵌入字段(用于语义搜索) 165 + # 语义向量
140 - name: "title_embedding" 166 - name: "title_embedding"
141 type: "TEXT_EMBEDDING" 167 type: "TEXT_EMBEDDING"
142 embedding_dims: 1024 168 embedding_dims: 1024
@@ -145,30 +171,137 @@ fields: @@ -145,30 +171,137 @@ fields:
145 store: false 171 store: false
146 return_in_source: false # 嵌入向量通常不需要在结果中返回 172 return_in_source: false # 嵌入向量通常不需要在结果中返回
147 173
148 - # 时间字段  
149 - - name: "create_time"  
150 - type: "DATE" 174 + - name: "image_embedding"
  175 + type: "IMAGE_EMBEDDING"
  176 + embedding_dims: 1024
  177 + embedding_similarity: "dot_product"
  178 + nested: true
  179 + index: true
  180 + store: false
  181 + return_in_source: false
  182 +
  183 + # 分类相关字段
  184 + - name: "category_path_zh"
  185 + type: "TEXT"
  186 + analyzer: "hanlp_index"
  187 + search_analyzer: "hanlp_standard"
  188 + boost: 1.5
151 index: true 189 index: true
152 store: true 190 store: true
153 return_in_source: true 191 return_in_source: true
154 192
155 - - name: "update_time"  
156 - type: "DATE" 193 + - name: "category_path_en"
  194 + type: "TEXT"
  195 + analyzer: "english"
  196 + search_analyzer: "english"
  197 + boost: 1.5
  198 + index: true
  199 + store: true
  200 + return_in_source: true
  201 +
  202 + - name: "category_name_zh"
  203 + type: "TEXT"
  204 + analyzer: "hanlp_index"
  205 + search_analyzer: "hanlp_standard"
  206 + boost: 1.5
  207 + index: true
  208 + store: true
  209 + return_in_source: true
  210 +
  211 + - name: "category_name_en"
  212 + type: "TEXT"
  213 + analyzer: "english"
  214 + search_analyzer: "english"
  215 + boost: 1.5
157 index: true 216 index: true
158 store: true 217 store: true
159 return_in_source: true 218 return_in_source: true
160 219
161 - - name: "shoplazza_created_at" 220 + - name: "category_id"
  221 + type: "KEYWORD"
  222 + index: true
  223 + store: true
  224 + return_in_source: true
  225 +
  226 + - name: "category_name"
  227 + type: "KEYWORD"
  228 + index: true
  229 + store: true
  230 + return_in_source: true
  231 +
  232 + - name: "category_level"
  233 + type: "INT"
  234 + index: true
  235 + store: true
  236 + return_in_source: true
  237 +
  238 + - name: "category1_name"
  239 + type: "KEYWORD"
  240 + index: true
  241 + store: true
  242 + return_in_source: true
  243 +
  244 + - name: "category2_name"
  245 + type: "KEYWORD"
  246 + index: true
  247 + store: true
  248 + return_in_source: true
  249 +
  250 + - name: "category3_name"
  251 + type: "KEYWORD"
  252 + index: true
  253 + store: true
  254 + return_in_source: true
  255 +
  256 + # SKU款式、子sku属性
  257 + - name: "specifications"
  258 + type: "JSON"
  259 + nested: true
  260 + return_in_source: true
  261 + nested_properties:
  262 + sku_id:
  263 + type: "keyword"
  264 + index: true
  265 + store: true
  266 + name:
  267 + type: "keyword"
  268 + index: true
  269 + store: true
  270 + value:
  271 + type: "keyword"
  272 + index: true
  273 + store: true
  274 +
  275 + - name: "option1_name"
  276 + type: "KEYWORD"
  277 + index: true
  278 + store: true
  279 + return_in_source: true
  280 +
  281 + - name: "option2_name"
  282 + type: "KEYWORD"
  283 + index: true
  284 + store: true
  285 + return_in_source: true
  286 +
  287 + - name: "option3_name"
  288 + type: "KEYWORD"
  289 + index: true
  290 + store: true
  291 + return_in_source: true
  292 +
  293 + # 时间字段
  294 + - name: "create_time"
162 type: "DATE" 295 type: "DATE"
163 index: true 296 index: true
164 store: true 297 store: true
165 - return_in_source: false # 通常不需要返回 298 + return_in_source: true
166 299
167 - - name: "shoplazza_updated_at" 300 + - name: "update_time"
168 type: "DATE" 301 type: "DATE"
169 index: true 302 index: true
170 store: true 303 store: true
171 - return_in_source: false # 通常不需要返回 304 + return_in_source: true
172 305
173 # 嵌套skus字段 306 # 嵌套skus字段
174 - name: "skus" 307 - name: "skus"
@@ -180,11 +313,6 @@ fields: @@ -180,11 +313,6 @@ fields:
180 type: "keyword" 313 type: "keyword"
181 index: true 314 index: true
182 store: true 315 store: true
183 - title:  
184 - type: "text"  
185 - analyzer: "chinese_ecommerce"  
186 - index: true  
187 - store: true  
188 price: 316 price:
189 type: "float" 317 type: "float"
190 index: true 318 index: true
@@ -193,7 +321,7 @@ fields: @@ -193,7 +321,7 @@ fields:
193 type: "float" 321 type: "float"
194 index: true 322 index: true
195 store: true 323 store: true
196 - sku: 324 + sku_code:
197 type: "keyword" 325 type: "keyword"
198 index: true 326 index: true
199 store: true 327 store: true
@@ -201,46 +329,65 @@ fields: @@ -201,46 +329,65 @@ fields:
201 type: "long" 329 type: "long"
202 index: true 330 index: true
203 store: true 331 store: true
204 - options:  
205 - type: "object"  
206 - enabled: true 332 + weight:
  333 + type: "float"
  334 + index: true
  335 + store: true
  336 + weight_unit:
  337 + type: "keyword"
  338 + index: true
  339 + store: true
  340 + option1_value:
  341 + type: "keyword"
  342 + index: true
  343 + store: true
  344 + option2_value:
  345 + type: "keyword"
  346 + index: true
  347 + store: true
  348 + option3_value:
  349 + type: "keyword"
  350 + index: true
  351 + store: true
  352 + image_src:
  353 + type: "keyword"
  354 + index: false
  355 + store: true
207 356
208 # Index Structure (Query Domains) 357 # Index Structure (Query Domains)
209 indexes: 358 indexes:
210 - name: "default" 359 - name: "default"
211 label: "默认索引" 360 label: "默认索引"
212 fields: 361 fields:
213 - - "title"  
214 - - "brief"  
215 - - "description"  
216 - - "seo_title"  
217 - - "seo_description"  
218 - - "seo_keywords"  
219 - - "vendor" 362 + - "title_zh"
  363 + - "brief_zh"
  364 + - "description_zh"
  365 + - "vendor_zh"
220 - "tags" 366 - "tags"
221 - - "category" 367 + - "category_path_zh"
  368 + - "category_name_zh"
222 analyzer: "chinese_ecommerce" 369 analyzer: "chinese_ecommerce"
223 boost: 1.0 370 boost: 1.0
224 371
225 - name: "title" 372 - name: "title"
226 label: "标题索引" 373 label: "标题索引"
227 fields: 374 fields:
228 - - "title"  
229 - - "seo_title" 375 + - "title_zh"
230 analyzer: "chinese_ecommerce" 376 analyzer: "chinese_ecommerce"
231 boost: 2.0 377 boost: 2.0
232 378
233 - name: "vendor" 379 - name: "vendor"
234 label: "品牌索引" 380 label: "品牌索引"
235 fields: 381 fields:
236 - - "vendor" 382 + - "vendor_zh"
237 analyzer: "chinese_ecommerce" 383 analyzer: "chinese_ecommerce"
238 boost: 1.5 384 boost: 1.5
239 385
240 - name: "category" 386 - name: "category"
241 label: "类目索引" 387 label: "类目索引"
242 fields: 388 fields:
243 - - "category" 389 + - "category_path_zh"
  390 + - "category_name_zh"
244 analyzer: "chinese_ecommerce" 391 analyzer: "chinese_ecommerce"
245 boost: 1.5 392 boost: 1.5
246 393
@@ -248,7 +395,6 @@ indexes: @@ -248,7 +395,6 @@ indexes:
248 label: "标签索引" 395 label: "标签索引"
249 fields: 396 fields:
250 - "tags" 397 - "tags"
251 - - "seo_keywords"  
252 analyzer: "chinese_ecommerce" 398 analyzer: "chinese_ecommerce"
253 boost: 1.0 399 boost: 1.0
254 400
config/config_loader.py
@@ -313,7 +313,8 @@ class ConfigLoader: @@ -313,7 +313,8 @@ class ConfigLoader:
313 nested=field_data.get("nested", False), 313 nested=field_data.get("nested", False),
314 nested_properties=field_data.get("nested_properties"), 314 nested_properties=field_data.get("nested_properties"),
315 keyword_subfield=field_data.get("keyword_subfield", is_hktext), 315 keyword_subfield=field_data.get("keyword_subfield", is_hktext),
316 - keyword_ignore_above=field_data.get("keyword_ignore_above", 256) 316 + keyword_ignore_above=field_data.get("keyword_ignore_above", 256),
  317 + keyword_normalizer=field_data.get("keyword_normalizer")
317 ) 318 )
318 319
319 def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig: 320 def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
config/field_types.py
@@ -75,6 +75,7 @@ class FieldConfig: @@ -75,6 +75,7 @@ class FieldConfig:
75 # Hybrid Keyword Text (HKText) support 75 # Hybrid Keyword Text (HKText) support
76 keyword_subfield: bool = False 76 keyword_subfield: bool = False
77 keyword_ignore_above: int = 256 77 keyword_ignore_above: int = 256
  78 + keyword_normalizer: Optional[str] = None # For keyword subfield normalizer (e.g., "lowercase")
78 79
79 80
80 def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: 81 def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
@@ -100,18 +101,28 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: @@ -100,18 +101,28 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
100 if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE: 101 if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE:
101 mapping["analyzer"] = "index_ansj" 102 mapping["analyzer"] = "index_ansj"
102 mapping["search_analyzer"] = "query_ansj" 103 mapping["search_analyzer"] = "query_ansj"
  104 + elif field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY:
  105 + # If search_analyzer is explicitly set to CHINESE_ECOMMERCE_QUERY
  106 + mapping["analyzer"] = "index_ansj"
  107 + mapping["search_analyzer"] = "query_ansj"
103 else: 108 else:
104 mapping["analyzer"] = field_config.analyzer.value 109 mapping["analyzer"] = field_config.analyzer.value
105 110
106 if field_config.search_analyzer: 111 if field_config.search_analyzer:
107 - mapping["search_analyzer"] = field_config.search_analyzer.value 112 + if field_config.search_analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY:
  113 + mapping["search_analyzer"] = "query_ansj"
  114 + else:
  115 + mapping["search_analyzer"] = field_config.search_analyzer.value
108 116
109 if field_config.keyword_subfield: 117 if field_config.keyword_subfield:
110 mapping.setdefault("fields", {}) 118 mapping.setdefault("fields", {})
111 - mapping["fields"]["keyword"] = { 119 + keyword_field = {
112 "type": "keyword", 120 "type": "keyword",
113 "ignore_above": field_config.keyword_ignore_above 121 "ignore_above": field_config.keyword_ignore_above
114 } 122 }
  123 + if field_config.keyword_normalizer:
  124 + keyword_field["normalizer"] = field_config.keyword_normalizer
  125 + mapping["fields"]["keyword"] = keyword_field
115 126
116 elif field_config.field_type == FieldType.KEYWORD: 127 elif field_config.field_type == FieldType.KEYWORD:
117 mapping = { 128 mapping = {
@@ -140,7 +151,7 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: @@ -140,7 +151,7 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
140 "similarity": field_config.embedding_similarity 151 "similarity": field_config.embedding_similarity
141 }, 152 },
142 "url": { 153 "url": {
143 - "type": "keyword" 154 + "type": "text"
144 } 155 }
145 } 156 }
146 } 157 }
@@ -239,6 +250,22 @@ def get_default_analyzers() -> Dict[str, Any]: @@ -239,6 +250,22 @@ def get_default_analyzers() -> Dict[str, Any]:
239 "type": "custom", 250 "type": "custom",
240 "tokenizer": "standard", 251 "tokenizer": "standard",
241 "filter": ["lowercase", "asciifolding"] 252 "filter": ["lowercase", "asciifolding"]
  253 + },
  254 + "hanlp_index": {
  255 + "type": "custom",
  256 + "tokenizer": "standard",
  257 + "filter": ["lowercase", "asciifolding"]
  258 + },
  259 + "hanlp_standard": {
  260 + "type": "custom",
  261 + "tokenizer": "standard",
  262 + "filter": ["lowercase", "asciifolding"]
  263 + }
  264 + },
  265 + "normalizer": {
  266 + "lowercase": {
  267 + "type": "custom",
  268 + "filter": ["lowercase"]
242 } 269 }
243 } 270 }
244 } 271 }
@@ -300,6 +327,9 @@ ANALYZER_MAP = { @@ -300,6 +327,9 @@ ANALYZER_MAP = {
300 "chinese": AnalyzerType.CHINESE_ECOMMERCE, 327 "chinese": AnalyzerType.CHINESE_ECOMMERCE,
301 "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE, 328 "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE,
302 "index_ansj": AnalyzerType.CHINESE_ECOMMERCE, 329 "index_ansj": AnalyzerType.CHINESE_ECOMMERCE,
  330 + "hanlp_index": AnalyzerType.CHINESE_ECOMMERCE, # Alias for index_ansj
  331 + "hanlp_standard": AnalyzerType.CHINESE_ECOMMERCE_QUERY, # Alias for query_ansj
  332 + "query_ansj": AnalyzerType.CHINESE_ECOMMERCE_QUERY,
303 "english": AnalyzerType.ENGLISH, 333 "english": AnalyzerType.ENGLISH,
304 "arabic": AnalyzerType.ARABIC, 334 "arabic": AnalyzerType.ARABIC,
305 "spanish": AnalyzerType.SPANISH, 335 "spanish": AnalyzerType.SPANISH,
docs/索引字段说明v2-mapping结构.md 0 → 100644
@@ -0,0 +1,231 @@ @@ -0,0 +1,231 @@
  1 +{
  2 + "mappings": {
  3 + "properties": {
  4 + "tenant_id": {
  5 + "type": "keyword"
  6 + },
  7 + "spu_id": {
  8 + "type": "keyword"
  9 + },
  10 +
  11 + "create_time": {
  12 + "type": "date"
  13 + },
  14 + "update_time": {
  15 + "type": "date"
  16 + },
  17 +
  18 + // 文本相关性相关字段
  19 + "title_zh": {
  20 + "type": "text",
  21 + "analyzer": "hanlp_index",
  22 + "search_analyzer": "hanlp_standard"
  23 + },
  24 + "brief_zh": {
  25 + "type": "text",
  26 + "analyzer": "hanlp_index",
  27 + "search_analyzer": "hanlp_standard"
  28 + },
  29 + "description_zh": {
  30 + "type": "text",
  31 + "analyzer": "hanlp_index",
  32 + "search_analyzer": "hanlp_standard"
  33 + },
  34 + "vendor_zh": {
  35 + "type": "text",
  36 + "analyzer": "hanlp_index",
  37 + "search_analyzer": "hanlp_standard",
  38 + "fields": {
  39 + "keyword": {
  40 + "type": "keyword",
  41 + "normalizer": "lowercase"
  42 + }
  43 + }
  44 + },
  45 +
  46 + "title_en": {
  47 + "type": "text",
  48 + "analyzer": "english",
  49 + "search_analyzer": "english",
  50 + },
  51 + "brief_en": {
  52 + "type": "text",
  53 + "analyzer": "english",
  54 + "search_analyzer": "english",
  55 +
  56 + },
  57 + "description_en": {
  58 + "type": "text",
  59 + "analyzer": "english",
  60 + "search_analyzer": "english",
  61 + },
  62 + "vendor_en": {
  63 + "type": "text",
  64 + "analyzer": "english",
  65 + "search_analyzer": "english",
  66 + "fields": {
  67 + "keyword": {
  68 + "type": "keyword",
  69 + "normalizer": "lowercase"
  70 + }
  71 + }
  72 + },
  73 +
  74 + "tags": {
  75 + "type": "keyword",
  76 + },
  77 +
  78 + "image_url": {
  79 + "type": "keyword",
  80 + "index": false
  81 + },
  82 +
  83 + // 语义向量
  84 + "title_embedding": {
  85 + "type": "dense_vector",
  86 + "dims": 1024,
  87 + "index": true,
  88 + "similarity": "dot_product"
  89 + },
  90 + "image_embedding": {
  91 + "type": "nested",
  92 + "properties": {
  93 + "vector": {
  94 + "type": "dense_vector",
  95 + "dims": 1024,
  96 + "index": true,
  97 + "similarity": "dot_product"
  98 + },
  99 + "url": {
  100 + "type": "text"
  101 + }
  102 + }
  103 + },
  104 +
  105 + // 分类相关
  106 + "category_path_zh": { // 提供模糊查询功能,辅助相关性计算
  107 + "type": "text",
  108 + "analyzer": "hanlp_index",
  109 + "search_analyzer": "hanlp_standard"
  110 + },
  111 + "category_path_en": { // 提供模糊查询功能,辅助相关性计算
  112 + "type": "text",
  113 + "analyzer": "english",
  114 + "search_analyzer": "english"
  115 + },
  116 + "category_name_zh": { // 提供模糊查询功能,辅助相关性计算
  117 + "type": "text",
  118 + "analyzer": "hanlp_index",
  119 + "search_analyzer": "hanlp_standard"
  120 + },
  121 + "category_name_en": { // 提供模糊查询功能,辅助相关性计算
  122 + "type": "text",
  123 + "analyzer": "english",
  124 + "search_analyzer": "english"
  125 + },
  126 +
  127 + "category_id": {
  128 + "type": "keyword"
  129 + },
  130 + "category_name": {
  131 + "type": "keyword"
  132 + },
  133 + "category_level": {
  134 + "type": "integer"
  135 + },
  136 + "category1_name": { // 不同层级下 可能有同名的情况,因此提供一二三级分开的查询方式
  137 + "type": "keyword"
  138 + },
  139 + "category2_name": {
  140 + "type": "keyword"
  141 + },
  142 + "category3_name": {
  143 + "type": "keyword"
  144 + },
  145 +
  146 + // sku款式、子sku属性
  147 + "specifications": {
  148 + "type": "nested",
  149 + "properties": {
  150 + "sku_id": { "type": "keyword" },
  151 + "name": { "type": "keyword" }, // "颜色", "容量"
  152 + "value": { "type": "keyword" } // "白色", "256GB"
  153 + }
  154 + },
  155 +
  156 + "option1_name": {
  157 + "type": "keyword"
  158 + },
  159 + "option2_name": {
  160 + "type": "keyword"
  161 + },
  162 + "option3_name": {
  163 + "type": "keyword"
  164 + },
  165 +
  166 + "min_price": {
  167 + "type": "float"
  168 + },
  169 + "max_price": {
  170 + "type": "float"
  171 + },
  172 + "compare_at_price": {
  173 + "type": "float"
  174 + },
  175 + "sku_prices": {
  176 + "type": "float"
  177 + },
  178 + "sku_weights": {
  179 + "type": "long"
  180 + },
  181 + "sku_weight_units": {
  182 + "type": "keyword"
  183 + },
  184 + "total_inventory": {
  185 + "type": "long"
  186 + },
  187 +
  188 + "skus": {
  189 + "type": "nested",
  190 + "properties": {
  191 + "sku_id": {
  192 + "type": "keyword"
  193 + },
  194 + "price": {
  195 + "type": "float"
  196 + },
  197 + "compare_at_price": {
  198 + "type": "float"
  199 + },
  200 + "sku_code": {
  201 + "type": "keyword"
  202 + },
  203 + "stock": {
  204 + "type": "long"
  205 + },
  206 + "weight": {
  207 + "type": "float"
  208 + },
  209 + "weight_unit": {
  210 + "type": "keyword"
  211 + },
  212 + "option1_value": {
  213 + "type": "keyword"
  214 + },
  215 + "option2_value": {
  216 + "type": "keyword"
  217 + },
  218 + "option3_value": {
  219 + "type": "keyword"
  220 + },
  221 + "image_src": {
  222 + "type": "keyword",
  223 + "index": false
  224 + }
  225 + }
  226 + }
  227 + }
  228 + }
  229 +}
  230 +
  231 +
docs/索引字段说明v2.md
@@ -4,247 +4,34 @@ SPU-SKU索引方案选型 @@ -4,247 +4,34 @@ SPU-SKU索引方案选型
4 除了title, brielf description seo相关 cate tags vendor所有影响相关性的字段都在spu。 sku只有款式、价格、重量、库存等相关属性。所以,可以以spu为单位建立索引。 4 除了title, brielf description seo相关 cate tags vendor所有影响相关性的字段都在spu。 sku只有款式、价格、重量、库存等相关属性。所以,可以以spu为单位建立索引。
5 sku中需要参与搜索的属性(比如价格、库存)展开到spu。 5 sku中需要参与搜索的属性(比如价格、库存)展开到spu。
6 sku的所有需要返回的字段作为nested字段,仅用于返回。 6 sku的所有需要返回的字段作为nested字段,仅用于返回。
7 -灌入数据准备  
8 -def build_product_document(product, skus):  
9 - # 提取价格列表(转换为float,保留两位小数)  
10 - price_list = [float(sku.price) for sku in skus if sku.price is not None]  
11 -  
12 - # 提取重量信息(重量转为int,单位统一为克;重量+单位拼接为字符串)  
13 - weight_list = [int(float(sku.weight) * 1000) for sku in skus if sku.weight is not None] # 转为整数克  
14 - weight_with_unit_list = [f"{sku.weight}{sku.weight_unit}" for sku in skus if sku.weight and sku.weight_unit]  
15 -  
16 - # 计算库存总和  
17 - total_stock = sum([sku.inventory_quantity for sku in skus if sku.inventory_quantity is not None])  
18 -  
19 - # 计算价格区间  
20 - min_price = min(price_list) if price_list else 0.0  
21 - max_price = max(price_list) if price_list else 0.0  
22 - 7 +# 写入 spu 级别索引
  8 +def build_product_document(product, variants):
23 return { 9 return {
24 "spu_id": str(product.id), 10 "spu_id": str(product.id),
25 "title": product.title, 11 "title": product.title,
26 12
27 - # SPU级别的选项名称定义(如:颜色、尺码、材质)  
28 - "option1_name": getattr(product, 'option1', None),  
29 - "option2_name": getattr(product, 'option2', None),  
30 - "option3_name": getattr(product, 'option3', None),  
31 -  
32 - # SKU搜索字段(展开) 13 + # Variant搜索字段(展开)
33 # 价格(int)、重量(int)、重量单位拼接重量(keyword),都以list形式灌入 14 # 价格(int)、重量(int)、重量单位拼接重量(keyword),都以list形式灌入
34 - "sku_prices": price_list, # 所有SKU价格列表,用于范围聚合  
35 - "sku_weights": weight_list, # 重量数值列表(转换为整数克)  
36 - "sku_weight_units": weight_with_unit_list, # 重量+单位字符串列表  
37 -  
38 - # 库存总和 将SKU的库存加起来作为一个值灌入  
39 - "total_inventory": total_stock, # SKU库存总和  
40 -  
41 - # 售价,灌入3个字段:SKU价格列表、最高价、最低价  
42 - "min_price": min_price, # 最低售价  
43 - "max_price": max_price, # 最高售价  
44 - "price_range": { # 价格区间对象,便于范围查询  
45 - "gte": min_price,  
46 - "lte": max_price  
47 - },  
48 -  
49 - # SKU详细信息(nested结构,仅用于返回)  
50 - "skus": [  
51 - {  
52 - "sku_id": str(sku.id),  
53 - "price": float(sku.price) if sku.price else 0.0,  
54 - "compare_at_price": float(sku.compare_at_price) if sku.compare_at_price else None,  
55 - "sku_code": sku.sku,  
56 - "stock": sku.inventory_quantity,  
57 - "weight": float(sku.weight) if sku.weight else None,  
58 - "weight_unit": sku.weight_unit,  
59 -  
60 - # SKU级别的选项值(对应SPU的选项名称)  
61 - "option1_value": sku.option1,  
62 - "option2_value": sku.option2,  
63 - "option3_value": sku.option3, 15 + # TODO 按要求补充
  16 +
  17 + # 库存总和 将sku的库存加起来作为一个值灌入
  18 + # 售价,灌入3个字段,一个 sku价格 以list形式灌入,一个最高价一个最低价
  19 + # TODO 按要求补充
64 20
65 - "image_src": sku.image_src 21 + # Variant详细信息(用于返回)
  22 + "variants": [
  23 + {
  24 + "sku_id": str(v.id),
  25 + "price": float(v.price),
  26 + "options": v.options
66 } 27 }
67 - for sku in skus 28 + for v in variants
68 ], 29 ],
69 -  
70 - # 其他SPU级别字段(根据索引文档补充)  
71 - "tenant_id": str(product.tenant_id),  
72 - "brief": product.brief,  
73 - "description": product.description,  
74 - "vendor": product.vendor,  
75 - "category": product.category,  
76 - "tags": product.tags.split(',') if product.tags else [],  
77 - "seo_title": product.seo_title,  
78 - "seo_description": product.seo_description,  
79 - "seo_keywords": product.seo_keywords.split(',') if product.seo_keywords else [],  
80 - "image_url": product.image_src,  
81 - "create_time": product.create_time.isoformat() if product.create_time else None,  
82 - "update_time": product.update_time.isoformat() if product.update_time else None  
83 - }  
84 - 索引定义  
85 -{  
86 - "mappings": {  
87 - "properties": {  
88 - "tenant_id": {  
89 - "type": "keyword"  
90 - },  
91 - "spu_id": {  
92 - "type": "keyword"  
93 - },  
94 - // 文本相关性相关字段  
95 - "title_zh": {  
96 - "type": "text",  
97 - "analyzer": "hanlp_index",  
98 - "search_analyzer": "hanlp_standard"  
99 - },  
100 - "brief_zh": {  
101 - "type": "text",  
102 - "analyzer": "hanlp_index",  
103 - "search_analyzer": "hanlp_standard"  
104 - },  
105 - "description_zh": {  
106 - "type": "text",  
107 - "analyzer": "hanlp_index",  
108 - "search_analyzer": "hanlp_standard"  
109 - },  
110 - "vendor_zh": {  
111 - "type": "text",  
112 - "analyzer": "hanlp_index",  
113 - "search_analyzer": "hanlp_standard",  
114 - "fields": {  
115 - "keyword": {  
116 - "type": "keyword",  
117 - "normalizer": "lowercase"  
118 - }  
119 - }  
120 - },  
121 -  
122 - "title_en": {  
123 - "type": "text",  
124 - "analyzer": "english",  
125 - "search_analyzer": "english",  
126 - },  
127 - "brief_en": {  
128 - "type": "text",  
129 - "analyzer": "english",  
130 - "search_analyzer": "english",  
131 30
132 - },  
133 - "description_en": {  
134 - "type": "text",  
135 - "analyzer": "english",  
136 - "search_analyzer": "english",  
137 - },  
138 - "vendor_en": {  
139 - "type": "text",  
140 - "analyzer": "english",  
141 - "search_analyzer": "english",  
142 - "fields": {  
143 - "keyword": {  
144 - "type": "keyword",  
145 - "normalizer": "lowercase"  
146 - }  
147 - }  
148 - },  
149 -  
150 - "tags": {  
151 - "type": "keyword",  
152 - },  
153 -  
154 -  
155 - "min_price": {  
156 - "type": "float"  
157 - },  
158 - "max_price": {  
159 - "type": "float"  
160 - },  
161 - "compare_at_price": {  
162 - "type": "float"  
163 - },  
164 - "sku_prices": {  
165 - "type": "float"  
166 - },  
167 - "sku_weights": {  
168 - "type": "long"  
169 - },  
170 - "sku_weight_units": {  
171 - "type": "keyword"  
172 - },  
173 - "total_inventory": {  
174 - "type": "long"  
175 - },  
176 -  
177 - "image_url": {  
178 - "type": "keyword",  
179 - "index": false  
180 - },  
181 -  
182 - "title_embedding": {  
183 - "type": "dense_vector",  
184 - "dims": 1024,  
185 - "index": true,  
186 - "similarity": "dot_product"  
187 - },  
188 -  
189 - "create_time": {  
190 - "type": "date"  
191 - },  
192 - "update_time": {  
193 - "type": "date"  
194 - },  
195 -  
196 - "option1_name": {  
197 - "type": "keyword"  
198 - },  
199 - "option2_name": {  
200 - "type": "keyword"  
201 - },  
202 - "option3_name": {  
203 - "type": "keyword"  
204 - },  
205 -  
206 - "skus": {  
207 - "type": "nested",  
208 - "properties": {  
209 - "sku_id": {  
210 - "type": "keyword"  
211 - },  
212 - "price": {  
213 - "type": "float"  
214 - },  
215 - "compare_at_price": {  
216 - "type": "float"  
217 - },  
218 - "sku_code": {  
219 - "type": "keyword"  
220 - },  
221 - "stock": {  
222 - "type": "long"  
223 - },  
224 - "weight": {  
225 - "type": "float"  
226 - },  
227 - "weight_unit": {  
228 - "type": "keyword"  
229 - },  
230 - "option1_value": {  
231 - "type": "keyword"  
232 - },  
233 - "option2_value": {  
234 - "type": "keyword"  
235 - },  
236 - "option3_value": {  
237 - "type": "keyword"  
238 - },  
239 - "image_src": {  
240 - "type": "keyword",  
241 - "index": false  
242 - }  
243 - }  
244 - } 31 +
  32 + "min_price": min(v.price for v in variants),
  33 + "max_price": max(v.price for v in variants)
245 } 34 }
246 - }  
247 -}  
248 1.2 查询方案 35 1.2 查询方案
249 对数组字段使用 dis_max,只取最高分,避免累加。 36 对数组字段使用 dis_max,只取最高分,避免累加。
250 其他重点字段 37 其他重点字段
@@ -333,26 +120,56 @@ S red @@ -333,26 +120,56 @@ S red
333 1. API 在 SPU 的维度直接返回3个属性定义,存储在 shoplazza_product_option 中: 120 1. API 在 SPU 的维度直接返回3个属性定义,存储在 shoplazza_product_option 中:
334 1. API在 SKU的维度直接返回3个属性值,存储在 shoplazza_product_sku 表的 option 相关的字段中: 121 1. API在 SKU的维度直接返回3个属性值,存储在 shoplazza_product_sku 表的 option 相关的字段中:
335 5.3 ES索引 122 5.3 ES索引
336 -5.3.1  
337 - 3nested,支持超过3个属性(动态)。只用作返回,不能查询。节省索引空间 123 +
338 "specifications": { 124 "specifications": {
339 "type": "nested", 125 "type": "nested",
340 "properties": { 126 "properties": {
341 - "name": { "type": "keyword","index": false },  
342 - "value": { "type": "keyword","index": false } 127 + "name": { "type": "keyword" }, // "颜色", "容量"
  128 + "value": { "type": "keyword" } // "白色", "256GB"
343 } 129 }
344 }, 130 },
345 131
346 -6. SEO相关字段  
347 -6.1 数据源  
348 -SEO标题 SEO描述 SEO URL Handle SEO URL 重定向 SEO关键词  
349 -最多5000字符 最多5000字符 "最多支持输入255字符  
350 - (SEO URL handle只对SEO URL的「URL参数」部分进行更改,即“products/”后的内容,如:products/「URL参数」  
351 - )" "创建URL重定向,访问修改前链接可跳转到修改后的新链接页面  
352 -「Y」:TRUE  
353 -「N」:FALSE " 多个关键词请用「英文逗号」隔开  
354 -  
355 -6.2 Mysql  
356 -6.3 ES索引  
357 -6.3.1 输入数据  
358 -6.3.2 索引方法  
359 \ No newline at end of file 132 \ No newline at end of file
  133 + 另外还需要包含一个单独的字段,main_option (即店铺主题装修里面配置的 颜色切换 - 变体名称,也就是列表页商品的子sku显示维度)
  134 + "main_option": { "type": "keyword" }
  135 +查询指定款式
  136 +{
  137 + "query": {
  138 + "nested": {
  139 + "path": "specifications",
  140 + "query": {
  141 + "bool": {
  142 + "must": [
  143 + { "term": { "specifications.name ": "颜色" } },
  144 + { "term": { "specifications.value": "绿色" } }
  145 + ]
  146 + }
  147 + }
  148 + }
  149 + }
  150 +}
  151 +按 name 做分面搜索(聚合)
  152 +
  153 +{
  154 + "aggs": {
  155 + "specs": {
  156 + "nested": { "path": "specifications" },
  157 + "aggs": {
  158 + "by_name": {
  159 + "terms": {
  160 + "field": "specifications.name",
  161 + "size": 20
  162 + },
  163 + "aggs": {
  164 + "value_counts": {
  165 + "terms": {
  166 + "field": "specifications.value",
  167 + "size": 10
  168 + }
  169 + }
  170 + }
  171 + }
  172 + }
  173 + }
  174 + }
  175 +}
  176 +
360 \ No newline at end of file 177 \ No newline at end of file
indexer/spu_transformer.py
@@ -38,12 +38,12 @@ class SPUTransformer: @@ -38,12 +38,12 @@ class SPUTransformer:
38 """ 38 """
39 query = text(""" 39 query = text("""
40 SELECT 40 SELECT
41 - id, shop_id, shoplazza_id, handle, title, brief, description,  
42 - spu, vendor, vendor_url, seo_title, seo_description, seo_keywords, 41 + id, shop_id, shoplazza_id, title, brief, description,
  42 + spu, vendor, vendor_url,
43 image_src, image_width, image_height, image_path, image_alt, 43 image_src, image_width, image_height, image_path, image_alt,
44 - tags, note, category,  
45 - shoplazza_created_at, shoplazza_updated_at, tenant_id,  
46 - creator, create_time, updater, update_time, deleted 44 + tags, note, category, category_id, category_google_id,
  45 + category_level, category_path,
  46 + tenant_id, creator, create_time, updater, update_time, deleted
47 FROM shoplazza_product_spu 47 FROM shoplazza_product_spu
48 WHERE tenant_id = :tenant_id AND deleted = 0 48 WHERE tenant_id = :tenant_id AND deleted = 0
49 """) 49 """)
@@ -114,6 +114,30 @@ class SPUTransformer: @@ -114,6 +114,30 @@ class SPUTransformer:
114 114
115 return df 115 return df
116 116
  117 + def load_option_data(self) -> pd.DataFrame:
  118 + """
  119 + Load option data from MySQL.
  120 +
  121 + Returns:
  122 + DataFrame with option data (name, position for each SPU)
  123 + """
  124 + query = text("""
  125 + SELECT
  126 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  127 + position, name, values, tenant_id,
  128 + creator, create_time, updater, update_time, deleted
  129 + FROM shoplazza_product_option
  130 + WHERE tenant_id = :tenant_id AND deleted = 0
  131 + ORDER BY spu_id, position
  132 + """)
  133 +
  134 + with self.db_engine.connect() as conn:
  135 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  136 +
  137 + print(f"DEBUG: Loaded {len(df)} option records for tenant_id={self.tenant_id}")
  138 +
  139 + return df
  140 +
117 def transform_batch(self) -> List[Dict[str, Any]]: 141 def transform_batch(self) -> List[Dict[str, Any]]:
118 """ 142 """
119 Transform SPU and SKU data into ES documents. 143 Transform SPU and SKU data into ES documents.
@@ -124,12 +148,16 @@ class SPUTransformer: @@ -124,12 +148,16 @@ class SPUTransformer:
124 # Load data 148 # Load data
125 spu_df = self.load_spu_data() 149 spu_df = self.load_spu_data()
126 sku_df = self.load_sku_data() 150 sku_df = self.load_sku_data()
  151 + option_df = self.load_option_data()
127 152
128 if spu_df.empty: 153 if spu_df.empty:
129 return [] 154 return []
130 155
131 # Group SKUs by SPU 156 # Group SKUs by SPU
132 sku_groups = sku_df.groupby('spu_id') 157 sku_groups = sku_df.groupby('spu_id')
  158 +
  159 + # Group options by SPU
  160 + option_groups = option_df.groupby('spu_id') if not option_df.empty else None
133 161
134 documents = [] 162 documents = []
135 for _, spu_row in spu_df.iterrows(): 163 for _, spu_row in spu_df.iterrows():
@@ -138,8 +166,11 @@ class SPUTransformer: @@ -138,8 +166,11 @@ class SPUTransformer:
138 # Get SKUs for this SPU 166 # Get SKUs for this SPU
139 skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame() 167 skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame()
140 168
  169 + # Get options for this SPU
  170 + options = option_groups.get_group(spu_id) if option_groups and spu_id in option_groups.groups else pd.DataFrame()
  171 +
141 # Transform to ES document 172 # Transform to ES document
142 - doc = self._transform_spu_to_doc(spu_row, skus) 173 + doc = self._transform_spu_to_doc(spu_row, skus, options)
143 if doc: 174 if doc:
144 documents.append(doc) 175 documents.append(doc)
145 176
@@ -148,7 +179,8 @@ class SPUTransformer: @@ -148,7 +179,8 @@ class SPUTransformer:
148 def _transform_spu_to_doc( 179 def _transform_spu_to_doc(
149 self, 180 self,
150 spu_row: pd.Series, 181 spu_row: pd.Series,
151 - skus: pd.DataFrame 182 + skus: pd.DataFrame,
  183 + options: pd.DataFrame
152 ) -> Optional[Dict[str, Any]]: 184 ) -> Optional[Dict[str, Any]]:
153 """ 185 """
154 Transform a single SPU row and its SKUs into an ES document. 186 Transform a single SPU row and its SKUs into an ES document.
@@ -156,6 +188,7 @@ class SPUTransformer: @@ -156,6 +188,7 @@ class SPUTransformer:
156 Args: 188 Args:
157 spu_row: SPU row from database 189 spu_row: SPU row from database
158 skus: DataFrame with SKUs for this SPU 190 skus: DataFrame with SKUs for this SPU
  191 + options: DataFrame with options for this SPU
159 192
160 Returns: 193 Returns:
161 ES document or None if transformation fails 194 ES document or None if transformation fails
@@ -168,41 +201,66 @@ class SPUTransformer: @@ -168,41 +201,66 @@ class SPUTransformer:
168 # SPU ID 201 # SPU ID
169 doc['spu_id'] = str(spu_row['id']) 202 doc['spu_id'] = str(spu_row['id'])
170 203
171 - # Handle  
172 - if pd.notna(spu_row.get('handle')):  
173 - doc['handle'] = str(spu_row['handle'])  
174 -  
175 - # Title 204 + # 文本相关性相关字段(中英文双语,暂时只填充中文)
176 if pd.notna(spu_row.get('title')): 205 if pd.notna(spu_row.get('title')):
177 - doc['title'] = str(spu_row['title']) 206 + doc['title_zh'] = str(spu_row['title'])
  207 + doc['title_en'] = None # 暂时设为空
178 208
179 - # Brief  
180 if pd.notna(spu_row.get('brief')): 209 if pd.notna(spu_row.get('brief')):
181 - doc['brief'] = str(spu_row['brief']) 210 + doc['brief_zh'] = str(spu_row['brief'])
  211 + doc['brief_en'] = None
182 212
183 - # Description  
184 if pd.notna(spu_row.get('description')): 213 if pd.notna(spu_row.get('description')):
185 - doc['description'] = str(spu_row['description']) 214 + doc['description_zh'] = str(spu_row['description'])
  215 + doc['description_en'] = None
186 216
187 - # SEO fields  
188 - if pd.notna(spu_row.get('seo_title')):  
189 - doc['seo_title'] = str(spu_row['seo_title'])  
190 - if pd.notna(spu_row.get('seo_description')):  
191 - doc['seo_description'] = str(spu_row['seo_description'])  
192 - if pd.notna(spu_row.get('seo_keywords')):  
193 - doc['seo_keywords'] = str(spu_row['seo_keywords'])  
194 -  
195 - # Vendor  
196 if pd.notna(spu_row.get('vendor')): 217 if pd.notna(spu_row.get('vendor')):
197 - doc['vendor'] = str(spu_row['vendor']) 218 + doc['vendor_zh'] = str(spu_row['vendor'])
  219 + doc['vendor_en'] = None
198 220
199 # Tags 221 # Tags
200 if pd.notna(spu_row.get('tags')): 222 if pd.notna(spu_row.get('tags')):
201 - doc['tags'] = str(spu_row['tags']) 223 + # Tags是逗号分隔的字符串,需要转换为数组
  224 + tags_str = str(spu_row['tags'])
  225 + doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
  226 +
  227 + # Category相关字段
  228 + if pd.notna(spu_row.get('category_path')):
  229 + category_path = str(spu_row['category_path'])
  230 + doc['category_path_zh'] = category_path
  231 + doc['category_path_en'] = None # 暂时设为空
  232 +
  233 + # 解析category_path获取多层级分类名称
  234 + path_parts = category_path.split('/')
  235 + if len(path_parts) > 0:
  236 + doc['category1_name'] = path_parts[0].strip()
  237 + if len(path_parts) > 1:
  238 + doc['category2_name'] = path_parts[1].strip()
  239 + if len(path_parts) > 2:
  240 + doc['category3_name'] = path_parts[2].strip()
202 241
203 - # Category  
204 if pd.notna(spu_row.get('category')): 242 if pd.notna(spu_row.get('category')):
205 - doc['category'] = str(spu_row['category']) 243 + category_name = str(spu_row['category'])
  244 + doc['category_name_zh'] = category_name
  245 + doc['category_name_en'] = None
  246 + doc['category_name'] = category_name
  247 +
  248 + if pd.notna(spu_row.get('category_id')):
  249 + doc['category_id'] = str(int(spu_row['category_id']))
  250 +
  251 + if pd.notna(spu_row.get('category_level')):
  252 + doc['category_level'] = int(spu_row['category_level'])
  253 +
  254 + # Option名称(从option表获取)
  255 + if not options.empty:
  256 + # 按position排序获取option名称
  257 + sorted_options = options.sort_values('position')
  258 + if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')):
  259 + doc['option1_name'] = str(sorted_options.iloc[0]['name'])
  260 + if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')):
  261 + doc['option2_name'] = str(sorted_options.iloc[1]['name'])
  262 + if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')):
  263 + doc['option3_name'] = str(sorted_options.iloc[2]['name'])
206 264
207 # Image URL 265 # Image URL
208 if pd.notna(spu_row.get('image_src')): 266 if pd.notna(spu_row.get('image_src')):
@@ -211,27 +269,85 @@ class SPUTransformer: @@ -211,27 +269,85 @@ class SPUTransformer:
211 image_src = f"//{image_src}" if image_src.startswith('//') else image_src 269 image_src = f"//{image_src}" if image_src.startswith('//') else image_src
212 doc['image_url'] = image_src 270 doc['image_url'] = image_src
213 271
214 - # Process SKUs 272 + # Process SKUs and build specifications
215 skus_list = [] 273 skus_list = []
216 prices = [] 274 prices = []
217 compare_prices = [] 275 compare_prices = []
  276 + sku_prices = []
  277 + sku_weights = []
  278 + sku_weight_units = []
  279 + total_inventory = 0
  280 + specifications = []
  281 +
  282 + # 构建option名称映射(position -> name)
  283 + option_name_map = {}
  284 + if not options.empty:
  285 + for _, opt_row in options.iterrows():
  286 + position = opt_row.get('position')
  287 + name = opt_row.get('name')
  288 + if pd.notna(position) and pd.notna(name):
  289 + option_name_map[int(position)] = str(name)
218 290
219 for _, sku_row in skus.iterrows(): 291 for _, sku_row in skus.iterrows():
220 - sku_data = self._transform_sku_row(sku_row) 292 + sku_data = self._transform_sku_row(sku_row, option_name_map)
221 if sku_data: 293 if sku_data:
222 skus_list.append(sku_data) 294 skus_list.append(sku_data)
  295 +
  296 + # 收集价格信息
223 if 'price' in sku_data and sku_data['price'] is not None: 297 if 'price' in sku_data and sku_data['price'] is not None:
224 try: 298 try:
225 - prices.append(float(sku_data['price'])) 299 + price_val = float(sku_data['price'])
  300 + prices.append(price_val)
  301 + sku_prices.append(price_val)
226 except (ValueError, TypeError): 302 except (ValueError, TypeError):
227 pass 303 pass
  304 +
228 if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None: 305 if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None:
229 try: 306 try:
230 compare_prices.append(float(sku_data['compare_at_price'])) 307 compare_prices.append(float(sku_data['compare_at_price']))
231 except (ValueError, TypeError): 308 except (ValueError, TypeError):
232 pass 309 pass
  310 +
  311 + # 收集重量信息
  312 + if 'weight' in sku_data and sku_data['weight'] is not None:
  313 + try:
  314 + sku_weights.append(int(float(sku_data['weight'])))
  315 + except (ValueError, TypeError):
  316 + pass
  317 +
  318 + if 'weight_unit' in sku_data and sku_data['weight_unit']:
  319 + sku_weight_units.append(str(sku_data['weight_unit']))
  320 +
  321 + # 收集库存信息
  322 + if 'stock' in sku_data and sku_data['stock'] is not None:
  323 + try:
  324 + total_inventory += int(sku_data['stock'])
  325 + except (ValueError, TypeError):
  326 + pass
  327 +
  328 + # 构建specifications(从SKU的option值和option表的name)
  329 + sku_id = str(sku_row['id'])
  330 + if pd.notna(sku_row.get('option1')) and 1 in option_name_map:
  331 + specifications.append({
  332 + 'sku_id': sku_id,
  333 + 'name': option_name_map[1],
  334 + 'value': str(sku_row['option1'])
  335 + })
  336 + if pd.notna(sku_row.get('option2')) and 2 in option_name_map:
  337 + specifications.append({
  338 + 'sku_id': sku_id,
  339 + 'name': option_name_map[2],
  340 + 'value': str(sku_row['option2'])
  341 + })
  342 + if pd.notna(sku_row.get('option3')) and 3 in option_name_map:
  343 + specifications.append({
  344 + 'sku_id': sku_id,
  345 + 'name': option_name_map[3],
  346 + 'value': str(sku_row['option3'])
  347 + })
233 348
234 doc['skus'] = skus_list 349 doc['skus'] = skus_list
  350 + doc['specifications'] = specifications
235 351
236 # Calculate price ranges 352 # Calculate price ranges
237 if prices: 353 if prices:
@@ -246,6 +362,19 @@ class SPUTransformer: @@ -246,6 +362,19 @@ class SPUTransformer:
246 else: 362 else:
247 doc['compare_at_price'] = None 363 doc['compare_at_price'] = None
248 364
  365 + # SKU扁平化字段
  366 + doc['sku_prices'] = sku_prices
  367 + doc['sku_weights'] = sku_weights
  368 + doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重
  369 + doc['total_inventory'] = total_inventory
  370 +
  371 + # Image URL
  372 + if pd.notna(spu_row.get('image_src')):
  373 + image_src = str(spu_row['image_src'])
  374 + if not image_src.startswith('http'):
  375 + image_src = f"//{image_src}" if image_src.startswith('//') else image_src
  376 + doc['image_url'] = image_src
  377 +
249 # Time fields - convert datetime to ISO format string for ES DATE type 378 # Time fields - convert datetime to ISO format string for ES DATE type
250 if pd.notna(spu_row.get('create_time')): 379 if pd.notna(spu_row.get('create_time')):
251 create_time = spu_row['create_time'] 380 create_time = spu_row['create_time']
@@ -260,29 +389,16 @@ class SPUTransformer: @@ -260,29 +389,16 @@ class SPUTransformer:
260 doc['update_time'] = update_time.isoformat() 389 doc['update_time'] = update_time.isoformat()
261 else: 390 else:
262 doc['update_time'] = str(update_time) 391 doc['update_time'] = str(update_time)
263 -  
264 - if pd.notna(spu_row.get('shoplazza_created_at')):  
265 - shoplazza_created_at = spu_row['shoplazza_created_at']  
266 - if hasattr(shoplazza_created_at, 'isoformat'):  
267 - doc['shoplazza_created_at'] = shoplazza_created_at.isoformat()  
268 - else:  
269 - doc['shoplazza_created_at'] = str(shoplazza_created_at)  
270 -  
271 - if pd.notna(spu_row.get('shoplazza_updated_at')):  
272 - shoplazza_updated_at = spu_row['shoplazza_updated_at']  
273 - if hasattr(shoplazza_updated_at, 'isoformat'):  
274 - doc['shoplazza_updated_at'] = shoplazza_updated_at.isoformat()  
275 - else:  
276 - doc['shoplazza_updated_at'] = str(shoplazza_updated_at)  
277 392
278 return doc 393 return doc
279 394
280 - def _transform_sku_row(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]: 395 + def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:
281 """ 396 """
282 Transform a SKU row into a SKU object. 397 Transform a SKU row into a SKU object.
283 398
284 Args: 399 Args:
285 sku_row: SKU row from database 400 sku_row: SKU row from database
  401 + option_name_map: Mapping from position to option name
286 402
287 Returns: 403 Returns:
288 SKU dictionary or None 404 SKU dictionary or None
@@ -292,10 +408,6 @@ class SPUTransformer: @@ -292,10 +408,6 @@ class SPUTransformer:
292 # SKU ID 408 # SKU ID
293 sku_data['sku_id'] = str(sku_row['id']) 409 sku_data['sku_id'] = str(sku_row['id'])
294 410
295 - # Title  
296 - if pd.notna(sku_row.get('title')):  
297 - sku_data['title'] = str(sku_row['title'])  
298 -  
299 # Price 411 # Price
300 if pd.notna(sku_row.get('price')): 412 if pd.notna(sku_row.get('price')):
301 try: 413 try:
@@ -314,9 +426,9 @@ class SPUTransformer: @@ -314,9 +426,9 @@ class SPUTransformer:
314 else: 426 else:
315 sku_data['compare_at_price'] = None 427 sku_data['compare_at_price'] = None
316 428
317 - # SKU 429 + # SKU Code
318 if pd.notna(sku_row.get('sku')): 430 if pd.notna(sku_row.get('sku')):
319 - sku_data['sku'] = str(sku_row['sku']) 431 + sku_data['sku_code'] = str(sku_row['sku'])
320 432
321 # Stock 433 # Stock
322 if pd.notna(sku_row.get('inventory_quantity')): 434 if pd.notna(sku_row.get('inventory_quantity')):
@@ -327,17 +439,30 @@ class SPUTransformer: @@ -327,17 +439,30 @@ class SPUTransformer:
327 else: 439 else:
328 sku_data['stock'] = 0 440 sku_data['stock'] = 0
329 441
330 - # Options (from option1, option2, option3)  
331 - options = {} 442 + # Weight
  443 + if pd.notna(sku_row.get('weight')):
  444 + try:
  445 + sku_data['weight'] = float(sku_row['weight'])
  446 + except (ValueError, TypeError):
  447 + sku_data['weight'] = None
  448 + else:
  449 + sku_data['weight'] = None
  450 +
  451 + # Weight unit
  452 + if pd.notna(sku_row.get('weight_unit')):
  453 + sku_data['weight_unit'] = str(sku_row['weight_unit'])
  454 +
  455 + # Option values
332 if pd.notna(sku_row.get('option1')): 456 if pd.notna(sku_row.get('option1')):
333 - options['option1'] = str(sku_row['option1']) 457 + sku_data['option1_value'] = str(sku_row['option1'])
334 if pd.notna(sku_row.get('option2')): 458 if pd.notna(sku_row.get('option2')):
335 - options['option2'] = str(sku_row['option2']) 459 + sku_data['option2_value'] = str(sku_row['option2'])
336 if pd.notna(sku_row.get('option3')): 460 if pd.notna(sku_row.get('option3')):
337 - options['option3'] = str(sku_row['option3'])  
338 -  
339 - if options:  
340 - sku_data['options'] = options 461 + sku_data['option3_value'] = str(sku_row['option3'])
  462 +
  463 + # Image src
  464 + if pd.notna(sku_row.get('image_src')):
  465 + sku_data['image_src'] = str(sku_row['image_src'])
341 466
342 return sku_data 467 return sku_data
343 468