Commit 5dcddc06eabb865f4f59af2f73e384ea34148c15

Authored by tangwang
1 parent 39e63ad1

索引重构

主要是对 分类 属性 子sku 等重要字段的处理。
参考文档《 @docs/索引字段说明v2-mapping结构.md 》《 @docs/索引字段说明v2.md 》

feat:
1. 更新 field_types.py
添加 hanlp_index/hanlp_standard 分析器映射(映射到 CHINESE_ECOMMERCE/CHINESE_ECOMMERCE_QUERY)
支持 keyword_normalizer 配置(用于 vendor.keyword 的 lowercase normalizer)
更新 get_default_analyzers() 添加 hanlp 分析器和 lowercase normalizer
修复 image_embedding 的 url 字段类型为 text

2. 更新 config.yaml(32-207行)
移除无用字段:handle, seo_title, seo_description, seo_keywords, shoplazza_created_at, shoplazza_updated_at
添加中英文字段:title_zh, title_en, brief_zh, brief_en, description_zh, description_en, vendor_zh, vendor_en
添加 category 多层级字段:category_path_zh, category_path_en, category_name_zh, category_name_en, category_id, category_name, category_level, category1_name, category2_name, category3_name
添加 specifications 嵌套字段
添加 option 名称字段:option1_name, option2_name, option3_name
添加 SKU 扁平化字段:sku_prices, sku_weights, sku_weight_units, total_inventory
更新 skus 嵌套结构以匹配目标 mapping
添加 image_embedding 嵌套字段
更新 indexes 配置以使用新字段名

3. 更新 config_loader.py
添加 keyword_normalizer 字段支持

4. 重构 spu_transformer.py
添加 load_option_data() 方法从 option 表加载数据
更新 transform_batch() 加载 option 数据
重构 _transform_spu_to_doc():
实现中英文字段映射(暂时只填充中文)
实现 category 多层级字段映射和 category_path 解析
实现 specifications 构建(从 option 表获取 name,从 SKU 获取 value)
实现 option 名称字段映射
实现 SKU 扁平化字段计算
更新 skus 嵌套结构
重构 _transform_sku_row() 以匹配新的 SKU 结构
移除 SEO 和 handle 字段的处理
config/config.yaml
... ... @@ -29,86 +29,88 @@ fields:
29 29 store: true
30 30 return_in_source: true
31 31  
32   - - name: "handle"
33   - type: "KEYWORD"
34   - index: true
35   - store: true
36   - return_in_source: true
37   -
38   - # 文本搜索字段
39   - - name: "title"
  32 + # 文本相关性相关字段(中英文双语)
  33 + - name: "title_zh"
40 34 type: "TEXT"
41   - analyzer: "chinese_ecommerce"
  35 + analyzer: "hanlp_index"
  36 + search_analyzer: "hanlp_standard"
42 37 boost: 3.0
43 38 index: true
44 39 store: true
45 40 return_in_source: true
46 41  
47   - - name: "brief"
  42 + - name: "brief_zh"
48 43 type: "TEXT"
49   - analyzer: "chinese_ecommerce"
  44 + analyzer: "hanlp_index"
  45 + search_analyzer: "hanlp_standard"
50 46 boost: 1.5
51 47 index: true
52 48 store: true
53 49 return_in_source: true
54 50  
55   - - name: "description"
  51 + - name: "description_zh"
56 52 type: "TEXT"
57   - analyzer: "chinese_ecommerce"
  53 + analyzer: "hanlp_index"
  54 + search_analyzer: "hanlp_standard"
58 55 boost: 1.0
59 56 index: true
60 57 store: true
61 58 return_in_source: true
62 59  
63   - # SEO字段(提升相关性)
64   - - name: "seo_title"
  60 + - name: "vendor_zh"
65 61 type: "TEXT"
66   - analyzer: "chinese_ecommerce"
67   - boost: 2.0
68   - index: true
69   - store: true
70   - return_in_source: false # SEO字段通常不需要在结果中返回
71   -
72   - - name: "seo_description"
73   - type: "TEXT"
74   - analyzer: "chinese_ecommerce"
  62 + analyzer: "hanlp_index"
  63 + search_analyzer: "hanlp_standard"
75 64 boost: 1.5
76 65 index: true
77 66 store: true
78   - return_in_source: false
  67 + return_in_source: true
  68 + keyword_subfield: true
  69 + keyword_normalizer: "lowercase"
79 70  
80   - - name: "seo_keywords"
  71 + - name: "title_en"
81 72 type: "TEXT"
82   - analyzer: "chinese_ecommerce"
83   - boost: 2.0
  73 + analyzer: "english"
  74 + search_analyzer: "english"
  75 + boost: 3.0
84 76 index: true
85 77 store: true
86   - return_in_source: false
  78 + return_in_source: true
87 79  
88   - # 分类和标签字段(TEXT + KEYWORD双重索引)
89   - - name: "vendor"
90   - type: "HKText"
91   - analyzer: "chinese_ecommerce"
  80 + - name: "brief_en"
  81 + type: "TEXT"
  82 + analyzer: "english"
  83 + search_analyzer: "english"
92 84 boost: 1.5
93 85 index: true
94 86 store: true
95 87 return_in_source: true
96 88  
97   - - name: "tags"
98   - type: "HKText"
99   - analyzer: "chinese_ecommerce"
  89 + - name: "description_en"
  90 + type: "TEXT"
  91 + analyzer: "english"
  92 + search_analyzer: "english"
100 93 boost: 1.0
101 94 index: true
102 95 store: true
103 96 return_in_source: true
104 97  
105   - - name: "category"
106   - type: "HKText"
107   - analyzer: "chinese_ecommerce"
  98 + - name: "vendor_en"
  99 + type: "TEXT"
  100 + analyzer: "english"
  101 + search_analyzer: "english"
108 102 boost: 1.5
109 103 index: true
110 104 store: true
111 105 return_in_source: true
  106 + keyword_subfield: true
  107 + keyword_normalizer: "lowercase"
  108 +
  109 + - name: "tags"
  110 + type: "KEYWORD"
  111 + index: true
  112 + store: true
  113 + return_in_source: true
112 114  
113 115 # 价格字段(扁平化)
114 116 - name: "min_price"
... ... @@ -129,6 +131,30 @@ fields:
129 131 store: true
130 132 return_in_source: true
131 133  
  134 + - name: "sku_prices"
  135 + type: "FLOAT"
  136 + index: true
  137 + store: true
  138 + return_in_source: true
  139 +
  140 + - name: "sku_weights"
  141 + type: "LONG"
  142 + index: true
  143 + store: true
  144 + return_in_source: true
  145 +
  146 + - name: "sku_weight_units"
  147 + type: "KEYWORD"
  148 + index: true
  149 + store: true
  150 + return_in_source: true
  151 +
  152 + - name: "total_inventory"
  153 + type: "LONG"
  154 + index: true
  155 + store: true
  156 + return_in_source: true
  157 +
132 158 # 图片字段(用于显示,不参与搜索)
133 159 - name: "image_url"
134 160 type: "KEYWORD"
... ... @@ -136,7 +162,7 @@ fields:
136 162 store: true
137 163 return_in_source: true
138 164  
139   - # 文本嵌入字段(用于语义搜索)
  165 + # 语义向量
140 166 - name: "title_embedding"
141 167 type: "TEXT_EMBEDDING"
142 168 embedding_dims: 1024
... ... @@ -145,30 +171,137 @@ fields:
145 171 store: false
146 172 return_in_source: false # 嵌入向量通常不需要在结果中返回
147 173  
148   - # 时间字段
149   - - name: "create_time"
150   - type: "DATE"
  174 + - name: "image_embedding"
  175 + type: "IMAGE_EMBEDDING"
  176 + embedding_dims: 1024
  177 + embedding_similarity: "dot_product"
  178 + nested: true
  179 + index: true
  180 + store: false
  181 + return_in_source: false
  182 +
  183 + # 分类相关字段
  184 + - name: "category_path_zh"
  185 + type: "TEXT"
  186 + analyzer: "hanlp_index"
  187 + search_analyzer: "hanlp_standard"
  188 + boost: 1.5
151 189 index: true
152 190 store: true
153 191 return_in_source: true
154 192  
155   - - name: "update_time"
156   - type: "DATE"
  193 + - name: "category_path_en"
  194 + type: "TEXT"
  195 + analyzer: "english"
  196 + search_analyzer: "english"
  197 + boost: 1.5
  198 + index: true
  199 + store: true
  200 + return_in_source: true
  201 +
  202 + - name: "category_name_zh"
  203 + type: "TEXT"
  204 + analyzer: "hanlp_index"
  205 + search_analyzer: "hanlp_standard"
  206 + boost: 1.5
  207 + index: true
  208 + store: true
  209 + return_in_source: true
  210 +
  211 + - name: "category_name_en"
  212 + type: "TEXT"
  213 + analyzer: "english"
  214 + search_analyzer: "english"
  215 + boost: 1.5
157 216 index: true
158 217 store: true
159 218 return_in_source: true
160 219  
161   - - name: "shoplazza_created_at"
  220 + - name: "category_id"
  221 + type: "KEYWORD"
  222 + index: true
  223 + store: true
  224 + return_in_source: true
  225 +
  226 + - name: "category_name"
  227 + type: "KEYWORD"
  228 + index: true
  229 + store: true
  230 + return_in_source: true
  231 +
  232 + - name: "category_level"
  233 + type: "INT"
  234 + index: true
  235 + store: true
  236 + return_in_source: true
  237 +
  238 + - name: "category1_name"
  239 + type: "KEYWORD"
  240 + index: true
  241 + store: true
  242 + return_in_source: true
  243 +
  244 + - name: "category2_name"
  245 + type: "KEYWORD"
  246 + index: true
  247 + store: true
  248 + return_in_source: true
  249 +
  250 + - name: "category3_name"
  251 + type: "KEYWORD"
  252 + index: true
  253 + store: true
  254 + return_in_source: true
  255 +
  256 + # SKU款式、子sku属性
  257 + - name: "specifications"
  258 + type: "JSON"
  259 + nested: true
  260 + return_in_source: true
  261 + nested_properties:
  262 + sku_id:
  263 + type: "keyword"
  264 + index: true
  265 + store: true
  266 + name:
  267 + type: "keyword"
  268 + index: true
  269 + store: true
  270 + value:
  271 + type: "keyword"
  272 + index: true
  273 + store: true
  274 +
  275 + - name: "option1_name"
  276 + type: "KEYWORD"
  277 + index: true
  278 + store: true
  279 + return_in_source: true
  280 +
  281 + - name: "option2_name"
  282 + type: "KEYWORD"
  283 + index: true
  284 + store: true
  285 + return_in_source: true
  286 +
  287 + - name: "option3_name"
  288 + type: "KEYWORD"
  289 + index: true
  290 + store: true
  291 + return_in_source: true
  292 +
  293 + # 时间字段
  294 + - name: "create_time"
162 295 type: "DATE"
163 296 index: true
164 297 store: true
165   - return_in_source: false # 通常不需要返回
  298 + return_in_source: true
166 299  
167   - - name: "shoplazza_updated_at"
  300 + - name: "update_time"
168 301 type: "DATE"
169 302 index: true
170 303 store: true
171   - return_in_source: false # 通常不需要返回
  304 + return_in_source: true
172 305  
173 306 # 嵌套skus字段
174 307 - name: "skus"
... ... @@ -180,11 +313,6 @@ fields:
180 313 type: "keyword"
181 314 index: true
182 315 store: true
183   - title:
184   - type: "text"
185   - analyzer: "chinese_ecommerce"
186   - index: true
187   - store: true
188 316 price:
189 317 type: "float"
190 318 index: true
... ... @@ -193,7 +321,7 @@ fields:
193 321 type: "float"
194 322 index: true
195 323 store: true
196   - sku:
  324 + sku_code:
197 325 type: "keyword"
198 326 index: true
199 327 store: true
... ... @@ -201,46 +329,65 @@ fields:
201 329 type: "long"
202 330 index: true
203 331 store: true
204   - options:
205   - type: "object"
206   - enabled: true
  332 + weight:
  333 + type: "float"
  334 + index: true
  335 + store: true
  336 + weight_unit:
  337 + type: "keyword"
  338 + index: true
  339 + store: true
  340 + option1_value:
  341 + type: "keyword"
  342 + index: true
  343 + store: true
  344 + option2_value:
  345 + type: "keyword"
  346 + index: true
  347 + store: true
  348 + option3_value:
  349 + type: "keyword"
  350 + index: true
  351 + store: true
  352 + image_src:
  353 + type: "keyword"
  354 + index: false
  355 + store: true
207 356  
208 357 # Index Structure (Query Domains)
209 358 indexes:
210 359 - name: "default"
211 360 label: "默认索引"
212 361 fields:
213   - - "title"
214   - - "brief"
215   - - "description"
216   - - "seo_title"
217   - - "seo_description"
218   - - "seo_keywords"
219   - - "vendor"
  362 + - "title_zh"
  363 + - "brief_zh"
  364 + - "description_zh"
  365 + - "vendor_zh"
220 366 - "tags"
221   - - "category"
  367 + - "category_path_zh"
  368 + - "category_name_zh"
222 369 analyzer: "chinese_ecommerce"
223 370 boost: 1.0
224 371  
225 372 - name: "title"
226 373 label: "标题索引"
227 374 fields:
228   - - "title"
229   - - "seo_title"
  375 + - "title_zh"
230 376 analyzer: "chinese_ecommerce"
231 377 boost: 2.0
232 378  
233 379 - name: "vendor"
234 380 label: "品牌索引"
235 381 fields:
236   - - "vendor"
  382 + - "vendor_zh"
237 383 analyzer: "chinese_ecommerce"
238 384 boost: 1.5
239 385  
240 386 - name: "category"
241 387 label: "类目索引"
242 388 fields:
243   - - "category"
  389 + - "category_path_zh"
  390 + - "category_name_zh"
244 391 analyzer: "chinese_ecommerce"
245 392 boost: 1.5
246 393  
... ... @@ -248,7 +395,6 @@ indexes:
248 395 label: "标签索引"
249 396 fields:
250 397 - "tags"
251   - - "seo_keywords"
252 398 analyzer: "chinese_ecommerce"
253 399 boost: 1.0
254 400  
... ...
config/config_loader.py
... ... @@ -313,7 +313,8 @@ class ConfigLoader:
313 313 nested=field_data.get("nested", False),
314 314 nested_properties=field_data.get("nested_properties"),
315 315 keyword_subfield=field_data.get("keyword_subfield", is_hktext),
316   - keyword_ignore_above=field_data.get("keyword_ignore_above", 256)
  316 + keyword_ignore_above=field_data.get("keyword_ignore_above", 256),
  317 + keyword_normalizer=field_data.get("keyword_normalizer")
317 318 )
318 319  
319 320 def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig:
... ...
config/field_types.py
... ... @@ -75,6 +75,7 @@ class FieldConfig:
75 75 # Hybrid Keyword Text (HKText) support
76 76 keyword_subfield: bool = False
77 77 keyword_ignore_above: int = 256
  78 + keyword_normalizer: Optional[str] = None # For keyword subfield normalizer (e.g., "lowercase")
78 79  
79 80  
80 81 def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
... ... @@ -100,18 +101,28 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
100 101 if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE:
101 102 mapping["analyzer"] = "index_ansj"
102 103 mapping["search_analyzer"] = "query_ansj"
  104 + elif field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY:
  105 + # If search_analyzer is explicitly set to CHINESE_ECOMMERCE_QUERY
  106 + mapping["analyzer"] = "index_ansj"
  107 + mapping["search_analyzer"] = "query_ansj"
103 108 else:
104 109 mapping["analyzer"] = field_config.analyzer.value
105 110  
106 111 if field_config.search_analyzer:
107   - mapping["search_analyzer"] = field_config.search_analyzer.value
  112 + if field_config.search_analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY:
  113 + mapping["search_analyzer"] = "query_ansj"
  114 + else:
  115 + mapping["search_analyzer"] = field_config.search_analyzer.value
108 116  
109 117 if field_config.keyword_subfield:
110 118 mapping.setdefault("fields", {})
111   - mapping["fields"]["keyword"] = {
  119 + keyword_field = {
112 120 "type": "keyword",
113 121 "ignore_above": field_config.keyword_ignore_above
114 122 }
  123 + if field_config.keyword_normalizer:
  124 + keyword_field["normalizer"] = field_config.keyword_normalizer
  125 + mapping["fields"]["keyword"] = keyword_field
115 126  
116 127 elif field_config.field_type == FieldType.KEYWORD:
117 128 mapping = {
... ... @@ -140,7 +151,7 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]:
140 151 "similarity": field_config.embedding_similarity
141 152 },
142 153 "url": {
143   - "type": "keyword"
  154 + "type": "text"
144 155 }
145 156 }
146 157 }
... ... @@ -239,6 +250,22 @@ def get_default_analyzers() -> Dict[str, Any]:
239 250 "type": "custom",
240 251 "tokenizer": "standard",
241 252 "filter": ["lowercase", "asciifolding"]
  253 + },
  254 + "hanlp_index": {
  255 + "type": "custom",
  256 + "tokenizer": "standard",
  257 + "filter": ["lowercase", "asciifolding"]
  258 + },
  259 + "hanlp_standard": {
  260 + "type": "custom",
  261 + "tokenizer": "standard",
  262 + "filter": ["lowercase", "asciifolding"]
  263 + }
  264 + },
  265 + "normalizer": {
  266 + "lowercase": {
  267 + "type": "custom",
  268 + "filter": ["lowercase"]
242 269 }
243 270 }
244 271 }
... ... @@ -300,6 +327,9 @@ ANALYZER_MAP = {
300 327 "chinese": AnalyzerType.CHINESE_ECOMMERCE,
301 328 "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE,
302 329 "index_ansj": AnalyzerType.CHINESE_ECOMMERCE,
  330 + "hanlp_index": AnalyzerType.CHINESE_ECOMMERCE, # Alias for index_ansj
  331 + "hanlp_standard": AnalyzerType.CHINESE_ECOMMERCE_QUERY, # Alias for query_ansj
  332 + "query_ansj": AnalyzerType.CHINESE_ECOMMERCE_QUERY,
303 333 "english": AnalyzerType.ENGLISH,
304 334 "arabic": AnalyzerType.ARABIC,
305 335 "spanish": AnalyzerType.SPANISH,
... ...
docs/索引字段说明v2-mapping结构.md 0 → 100644
... ... @@ -0,0 +1,231 @@
  1 +{
  2 + "mappings": {
  3 + "properties": {
  4 + "tenant_id": {
  5 + "type": "keyword"
  6 + },
  7 + "spu_id": {
  8 + "type": "keyword"
  9 + },
  10 +
  11 + "create_time": {
  12 + "type": "date"
  13 + },
  14 + "update_time": {
  15 + "type": "date"
  16 + },
  17 +
  18 + // 文本相关性相关字段
  19 + "title_zh": {
  20 + "type": "text",
  21 + "analyzer": "hanlp_index",
  22 + "search_analyzer": "hanlp_standard"
  23 + },
  24 + "brief_zh": {
  25 + "type": "text",
  26 + "analyzer": "hanlp_index",
  27 + "search_analyzer": "hanlp_standard"
  28 + },
  29 + "description_zh": {
  30 + "type": "text",
  31 + "analyzer": "hanlp_index",
  32 + "search_analyzer": "hanlp_standard"
  33 + },
  34 + "vendor_zh": {
  35 + "type": "text",
  36 + "analyzer": "hanlp_index",
  37 + "search_analyzer": "hanlp_standard",
  38 + "fields": {
  39 + "keyword": {
  40 + "type": "keyword",
  41 + "normalizer": "lowercase"
  42 + }
  43 + }
  44 + },
  45 +
  46 + "title_en": {
  47 + "type": "text",
  48 + "analyzer": "english",
  49 + "search_analyzer": "english",
  50 + },
  51 + "brief_en": {
  52 + "type": "text",
  53 + "analyzer": "english",
  54 + "search_analyzer": "english",
  55 +
  56 + },
  57 + "description_en": {
  58 + "type": "text",
  59 + "analyzer": "english",
  60 + "search_analyzer": "english",
  61 + },
  62 + "vendor_en": {
  63 + "type": "text",
  64 + "analyzer": "english",
  65 + "search_analyzer": "english",
  66 + "fields": {
  67 + "keyword": {
  68 + "type": "keyword",
  69 + "normalizer": "lowercase"
  70 + }
  71 + }
  72 + },
  73 +
  74 + "tags": {
  75 + "type": "keyword",
  76 + },
  77 +
  78 + "image_url": {
  79 + "type": "keyword",
  80 + "index": false
  81 + },
  82 +
  83 + // 语义向量
  84 + "title_embedding": {
  85 + "type": "dense_vector",
  86 + "dims": 1024,
  87 + "index": true,
  88 + "similarity": "dot_product"
  89 + },
  90 + "image_embedding": {
  91 + "type": "nested",
  92 + "properties": {
  93 + "vector": {
  94 + "type": "dense_vector",
  95 + "dims": 1024,
  96 + "index": true,
  97 + "similarity": "dot_product"
  98 + },
  99 + "url": {
  100 + "type": "text"
  101 + }
  102 + }
  103 + },
  104 +
  105 + // 分类相关
  106 + "category_path_zh": { // 提供模糊查询功能,辅助相关性计算
  107 + "type": "text",
  108 + "analyzer": "hanlp_index",
  109 + "search_analyzer": "hanlp_standard"
  110 + },
  111 + "category_path_en": { // 提供模糊查询功能,辅助相关性计算
  112 + "type": "text",
  113 + "analyzer": "english",
  114 + "search_analyzer": "english"
  115 + },
  116 + "category_name_zh": { // 提供模糊查询功能,辅助相关性计算
  117 + "type": "text",
  118 + "analyzer": "hanlp_index",
  119 + "search_analyzer": "hanlp_standard"
  120 + },
  121 + "category_name_en": { // 提供模糊查询功能,辅助相关性计算
  122 + "type": "text",
  123 + "analyzer": "english",
  124 + "search_analyzer": "english"
  125 + },
  126 +
  127 + "category_id": {
  128 + "type": "keyword"
  129 + },
  130 + "category_name": {
  131 + "type": "keyword"
  132 + },
  133 + "category_level": {
  134 + "type": "integer"
  135 + },
  136 + "category1_name": { // 不同层级下 可能有同名的情况,因此提供一二三级分开的查询方式
  137 + "type": "keyword"
  138 + },
  139 + "category2_name": {
  140 + "type": "keyword"
  141 + },
  142 + "category3_name": {
  143 + "type": "keyword"
  144 + },
  145 +
  146 + // sku款式、子sku属性
  147 + "specifications": {
  148 + "type": "nested",
  149 + "properties": {
  150 + "sku_id": { "type": "keyword" },
  151 + "name": { "type": "keyword" }, // "颜色", "容量"
  152 + "value": { "type": "keyword" } // "白色", "256GB"
  153 + }
  154 + },
  155 +
  156 + "option1_name": {
  157 + "type": "keyword"
  158 + },
  159 + "option2_name": {
  160 + "type": "keyword"
  161 + },
  162 + "option3_name": {
  163 + "type": "keyword"
  164 + },
  165 +
  166 + "min_price": {
  167 + "type": "float"
  168 + },
  169 + "max_price": {
  170 + "type": "float"
  171 + },
  172 + "compare_at_price": {
  173 + "type": "float"
  174 + },
  175 + "sku_prices": {
  176 + "type": "float"
  177 + },
  178 + "sku_weights": {
  179 + "type": "long"
  180 + },
  181 + "sku_weight_units": {
  182 + "type": "keyword"
  183 + },
  184 + "total_inventory": {
  185 + "type": "long"
  186 + },
  187 +
  188 + "skus": {
  189 + "type": "nested",
  190 + "properties": {
  191 + "sku_id": {
  192 + "type": "keyword"
  193 + },
  194 + "price": {
  195 + "type": "float"
  196 + },
  197 + "compare_at_price": {
  198 + "type": "float"
  199 + },
  200 + "sku_code": {
  201 + "type": "keyword"
  202 + },
  203 + "stock": {
  204 + "type": "long"
  205 + },
  206 + "weight": {
  207 + "type": "float"
  208 + },
  209 + "weight_unit": {
  210 + "type": "keyword"
  211 + },
  212 + "option1_value": {
  213 + "type": "keyword"
  214 + },
  215 + "option2_value": {
  216 + "type": "keyword"
  217 + },
  218 + "option3_value": {
  219 + "type": "keyword"
  220 + },
  221 + "image_src": {
  222 + "type": "keyword",
  223 + "index": false
  224 + }
  225 + }
  226 + }
  227 + }
  228 + }
  229 +}
  230 +
  231 +
... ...
docs/索引字段说明v2.md
... ... @@ -4,247 +4,34 @@ SPU-SKU索引方案选型
4 4 除了title, brielf description seo相关 cate tags vendor所有影响相关性的字段都在spu。 sku只有款式、价格、重量、库存等相关属性。所以,可以以spu为单位建立索引。
5 5 sku中需要参与搜索的属性(比如价格、库存)展开到spu。
6 6 sku的所有需要返回的字段作为nested字段,仅用于返回。
7   -灌入数据准备
8   -def build_product_document(product, skus):
9   - # 提取价格列表(转换为float,保留两位小数)
10   - price_list = [float(sku.price) for sku in skus if sku.price is not None]
11   -
12   - # 提取重量信息(重量转为int,单位统一为克;重量+单位拼接为字符串)
13   - weight_list = [int(float(sku.weight) * 1000) for sku in skus if sku.weight is not None] # 转为整数克
14   - weight_with_unit_list = [f"{sku.weight}{sku.weight_unit}" for sku in skus if sku.weight and sku.weight_unit]
15   -
16   - # 计算库存总和
17   - total_stock = sum([sku.inventory_quantity for sku in skus if sku.inventory_quantity is not None])
18   -
19   - # 计算价格区间
20   - min_price = min(price_list) if price_list else 0.0
21   - max_price = max(price_list) if price_list else 0.0
22   -
  7 +# 写入 spu 级别索引
  8 +def build_product_document(product, variants):
23 9 return {
24 10 "spu_id": str(product.id),
25 11 "title": product.title,
26 12  
27   - # SPU级别的选项名称定义(如:颜色、尺码、材质)
28   - "option1_name": getattr(product, 'option1', None),
29   - "option2_name": getattr(product, 'option2', None),
30   - "option3_name": getattr(product, 'option3', None),
31   -
32   - # SKU搜索字段(展开)
  13 + # Variant搜索字段(展开)
33 14 # 价格(int)、重量(int)、重量单位拼接重量(keyword),都以list形式灌入
34   - "sku_prices": price_list, # 所有SKU价格列表,用于范围聚合
35   - "sku_weights": weight_list, # 重量数值列表(转换为整数克)
36   - "sku_weight_units": weight_with_unit_list, # 重量+单位字符串列表
37   -
38   - # 库存总和 将SKU的库存加起来作为一个值灌入
39   - "total_inventory": total_stock, # SKU库存总和
40   -
41   - # 售价,灌入3个字段:SKU价格列表、最高价、最低价
42   - "min_price": min_price, # 最低售价
43   - "max_price": max_price, # 最高售价
44   - "price_range": { # 价格区间对象,便于范围查询
45   - "gte": min_price,
46   - "lte": max_price
47   - },
48   -
49   - # SKU详细信息(nested结构,仅用于返回)
50   - "skus": [
51   - {
52   - "sku_id": str(sku.id),
53   - "price": float(sku.price) if sku.price else 0.0,
54   - "compare_at_price": float(sku.compare_at_price) if sku.compare_at_price else None,
55   - "sku_code": sku.sku,
56   - "stock": sku.inventory_quantity,
57   - "weight": float(sku.weight) if sku.weight else None,
58   - "weight_unit": sku.weight_unit,
59   -
60   - # SKU级别的选项值(对应SPU的选项名称)
61   - "option1_value": sku.option1,
62   - "option2_value": sku.option2,
63   - "option3_value": sku.option3,
  15 + # TODO 按要求补充
  16 +
  17 + # 库存总和 将sku的库存加起来作为一个值灌入
  18 + # 售价,灌入3个字段,一个 sku价格 以list形式灌入,一个最高价一个最低价
  19 + # TODO 按要求补充
64 20  
65   - "image_src": sku.image_src
  21 + # Variant详细信息(用于返回)
  22 + "variants": [
  23 + {
  24 + "sku_id": str(v.id),
  25 + "price": float(v.price),
  26 + "options": v.options
66 27 }
67   - for sku in skus
  28 + for v in variants
68 29 ],
69   -
70   - # 其他SPU级别字段(根据索引文档补充)
71   - "tenant_id": str(product.tenant_id),
72   - "brief": product.brief,
73   - "description": product.description,
74   - "vendor": product.vendor,
75   - "category": product.category,
76   - "tags": product.tags.split(',') if product.tags else [],
77   - "seo_title": product.seo_title,
78   - "seo_description": product.seo_description,
79   - "seo_keywords": product.seo_keywords.split(',') if product.seo_keywords else [],
80   - "image_url": product.image_src,
81   - "create_time": product.create_time.isoformat() if product.create_time else None,
82   - "update_time": product.update_time.isoformat() if product.update_time else None
83   - }
84   - 索引定义
85   -{
86   - "mappings": {
87   - "properties": {
88   - "tenant_id": {
89   - "type": "keyword"
90   - },
91   - "spu_id": {
92   - "type": "keyword"
93   - },
94   - // 文本相关性相关字段
95   - "title_zh": {
96   - "type": "text",
97   - "analyzer": "hanlp_index",
98   - "search_analyzer": "hanlp_standard"
99   - },
100   - "brief_zh": {
101   - "type": "text",
102   - "analyzer": "hanlp_index",
103   - "search_analyzer": "hanlp_standard"
104   - },
105   - "description_zh": {
106   - "type": "text",
107   - "analyzer": "hanlp_index",
108   - "search_analyzer": "hanlp_standard"
109   - },
110   - "vendor_zh": {
111   - "type": "text",
112   - "analyzer": "hanlp_index",
113   - "search_analyzer": "hanlp_standard",
114   - "fields": {
115   - "keyword": {
116   - "type": "keyword",
117   - "normalizer": "lowercase"
118   - }
119   - }
120   - },
121   -
122   - "title_en": {
123   - "type": "text",
124   - "analyzer": "english",
125   - "search_analyzer": "english",
126   - },
127   - "brief_en": {
128   - "type": "text",
129   - "analyzer": "english",
130   - "search_analyzer": "english",
131 30  
132   - },
133   - "description_en": {
134   - "type": "text",
135   - "analyzer": "english",
136   - "search_analyzer": "english",
137   - },
138   - "vendor_en": {
139   - "type": "text",
140   - "analyzer": "english",
141   - "search_analyzer": "english",
142   - "fields": {
143   - "keyword": {
144   - "type": "keyword",
145   - "normalizer": "lowercase"
146   - }
147   - }
148   - },
149   -
150   - "tags": {
151   - "type": "keyword",
152   - },
153   -
154   -
155   - "min_price": {
156   - "type": "float"
157   - },
158   - "max_price": {
159   - "type": "float"
160   - },
161   - "compare_at_price": {
162   - "type": "float"
163   - },
164   - "sku_prices": {
165   - "type": "float"
166   - },
167   - "sku_weights": {
168   - "type": "long"
169   - },
170   - "sku_weight_units": {
171   - "type": "keyword"
172   - },
173   - "total_inventory": {
174   - "type": "long"
175   - },
176   -
177   - "image_url": {
178   - "type": "keyword",
179   - "index": false
180   - },
181   -
182   - "title_embedding": {
183   - "type": "dense_vector",
184   - "dims": 1024,
185   - "index": true,
186   - "similarity": "dot_product"
187   - },
188   -
189   - "create_time": {
190   - "type": "date"
191   - },
192   - "update_time": {
193   - "type": "date"
194   - },
195   -
196   - "option1_name": {
197   - "type": "keyword"
198   - },
199   - "option2_name": {
200   - "type": "keyword"
201   - },
202   - "option3_name": {
203   - "type": "keyword"
204   - },
205   -
206   - "skus": {
207   - "type": "nested",
208   - "properties": {
209   - "sku_id": {
210   - "type": "keyword"
211   - },
212   - "price": {
213   - "type": "float"
214   - },
215   - "compare_at_price": {
216   - "type": "float"
217   - },
218   - "sku_code": {
219   - "type": "keyword"
220   - },
221   - "stock": {
222   - "type": "long"
223   - },
224   - "weight": {
225   - "type": "float"
226   - },
227   - "weight_unit": {
228   - "type": "keyword"
229   - },
230   - "option1_value": {
231   - "type": "keyword"
232   - },
233   - "option2_value": {
234   - "type": "keyword"
235   - },
236   - "option3_value": {
237   - "type": "keyword"
238   - },
239   - "image_src": {
240   - "type": "keyword",
241   - "index": false
242   - }
243   - }
244   - }
  31 +
  32 + "min_price": min(v.price for v in variants),
  33 + "max_price": max(v.price for v in variants)
245 34 }
246   - }
247   -}
248 35 1.2 查询方案
249 36 对数组字段使用 dis_max,只取最高分,避免累加。
250 37 其他重点字段
... ... @@ -333,26 +120,56 @@ S red
333 120 1. API 在 SPU 的维度直接返回3个属性定义,存储在 shoplazza_product_option 中:
334 121 1. API在 SKU的维度直接返回3个属性值,存储在 shoplazza_product_sku 表的 option 相关的字段中:
335 122 5.3 ES索引
336   -5.3.1
337   - 3nested,支持超过3个属性(动态)。只用作返回,不能查询。节省索引空间
  123 +
338 124 "specifications": {
339 125 "type": "nested",
340 126 "properties": {
341   - "name": { "type": "keyword","index": false },
342   - "value": { "type": "keyword","index": false }
  127 + "name": { "type": "keyword" }, // "颜色", "容量"
  128 + "value": { "type": "keyword" } // "白色", "256GB"
343 129 }
344 130 },
345 131  
346   -6. SEO相关字段
347   -6.1 数据源
348   -SEO标题 SEO描述 SEO URL Handle SEO URL 重定向 SEO关键词
349   -最多5000字符 最多5000字符 "最多支持输入255字符
350   - (SEO URL handle只对SEO URL的「URL参数」部分进行更改,即“products/”后的内容,如:products/「URL参数」
351   - )" "创建URL重定向,访问修改前链接可跳转到修改后的新链接页面
352   -「Y」:TRUE
353   -「N」:FALSE " 多个关键词请用「英文逗号」隔开
354   -
355   -6.2 Mysql
356   -6.3 ES索引
357   -6.3.1 输入数据
358   -6.3.2 索引方法
359 132 \ No newline at end of file
  133 + 另外还需要包含一个单独的字段,main_option (即店铺主题装修里面配置的 颜色切换 - 变体名称,也就是列表页商品的子sku显示维度)
  134 + "main_option": { "type": "keyword" }
  135 +查询指定款式
  136 +{
  137 + "query": {
  138 + "nested": {
  139 + "path": "specifications",
  140 + "query": {
  141 + "bool": {
  142 + "must": [
  143 + { "term": { "specifications.name ": "颜色" } },
  144 + { "term": { "specifications.value": "绿色" } }
  145 + ]
  146 + }
  147 + }
  148 + }
  149 + }
  150 +}
  151 +按 name 做分面搜索(聚合)
  152 +
  153 +{
  154 + "aggs": {
  155 + "specs": {
  156 + "nested": { "path": "specifications" },
  157 + "aggs": {
  158 + "by_name": {
  159 + "terms": {
  160 + "field": "specifications.name",
  161 + "size": 20
  162 + },
  163 + "aggs": {
  164 + "value_counts": {
  165 + "terms": {
  166 + "field": "specifications.value",
  167 + "size": 10
  168 + }
  169 + }
  170 + }
  171 + }
  172 + }
  173 + }
  174 + }
  175 +}
  176 +
360 177 \ No newline at end of file
... ...
indexer/spu_transformer.py
... ... @@ -38,12 +38,12 @@ class SPUTransformer:
38 38 """
39 39 query = text("""
40 40 SELECT
41   - id, shop_id, shoplazza_id, handle, title, brief, description,
42   - spu, vendor, vendor_url, seo_title, seo_description, seo_keywords,
  41 + id, shop_id, shoplazza_id, title, brief, description,
  42 + spu, vendor, vendor_url,
43 43 image_src, image_width, image_height, image_path, image_alt,
44   - tags, note, category,
45   - shoplazza_created_at, shoplazza_updated_at, tenant_id,
46   - creator, create_time, updater, update_time, deleted
  44 + tags, note, category, category_id, category_google_id,
  45 + category_level, category_path,
  46 + tenant_id, creator, create_time, updater, update_time, deleted
47 47 FROM shoplazza_product_spu
48 48 WHERE tenant_id = :tenant_id AND deleted = 0
49 49 """)
... ... @@ -114,6 +114,30 @@ class SPUTransformer:
114 114  
115 115 return df
116 116  
  117 + def load_option_data(self) -> pd.DataFrame:
  118 + """
  119 + Load option data from MySQL.
  120 +
  121 + Returns:
  122 + DataFrame with option data (name, position for each SPU)
  123 + """
  124 + query = text("""
  125 + SELECT
  126 + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id,
  127 + position, name, values, tenant_id,
  128 + creator, create_time, updater, update_time, deleted
  129 + FROM shoplazza_product_option
  130 + WHERE tenant_id = :tenant_id AND deleted = 0
  131 + ORDER BY spu_id, position
  132 + """)
  133 +
  134 + with self.db_engine.connect() as conn:
  135 + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id})
  136 +
  137 + print(f"DEBUG: Loaded {len(df)} option records for tenant_id={self.tenant_id}")
  138 +
  139 + return df
  140 +
117 141 def transform_batch(self) -> List[Dict[str, Any]]:
118 142 """
119 143 Transform SPU and SKU data into ES documents.
... ... @@ -124,12 +148,16 @@ class SPUTransformer:
124 148 # Load data
125 149 spu_df = self.load_spu_data()
126 150 sku_df = self.load_sku_data()
  151 + option_df = self.load_option_data()
127 152  
128 153 if spu_df.empty:
129 154 return []
130 155  
131 156 # Group SKUs by SPU
132 157 sku_groups = sku_df.groupby('spu_id')
  158 +
  159 + # Group options by SPU
  160 + option_groups = option_df.groupby('spu_id') if not option_df.empty else None
133 161  
134 162 documents = []
135 163 for _, spu_row in spu_df.iterrows():
... ... @@ -138,8 +166,11 @@ class SPUTransformer:
138 166 # Get SKUs for this SPU
139 167 skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame()
140 168  
  169 + # Get options for this SPU
  170 + options = option_groups.get_group(spu_id) if option_groups and spu_id in option_groups.groups else pd.DataFrame()
  171 +
141 172 # Transform to ES document
142   - doc = self._transform_spu_to_doc(spu_row, skus)
  173 + doc = self._transform_spu_to_doc(spu_row, skus, options)
143 174 if doc:
144 175 documents.append(doc)
145 176  
... ... @@ -148,7 +179,8 @@ class SPUTransformer:
148 179 def _transform_spu_to_doc(
149 180 self,
150 181 spu_row: pd.Series,
151   - skus: pd.DataFrame
  182 + skus: pd.DataFrame,
  183 + options: pd.DataFrame
152 184 ) -> Optional[Dict[str, Any]]:
153 185 """
154 186 Transform a single SPU row and its SKUs into an ES document.
... ... @@ -156,6 +188,7 @@ class SPUTransformer:
156 188 Args:
157 189 spu_row: SPU row from database
158 190 skus: DataFrame with SKUs for this SPU
  191 + options: DataFrame with options for this SPU
159 192  
160 193 Returns:
161 194 ES document or None if transformation fails
... ... @@ -168,41 +201,66 @@ class SPUTransformer:
168 201 # SPU ID
169 202 doc['spu_id'] = str(spu_row['id'])
170 203  
171   - # Handle
172   - if pd.notna(spu_row.get('handle')):
173   - doc['handle'] = str(spu_row['handle'])
174   -
175   - # Title
  204 + # 文本相关性相关字段(中英文双语,暂时只填充中文)
176 205 if pd.notna(spu_row.get('title')):
177   - doc['title'] = str(spu_row['title'])
  206 + doc['title_zh'] = str(spu_row['title'])
  207 + doc['title_en'] = None # 暂时设为空
178 208  
179   - # Brief
180 209 if pd.notna(spu_row.get('brief')):
181   - doc['brief'] = str(spu_row['brief'])
  210 + doc['brief_zh'] = str(spu_row['brief'])
  211 + doc['brief_en'] = None
182 212  
183   - # Description
184 213 if pd.notna(spu_row.get('description')):
185   - doc['description'] = str(spu_row['description'])
  214 + doc['description_zh'] = str(spu_row['description'])
  215 + doc['description_en'] = None
186 216  
187   - # SEO fields
188   - if pd.notna(spu_row.get('seo_title')):
189   - doc['seo_title'] = str(spu_row['seo_title'])
190   - if pd.notna(spu_row.get('seo_description')):
191   - doc['seo_description'] = str(spu_row['seo_description'])
192   - if pd.notna(spu_row.get('seo_keywords')):
193   - doc['seo_keywords'] = str(spu_row['seo_keywords'])
194   -
195   - # Vendor
196 217 if pd.notna(spu_row.get('vendor')):
197   - doc['vendor'] = str(spu_row['vendor'])
  218 + doc['vendor_zh'] = str(spu_row['vendor'])
  219 + doc['vendor_en'] = None
198 220  
199 221 # Tags
200 222 if pd.notna(spu_row.get('tags')):
201   - doc['tags'] = str(spu_row['tags'])
  223 + # Tags是逗号分隔的字符串,需要转换为数组
  224 + tags_str = str(spu_row['tags'])
  225 + doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
  226 +
  227 + # Category相关字段
  228 + if pd.notna(spu_row.get('category_path')):
  229 + category_path = str(spu_row['category_path'])
  230 + doc['category_path_zh'] = category_path
  231 + doc['category_path_en'] = None # 暂时设为空
  232 +
  233 + # 解析category_path获取多层级分类名称
  234 + path_parts = category_path.split('/')
  235 + if len(path_parts) > 0:
  236 + doc['category1_name'] = path_parts[0].strip()
  237 + if len(path_parts) > 1:
  238 + doc['category2_name'] = path_parts[1].strip()
  239 + if len(path_parts) > 2:
  240 + doc['category3_name'] = path_parts[2].strip()
202 241  
203   - # Category
204 242 if pd.notna(spu_row.get('category')):
205   - doc['category'] = str(spu_row['category'])
  243 + category_name = str(spu_row['category'])
  244 + doc['category_name_zh'] = category_name
  245 + doc['category_name_en'] = None
  246 + doc['category_name'] = category_name
  247 +
  248 + if pd.notna(spu_row.get('category_id')):
  249 + doc['category_id'] = str(int(spu_row['category_id']))
  250 +
  251 + if pd.notna(spu_row.get('category_level')):
  252 + doc['category_level'] = int(spu_row['category_level'])
  253 +
  254 + # Option名称(从option表获取)
  255 + if not options.empty:
  256 + # 按position排序获取option名称
  257 + sorted_options = options.sort_values('position')
  258 + if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')):
  259 + doc['option1_name'] = str(sorted_options.iloc[0]['name'])
  260 + if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')):
  261 + doc['option2_name'] = str(sorted_options.iloc[1]['name'])
  262 + if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')):
  263 + doc['option3_name'] = str(sorted_options.iloc[2]['name'])
206 264  
207 265 # Image URL
208 266 if pd.notna(spu_row.get('image_src')):
... ... @@ -211,27 +269,85 @@ class SPUTransformer:
211 269 image_src = f"//{image_src}" if image_src.startswith('//') else image_src
212 270 doc['image_url'] = image_src
213 271  
214   - # Process SKUs
  272 + # Process SKUs and build specifications
215 273 skus_list = []
216 274 prices = []
217 275 compare_prices = []
  276 + sku_prices = []
  277 + sku_weights = []
  278 + sku_weight_units = []
  279 + total_inventory = 0
  280 + specifications = []
  281 +
  282 + # 构建option名称映射(position -> name)
  283 + option_name_map = {}
  284 + if not options.empty:
  285 + for _, opt_row in options.iterrows():
  286 + position = opt_row.get('position')
  287 + name = opt_row.get('name')
  288 + if pd.notna(position) and pd.notna(name):
  289 + option_name_map[int(position)] = str(name)
218 290  
219 291 for _, sku_row in skus.iterrows():
220   - sku_data = self._transform_sku_row(sku_row)
  292 + sku_data = self._transform_sku_row(sku_row, option_name_map)
221 293 if sku_data:
222 294 skus_list.append(sku_data)
  295 +
  296 + # 收集价格信息
223 297 if 'price' in sku_data and sku_data['price'] is not None:
224 298 try:
225   - prices.append(float(sku_data['price']))
  299 + price_val = float(sku_data['price'])
  300 + prices.append(price_val)
  301 + sku_prices.append(price_val)
226 302 except (ValueError, TypeError):
227 303 pass
  304 +
228 305 if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None:
229 306 try:
230 307 compare_prices.append(float(sku_data['compare_at_price']))
231 308 except (ValueError, TypeError):
232 309 pass
  310 +
  311 + # 收集重量信息
  312 + if 'weight' in sku_data and sku_data['weight'] is not None:
  313 + try:
  314 + sku_weights.append(int(float(sku_data['weight'])))
  315 + except (ValueError, TypeError):
  316 + pass
  317 +
  318 + if 'weight_unit' in sku_data and sku_data['weight_unit']:
  319 + sku_weight_units.append(str(sku_data['weight_unit']))
  320 +
  321 + # 收集库存信息
  322 + if 'stock' in sku_data and sku_data['stock'] is not None:
  323 + try:
  324 + total_inventory += int(sku_data['stock'])
  325 + except (ValueError, TypeError):
  326 + pass
  327 +
  328 + # 构建specifications(从SKU的option值和option表的name)
  329 + sku_id = str(sku_row['id'])
  330 + if pd.notna(sku_row.get('option1')) and 1 in option_name_map:
  331 + specifications.append({
  332 + 'sku_id': sku_id,
  333 + 'name': option_name_map[1],
  334 + 'value': str(sku_row['option1'])
  335 + })
  336 + if pd.notna(sku_row.get('option2')) and 2 in option_name_map:
  337 + specifications.append({
  338 + 'sku_id': sku_id,
  339 + 'name': option_name_map[2],
  340 + 'value': str(sku_row['option2'])
  341 + })
  342 + if pd.notna(sku_row.get('option3')) and 3 in option_name_map:
  343 + specifications.append({
  344 + 'sku_id': sku_id,
  345 + 'name': option_name_map[3],
  346 + 'value': str(sku_row['option3'])
  347 + })
233 348  
234 349 doc['skus'] = skus_list
  350 + doc['specifications'] = specifications
235 351  
236 352 # Calculate price ranges
237 353 if prices:
... ... @@ -246,6 +362,19 @@ class SPUTransformer:
246 362 else:
247 363 doc['compare_at_price'] = None
248 364  
  365 + # SKU扁平化字段
  366 + doc['sku_prices'] = sku_prices
  367 + doc['sku_weights'] = sku_weights
  368 + doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重
  369 + doc['total_inventory'] = total_inventory
  370 +
  371 + # Image URL
  372 + if pd.notna(spu_row.get('image_src')):
  373 + image_src = str(spu_row['image_src'])
  374 + if not image_src.startswith('http'):
  375 + image_src = f"//{image_src}" if image_src.startswith('//') else image_src
  376 + doc['image_url'] = image_src
  377 +
249 378 # Time fields - convert datetime to ISO format string for ES DATE type
250 379 if pd.notna(spu_row.get('create_time')):
251 380 create_time = spu_row['create_time']
... ... @@ -260,29 +389,16 @@ class SPUTransformer:
260 389 doc['update_time'] = update_time.isoformat()
261 390 else:
262 391 doc['update_time'] = str(update_time)
263   -
264   - if pd.notna(spu_row.get('shoplazza_created_at')):
265   - shoplazza_created_at = spu_row['shoplazza_created_at']
266   - if hasattr(shoplazza_created_at, 'isoformat'):
267   - doc['shoplazza_created_at'] = shoplazza_created_at.isoformat()
268   - else:
269   - doc['shoplazza_created_at'] = str(shoplazza_created_at)
270   -
271   - if pd.notna(spu_row.get('shoplazza_updated_at')):
272   - shoplazza_updated_at = spu_row['shoplazza_updated_at']
273   - if hasattr(shoplazza_updated_at, 'isoformat'):
274   - doc['shoplazza_updated_at'] = shoplazza_updated_at.isoformat()
275   - else:
276   - doc['shoplazza_updated_at'] = str(shoplazza_updated_at)
277 392  
278 393 return doc
279 394  
280   - def _transform_sku_row(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]:
  395 + def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:
281 396 """
282 397 Transform a SKU row into a SKU object.
283 398  
284 399 Args:
285 400 sku_row: SKU row from database
  401 + option_name_map: Mapping from position to option name
286 402  
287 403 Returns:
288 404 SKU dictionary or None
... ... @@ -292,10 +408,6 @@ class SPUTransformer:
292 408 # SKU ID
293 409 sku_data['sku_id'] = str(sku_row['id'])
294 410  
295   - # Title
296   - if pd.notna(sku_row.get('title')):
297   - sku_data['title'] = str(sku_row['title'])
298   -
299 411 # Price
300 412 if pd.notna(sku_row.get('price')):
301 413 try:
... ... @@ -314,9 +426,9 @@ class SPUTransformer:
314 426 else:
315 427 sku_data['compare_at_price'] = None
316 428  
317   - # SKU
  429 + # SKU Code
318 430 if pd.notna(sku_row.get('sku')):
319   - sku_data['sku'] = str(sku_row['sku'])
  431 + sku_data['sku_code'] = str(sku_row['sku'])
320 432  
321 433 # Stock
322 434 if pd.notna(sku_row.get('inventory_quantity')):
... ... @@ -327,17 +439,30 @@ class SPUTransformer:
327 439 else:
328 440 sku_data['stock'] = 0
329 441  
330   - # Options (from option1, option2, option3)
331   - options = {}
  442 + # Weight
  443 + if pd.notna(sku_row.get('weight')):
  444 + try:
  445 + sku_data['weight'] = float(sku_row['weight'])
  446 + except (ValueError, TypeError):
  447 + sku_data['weight'] = None
  448 + else:
  449 + sku_data['weight'] = None
  450 +
  451 + # Weight unit
  452 + if pd.notna(sku_row.get('weight_unit')):
  453 + sku_data['weight_unit'] = str(sku_row['weight_unit'])
  454 +
  455 + # Option values
332 456 if pd.notna(sku_row.get('option1')):
333   - options['option1'] = str(sku_row['option1'])
  457 + sku_data['option1_value'] = str(sku_row['option1'])
334 458 if pd.notna(sku_row.get('option2')):
335   - options['option2'] = str(sku_row['option2'])
  459 + sku_data['option2_value'] = str(sku_row['option2'])
336 460 if pd.notna(sku_row.get('option3')):
337   - options['option3'] = str(sku_row['option3'])
338   -
339   - if options:
340   - sku_data['options'] = options
  461 + sku_data['option3_value'] = str(sku_row['option3'])
  462 +
  463 + # Image src
  464 + if pd.notna(sku_row.get('image_src')):
  465 + sku_data['image_src'] = str(sku_row['image_src'])
341 466  
342 467 return sku_data
343 468  
... ...