Commit 5dcddc06eabb865f4f59af2f73e384ea34148c15
1 parent
39e63ad1
索引重构
主要是对 分类 属性 子sku 等重要字段的处理。 参考文档《 @docs/索引字段说明v2-mapping结构.md 》《 @docs/索引字段说明v2.md 》 feat: 1. 更新 field_types.py 添加 hanlp_index/hanlp_standard 分析器映射(映射到 CHINESE_ECOMMERCE/CHINESE_ECOMMERCE_QUERY) 支持 keyword_normalizer 配置(用于 vendor.keyword 的 lowercase normalizer) 更新 get_default_analyzers() 添加 hanlp 分析器和 lowercase normalizer 修复 image_embedding 的 url 字段类型为 text 2. 更新 config.yaml(32-207行) 移除无用字段:handle, seo_title, seo_description, seo_keywords, shoplazza_created_at, shoplazza_updated_at 添加中英文字段:title_zh, title_en, brief_zh, brief_en, description_zh, description_en, vendor_zh, vendor_en 添加 category 多层级字段:category_path_zh, category_path_en, category_name_zh, category_name_en, category_id, category_name, category_level, category1_name, category2_name, category3_name 添加 specifications 嵌套字段 添加 option 名称字段:option1_name, option2_name, option3_name 添加 SKU 扁平化字段:sku_prices, sku_weights, sku_weight_units, total_inventory 更新 skus 嵌套结构以匹配目标 mapping 添加 image_embedding 嵌套字段 更新 indexes 配置以使用新字段名 3. 更新 config_loader.py 添加 keyword_normalizer 字段支持 4. 重构 spu_transformer.py 添加 load_option_data() 方法从 option 表加载数据 更新 transform_batch() 加载 option 数据 重构 _transform_spu_to_doc(): 实现中英文字段映射(暂时只填充中文) 实现 category 多层级字段映射和 category_path 解析 实现 specifications 构建(从 option 表获取 name,从 SKU 获取 value) 实现 option 名称字段映射 实现 SKU 扁平化字段计算 更新 skus 嵌套结构 重构 _transform_sku_row() 以匹配新的 SKU 结构 移除 SEO 和 handle 字段的处理
Showing
6 changed files
with
735 additions
and
385 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -29,86 +29,88 @@ fields: |
| 29 | 29 | store: true |
| 30 | 30 | return_in_source: true |
| 31 | 31 | |
| 32 | - - name: "handle" | |
| 33 | - type: "KEYWORD" | |
| 34 | - index: true | |
| 35 | - store: true | |
| 36 | - return_in_source: true | |
| 37 | - | |
| 38 | - # 文本搜索字段 | |
| 39 | - - name: "title" | |
| 32 | + # 文本相关性相关字段(中英文双语) | |
| 33 | + - name: "title_zh" | |
| 40 | 34 | type: "TEXT" |
| 41 | - analyzer: "chinese_ecommerce" | |
| 35 | + analyzer: "hanlp_index" | |
| 36 | + search_analyzer: "hanlp_standard" | |
| 42 | 37 | boost: 3.0 |
| 43 | 38 | index: true |
| 44 | 39 | store: true |
| 45 | 40 | return_in_source: true |
| 46 | 41 | |
| 47 | - - name: "brief" | |
| 42 | + - name: "brief_zh" | |
| 48 | 43 | type: "TEXT" |
| 49 | - analyzer: "chinese_ecommerce" | |
| 44 | + analyzer: "hanlp_index" | |
| 45 | + search_analyzer: "hanlp_standard" | |
| 50 | 46 | boost: 1.5 |
| 51 | 47 | index: true |
| 52 | 48 | store: true |
| 53 | 49 | return_in_source: true |
| 54 | 50 | |
| 55 | - - name: "description" | |
| 51 | + - name: "description_zh" | |
| 56 | 52 | type: "TEXT" |
| 57 | - analyzer: "chinese_ecommerce" | |
| 53 | + analyzer: "hanlp_index" | |
| 54 | + search_analyzer: "hanlp_standard" | |
| 58 | 55 | boost: 1.0 |
| 59 | 56 | index: true |
| 60 | 57 | store: true |
| 61 | 58 | return_in_source: true |
| 62 | 59 | |
| 63 | - # SEO字段(提升相关性) | |
| 64 | - - name: "seo_title" | |
| 60 | + - name: "vendor_zh" | |
| 65 | 61 | type: "TEXT" |
| 66 | - analyzer: "chinese_ecommerce" | |
| 67 | - boost: 2.0 | |
| 68 | - index: true | |
| 69 | - store: true | |
| 70 | - return_in_source: false # SEO字段通常不需要在结果中返回 | |
| 71 | - | |
| 72 | - - name: "seo_description" | |
| 73 | - type: "TEXT" | |
| 74 | - analyzer: "chinese_ecommerce" | |
| 62 | + analyzer: "hanlp_index" | |
| 63 | + search_analyzer: "hanlp_standard" | |
| 75 | 64 | boost: 1.5 |
| 76 | 65 | index: true |
| 77 | 66 | store: true |
| 78 | - return_in_source: false | |
| 67 | + return_in_source: true | |
| 68 | + keyword_subfield: true | |
| 69 | + keyword_normalizer: "lowercase" | |
| 79 | 70 | |
| 80 | - - name: "seo_keywords" | |
| 71 | + - name: "title_en" | |
| 81 | 72 | type: "TEXT" |
| 82 | - analyzer: "chinese_ecommerce" | |
| 83 | - boost: 2.0 | |
| 73 | + analyzer: "english" | |
| 74 | + search_analyzer: "english" | |
| 75 | + boost: 3.0 | |
| 84 | 76 | index: true |
| 85 | 77 | store: true |
| 86 | - return_in_source: false | |
| 78 | + return_in_source: true | |
| 87 | 79 | |
| 88 | - # 分类和标签字段(TEXT + KEYWORD双重索引) | |
| 89 | - - name: "vendor" | |
| 90 | - type: "HKText" | |
| 91 | - analyzer: "chinese_ecommerce" | |
| 80 | + - name: "brief_en" | |
| 81 | + type: "TEXT" | |
| 82 | + analyzer: "english" | |
| 83 | + search_analyzer: "english" | |
| 92 | 84 | boost: 1.5 |
| 93 | 85 | index: true |
| 94 | 86 | store: true |
| 95 | 87 | return_in_source: true |
| 96 | 88 | |
| 97 | - - name: "tags" | |
| 98 | - type: "HKText" | |
| 99 | - analyzer: "chinese_ecommerce" | |
| 89 | + - name: "description_en" | |
| 90 | + type: "TEXT" | |
| 91 | + analyzer: "english" | |
| 92 | + search_analyzer: "english" | |
| 100 | 93 | boost: 1.0 |
| 101 | 94 | index: true |
| 102 | 95 | store: true |
| 103 | 96 | return_in_source: true |
| 104 | 97 | |
| 105 | - - name: "category" | |
| 106 | - type: "HKText" | |
| 107 | - analyzer: "chinese_ecommerce" | |
| 98 | + - name: "vendor_en" | |
| 99 | + type: "TEXT" | |
| 100 | + analyzer: "english" | |
| 101 | + search_analyzer: "english" | |
| 108 | 102 | boost: 1.5 |
| 109 | 103 | index: true |
| 110 | 104 | store: true |
| 111 | 105 | return_in_source: true |
| 106 | + keyword_subfield: true | |
| 107 | + keyword_normalizer: "lowercase" | |
| 108 | + | |
| 109 | + - name: "tags" | |
| 110 | + type: "KEYWORD" | |
| 111 | + index: true | |
| 112 | + store: true | |
| 113 | + return_in_source: true | |
| 112 | 114 | |
| 113 | 115 | # 价格字段(扁平化) |
| 114 | 116 | - name: "min_price" |
| ... | ... | @@ -129,6 +131,30 @@ fields: |
| 129 | 131 | store: true |
| 130 | 132 | return_in_source: true |
| 131 | 133 | |
| 134 | + - name: "sku_prices" | |
| 135 | + type: "FLOAT" | |
| 136 | + index: true | |
| 137 | + store: true | |
| 138 | + return_in_source: true | |
| 139 | + | |
| 140 | + - name: "sku_weights" | |
| 141 | + type: "LONG" | |
| 142 | + index: true | |
| 143 | + store: true | |
| 144 | + return_in_source: true | |
| 145 | + | |
| 146 | + - name: "sku_weight_units" | |
| 147 | + type: "KEYWORD" | |
| 148 | + index: true | |
| 149 | + store: true | |
| 150 | + return_in_source: true | |
| 151 | + | |
| 152 | + - name: "total_inventory" | |
| 153 | + type: "LONG" | |
| 154 | + index: true | |
| 155 | + store: true | |
| 156 | + return_in_source: true | |
| 157 | + | |
| 132 | 158 | # 图片字段(用于显示,不参与搜索) |
| 133 | 159 | - name: "image_url" |
| 134 | 160 | type: "KEYWORD" |
| ... | ... | @@ -136,7 +162,7 @@ fields: |
| 136 | 162 | store: true |
| 137 | 163 | return_in_source: true |
| 138 | 164 | |
| 139 | - # 文本嵌入字段(用于语义搜索) | |
| 165 | + # 语义向量 | |
| 140 | 166 | - name: "title_embedding" |
| 141 | 167 | type: "TEXT_EMBEDDING" |
| 142 | 168 | embedding_dims: 1024 |
| ... | ... | @@ -145,30 +171,137 @@ fields: |
| 145 | 171 | store: false |
| 146 | 172 | return_in_source: false # 嵌入向量通常不需要在结果中返回 |
| 147 | 173 | |
| 148 | - # 时间字段 | |
| 149 | - - name: "create_time" | |
| 150 | - type: "DATE" | |
| 174 | + - name: "image_embedding" | |
| 175 | + type: "IMAGE_EMBEDDING" | |
| 176 | + embedding_dims: 1024 | |
| 177 | + embedding_similarity: "dot_product" | |
| 178 | + nested: true | |
| 179 | + index: true | |
| 180 | + store: false | |
| 181 | + return_in_source: false | |
| 182 | + | |
| 183 | + # 分类相关字段 | |
| 184 | + - name: "category_path_zh" | |
| 185 | + type: "TEXT" | |
| 186 | + analyzer: "hanlp_index" | |
| 187 | + search_analyzer: "hanlp_standard" | |
| 188 | + boost: 1.5 | |
| 151 | 189 | index: true |
| 152 | 190 | store: true |
| 153 | 191 | return_in_source: true |
| 154 | 192 | |
| 155 | - - name: "update_time" | |
| 156 | - type: "DATE" | |
| 193 | + - name: "category_path_en" | |
| 194 | + type: "TEXT" | |
| 195 | + analyzer: "english" | |
| 196 | + search_analyzer: "english" | |
| 197 | + boost: 1.5 | |
| 198 | + index: true | |
| 199 | + store: true | |
| 200 | + return_in_source: true | |
| 201 | + | |
| 202 | + - name: "category_name_zh" | |
| 203 | + type: "TEXT" | |
| 204 | + analyzer: "hanlp_index" | |
| 205 | + search_analyzer: "hanlp_standard" | |
| 206 | + boost: 1.5 | |
| 207 | + index: true | |
| 208 | + store: true | |
| 209 | + return_in_source: true | |
| 210 | + | |
| 211 | + - name: "category_name_en" | |
| 212 | + type: "TEXT" | |
| 213 | + analyzer: "english" | |
| 214 | + search_analyzer: "english" | |
| 215 | + boost: 1.5 | |
| 157 | 216 | index: true |
| 158 | 217 | store: true |
| 159 | 218 | return_in_source: true |
| 160 | 219 | |
| 161 | - - name: "shoplazza_created_at" | |
| 220 | + - name: "category_id" | |
| 221 | + type: "KEYWORD" | |
| 222 | + index: true | |
| 223 | + store: true | |
| 224 | + return_in_source: true | |
| 225 | + | |
| 226 | + - name: "category_name" | |
| 227 | + type: "KEYWORD" | |
| 228 | + index: true | |
| 229 | + store: true | |
| 230 | + return_in_source: true | |
| 231 | + | |
| 232 | + - name: "category_level" | |
| 233 | + type: "INT" | |
| 234 | + index: true | |
| 235 | + store: true | |
| 236 | + return_in_source: true | |
| 237 | + | |
| 238 | + - name: "category1_name" | |
| 239 | + type: "KEYWORD" | |
| 240 | + index: true | |
| 241 | + store: true | |
| 242 | + return_in_source: true | |
| 243 | + | |
| 244 | + - name: "category2_name" | |
| 245 | + type: "KEYWORD" | |
| 246 | + index: true | |
| 247 | + store: true | |
| 248 | + return_in_source: true | |
| 249 | + | |
| 250 | + - name: "category3_name" | |
| 251 | + type: "KEYWORD" | |
| 252 | + index: true | |
| 253 | + store: true | |
| 254 | + return_in_source: true | |
| 255 | + | |
| 256 | + # SKU款式、子sku属性 | |
| 257 | + - name: "specifications" | |
| 258 | + type: "JSON" | |
| 259 | + nested: true | |
| 260 | + return_in_source: true | |
| 261 | + nested_properties: | |
| 262 | + sku_id: | |
| 263 | + type: "keyword" | |
| 264 | + index: true | |
| 265 | + store: true | |
| 266 | + name: | |
| 267 | + type: "keyword" | |
| 268 | + index: true | |
| 269 | + store: true | |
| 270 | + value: | |
| 271 | + type: "keyword" | |
| 272 | + index: true | |
| 273 | + store: true | |
| 274 | + | |
| 275 | + - name: "option1_name" | |
| 276 | + type: "KEYWORD" | |
| 277 | + index: true | |
| 278 | + store: true | |
| 279 | + return_in_source: true | |
| 280 | + | |
| 281 | + - name: "option2_name" | |
| 282 | + type: "KEYWORD" | |
| 283 | + index: true | |
| 284 | + store: true | |
| 285 | + return_in_source: true | |
| 286 | + | |
| 287 | + - name: "option3_name" | |
| 288 | + type: "KEYWORD" | |
| 289 | + index: true | |
| 290 | + store: true | |
| 291 | + return_in_source: true | |
| 292 | + | |
| 293 | + # 时间字段 | |
| 294 | + - name: "create_time" | |
| 162 | 295 | type: "DATE" |
| 163 | 296 | index: true |
| 164 | 297 | store: true |
| 165 | - return_in_source: false # 通常不需要返回 | |
| 298 | + return_in_source: true | |
| 166 | 299 | |
| 167 | - - name: "shoplazza_updated_at" | |
| 300 | + - name: "update_time" | |
| 168 | 301 | type: "DATE" |
| 169 | 302 | index: true |
| 170 | 303 | store: true |
| 171 | - return_in_source: false # 通常不需要返回 | |
| 304 | + return_in_source: true | |
| 172 | 305 | |
| 173 | 306 | # 嵌套skus字段 |
| 174 | 307 | - name: "skus" |
| ... | ... | @@ -180,11 +313,6 @@ fields: |
| 180 | 313 | type: "keyword" |
| 181 | 314 | index: true |
| 182 | 315 | store: true |
| 183 | - title: | |
| 184 | - type: "text" | |
| 185 | - analyzer: "chinese_ecommerce" | |
| 186 | - index: true | |
| 187 | - store: true | |
| 188 | 316 | price: |
| 189 | 317 | type: "float" |
| 190 | 318 | index: true |
| ... | ... | @@ -193,7 +321,7 @@ fields: |
| 193 | 321 | type: "float" |
| 194 | 322 | index: true |
| 195 | 323 | store: true |
| 196 | - sku: | |
| 324 | + sku_code: | |
| 197 | 325 | type: "keyword" |
| 198 | 326 | index: true |
| 199 | 327 | store: true |
| ... | ... | @@ -201,46 +329,65 @@ fields: |
| 201 | 329 | type: "long" |
| 202 | 330 | index: true |
| 203 | 331 | store: true |
| 204 | - options: | |
| 205 | - type: "object" | |
| 206 | - enabled: true | |
| 332 | + weight: | |
| 333 | + type: "float" | |
| 334 | + index: true | |
| 335 | + store: true | |
| 336 | + weight_unit: | |
| 337 | + type: "keyword" | |
| 338 | + index: true | |
| 339 | + store: true | |
| 340 | + option1_value: | |
| 341 | + type: "keyword" | |
| 342 | + index: true | |
| 343 | + store: true | |
| 344 | + option2_value: | |
| 345 | + type: "keyword" | |
| 346 | + index: true | |
| 347 | + store: true | |
| 348 | + option3_value: | |
| 349 | + type: "keyword" | |
| 350 | + index: true | |
| 351 | + store: true | |
| 352 | + image_src: | |
| 353 | + type: "keyword" | |
| 354 | + index: false | |
| 355 | + store: true | |
| 207 | 356 | |
| 208 | 357 | # Index Structure (Query Domains) |
| 209 | 358 | indexes: |
| 210 | 359 | - name: "default" |
| 211 | 360 | label: "默认索引" |
| 212 | 361 | fields: |
| 213 | - - "title" | |
| 214 | - - "brief" | |
| 215 | - - "description" | |
| 216 | - - "seo_title" | |
| 217 | - - "seo_description" | |
| 218 | - - "seo_keywords" | |
| 219 | - - "vendor" | |
| 362 | + - "title_zh" | |
| 363 | + - "brief_zh" | |
| 364 | + - "description_zh" | |
| 365 | + - "vendor_zh" | |
| 220 | 366 | - "tags" |
| 221 | - - "category" | |
| 367 | + - "category_path_zh" | |
| 368 | + - "category_name_zh" | |
| 222 | 369 | analyzer: "chinese_ecommerce" |
| 223 | 370 | boost: 1.0 |
| 224 | 371 | |
| 225 | 372 | - name: "title" |
| 226 | 373 | label: "标题索引" |
| 227 | 374 | fields: |
| 228 | - - "title" | |
| 229 | - - "seo_title" | |
| 375 | + - "title_zh" | |
| 230 | 376 | analyzer: "chinese_ecommerce" |
| 231 | 377 | boost: 2.0 |
| 232 | 378 | |
| 233 | 379 | - name: "vendor" |
| 234 | 380 | label: "品牌索引" |
| 235 | 381 | fields: |
| 236 | - - "vendor" | |
| 382 | + - "vendor_zh" | |
| 237 | 383 | analyzer: "chinese_ecommerce" |
| 238 | 384 | boost: 1.5 |
| 239 | 385 | |
| 240 | 386 | - name: "category" |
| 241 | 387 | label: "类目索引" |
| 242 | 388 | fields: |
| 243 | - - "category" | |
| 389 | + - "category_path_zh" | |
| 390 | + - "category_name_zh" | |
| 244 | 391 | analyzer: "chinese_ecommerce" |
| 245 | 392 | boost: 1.5 |
| 246 | 393 | |
| ... | ... | @@ -248,7 +395,6 @@ indexes: |
| 248 | 395 | label: "标签索引" |
| 249 | 396 | fields: |
| 250 | 397 | - "tags" |
| 251 | - - "seo_keywords" | |
| 252 | 398 | analyzer: "chinese_ecommerce" |
| 253 | 399 | boost: 1.0 |
| 254 | 400 | ... | ... |
config/config_loader.py
| ... | ... | @@ -313,7 +313,8 @@ class ConfigLoader: |
| 313 | 313 | nested=field_data.get("nested", False), |
| 314 | 314 | nested_properties=field_data.get("nested_properties"), |
| 315 | 315 | keyword_subfield=field_data.get("keyword_subfield", is_hktext), |
| 316 | - keyword_ignore_above=field_data.get("keyword_ignore_above", 256) | |
| 316 | + keyword_ignore_above=field_data.get("keyword_ignore_above", 256), | |
| 317 | + keyword_normalizer=field_data.get("keyword_normalizer") | |
| 317 | 318 | ) |
| 318 | 319 | |
| 319 | 320 | def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig: | ... | ... |
config/field_types.py
| ... | ... | @@ -75,6 +75,7 @@ class FieldConfig: |
| 75 | 75 | # Hybrid Keyword Text (HKText) support |
| 76 | 76 | keyword_subfield: bool = False |
| 77 | 77 | keyword_ignore_above: int = 256 |
| 78 | + keyword_normalizer: Optional[str] = None # For keyword subfield normalizer (e.g., "lowercase") | |
| 78 | 79 | |
| 79 | 80 | |
| 80 | 81 | def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: |
| ... | ... | @@ -100,18 +101,28 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: |
| 100 | 101 | if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE: |
| 101 | 102 | mapping["analyzer"] = "index_ansj" |
| 102 | 103 | mapping["search_analyzer"] = "query_ansj" |
| 104 | + elif field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: | |
| 105 | + # If search_analyzer is explicitly set to CHINESE_ECOMMERCE_QUERY | |
| 106 | + mapping["analyzer"] = "index_ansj" | |
| 107 | + mapping["search_analyzer"] = "query_ansj" | |
| 103 | 108 | else: |
| 104 | 109 | mapping["analyzer"] = field_config.analyzer.value |
| 105 | 110 | |
| 106 | 111 | if field_config.search_analyzer: |
| 107 | - mapping["search_analyzer"] = field_config.search_analyzer.value | |
| 112 | + if field_config.search_analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: | |
| 113 | + mapping["search_analyzer"] = "query_ansj" | |
| 114 | + else: | |
| 115 | + mapping["search_analyzer"] = field_config.search_analyzer.value | |
| 108 | 116 | |
| 109 | 117 | if field_config.keyword_subfield: |
| 110 | 118 | mapping.setdefault("fields", {}) |
| 111 | - mapping["fields"]["keyword"] = { | |
| 119 | + keyword_field = { | |
| 112 | 120 | "type": "keyword", |
| 113 | 121 | "ignore_above": field_config.keyword_ignore_above |
| 114 | 122 | } |
| 123 | + if field_config.keyword_normalizer: | |
| 124 | + keyword_field["normalizer"] = field_config.keyword_normalizer | |
| 125 | + mapping["fields"]["keyword"] = keyword_field | |
| 115 | 126 | |
| 116 | 127 | elif field_config.field_type == FieldType.KEYWORD: |
| 117 | 128 | mapping = { |
| ... | ... | @@ -140,7 +151,7 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: |
| 140 | 151 | "similarity": field_config.embedding_similarity |
| 141 | 152 | }, |
| 142 | 153 | "url": { |
| 143 | - "type": "keyword" | |
| 154 | + "type": "text" | |
| 144 | 155 | } |
| 145 | 156 | } |
| 146 | 157 | } |
| ... | ... | @@ -239,6 +250,22 @@ def get_default_analyzers() -> Dict[str, Any]: |
| 239 | 250 | "type": "custom", |
| 240 | 251 | "tokenizer": "standard", |
| 241 | 252 | "filter": ["lowercase", "asciifolding"] |
| 253 | + }, | |
| 254 | + "hanlp_index": { | |
| 255 | + "type": "custom", | |
| 256 | + "tokenizer": "standard", | |
| 257 | + "filter": ["lowercase", "asciifolding"] | |
| 258 | + }, | |
| 259 | + "hanlp_standard": { | |
| 260 | + "type": "custom", | |
| 261 | + "tokenizer": "standard", | |
| 262 | + "filter": ["lowercase", "asciifolding"] | |
| 263 | + } | |
| 264 | + }, | |
| 265 | + "normalizer": { | |
| 266 | + "lowercase": { | |
| 267 | + "type": "custom", | |
| 268 | + "filter": ["lowercase"] | |
| 242 | 269 | } |
| 243 | 270 | } |
| 244 | 271 | } |
| ... | ... | @@ -300,6 +327,9 @@ ANALYZER_MAP = { |
| 300 | 327 | "chinese": AnalyzerType.CHINESE_ECOMMERCE, |
| 301 | 328 | "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE, |
| 302 | 329 | "index_ansj": AnalyzerType.CHINESE_ECOMMERCE, |
| 330 | + "hanlp_index": AnalyzerType.CHINESE_ECOMMERCE, # Alias for index_ansj | |
| 331 | + "hanlp_standard": AnalyzerType.CHINESE_ECOMMERCE_QUERY, # Alias for query_ansj | |
| 332 | + "query_ansj": AnalyzerType.CHINESE_ECOMMERCE_QUERY, | |
| 303 | 333 | "english": AnalyzerType.ENGLISH, |
| 304 | 334 | "arabic": AnalyzerType.ARABIC, |
| 305 | 335 | "spanish": AnalyzerType.SPANISH, | ... | ... |
| ... | ... | @@ -0,0 +1,231 @@ |
| 1 | +{ | |
| 2 | + "mappings": { | |
| 3 | + "properties": { | |
| 4 | + "tenant_id": { | |
| 5 | + "type": "keyword" | |
| 6 | + }, | |
| 7 | + "spu_id": { | |
| 8 | + "type": "keyword" | |
| 9 | + }, | |
| 10 | + | |
| 11 | + "create_time": { | |
| 12 | + "type": "date" | |
| 13 | + }, | |
| 14 | + "update_time": { | |
| 15 | + "type": "date" | |
| 16 | + }, | |
| 17 | + | |
| 18 | + // 文本相关性相关字段 | |
| 19 | + "title_zh": { | |
| 20 | + "type": "text", | |
| 21 | + "analyzer": "hanlp_index", | |
| 22 | + "search_analyzer": "hanlp_standard" | |
| 23 | + }, | |
| 24 | + "brief_zh": { | |
| 25 | + "type": "text", | |
| 26 | + "analyzer": "hanlp_index", | |
| 27 | + "search_analyzer": "hanlp_standard" | |
| 28 | + }, | |
| 29 | + "description_zh": { | |
| 30 | + "type": "text", | |
| 31 | + "analyzer": "hanlp_index", | |
| 32 | + "search_analyzer": "hanlp_standard" | |
| 33 | + }, | |
| 34 | + "vendor_zh": { | |
| 35 | + "type": "text", | |
| 36 | + "analyzer": "hanlp_index", | |
| 37 | + "search_analyzer": "hanlp_standard", | |
| 38 | + "fields": { | |
| 39 | + "keyword": { | |
| 40 | + "type": "keyword", | |
| 41 | + "normalizer": "lowercase" | |
| 42 | + } | |
| 43 | + } | |
| 44 | + }, | |
| 45 | + | |
| 46 | + "title_en": { | |
| 47 | + "type": "text", | |
| 48 | + "analyzer": "english", | |
| 49 | + "search_analyzer": "english", | |
| 50 | + }, | |
| 51 | + "brief_en": { | |
| 52 | + "type": "text", | |
| 53 | + "analyzer": "english", | |
| 54 | + "search_analyzer": "english", | |
| 55 | + | |
| 56 | + }, | |
| 57 | + "description_en": { | |
| 58 | + "type": "text", | |
| 59 | + "analyzer": "english", | |
| 60 | + "search_analyzer": "english", | |
| 61 | + }, | |
| 62 | + "vendor_en": { | |
| 63 | + "type": "text", | |
| 64 | + "analyzer": "english", | |
| 65 | + "search_analyzer": "english", | |
| 66 | + "fields": { | |
| 67 | + "keyword": { | |
| 68 | + "type": "keyword", | |
| 69 | + "normalizer": "lowercase" | |
| 70 | + } | |
| 71 | + } | |
| 72 | + }, | |
| 73 | + | |
| 74 | + "tags": { | |
| 75 | + "type": "keyword", | |
| 76 | + }, | |
| 77 | + | |
| 78 | + "image_url": { | |
| 79 | + "type": "keyword", | |
| 80 | + "index": false | |
| 81 | + }, | |
| 82 | + | |
| 83 | + // 语义向量 | |
| 84 | + "title_embedding": { | |
| 85 | + "type": "dense_vector", | |
| 86 | + "dims": 1024, | |
| 87 | + "index": true, | |
| 88 | + "similarity": "dot_product" | |
| 89 | + }, | |
| 90 | + "image_embedding": { | |
| 91 | + "type": "nested", | |
| 92 | + "properties": { | |
| 93 | + "vector": { | |
| 94 | + "type": "dense_vector", | |
| 95 | + "dims": 1024, | |
| 96 | + "index": true, | |
| 97 | + "similarity": "dot_product" | |
| 98 | + }, | |
| 99 | + "url": { | |
| 100 | + "type": "text" | |
| 101 | + } | |
| 102 | + } | |
| 103 | + }, | |
| 104 | + | |
| 105 | + // 分类相关 | |
| 106 | + "category_path_zh": { // 提供模糊查询功能,辅助相关性计算 | |
| 107 | + "type": "text", | |
| 108 | + "analyzer": "hanlp_index", | |
| 109 | + "search_analyzer": "hanlp_standard" | |
| 110 | + }, | |
| 111 | + "category_path_en": { // 提供模糊查询功能,辅助相关性计算 | |
| 112 | + "type": "text", | |
| 113 | + "analyzer": "english", | |
| 114 | + "search_analyzer": "english" | |
| 115 | + }, | |
| 116 | + "category_name_zh": { // 提供模糊查询功能,辅助相关性计算 | |
| 117 | + "type": "text", | |
| 118 | + "analyzer": "hanlp_index", | |
| 119 | + "search_analyzer": "hanlp_standard" | |
| 120 | + }, | |
| 121 | + "category_name_en": { // 提供模糊查询功能,辅助相关性计算 | |
| 122 | + "type": "text", | |
| 123 | + "analyzer": "english", | |
| 124 | + "search_analyzer": "english" | |
| 125 | + }, | |
| 126 | + | |
| 127 | + "category_id": { | |
| 128 | + "type": "keyword" | |
| 129 | + }, | |
| 130 | + "category_name": { | |
| 131 | + "type": "keyword" | |
| 132 | + }, | |
| 133 | + "category_level": { | |
| 134 | + "type": "integer" | |
| 135 | + }, | |
| 136 | + "category1_name": { // 不同层级下 可能有同名的情况,因此提供一二三级分开的查询方式 | |
| 137 | + "type": "keyword" | |
| 138 | + }, | |
| 139 | + "category2_name": { | |
| 140 | + "type": "keyword" | |
| 141 | + }, | |
| 142 | + "category3_name": { | |
| 143 | + "type": "keyword" | |
| 144 | + }, | |
| 145 | + | |
| 146 | + // sku款式、子sku属性 | |
| 147 | + "specifications": { | |
| 148 | + "type": "nested", | |
| 149 | + "properties": { | |
| 150 | + "sku_id": { "type": "keyword" }, | |
| 151 | + "name": { "type": "keyword" }, // "颜色", "容量" | |
| 152 | + "value": { "type": "keyword" } // "白色", "256GB" | |
| 153 | + } | |
| 154 | + }, | |
| 155 | + | |
| 156 | + "option1_name": { | |
| 157 | + "type": "keyword" | |
| 158 | + }, | |
| 159 | + "option2_name": { | |
| 160 | + "type": "keyword" | |
| 161 | + }, | |
| 162 | + "option3_name": { | |
| 163 | + "type": "keyword" | |
| 164 | + }, | |
| 165 | + | |
| 166 | + "min_price": { | |
| 167 | + "type": "float" | |
| 168 | + }, | |
| 169 | + "max_price": { | |
| 170 | + "type": "float" | |
| 171 | + }, | |
| 172 | + "compare_at_price": { | |
| 173 | + "type": "float" | |
| 174 | + }, | |
| 175 | + "sku_prices": { | |
| 176 | + "type": "float" | |
| 177 | + }, | |
| 178 | + "sku_weights": { | |
| 179 | + "type": "long" | |
| 180 | + }, | |
| 181 | + "sku_weight_units": { | |
| 182 | + "type": "keyword" | |
| 183 | + }, | |
| 184 | + "total_inventory": { | |
| 185 | + "type": "long" | |
| 186 | + }, | |
| 187 | + | |
| 188 | + "skus": { | |
| 189 | + "type": "nested", | |
| 190 | + "properties": { | |
| 191 | + "sku_id": { | |
| 192 | + "type": "keyword" | |
| 193 | + }, | |
| 194 | + "price": { | |
| 195 | + "type": "float" | |
| 196 | + }, | |
| 197 | + "compare_at_price": { | |
| 198 | + "type": "float" | |
| 199 | + }, | |
| 200 | + "sku_code": { | |
| 201 | + "type": "keyword" | |
| 202 | + }, | |
| 203 | + "stock": { | |
| 204 | + "type": "long" | |
| 205 | + }, | |
| 206 | + "weight": { | |
| 207 | + "type": "float" | |
| 208 | + }, | |
| 209 | + "weight_unit": { | |
| 210 | + "type": "keyword" | |
| 211 | + }, | |
| 212 | + "option1_value": { | |
| 213 | + "type": "keyword" | |
| 214 | + }, | |
| 215 | + "option2_value": { | |
| 216 | + "type": "keyword" | |
| 217 | + }, | |
| 218 | + "option3_value": { | |
| 219 | + "type": "keyword" | |
| 220 | + }, | |
| 221 | + "image_src": { | |
| 222 | + "type": "keyword", | |
| 223 | + "index": false | |
| 224 | + } | |
| 225 | + } | |
| 226 | + } | |
| 227 | + } | |
| 228 | + } | |
| 229 | +} | |
| 230 | + | |
| 231 | + | ... | ... |
docs/索引字段说明v2.md
| ... | ... | @@ -4,247 +4,34 @@ SPU-SKU索引方案选型 |
| 4 | 4 | 除了title, brielf description seo相关 cate tags vendor所有影响相关性的字段都在spu。 sku只有款式、价格、重量、库存等相关属性。所以,可以以spu为单位建立索引。 |
| 5 | 5 | sku中需要参与搜索的属性(比如价格、库存)展开到spu。 |
| 6 | 6 | sku的所有需要返回的字段作为nested字段,仅用于返回。 |
| 7 | -灌入数据准备 | |
| 8 | -def build_product_document(product, skus): | |
| 9 | - # 提取价格列表(转换为float,保留两位小数) | |
| 10 | - price_list = [float(sku.price) for sku in skus if sku.price is not None] | |
| 11 | - | |
| 12 | - # 提取重量信息(重量转为int,单位统一为克;重量+单位拼接为字符串) | |
| 13 | - weight_list = [int(float(sku.weight) * 1000) for sku in skus if sku.weight is not None] # 转为整数克 | |
| 14 | - weight_with_unit_list = [f"{sku.weight}{sku.weight_unit}" for sku in skus if sku.weight and sku.weight_unit] | |
| 15 | - | |
| 16 | - # 计算库存总和 | |
| 17 | - total_stock = sum([sku.inventory_quantity for sku in skus if sku.inventory_quantity is not None]) | |
| 18 | - | |
| 19 | - # 计算价格区间 | |
| 20 | - min_price = min(price_list) if price_list else 0.0 | |
| 21 | - max_price = max(price_list) if price_list else 0.0 | |
| 22 | - | |
| 7 | +# 写入 spu 级别索引 | |
| 8 | +def build_product_document(product, variants): | |
| 23 | 9 | return { |
| 24 | 10 | "spu_id": str(product.id), |
| 25 | 11 | "title": product.title, |
| 26 | 12 | |
| 27 | - # SPU级别的选项名称定义(如:颜色、尺码、材质) | |
| 28 | - "option1_name": getattr(product, 'option1', None), | |
| 29 | - "option2_name": getattr(product, 'option2', None), | |
| 30 | - "option3_name": getattr(product, 'option3', None), | |
| 31 | - | |
| 32 | - # SKU搜索字段(展开) | |
| 13 | + # Variant搜索字段(展开) | |
| 33 | 14 | # 价格(int)、重量(int)、重量单位拼接重量(keyword),都以list形式灌入 |
| 34 | - "sku_prices": price_list, # 所有SKU价格列表,用于范围聚合 | |
| 35 | - "sku_weights": weight_list, # 重量数值列表(转换为整数克) | |
| 36 | - "sku_weight_units": weight_with_unit_list, # 重量+单位字符串列表 | |
| 37 | - | |
| 38 | - # 库存总和 将SKU的库存加起来作为一个值灌入 | |
| 39 | - "total_inventory": total_stock, # SKU库存总和 | |
| 40 | - | |
| 41 | - # 售价,灌入3个字段:SKU价格列表、最高价、最低价 | |
| 42 | - "min_price": min_price, # 最低售价 | |
| 43 | - "max_price": max_price, # 最高售价 | |
| 44 | - "price_range": { # 价格区间对象,便于范围查询 | |
| 45 | - "gte": min_price, | |
| 46 | - "lte": max_price | |
| 47 | - }, | |
| 48 | - | |
| 49 | - # SKU详细信息(nested结构,仅用于返回) | |
| 50 | - "skus": [ | |
| 51 | - { | |
| 52 | - "sku_id": str(sku.id), | |
| 53 | - "price": float(sku.price) if sku.price else 0.0, | |
| 54 | - "compare_at_price": float(sku.compare_at_price) if sku.compare_at_price else None, | |
| 55 | - "sku_code": sku.sku, | |
| 56 | - "stock": sku.inventory_quantity, | |
| 57 | - "weight": float(sku.weight) if sku.weight else None, | |
| 58 | - "weight_unit": sku.weight_unit, | |
| 59 | - | |
| 60 | - # SKU级别的选项值(对应SPU的选项名称) | |
| 61 | - "option1_value": sku.option1, | |
| 62 | - "option2_value": sku.option2, | |
| 63 | - "option3_value": sku.option3, | |
| 15 | + # TODO 按要求补充 | |
| 16 | + | |
| 17 | + # 库存总和 将sku的库存加起来作为一个值灌入 | |
| 18 | + # 售价,灌入3个字段,一个 sku价格 以list形式灌入,一个最高价一个最低价 | |
| 19 | + # TODO 按要求补充 | |
| 64 | 20 | |
| 65 | - "image_src": sku.image_src | |
| 21 | + # Variant详细信息(用于返回) | |
| 22 | + "variants": [ | |
| 23 | + { | |
| 24 | + "sku_id": str(v.id), | |
| 25 | + "price": float(v.price), | |
| 26 | + "options": v.options | |
| 66 | 27 | } |
| 67 | - for sku in skus | |
| 28 | + for v in variants | |
| 68 | 29 | ], |
| 69 | - | |
| 70 | - # 其他SPU级别字段(根据索引文档补充) | |
| 71 | - "tenant_id": str(product.tenant_id), | |
| 72 | - "brief": product.brief, | |
| 73 | - "description": product.description, | |
| 74 | - "vendor": product.vendor, | |
| 75 | - "category": product.category, | |
| 76 | - "tags": product.tags.split(',') if product.tags else [], | |
| 77 | - "seo_title": product.seo_title, | |
| 78 | - "seo_description": product.seo_description, | |
| 79 | - "seo_keywords": product.seo_keywords.split(',') if product.seo_keywords else [], | |
| 80 | - "image_url": product.image_src, | |
| 81 | - "create_time": product.create_time.isoformat() if product.create_time else None, | |
| 82 | - "update_time": product.update_time.isoformat() if product.update_time else None | |
| 83 | - } | |
| 84 | - 索引定义 | |
| 85 | -{ | |
| 86 | - "mappings": { | |
| 87 | - "properties": { | |
| 88 | - "tenant_id": { | |
| 89 | - "type": "keyword" | |
| 90 | - }, | |
| 91 | - "spu_id": { | |
| 92 | - "type": "keyword" | |
| 93 | - }, | |
| 94 | - // 文本相关性相关字段 | |
| 95 | - "title_zh": { | |
| 96 | - "type": "text", | |
| 97 | - "analyzer": "hanlp_index", | |
| 98 | - "search_analyzer": "hanlp_standard" | |
| 99 | - }, | |
| 100 | - "brief_zh": { | |
| 101 | - "type": "text", | |
| 102 | - "analyzer": "hanlp_index", | |
| 103 | - "search_analyzer": "hanlp_standard" | |
| 104 | - }, | |
| 105 | - "description_zh": { | |
| 106 | - "type": "text", | |
| 107 | - "analyzer": "hanlp_index", | |
| 108 | - "search_analyzer": "hanlp_standard" | |
| 109 | - }, | |
| 110 | - "vendor_zh": { | |
| 111 | - "type": "text", | |
| 112 | - "analyzer": "hanlp_index", | |
| 113 | - "search_analyzer": "hanlp_standard", | |
| 114 | - "fields": { | |
| 115 | - "keyword": { | |
| 116 | - "type": "keyword", | |
| 117 | - "normalizer": "lowercase" | |
| 118 | - } | |
| 119 | - } | |
| 120 | - }, | |
| 121 | - | |
| 122 | - "title_en": { | |
| 123 | - "type": "text", | |
| 124 | - "analyzer": "english", | |
| 125 | - "search_analyzer": "english", | |
| 126 | - }, | |
| 127 | - "brief_en": { | |
| 128 | - "type": "text", | |
| 129 | - "analyzer": "english", | |
| 130 | - "search_analyzer": "english", | |
| 131 | 30 | |
| 132 | - }, | |
| 133 | - "description_en": { | |
| 134 | - "type": "text", | |
| 135 | - "analyzer": "english", | |
| 136 | - "search_analyzer": "english", | |
| 137 | - }, | |
| 138 | - "vendor_en": { | |
| 139 | - "type": "text", | |
| 140 | - "analyzer": "english", | |
| 141 | - "search_analyzer": "english", | |
| 142 | - "fields": { | |
| 143 | - "keyword": { | |
| 144 | - "type": "keyword", | |
| 145 | - "normalizer": "lowercase" | |
| 146 | - } | |
| 147 | - } | |
| 148 | - }, | |
| 149 | - | |
| 150 | - "tags": { | |
| 151 | - "type": "keyword", | |
| 152 | - }, | |
| 153 | - | |
| 154 | - | |
| 155 | - "min_price": { | |
| 156 | - "type": "float" | |
| 157 | - }, | |
| 158 | - "max_price": { | |
| 159 | - "type": "float" | |
| 160 | - }, | |
| 161 | - "compare_at_price": { | |
| 162 | - "type": "float" | |
| 163 | - }, | |
| 164 | - "sku_prices": { | |
| 165 | - "type": "float" | |
| 166 | - }, | |
| 167 | - "sku_weights": { | |
| 168 | - "type": "long" | |
| 169 | - }, | |
| 170 | - "sku_weight_units": { | |
| 171 | - "type": "keyword" | |
| 172 | - }, | |
| 173 | - "total_inventory": { | |
| 174 | - "type": "long" | |
| 175 | - }, | |
| 176 | - | |
| 177 | - "image_url": { | |
| 178 | - "type": "keyword", | |
| 179 | - "index": false | |
| 180 | - }, | |
| 181 | - | |
| 182 | - "title_embedding": { | |
| 183 | - "type": "dense_vector", | |
| 184 | - "dims": 1024, | |
| 185 | - "index": true, | |
| 186 | - "similarity": "dot_product" | |
| 187 | - }, | |
| 188 | - | |
| 189 | - "create_time": { | |
| 190 | - "type": "date" | |
| 191 | - }, | |
| 192 | - "update_time": { | |
| 193 | - "type": "date" | |
| 194 | - }, | |
| 195 | - | |
| 196 | - "option1_name": { | |
| 197 | - "type": "keyword" | |
| 198 | - }, | |
| 199 | - "option2_name": { | |
| 200 | - "type": "keyword" | |
| 201 | - }, | |
| 202 | - "option3_name": { | |
| 203 | - "type": "keyword" | |
| 204 | - }, | |
| 205 | - | |
| 206 | - "skus": { | |
| 207 | - "type": "nested", | |
| 208 | - "properties": { | |
| 209 | - "sku_id": { | |
| 210 | - "type": "keyword" | |
| 211 | - }, | |
| 212 | - "price": { | |
| 213 | - "type": "float" | |
| 214 | - }, | |
| 215 | - "compare_at_price": { | |
| 216 | - "type": "float" | |
| 217 | - }, | |
| 218 | - "sku_code": { | |
| 219 | - "type": "keyword" | |
| 220 | - }, | |
| 221 | - "stock": { | |
| 222 | - "type": "long" | |
| 223 | - }, | |
| 224 | - "weight": { | |
| 225 | - "type": "float" | |
| 226 | - }, | |
| 227 | - "weight_unit": { | |
| 228 | - "type": "keyword" | |
| 229 | - }, | |
| 230 | - "option1_value": { | |
| 231 | - "type": "keyword" | |
| 232 | - }, | |
| 233 | - "option2_value": { | |
| 234 | - "type": "keyword" | |
| 235 | - }, | |
| 236 | - "option3_value": { | |
| 237 | - "type": "keyword" | |
| 238 | - }, | |
| 239 | - "image_src": { | |
| 240 | - "type": "keyword", | |
| 241 | - "index": false | |
| 242 | - } | |
| 243 | - } | |
| 244 | - } | |
| 31 | + | |
| 32 | + "min_price": min(v.price for v in variants), | |
| 33 | + "max_price": max(v.price for v in variants) | |
| 245 | 34 | } |
| 246 | - } | |
| 247 | -} | |
| 248 | 35 | 1.2 查询方案 |
| 249 | 36 | 对数组字段使用 dis_max,只取最高分,避免累加。 |
| 250 | 37 | 其他重点字段 |
| ... | ... | @@ -333,26 +120,56 @@ S red |
| 333 | 120 | 1. API 在 SPU 的维度直接返回3个属性定义,存储在 shoplazza_product_option 中: |
| 334 | 121 | 1. API在 SKU的维度直接返回3个属性值,存储在 shoplazza_product_sku 表的 option 相关的字段中: |
| 335 | 122 | 5.3 ES索引 |
| 336 | -5.3.1 | |
| 337 | - 3nested,支持超过3个属性(动态)。只用作返回,不能查询。节省索引空间 | |
| 123 | + | |
| 338 | 124 | "specifications": { |
| 339 | 125 | "type": "nested", |
| 340 | 126 | "properties": { |
| 341 | - "name": { "type": "keyword","index": false }, | |
| 342 | - "value": { "type": "keyword","index": false } | |
| 127 | + "name": { "type": "keyword" }, // "颜色", "容量" | |
| 128 | + "value": { "type": "keyword" } // "白色", "256GB" | |
| 343 | 129 | } |
| 344 | 130 | }, |
| 345 | 131 | |
| 346 | -6. SEO相关字段 | |
| 347 | -6.1 数据源 | |
| 348 | -SEO标题 SEO描述 SEO URL Handle SEO URL 重定向 SEO关键词 | |
| 349 | -最多5000字符 最多5000字符 "最多支持输入255字符 | |
| 350 | - (SEO URL handle只对SEO URL的「URL参数」部分进行更改,即“products/”后的内容,如:products/「URL参数」 | |
| 351 | - )" "创建URL重定向,访问修改前链接可跳转到修改后的新链接页面 | |
| 352 | -「Y」:TRUE | |
| 353 | -「N」:FALSE " 多个关键词请用「英文逗号」隔开 | |
| 354 | - | |
| 355 | -6.2 Mysql | |
| 356 | -6.3 ES索引 | |
| 357 | -6.3.1 输入数据 | |
| 358 | -6.3.2 索引方法 | |
| 359 | 132 | \ No newline at end of file |
| 133 | + 另外还需要包含一个单独的字段,main_option (即店铺主题装修里面配置的 颜色切换 - 变体名称,也就是列表页商品的子sku显示维度) | |
| 134 | + "main_option": { "type": "keyword" } | |
| 135 | +查询指定款式 | |
| 136 | +{ | |
| 137 | + "query": { | |
| 138 | + "nested": { | |
| 139 | + "path": "specifications", | |
| 140 | + "query": { | |
| 141 | + "bool": { | |
| 142 | + "must": [ | |
| 143 | + { "term": { "specifications.name ": "颜色" } }, | |
| 144 | + { "term": { "specifications.value": "绿色" } } | |
| 145 | + ] | |
| 146 | + } | |
| 147 | + } | |
| 148 | + } | |
| 149 | + } | |
| 150 | +} | |
| 151 | +按 name 做分面搜索(聚合) | |
| 152 | + | |
| 153 | +{ | |
| 154 | + "aggs": { | |
| 155 | + "specs": { | |
| 156 | + "nested": { "path": "specifications" }, | |
| 157 | + "aggs": { | |
| 158 | + "by_name": { | |
| 159 | + "terms": { | |
| 160 | + "field": "specifications.name", | |
| 161 | + "size": 20 | |
| 162 | + }, | |
| 163 | + "aggs": { | |
| 164 | + "value_counts": { | |
| 165 | + "terms": { | |
| 166 | + "field": "specifications.value", | |
| 167 | + "size": 10 | |
| 168 | + } | |
| 169 | + } | |
| 170 | + } | |
| 171 | + } | |
| 172 | + } | |
| 173 | + } | |
| 174 | + } | |
| 175 | +} | |
| 176 | + | |
| 360 | 177 | \ No newline at end of file | ... | ... |
indexer/spu_transformer.py
| ... | ... | @@ -38,12 +38,12 @@ class SPUTransformer: |
| 38 | 38 | """ |
| 39 | 39 | query = text(""" |
| 40 | 40 | SELECT |
| 41 | - id, shop_id, shoplazza_id, handle, title, brief, description, | |
| 42 | - spu, vendor, vendor_url, seo_title, seo_description, seo_keywords, | |
| 41 | + id, shop_id, shoplazza_id, title, brief, description, | |
| 42 | + spu, vendor, vendor_url, | |
| 43 | 43 | image_src, image_width, image_height, image_path, image_alt, |
| 44 | - tags, note, category, | |
| 45 | - shoplazza_created_at, shoplazza_updated_at, tenant_id, | |
| 46 | - creator, create_time, updater, update_time, deleted | |
| 44 | + tags, note, category, category_id, category_google_id, | |
| 45 | + category_level, category_path, | |
| 46 | + tenant_id, creator, create_time, updater, update_time, deleted | |
| 47 | 47 | FROM shoplazza_product_spu |
| 48 | 48 | WHERE tenant_id = :tenant_id AND deleted = 0 |
| 49 | 49 | """) |
| ... | ... | @@ -114,6 +114,30 @@ class SPUTransformer: |
| 114 | 114 | |
| 115 | 115 | return df |
| 116 | 116 | |
| 117 | + def load_option_data(self) -> pd.DataFrame: | |
| 118 | + """ | |
| 119 | + Load option data from MySQL. | |
| 120 | + | |
| 121 | + Returns: | |
| 122 | + DataFrame with option data (name, position for each SPU) | |
| 123 | + """ | |
| 124 | + query = text(""" | |
| 125 | + SELECT | |
| 126 | + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, | |
| 127 | + position, name, values, tenant_id, | |
| 128 | + creator, create_time, updater, update_time, deleted | |
| 129 | + FROM shoplazza_product_option | |
| 130 | + WHERE tenant_id = :tenant_id AND deleted = 0 | |
| 131 | + ORDER BY spu_id, position | |
| 132 | + """) | |
| 133 | + | |
| 134 | + with self.db_engine.connect() as conn: | |
| 135 | + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id}) | |
| 136 | + | |
| 137 | + print(f"DEBUG: Loaded {len(df)} option records for tenant_id={self.tenant_id}") | |
| 138 | + | |
| 139 | + return df | |
| 140 | + | |
| 117 | 141 | def transform_batch(self) -> List[Dict[str, Any]]: |
| 118 | 142 | """ |
| 119 | 143 | Transform SPU and SKU data into ES documents. |
| ... | ... | @@ -124,12 +148,16 @@ class SPUTransformer: |
| 124 | 148 | # Load data |
| 125 | 149 | spu_df = self.load_spu_data() |
| 126 | 150 | sku_df = self.load_sku_data() |
| 151 | + option_df = self.load_option_data() | |
| 127 | 152 | |
| 128 | 153 | if spu_df.empty: |
| 129 | 154 | return [] |
| 130 | 155 | |
| 131 | 156 | # Group SKUs by SPU |
| 132 | 157 | sku_groups = sku_df.groupby('spu_id') |
| 158 | + | |
| 159 | + # Group options by SPU | |
| 160 | + option_groups = option_df.groupby('spu_id') if not option_df.empty else None | |
| 133 | 161 | |
| 134 | 162 | documents = [] |
| 135 | 163 | for _, spu_row in spu_df.iterrows(): |
| ... | ... | @@ -138,8 +166,11 @@ class SPUTransformer: |
| 138 | 166 | # Get SKUs for this SPU |
| 139 | 167 | skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame() |
| 140 | 168 | |
| 169 | + # Get options for this SPU | |
| 170 | + options = option_groups.get_group(spu_id) if option_groups and spu_id in option_groups.groups else pd.DataFrame() | |
| 171 | + | |
| 141 | 172 | # Transform to ES document |
| 142 | - doc = self._transform_spu_to_doc(spu_row, skus) | |
| 173 | + doc = self._transform_spu_to_doc(spu_row, skus, options) | |
| 143 | 174 | if doc: |
| 144 | 175 | documents.append(doc) |
| 145 | 176 | |
| ... | ... | @@ -148,7 +179,8 @@ class SPUTransformer: |
| 148 | 179 | def _transform_spu_to_doc( |
| 149 | 180 | self, |
| 150 | 181 | spu_row: pd.Series, |
| 151 | - skus: pd.DataFrame | |
| 182 | + skus: pd.DataFrame, | |
| 183 | + options: pd.DataFrame | |
| 152 | 184 | ) -> Optional[Dict[str, Any]]: |
| 153 | 185 | """ |
| 154 | 186 | Transform a single SPU row and its SKUs into an ES document. |
| ... | ... | @@ -156,6 +188,7 @@ class SPUTransformer: |
| 156 | 188 | Args: |
| 157 | 189 | spu_row: SPU row from database |
| 158 | 190 | skus: DataFrame with SKUs for this SPU |
| 191 | + options: DataFrame with options for this SPU | |
| 159 | 192 | |
| 160 | 193 | Returns: |
| 161 | 194 | ES document or None if transformation fails |
| ... | ... | @@ -168,41 +201,66 @@ class SPUTransformer: |
| 168 | 201 | # SPU ID |
| 169 | 202 | doc['spu_id'] = str(spu_row['id']) |
| 170 | 203 | |
| 171 | - # Handle | |
| 172 | - if pd.notna(spu_row.get('handle')): | |
| 173 | - doc['handle'] = str(spu_row['handle']) | |
| 174 | - | |
| 175 | - # Title | |
| 204 | + # 文本相关性相关字段(中英文双语,暂时只填充中文) | |
| 176 | 205 | if pd.notna(spu_row.get('title')): |
| 177 | - doc['title'] = str(spu_row['title']) | |
| 206 | + doc['title_zh'] = str(spu_row['title']) | |
| 207 | + doc['title_en'] = None # 暂时设为空 | |
| 178 | 208 | |
| 179 | - # Brief | |
| 180 | 209 | if pd.notna(spu_row.get('brief')): |
| 181 | - doc['brief'] = str(spu_row['brief']) | |
| 210 | + doc['brief_zh'] = str(spu_row['brief']) | |
| 211 | + doc['brief_en'] = None | |
| 182 | 212 | |
| 183 | - # Description | |
| 184 | 213 | if pd.notna(spu_row.get('description')): |
| 185 | - doc['description'] = str(spu_row['description']) | |
| 214 | + doc['description_zh'] = str(spu_row['description']) | |
| 215 | + doc['description_en'] = None | |
| 186 | 216 | |
| 187 | - # SEO fields | |
| 188 | - if pd.notna(spu_row.get('seo_title')): | |
| 189 | - doc['seo_title'] = str(spu_row['seo_title']) | |
| 190 | - if pd.notna(spu_row.get('seo_description')): | |
| 191 | - doc['seo_description'] = str(spu_row['seo_description']) | |
| 192 | - if pd.notna(spu_row.get('seo_keywords')): | |
| 193 | - doc['seo_keywords'] = str(spu_row['seo_keywords']) | |
| 194 | - | |
| 195 | - # Vendor | |
| 196 | 217 | if pd.notna(spu_row.get('vendor')): |
| 197 | - doc['vendor'] = str(spu_row['vendor']) | |
| 218 | + doc['vendor_zh'] = str(spu_row['vendor']) | |
| 219 | + doc['vendor_en'] = None | |
| 198 | 220 | |
| 199 | 221 | # Tags |
| 200 | 222 | if pd.notna(spu_row.get('tags')): |
| 201 | - doc['tags'] = str(spu_row['tags']) | |
| 223 | + # Tags是逗号分隔的字符串,需要转换为数组 | |
| 224 | + tags_str = str(spu_row['tags']) | |
| 225 | + doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()] | |
| 226 | + | |
| 227 | + # Category相关字段 | |
| 228 | + if pd.notna(spu_row.get('category_path')): | |
| 229 | + category_path = str(spu_row['category_path']) | |
| 230 | + doc['category_path_zh'] = category_path | |
| 231 | + doc['category_path_en'] = None # 暂时设为空 | |
| 232 | + | |
| 233 | + # 解析category_path获取多层级分类名称 | |
| 234 | + path_parts = category_path.split('/') | |
| 235 | + if len(path_parts) > 0: | |
| 236 | + doc['category1_name'] = path_parts[0].strip() | |
| 237 | + if len(path_parts) > 1: | |
| 238 | + doc['category2_name'] = path_parts[1].strip() | |
| 239 | + if len(path_parts) > 2: | |
| 240 | + doc['category3_name'] = path_parts[2].strip() | |
| 202 | 241 | |
| 203 | - # Category | |
| 204 | 242 | if pd.notna(spu_row.get('category')): |
| 205 | - doc['category'] = str(spu_row['category']) | |
| 243 | + category_name = str(spu_row['category']) | |
| 244 | + doc['category_name_zh'] = category_name | |
| 245 | + doc['category_name_en'] = None | |
| 246 | + doc['category_name'] = category_name | |
| 247 | + | |
| 248 | + if pd.notna(spu_row.get('category_id')): | |
| 249 | + doc['category_id'] = str(int(spu_row['category_id'])) | |
| 250 | + | |
| 251 | + if pd.notna(spu_row.get('category_level')): | |
| 252 | + doc['category_level'] = int(spu_row['category_level']) | |
| 253 | + | |
| 254 | + # Option名称(从option表获取) | |
| 255 | + if not options.empty: | |
| 256 | + # 按position排序获取option名称 | |
| 257 | + sorted_options = options.sort_values('position') | |
| 258 | + if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')): | |
| 259 | + doc['option1_name'] = str(sorted_options.iloc[0]['name']) | |
| 260 | + if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')): | |
| 261 | + doc['option2_name'] = str(sorted_options.iloc[1]['name']) | |
| 262 | + if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')): | |
| 263 | + doc['option3_name'] = str(sorted_options.iloc[2]['name']) | |
| 206 | 264 | |
| 207 | 265 | # Image URL |
| 208 | 266 | if pd.notna(spu_row.get('image_src')): |
| ... | ... | @@ -211,27 +269,85 @@ class SPUTransformer: |
| 211 | 269 | image_src = f"//{image_src}" if image_src.startswith('//') else image_src |
| 212 | 270 | doc['image_url'] = image_src |
| 213 | 271 | |
| 214 | - # Process SKUs | |
| 272 | + # Process SKUs and build specifications | |
| 215 | 273 | skus_list = [] |
| 216 | 274 | prices = [] |
| 217 | 275 | compare_prices = [] |
| 276 | + sku_prices = [] | |
| 277 | + sku_weights = [] | |
| 278 | + sku_weight_units = [] | |
| 279 | + total_inventory = 0 | |
| 280 | + specifications = [] | |
| 281 | + | |
| 282 | + # 构建option名称映射(position -> name) | |
| 283 | + option_name_map = {} | |
| 284 | + if not options.empty: | |
| 285 | + for _, opt_row in options.iterrows(): | |
| 286 | + position = opt_row.get('position') | |
| 287 | + name = opt_row.get('name') | |
| 288 | + if pd.notna(position) and pd.notna(name): | |
| 289 | + option_name_map[int(position)] = str(name) | |
| 218 | 290 | |
| 219 | 291 | for _, sku_row in skus.iterrows(): |
| 220 | - sku_data = self._transform_sku_row(sku_row) | |
| 292 | + sku_data = self._transform_sku_row(sku_row, option_name_map) | |
| 221 | 293 | if sku_data: |
| 222 | 294 | skus_list.append(sku_data) |
| 295 | + | |
| 296 | + # 收集价格信息 | |
| 223 | 297 | if 'price' in sku_data and sku_data['price'] is not None: |
| 224 | 298 | try: |
| 225 | - prices.append(float(sku_data['price'])) | |
| 299 | + price_val = float(sku_data['price']) | |
| 300 | + prices.append(price_val) | |
| 301 | + sku_prices.append(price_val) | |
| 226 | 302 | except (ValueError, TypeError): |
| 227 | 303 | pass |
| 304 | + | |
| 228 | 305 | if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None: |
| 229 | 306 | try: |
| 230 | 307 | compare_prices.append(float(sku_data['compare_at_price'])) |
| 231 | 308 | except (ValueError, TypeError): |
| 232 | 309 | pass |
| 310 | + | |
| 311 | + # 收集重量信息 | |
| 312 | + if 'weight' in sku_data and sku_data['weight'] is not None: | |
| 313 | + try: | |
| 314 | + sku_weights.append(int(float(sku_data['weight']))) | |
| 315 | + except (ValueError, TypeError): | |
| 316 | + pass | |
| 317 | + | |
| 318 | + if 'weight_unit' in sku_data and sku_data['weight_unit']: | |
| 319 | + sku_weight_units.append(str(sku_data['weight_unit'])) | |
| 320 | + | |
| 321 | + # 收集库存信息 | |
| 322 | + if 'stock' in sku_data and sku_data['stock'] is not None: | |
| 323 | + try: | |
| 324 | + total_inventory += int(sku_data['stock']) | |
| 325 | + except (ValueError, TypeError): | |
| 326 | + pass | |
| 327 | + | |
| 328 | + # 构建specifications(从SKU的option值和option表的name) | |
| 329 | + sku_id = str(sku_row['id']) | |
| 330 | + if pd.notna(sku_row.get('option1')) and 1 in option_name_map: | |
| 331 | + specifications.append({ | |
| 332 | + 'sku_id': sku_id, | |
| 333 | + 'name': option_name_map[1], | |
| 334 | + 'value': str(sku_row['option1']) | |
| 335 | + }) | |
| 336 | + if pd.notna(sku_row.get('option2')) and 2 in option_name_map: | |
| 337 | + specifications.append({ | |
| 338 | + 'sku_id': sku_id, | |
| 339 | + 'name': option_name_map[2], | |
| 340 | + 'value': str(sku_row['option2']) | |
| 341 | + }) | |
| 342 | + if pd.notna(sku_row.get('option3')) and 3 in option_name_map: | |
| 343 | + specifications.append({ | |
| 344 | + 'sku_id': sku_id, | |
| 345 | + 'name': option_name_map[3], | |
| 346 | + 'value': str(sku_row['option3']) | |
| 347 | + }) | |
| 233 | 348 | |
| 234 | 349 | doc['skus'] = skus_list |
| 350 | + doc['specifications'] = specifications | |
| 235 | 351 | |
| 236 | 352 | # Calculate price ranges |
| 237 | 353 | if prices: |
| ... | ... | @@ -246,6 +362,19 @@ class SPUTransformer: |
| 246 | 362 | else: |
| 247 | 363 | doc['compare_at_price'] = None |
| 248 | 364 | |
| 365 | + # SKU扁平化字段 | |
| 366 | + doc['sku_prices'] = sku_prices | |
| 367 | + doc['sku_weights'] = sku_weights | |
| 368 | + doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重 | |
| 369 | + doc['total_inventory'] = total_inventory | |
| 370 | + | |
| 371 | + # Image URL | |
| 372 | + if pd.notna(spu_row.get('image_src')): | |
| 373 | + image_src = str(spu_row['image_src']) | |
| 374 | + if not image_src.startswith('http'): | |
| 375 | + image_src = f"//{image_src}" if image_src.startswith('//') else image_src | |
| 376 | + doc['image_url'] = image_src | |
| 377 | + | |
| 249 | 378 | # Time fields - convert datetime to ISO format string for ES DATE type |
| 250 | 379 | if pd.notna(spu_row.get('create_time')): |
| 251 | 380 | create_time = spu_row['create_time'] |
| ... | ... | @@ -260,29 +389,16 @@ class SPUTransformer: |
| 260 | 389 | doc['update_time'] = update_time.isoformat() |
| 261 | 390 | else: |
| 262 | 391 | doc['update_time'] = str(update_time) |
| 263 | - | |
| 264 | - if pd.notna(spu_row.get('shoplazza_created_at')): | |
| 265 | - shoplazza_created_at = spu_row['shoplazza_created_at'] | |
| 266 | - if hasattr(shoplazza_created_at, 'isoformat'): | |
| 267 | - doc['shoplazza_created_at'] = shoplazza_created_at.isoformat() | |
| 268 | - else: | |
| 269 | - doc['shoplazza_created_at'] = str(shoplazza_created_at) | |
| 270 | - | |
| 271 | - if pd.notna(spu_row.get('shoplazza_updated_at')): | |
| 272 | - shoplazza_updated_at = spu_row['shoplazza_updated_at'] | |
| 273 | - if hasattr(shoplazza_updated_at, 'isoformat'): | |
| 274 | - doc['shoplazza_updated_at'] = shoplazza_updated_at.isoformat() | |
| 275 | - else: | |
| 276 | - doc['shoplazza_updated_at'] = str(shoplazza_updated_at) | |
| 277 | 392 | |
| 278 | 393 | return doc |
| 279 | 394 | |
| 280 | - def _transform_sku_row(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]: | |
| 395 | + def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: | |
| 281 | 396 | """ |
| 282 | 397 | Transform a SKU row into a SKU object. |
| 283 | 398 | |
| 284 | 399 | Args: |
| 285 | 400 | sku_row: SKU row from database |
| 401 | + option_name_map: Mapping from position to option name | |
| 286 | 402 | |
| 287 | 403 | Returns: |
| 288 | 404 | SKU dictionary or None |
| ... | ... | @@ -292,10 +408,6 @@ class SPUTransformer: |
| 292 | 408 | # SKU ID |
| 293 | 409 | sku_data['sku_id'] = str(sku_row['id']) |
| 294 | 410 | |
| 295 | - # Title | |
| 296 | - if pd.notna(sku_row.get('title')): | |
| 297 | - sku_data['title'] = str(sku_row['title']) | |
| 298 | - | |
| 299 | 411 | # Price |
| 300 | 412 | if pd.notna(sku_row.get('price')): |
| 301 | 413 | try: |
| ... | ... | @@ -314,9 +426,9 @@ class SPUTransformer: |
| 314 | 426 | else: |
| 315 | 427 | sku_data['compare_at_price'] = None |
| 316 | 428 | |
| 317 | - # SKU | |
| 429 | + # SKU Code | |
| 318 | 430 | if pd.notna(sku_row.get('sku')): |
| 319 | - sku_data['sku'] = str(sku_row['sku']) | |
| 431 | + sku_data['sku_code'] = str(sku_row['sku']) | |
| 320 | 432 | |
| 321 | 433 | # Stock |
| 322 | 434 | if pd.notna(sku_row.get('inventory_quantity')): |
| ... | ... | @@ -327,17 +439,30 @@ class SPUTransformer: |
| 327 | 439 | else: |
| 328 | 440 | sku_data['stock'] = 0 |
| 329 | 441 | |
| 330 | - # Options (from option1, option2, option3) | |
| 331 | - options = {} | |
| 442 | + # Weight | |
| 443 | + if pd.notna(sku_row.get('weight')): | |
| 444 | + try: | |
| 445 | + sku_data['weight'] = float(sku_row['weight']) | |
| 446 | + except (ValueError, TypeError): | |
| 447 | + sku_data['weight'] = None | |
| 448 | + else: | |
| 449 | + sku_data['weight'] = None | |
| 450 | + | |
| 451 | + # Weight unit | |
| 452 | + if pd.notna(sku_row.get('weight_unit')): | |
| 453 | + sku_data['weight_unit'] = str(sku_row['weight_unit']) | |
| 454 | + | |
| 455 | + # Option values | |
| 332 | 456 | if pd.notna(sku_row.get('option1')): |
| 333 | - options['option1'] = str(sku_row['option1']) | |
| 457 | + sku_data['option1_value'] = str(sku_row['option1']) | |
| 334 | 458 | if pd.notna(sku_row.get('option2')): |
| 335 | - options['option2'] = str(sku_row['option2']) | |
| 459 | + sku_data['option2_value'] = str(sku_row['option2']) | |
| 336 | 460 | if pd.notna(sku_row.get('option3')): |
| 337 | - options['option3'] = str(sku_row['option3']) | |
| 338 | - | |
| 339 | - if options: | |
| 340 | - sku_data['options'] = options | |
| 461 | + sku_data['option3_value'] = str(sku_row['option3']) | |
| 462 | + | |
| 463 | + # Image src | |
| 464 | + if pd.notna(sku_row.get('image_src')): | |
| 465 | + sku_data['image_src'] = str(sku_row['image_src']) | |
| 341 | 466 | |
| 342 | 467 | return sku_data |
| 343 | 468 | ... | ... |