diff --git a/config/config.yaml b/config/config.yaml index 6065e8d..c5c95ab 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -29,86 +29,88 @@ fields: store: true return_in_source: true - - name: "handle" - type: "KEYWORD" - index: true - store: true - return_in_source: true - - # 文本搜索字段 - - name: "title" + # 文本相关性相关字段(中英文双语) + - name: "title_zh" type: "TEXT" - analyzer: "chinese_ecommerce" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" boost: 3.0 index: true store: true return_in_source: true - - name: "brief" + - name: "brief_zh" type: "TEXT" - analyzer: "chinese_ecommerce" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" boost: 1.5 index: true store: true return_in_source: true - - name: "description" + - name: "description_zh" type: "TEXT" - analyzer: "chinese_ecommerce" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" boost: 1.0 index: true store: true return_in_source: true - # SEO字段(提升相关性) - - name: "seo_title" + - name: "vendor_zh" type: "TEXT" - analyzer: "chinese_ecommerce" - boost: 2.0 - index: true - store: true - return_in_source: false # SEO字段通常不需要在结果中返回 - - - name: "seo_description" - type: "TEXT" - analyzer: "chinese_ecommerce" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" boost: 1.5 index: true store: true - return_in_source: false + return_in_source: true + keyword_subfield: true + keyword_normalizer: "lowercase" - - name: "seo_keywords" + - name: "title_en" type: "TEXT" - analyzer: "chinese_ecommerce" - boost: 2.0 + analyzer: "english" + search_analyzer: "english" + boost: 3.0 index: true store: true - return_in_source: false + return_in_source: true - # 分类和标签字段(TEXT + KEYWORD双重索引) - - name: "vendor" - type: "HKText" - analyzer: "chinese_ecommerce" + - name: "brief_en" + type: "TEXT" + analyzer: "english" + search_analyzer: "english" boost: 1.5 index: true store: true return_in_source: true - - name: "tags" - type: "HKText" - analyzer: "chinese_ecommerce" + - name: "description_en" + type: "TEXT" + analyzer: "english" + search_analyzer: "english" boost: 1.0 index: true store: true return_in_source: true - - name: "category" - type: "HKText" - analyzer: "chinese_ecommerce" + - name: "vendor_en" + type: "TEXT" + analyzer: "english" + search_analyzer: "english" boost: 1.5 index: true store: true return_in_source: true + keyword_subfield: true + keyword_normalizer: "lowercase" + + - name: "tags" + type: "KEYWORD" + index: true + store: true + return_in_source: true # 价格字段(扁平化) - name: "min_price" @@ -129,6 +131,30 @@ fields: store: true return_in_source: true + - name: "sku_prices" + type: "FLOAT" + index: true + store: true + return_in_source: true + + - name: "sku_weights" + type: "LONG" + index: true + store: true + return_in_source: true + + - name: "sku_weight_units" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "total_inventory" + type: "LONG" + index: true + store: true + return_in_source: true + # 图片字段(用于显示,不参与搜索) - name: "image_url" type: "KEYWORD" @@ -136,7 +162,7 @@ fields: store: true return_in_source: true - # 文本嵌入字段(用于语义搜索) + # 语义向量 - name: "title_embedding" type: "TEXT_EMBEDDING" embedding_dims: 1024 @@ -145,30 +171,137 @@ fields: store: false return_in_source: false # 嵌入向量通常不需要在结果中返回 - # 时间字段 - - name: "create_time" - type: "DATE" + - name: "image_embedding" + type: "IMAGE_EMBEDDING" + embedding_dims: 1024 + embedding_similarity: "dot_product" + nested: true + index: true + store: false + return_in_source: false + + # 分类相关字段 + - name: "category_path_zh" + type: "TEXT" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" + boost: 1.5 index: true store: true return_in_source: true - - name: "update_time" - type: "DATE" + - name: "category_path_en" + type: "TEXT" + analyzer: "english" + search_analyzer: "english" + boost: 1.5 + index: true + store: true + return_in_source: true + + - name: "category_name_zh" + type: "TEXT" + analyzer: "hanlp_index" + search_analyzer: "hanlp_standard" + boost: 1.5 + index: true + store: true + return_in_source: true + + - name: "category_name_en" + type: "TEXT" + analyzer: "english" + search_analyzer: "english" + boost: 1.5 index: true store: true return_in_source: true - - name: "shoplazza_created_at" + - name: "category_id" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "category_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "category_level" + type: "INT" + index: true + store: true + return_in_source: true + + - name: "category1_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "category2_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "category3_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + # SKU款式、子sku属性 + - name: "specifications" + type: "JSON" + nested: true + return_in_source: true + nested_properties: + sku_id: + type: "keyword" + index: true + store: true + name: + type: "keyword" + index: true + store: true + value: + type: "keyword" + index: true + store: true + + - name: "option1_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "option2_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + - name: "option3_name" + type: "KEYWORD" + index: true + store: true + return_in_source: true + + # 时间字段 + - name: "create_time" type: "DATE" index: true store: true - return_in_source: false # 通常不需要返回 + return_in_source: true - - name: "shoplazza_updated_at" + - name: "update_time" type: "DATE" index: true store: true - return_in_source: false # 通常不需要返回 + return_in_source: true # 嵌套skus字段 - name: "skus" @@ -180,11 +313,6 @@ fields: type: "keyword" index: true store: true - title: - type: "text" - analyzer: "chinese_ecommerce" - index: true - store: true price: type: "float" index: true @@ -193,7 +321,7 @@ fields: type: "float" index: true store: true - sku: + sku_code: type: "keyword" index: true store: true @@ -201,46 +329,65 @@ fields: type: "long" index: true store: true - options: - type: "object" - enabled: true + weight: + type: "float" + index: true + store: true + weight_unit: + type: "keyword" + index: true + store: true + option1_value: + type: "keyword" + index: true + store: true + option2_value: + type: "keyword" + index: true + store: true + option3_value: + type: "keyword" + index: true + store: true + image_src: + type: "keyword" + index: false + store: true # Index Structure (Query Domains) indexes: - name: "default" label: "默认索引" fields: - - "title" - - "brief" - - "description" - - "seo_title" - - "seo_description" - - "seo_keywords" - - "vendor" + - "title_zh" + - "brief_zh" + - "description_zh" + - "vendor_zh" - "tags" - - "category" + - "category_path_zh" + - "category_name_zh" analyzer: "chinese_ecommerce" boost: 1.0 - name: "title" label: "标题索引" fields: - - "title" - - "seo_title" + - "title_zh" analyzer: "chinese_ecommerce" boost: 2.0 - name: "vendor" label: "品牌索引" fields: - - "vendor" + - "vendor_zh" analyzer: "chinese_ecommerce" boost: 1.5 - name: "category" label: "类目索引" fields: - - "category" + - "category_path_zh" + - "category_name_zh" analyzer: "chinese_ecommerce" boost: 1.5 @@ -248,7 +395,6 @@ indexes: label: "标签索引" fields: - "tags" - - "seo_keywords" analyzer: "chinese_ecommerce" boost: 1.0 diff --git a/config/config_loader.py b/config/config_loader.py index f6c9892..e3f4c24 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -313,7 +313,8 @@ class ConfigLoader: nested=field_data.get("nested", False), nested_properties=field_data.get("nested_properties"), keyword_subfield=field_data.get("keyword_subfield", is_hktext), - keyword_ignore_above=field_data.get("keyword_ignore_above", 256) + keyword_ignore_above=field_data.get("keyword_ignore_above", 256), + keyword_normalizer=field_data.get("keyword_normalizer") ) def _parse_index_config(self, index_data: Dict[str, Any]) -> IndexConfig: diff --git a/config/field_types.py b/config/field_types.py index c1d3703..a02a875 100644 --- a/config/field_types.py +++ b/config/field_types.py @@ -75,6 +75,7 @@ class FieldConfig: # Hybrid Keyword Text (HKText) support keyword_subfield: bool = False keyword_ignore_above: int = 256 + keyword_normalizer: Optional[str] = None # For keyword subfield normalizer (e.g., "lowercase") def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: @@ -100,18 +101,28 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: if field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE: mapping["analyzer"] = "index_ansj" mapping["search_analyzer"] = "query_ansj" + elif field_config.analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: + # If search_analyzer is explicitly set to CHINESE_ECOMMERCE_QUERY + mapping["analyzer"] = "index_ansj" + mapping["search_analyzer"] = "query_ansj" else: mapping["analyzer"] = field_config.analyzer.value if field_config.search_analyzer: - mapping["search_analyzer"] = field_config.search_analyzer.value + if field_config.search_analyzer == AnalyzerType.CHINESE_ECOMMERCE_QUERY: + mapping["search_analyzer"] = "query_ansj" + else: + mapping["search_analyzer"] = field_config.search_analyzer.value if field_config.keyword_subfield: mapping.setdefault("fields", {}) - mapping["fields"]["keyword"] = { + keyword_field = { "type": "keyword", "ignore_above": field_config.keyword_ignore_above } + if field_config.keyword_normalizer: + keyword_field["normalizer"] = field_config.keyword_normalizer + mapping["fields"]["keyword"] = keyword_field elif field_config.field_type == FieldType.KEYWORD: mapping = { @@ -140,7 +151,7 @@ def get_es_mapping_for_field(field_config: FieldConfig) -> Dict[str, Any]: "similarity": field_config.embedding_similarity }, "url": { - "type": "keyword" + "type": "text" } } } @@ -239,6 +250,22 @@ def get_default_analyzers() -> Dict[str, Any]: "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] + }, + "hanlp_index": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding"] + }, + "hanlp_standard": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding"] + } + }, + "normalizer": { + "lowercase": { + "type": "custom", + "filter": ["lowercase"] } } } @@ -300,6 +327,9 @@ ANALYZER_MAP = { "chinese": AnalyzerType.CHINESE_ECOMMERCE, "chinese_ecommerce": AnalyzerType.CHINESE_ECOMMERCE, "index_ansj": AnalyzerType.CHINESE_ECOMMERCE, + "hanlp_index": AnalyzerType.CHINESE_ECOMMERCE, # Alias for index_ansj + "hanlp_standard": AnalyzerType.CHINESE_ECOMMERCE_QUERY, # Alias for query_ansj + "query_ansj": AnalyzerType.CHINESE_ECOMMERCE_QUERY, "english": AnalyzerType.ENGLISH, "arabic": AnalyzerType.ARABIC, "spanish": AnalyzerType.SPANISH, diff --git a/docs/索引字段说明v2-mapping结构.md b/docs/索引字段说明v2-mapping结构.md new file mode 100644 index 0000000..5698bbe --- /dev/null +++ b/docs/索引字段说明v2-mapping结构.md @@ -0,0 +1,231 @@ +{ + "mappings": { + "properties": { + "tenant_id": { + "type": "keyword" + }, + "spu_id": { + "type": "keyword" + }, + + "create_time": { + "type": "date" + }, + "update_time": { + "type": "date" + }, + + // 文本相关性相关字段 + "title_zh": { + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard" + }, + "brief_zh": { + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard" + }, + "description_zh": { + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard" + }, + "vendor_zh": { + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + + "title_en": { + "type": "text", + "analyzer": "english", + "search_analyzer": "english", + }, + "brief_en": { + "type": "text", + "analyzer": "english", + "search_analyzer": "english", + + }, + "description_en": { + "type": "text", + "analyzer": "english", + "search_analyzer": "english", + }, + "vendor_en": { + "type": "text", + "analyzer": "english", + "search_analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + + "tags": { + "type": "keyword", + }, + + "image_url": { + "type": "keyword", + "index": false + }, + + // 语义向量 + "title_embedding": { + "type": "dense_vector", + "dims": 1024, + "index": true, + "similarity": "dot_product" + }, + "image_embedding": { + "type": "nested", + "properties": { + "vector": { + "type": "dense_vector", + "dims": 1024, + "index": true, + "similarity": "dot_product" + }, + "url": { + "type": "text" + } + } + }, + + // 分类相关 + "category_path_zh": { // 提供模糊查询功能,辅助相关性计算 + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard" + }, + "category_path_en": { // 提供模糊查询功能,辅助相关性计算 + "type": "text", + "analyzer": "english", + "search_analyzer": "english" + }, + "category_name_zh": { // 提供模糊查询功能,辅助相关性计算 + "type": "text", + "analyzer": "hanlp_index", + "search_analyzer": "hanlp_standard" + }, + "category_name_en": { // 提供模糊查询功能,辅助相关性计算 + "type": "text", + "analyzer": "english", + "search_analyzer": "english" + }, + + "category_id": { + "type": "keyword" + }, + "category_name": { + "type": "keyword" + }, + "category_level": { + "type": "integer" + }, + "category1_name": { // 不同层级下 可能有同名的情况,因此提供一二三级分开的查询方式 + "type": "keyword" + }, + "category2_name": { + "type": "keyword" + }, + "category3_name": { + "type": "keyword" + }, + + // sku款式、子sku属性 + "specifications": { + "type": "nested", + "properties": { + "sku_id": { "type": "keyword" }, + "name": { "type": "keyword" }, // "颜色", "容量" + "value": { "type": "keyword" } // "白色", "256GB" + } + }, + + "option1_name": { + "type": "keyword" + }, + "option2_name": { + "type": "keyword" + }, + "option3_name": { + "type": "keyword" + }, + + "min_price": { + "type": "float" + }, + "max_price": { + "type": "float" + }, + "compare_at_price": { + "type": "float" + }, + "sku_prices": { + "type": "float" + }, + "sku_weights": { + "type": "long" + }, + "sku_weight_units": { + "type": "keyword" + }, + "total_inventory": { + "type": "long" + }, + + "skus": { + "type": "nested", + "properties": { + "sku_id": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "compare_at_price": { + "type": "float" + }, + "sku_code": { + "type": "keyword" + }, + "stock": { + "type": "long" + }, + "weight": { + "type": "float" + }, + "weight_unit": { + "type": "keyword" + }, + "option1_value": { + "type": "keyword" + }, + "option2_value": { + "type": "keyword" + }, + "option3_value": { + "type": "keyword" + }, + "image_src": { + "type": "keyword", + "index": false + } + } + } + } + } +} + + diff --git a/docs/索引字段说明v2.md b/docs/索引字段说明v2.md index fe9c00c..8593956 100644 --- a/docs/索引字段说明v2.md +++ b/docs/索引字段说明v2.md @@ -4,247 +4,34 @@ SPU-SKU索引方案选型 除了title, brielf description seo相关 cate tags vendor所有影响相关性的字段都在spu。 sku只有款式、价格、重量、库存等相关属性。所以,可以以spu为单位建立索引。 sku中需要参与搜索的属性(比如价格、库存)展开到spu。 sku的所有需要返回的字段作为nested字段,仅用于返回。 -灌入数据准备 -def build_product_document(product, skus): - # 提取价格列表(转换为float,保留两位小数) - price_list = [float(sku.price) for sku in skus if sku.price is not None] - - # 提取重量信息(重量转为int,单位统一为克;重量+单位拼接为字符串) - weight_list = [int(float(sku.weight) * 1000) for sku in skus if sku.weight is not None] # 转为整数克 - weight_with_unit_list = [f"{sku.weight}{sku.weight_unit}" for sku in skus if sku.weight and sku.weight_unit] - - # 计算库存总和 - total_stock = sum([sku.inventory_quantity for sku in skus if sku.inventory_quantity is not None]) - - # 计算价格区间 - min_price = min(price_list) if price_list else 0.0 - max_price = max(price_list) if price_list else 0.0 - +# 写入 spu 级别索引 +def build_product_document(product, variants): return { "spu_id": str(product.id), "title": product.title, - # SPU级别的选项名称定义(如:颜色、尺码、材质) - "option1_name": getattr(product, 'option1', None), - "option2_name": getattr(product, 'option2', None), - "option3_name": getattr(product, 'option3', None), - - # SKU搜索字段(展开) + # Variant搜索字段(展开) # 价格(int)、重量(int)、重量单位拼接重量(keyword),都以list形式灌入 - "sku_prices": price_list, # 所有SKU价格列表,用于范围聚合 - "sku_weights": weight_list, # 重量数值列表(转换为整数克) - "sku_weight_units": weight_with_unit_list, # 重量+单位字符串列表 - - # 库存总和 将SKU的库存加起来作为一个值灌入 - "total_inventory": total_stock, # SKU库存总和 - - # 售价,灌入3个字段:SKU价格列表、最高价、最低价 - "min_price": min_price, # 最低售价 - "max_price": max_price, # 最高售价 - "price_range": { # 价格区间对象,便于范围查询 - "gte": min_price, - "lte": max_price - }, - - # SKU详细信息(nested结构,仅用于返回) - "skus": [ - { - "sku_id": str(sku.id), - "price": float(sku.price) if sku.price else 0.0, - "compare_at_price": float(sku.compare_at_price) if sku.compare_at_price else None, - "sku_code": sku.sku, - "stock": sku.inventory_quantity, - "weight": float(sku.weight) if sku.weight else None, - "weight_unit": sku.weight_unit, - - # SKU级别的选项值(对应SPU的选项名称) - "option1_value": sku.option1, - "option2_value": sku.option2, - "option3_value": sku.option3, + # TODO 按要求补充 + + # 库存总和 将sku的库存加起来作为一个值灌入 + # 售价,灌入3个字段,一个 sku价格 以list形式灌入,一个最高价一个最低价 + # TODO 按要求补充 - "image_src": sku.image_src + # Variant详细信息(用于返回) + "variants": [ + { + "sku_id": str(v.id), + "price": float(v.price), + "options": v.options } - for sku in skus + for v in variants ], - - # 其他SPU级别字段(根据索引文档补充) - "tenant_id": str(product.tenant_id), - "brief": product.brief, - "description": product.description, - "vendor": product.vendor, - "category": product.category, - "tags": product.tags.split(',') if product.tags else [], - "seo_title": product.seo_title, - "seo_description": product.seo_description, - "seo_keywords": product.seo_keywords.split(',') if product.seo_keywords else [], - "image_url": product.image_src, - "create_time": product.create_time.isoformat() if product.create_time else None, - "update_time": product.update_time.isoformat() if product.update_time else None - } - 索引定义 -{ - "mappings": { - "properties": { - "tenant_id": { - "type": "keyword" - }, - "spu_id": { - "type": "keyword" - }, - // 文本相关性相关字段 - "title_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "brief_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "description_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "vendor_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase" - } - } - }, - - "title_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english", - }, - "brief_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english", - }, - "description_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english", - }, - "vendor_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase" - } - } - }, - - "tags": { - "type": "keyword", - }, - - - "min_price": { - "type": "float" - }, - "max_price": { - "type": "float" - }, - "compare_at_price": { - "type": "float" - }, - "sku_prices": { - "type": "float" - }, - "sku_weights": { - "type": "long" - }, - "sku_weight_units": { - "type": "keyword" - }, - "total_inventory": { - "type": "long" - }, - - "image_url": { - "type": "keyword", - "index": false - }, - - "title_embedding": { - "type": "dense_vector", - "dims": 1024, - "index": true, - "similarity": "dot_product" - }, - - "create_time": { - "type": "date" - }, - "update_time": { - "type": "date" - }, - - "option1_name": { - "type": "keyword" - }, - "option2_name": { - "type": "keyword" - }, - "option3_name": { - "type": "keyword" - }, - - "skus": { - "type": "nested", - "properties": { - "sku_id": { - "type": "keyword" - }, - "price": { - "type": "float" - }, - "compare_at_price": { - "type": "float" - }, - "sku_code": { - "type": "keyword" - }, - "stock": { - "type": "long" - }, - "weight": { - "type": "float" - }, - "weight_unit": { - "type": "keyword" - }, - "option1_value": { - "type": "keyword" - }, - "option2_value": { - "type": "keyword" - }, - "option3_value": { - "type": "keyword" - }, - "image_src": { - "type": "keyword", - "index": false - } - } - } + + "min_price": min(v.price for v in variants), + "max_price": max(v.price for v in variants) } - } -} 1.2 查询方案 对数组字段使用 dis_max,只取最高分,避免累加。 其他重点字段 @@ -333,26 +120,56 @@ S red 1. API 在 SPU 的维度直接返回3个属性定义,存储在 shoplazza_product_option 中: 1. API在 SKU的维度直接返回3个属性值,存储在 shoplazza_product_sku 表的 option 相关的字段中: 5.3 ES索引 -5.3.1 - 3nested,支持超过3个属性(动态)。只用作返回,不能查询。节省索引空间 + "specifications": { "type": "nested", "properties": { - "name": { "type": "keyword","index": false }, - "value": { "type": "keyword","index": false } + "name": { "type": "keyword" }, // "颜色", "容量" + "value": { "type": "keyword" } // "白色", "256GB" } }, -6. SEO相关字段 -6.1 数据源 -SEO标题 SEO描述 SEO URL Handle SEO URL 重定向 SEO关键词 -最多5000字符 最多5000字符 "最多支持输入255字符 - (SEO URL handle只对SEO URL的「URL参数」部分进行更改,即“products/”后的内容,如:products/「URL参数」 - )" "创建URL重定向,访问修改前链接可跳转到修改后的新链接页面 -「Y」:TRUE -「N」:FALSE " 多个关键词请用「英文逗号」隔开 - -6.2 Mysql -6.3 ES索引 -6.3.1 输入数据 -6.3.2 索引方法 \ No newline at end of file + 另外还需要包含一个单独的字段,main_option (即店铺主题装修里面配置的 颜色切换 - 变体名称,也就是列表页商品的子sku显示维度) + "main_option": { "type": "keyword" } +查询指定款式 +{ + "query": { + "nested": { + "path": "specifications", + "query": { + "bool": { + "must": [ + { "term": { "specifications.name ": "颜色" } }, + { "term": { "specifications.value": "绿色" } } + ] + } + } + } + } +} +按 name 做分面搜索(聚合) + +{ + "aggs": { + "specs": { + "nested": { "path": "specifications" }, + "aggs": { + "by_name": { + "terms": { + "field": "specifications.name", + "size": 20 + }, + "aggs": { + "value_counts": { + "terms": { + "field": "specifications.value", + "size": 10 + } + } + } + } + } + } + } +} + \ No newline at end of file diff --git a/indexer/spu_transformer.py b/indexer/spu_transformer.py index aa4fcba..6dc64ec 100644 --- a/indexer/spu_transformer.py +++ b/indexer/spu_transformer.py @@ -38,12 +38,12 @@ class SPUTransformer: """ query = text(""" SELECT - id, shop_id, shoplazza_id, handle, title, brief, description, - spu, vendor, vendor_url, seo_title, seo_description, seo_keywords, + id, shop_id, shoplazza_id, title, brief, description, + spu, vendor, vendor_url, image_src, image_width, image_height, image_path, image_alt, - tags, note, category, - shoplazza_created_at, shoplazza_updated_at, tenant_id, - creator, create_time, updater, update_time, deleted + tags, note, category, category_id, category_google_id, + category_level, category_path, + tenant_id, creator, create_time, updater, update_time, deleted FROM shoplazza_product_spu WHERE tenant_id = :tenant_id AND deleted = 0 """) @@ -114,6 +114,30 @@ class SPUTransformer: return df + def load_option_data(self) -> pd.DataFrame: + """ + Load option data from MySQL. + + Returns: + DataFrame with option data (name, position for each SPU) + """ + query = text(""" + SELECT + id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, + position, name, values, tenant_id, + creator, create_time, updater, update_time, deleted + FROM shoplazza_product_option + WHERE tenant_id = :tenant_id AND deleted = 0 + ORDER BY spu_id, position + """) + + with self.db_engine.connect() as conn: + df = pd.read_sql(query, conn, params={"tenant_id": self.tenant_id}) + + print(f"DEBUG: Loaded {len(df)} option records for tenant_id={self.tenant_id}") + + return df + def transform_batch(self) -> List[Dict[str, Any]]: """ Transform SPU and SKU data into ES documents. @@ -124,12 +148,16 @@ class SPUTransformer: # Load data spu_df = self.load_spu_data() sku_df = self.load_sku_data() + option_df = self.load_option_data() if spu_df.empty: return [] # Group SKUs by SPU sku_groups = sku_df.groupby('spu_id') + + # Group options by SPU + option_groups = option_df.groupby('spu_id') if not option_df.empty else None documents = [] for _, spu_row in spu_df.iterrows(): @@ -138,8 +166,11 @@ class SPUTransformer: # Get SKUs for this SPU skus = sku_groups.get_group(spu_id) if spu_id in sku_groups.groups else pd.DataFrame() + # Get options for this SPU + options = option_groups.get_group(spu_id) if option_groups and spu_id in option_groups.groups else pd.DataFrame() + # Transform to ES document - doc = self._transform_spu_to_doc(spu_row, skus) + doc = self._transform_spu_to_doc(spu_row, skus, options) if doc: documents.append(doc) @@ -148,7 +179,8 @@ class SPUTransformer: def _transform_spu_to_doc( self, spu_row: pd.Series, - skus: pd.DataFrame + skus: pd.DataFrame, + options: pd.DataFrame ) -> Optional[Dict[str, Any]]: """ Transform a single SPU row and its SKUs into an ES document. @@ -156,6 +188,7 @@ class SPUTransformer: Args: spu_row: SPU row from database skus: DataFrame with SKUs for this SPU + options: DataFrame with options for this SPU Returns: ES document or None if transformation fails @@ -168,41 +201,66 @@ class SPUTransformer: # SPU ID doc['spu_id'] = str(spu_row['id']) - # Handle - if pd.notna(spu_row.get('handle')): - doc['handle'] = str(spu_row['handle']) - - # Title + # 文本相关性相关字段(中英文双语,暂时只填充中文) if pd.notna(spu_row.get('title')): - doc['title'] = str(spu_row['title']) + doc['title_zh'] = str(spu_row['title']) + doc['title_en'] = None # 暂时设为空 - # Brief if pd.notna(spu_row.get('brief')): - doc['brief'] = str(spu_row['brief']) + doc['brief_zh'] = str(spu_row['brief']) + doc['brief_en'] = None - # Description if pd.notna(spu_row.get('description')): - doc['description'] = str(spu_row['description']) + doc['description_zh'] = str(spu_row['description']) + doc['description_en'] = None - # SEO fields - if pd.notna(spu_row.get('seo_title')): - doc['seo_title'] = str(spu_row['seo_title']) - if pd.notna(spu_row.get('seo_description')): - doc['seo_description'] = str(spu_row['seo_description']) - if pd.notna(spu_row.get('seo_keywords')): - doc['seo_keywords'] = str(spu_row['seo_keywords']) - - # Vendor if pd.notna(spu_row.get('vendor')): - doc['vendor'] = str(spu_row['vendor']) + doc['vendor_zh'] = str(spu_row['vendor']) + doc['vendor_en'] = None # Tags if pd.notna(spu_row.get('tags')): - doc['tags'] = str(spu_row['tags']) + # Tags是逗号分隔的字符串,需要转换为数组 + tags_str = str(spu_row['tags']) + doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()] + + # Category相关字段 + if pd.notna(spu_row.get('category_path')): + category_path = str(spu_row['category_path']) + doc['category_path_zh'] = category_path + doc['category_path_en'] = None # 暂时设为空 + + # 解析category_path获取多层级分类名称 + path_parts = category_path.split('/') + if len(path_parts) > 0: + doc['category1_name'] = path_parts[0].strip() + if len(path_parts) > 1: + doc['category2_name'] = path_parts[1].strip() + if len(path_parts) > 2: + doc['category3_name'] = path_parts[2].strip() - # Category if pd.notna(spu_row.get('category')): - doc['category'] = str(spu_row['category']) + category_name = str(spu_row['category']) + doc['category_name_zh'] = category_name + doc['category_name_en'] = None + doc['category_name'] = category_name + + if pd.notna(spu_row.get('category_id')): + doc['category_id'] = str(int(spu_row['category_id'])) + + if pd.notna(spu_row.get('category_level')): + doc['category_level'] = int(spu_row['category_level']) + + # Option名称(从option表获取) + if not options.empty: + # 按position排序获取option名称 + sorted_options = options.sort_values('position') + if len(sorted_options) > 0 and pd.notna(sorted_options.iloc[0].get('name')): + doc['option1_name'] = str(sorted_options.iloc[0]['name']) + if len(sorted_options) > 1 and pd.notna(sorted_options.iloc[1].get('name')): + doc['option2_name'] = str(sorted_options.iloc[1]['name']) + if len(sorted_options) > 2 and pd.notna(sorted_options.iloc[2].get('name')): + doc['option3_name'] = str(sorted_options.iloc[2]['name']) # Image URL if pd.notna(spu_row.get('image_src')): @@ -211,27 +269,85 @@ class SPUTransformer: image_src = f"//{image_src}" if image_src.startswith('//') else image_src doc['image_url'] = image_src - # Process SKUs + # Process SKUs and build specifications skus_list = [] prices = [] compare_prices = [] + sku_prices = [] + sku_weights = [] + sku_weight_units = [] + total_inventory = 0 + specifications = [] + + # 构建option名称映射(position -> name) + option_name_map = {} + if not options.empty: + for _, opt_row in options.iterrows(): + position = opt_row.get('position') + name = opt_row.get('name') + if pd.notna(position) and pd.notna(name): + option_name_map[int(position)] = str(name) for _, sku_row in skus.iterrows(): - sku_data = self._transform_sku_row(sku_row) + sku_data = self._transform_sku_row(sku_row, option_name_map) if sku_data: skus_list.append(sku_data) + + # 收集价格信息 if 'price' in sku_data and sku_data['price'] is not None: try: - prices.append(float(sku_data['price'])) + price_val = float(sku_data['price']) + prices.append(price_val) + sku_prices.append(price_val) except (ValueError, TypeError): pass + if 'compare_at_price' in sku_data and sku_data['compare_at_price'] is not None: try: compare_prices.append(float(sku_data['compare_at_price'])) except (ValueError, TypeError): pass + + # 收集重量信息 + if 'weight' in sku_data and sku_data['weight'] is not None: + try: + sku_weights.append(int(float(sku_data['weight']))) + except (ValueError, TypeError): + pass + + if 'weight_unit' in sku_data and sku_data['weight_unit']: + sku_weight_units.append(str(sku_data['weight_unit'])) + + # 收集库存信息 + if 'stock' in sku_data and sku_data['stock'] is not None: + try: + total_inventory += int(sku_data['stock']) + except (ValueError, TypeError): + pass + + # 构建specifications(从SKU的option值和option表的name) + sku_id = str(sku_row['id']) + if pd.notna(sku_row.get('option1')) and 1 in option_name_map: + specifications.append({ + 'sku_id': sku_id, + 'name': option_name_map[1], + 'value': str(sku_row['option1']) + }) + if pd.notna(sku_row.get('option2')) and 2 in option_name_map: + specifications.append({ + 'sku_id': sku_id, + 'name': option_name_map[2], + 'value': str(sku_row['option2']) + }) + if pd.notna(sku_row.get('option3')) and 3 in option_name_map: + specifications.append({ + 'sku_id': sku_id, + 'name': option_name_map[3], + 'value': str(sku_row['option3']) + }) doc['skus'] = skus_list + doc['specifications'] = specifications # Calculate price ranges if prices: @@ -246,6 +362,19 @@ class SPUTransformer: else: doc['compare_at_price'] = None + # SKU扁平化字段 + doc['sku_prices'] = sku_prices + doc['sku_weights'] = sku_weights + doc['sku_weight_units'] = list(set(sku_weight_units)) # 去重 + doc['total_inventory'] = total_inventory + + # Image URL + if pd.notna(spu_row.get('image_src')): + image_src = str(spu_row['image_src']) + if not image_src.startswith('http'): + image_src = f"//{image_src}" if image_src.startswith('//') else image_src + doc['image_url'] = image_src + # Time fields - convert datetime to ISO format string for ES DATE type if pd.notna(spu_row.get('create_time')): create_time = spu_row['create_time'] @@ -260,29 +389,16 @@ class SPUTransformer: doc['update_time'] = update_time.isoformat() else: doc['update_time'] = str(update_time) - - if pd.notna(spu_row.get('shoplazza_created_at')): - shoplazza_created_at = spu_row['shoplazza_created_at'] - if hasattr(shoplazza_created_at, 'isoformat'): - doc['shoplazza_created_at'] = shoplazza_created_at.isoformat() - else: - doc['shoplazza_created_at'] = str(shoplazza_created_at) - - if pd.notna(spu_row.get('shoplazza_updated_at')): - shoplazza_updated_at = spu_row['shoplazza_updated_at'] - if hasattr(shoplazza_updated_at, 'isoformat'): - doc['shoplazza_updated_at'] = shoplazza_updated_at.isoformat() - else: - doc['shoplazza_updated_at'] = str(shoplazza_updated_at) return doc - def _transform_sku_row(self, sku_row: pd.Series) -> Optional[Dict[str, Any]]: + def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: """ Transform a SKU row into a SKU object. Args: sku_row: SKU row from database + option_name_map: Mapping from position to option name Returns: SKU dictionary or None @@ -292,10 +408,6 @@ class SPUTransformer: # SKU ID sku_data['sku_id'] = str(sku_row['id']) - # Title - if pd.notna(sku_row.get('title')): - sku_data['title'] = str(sku_row['title']) - # Price if pd.notna(sku_row.get('price')): try: @@ -314,9 +426,9 @@ class SPUTransformer: else: sku_data['compare_at_price'] = None - # SKU + # SKU Code if pd.notna(sku_row.get('sku')): - sku_data['sku'] = str(sku_row['sku']) + sku_data['sku_code'] = str(sku_row['sku']) # Stock if pd.notna(sku_row.get('inventory_quantity')): @@ -327,17 +439,30 @@ class SPUTransformer: else: sku_data['stock'] = 0 - # Options (from option1, option2, option3) - options = {} + # Weight + if pd.notna(sku_row.get('weight')): + try: + sku_data['weight'] = float(sku_row['weight']) + except (ValueError, TypeError): + sku_data['weight'] = None + else: + sku_data['weight'] = None + + # Weight unit + if pd.notna(sku_row.get('weight_unit')): + sku_data['weight_unit'] = str(sku_row['weight_unit']) + + # Option values if pd.notna(sku_row.get('option1')): - options['option1'] = str(sku_row['option1']) + sku_data['option1_value'] = str(sku_row['option1']) if pd.notna(sku_row.get('option2')): - options['option2'] = str(sku_row['option2']) + sku_data['option2_value'] = str(sku_row['option2']) if pd.notna(sku_row.get('option3')): - options['option3'] = str(sku_row['option3']) - - if options: - sku_data['options'] = options + sku_data['option3_value'] = str(sku_row['option3']) + + # Image src + if pd.notna(sku_row.get('image_src')): + sku_data['image_src'] = str(sku_row['image_src']) return sku_data -- libgit2 0.21.2