diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 1c9b6c5..69a4a92 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -299,20 +299,32 @@ class SPUDocumentTransformer: def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): """填充类目相关字段。""" + # 数据质量兜底: + # - 当商品的类目ID在映射中不存在时,视为“不合法类目”,整条类目相关字段都不写入(当成没有类目) + # - 仅记录错误日志,不阻塞索引流程 + if pd.notna(spu_row.get('category_path')): category_path = str(spu_row['category_path']) # 解析category_path - 这是逗号分隔的类目ID列表 category_ids = [cid.strip() for cid in category_path.split(',') if cid.strip()] - - # 将ID映射为名称 + # 将ID映射为名称,如果找不到映射则记录错误并跳过 category_names = [] + missing_ids = [] for cid in category_ids: if cid in self.category_id_to_name: category_names.append(self.category_id_to_name[cid]) else: - logger.error(f"Category ID {cid} not found in mapping for SPU {spu_row['id']} (title: {spu_row.get('title', 'N/A')}), category_path={category_path}") - category_names.append(cid) # 使用ID作为备选 + missing_ids.append(cid) + + # 如果有缺失的类目ID,记录错误日志,不写入类目字段(当成没有类目) + if missing_ids: + logger.error( + f"Category ID(s) not found in mapping for SPU {spu_row.get('id')} " + f"(title: {spu_row.get('title', 'N/A')}), missing_ids={missing_ids}, " + f"category_path={category_path}. Treating as no-category." + ) + return # 构建类目路径字符串(用于搜索) if category_names: -- libgit2 0.21.2