From 50170c5a0f915bd2cd0d6e012348417ee380ddcb Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 17 Dec 2025 16:33:18 +0800 Subject: [PATCH] 导入成功。有部分失败 (1/4) 原因有: 1. 跟主商品标题不一致 2. 商品图片信息缺失 3. Options是无效的 --- docs/亚马逊格式数据转店匠商品导入模板.md | 23 ++++++++++++++++++++--- scripts/amazon_xlsx_to_shoplazza_xlsx.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ 2 files changed, 92 insertions(+), 27 deletions(-) diff --git a/docs/亚马逊格式数据转店匠商品导入模板.md b/docs/亚马逊格式数据转店匠商品导入模板.md index 7c60870..d637a89 100644 --- a/docs/亚马逊格式数据转店匠商品导入模板.md +++ b/docs/亚马逊格式数据转店匠商品导入模板.md @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ --input-dir data/mai_jia_jing_ling/products_data \ --template docs/商品导入模板.xlsx \ --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ - --max-files 1 --max-rows-per-file 2000 --max-products 50 + --max-files 1 --max-products 50 ``` ### 性能提示(很重要) - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 -- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。 -- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write` +- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写(只有这一种实现方式,保持简单)。 ### 2)生成全量 @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx ``` +### 3)按 Excel 行数自动拆分文件 + +当单个导出超过一定行数时,可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数(包含模板头部行): + +```bash +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ + --input-dir data/mai_jia_jing_ling/products_data \ + --template docs/商品导入模板.xlsx \ + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \ + --max-rows-per-output 40000 +``` + +- 若结果只需要 1 个文件,仍输出为 `amazon_shoplazza_import_SPLIT.xlsx` +- 若需要拆分为多个文件,则追加 `_part1/_part2/...` 后缀: + - `amazon_shoplazza_import_SPLIT_part1.xlsx` + - `amazon_shoplazza_import_SPLIT_part2.xlsx` +- **同一个 SPU(同一父ASIN 的 M+P 或 S 行)保证不会被拆到不同文件中** + --- ## 六、可扩展点(后续常见需求) diff --git a/scripts/amazon_xlsx_to_shoplazza_xlsx.py b/scripts/amazon_xlsx_to_shoplazza_xlsx.py index 276a45b..cf87549 100644 --- a/scripts/amazon_xlsx_to_shoplazza_xlsx.py +++ b/scripts/amazon_xlsx_to_shoplazza_xlsx.py @@ -36,7 +36,7 @@ from openpyxl import load_workbook # Allow running as `python scripts/xxx.py` without installing as a package sys.path.insert(0, str(Path(__file__).resolve().parent)) -from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast +from shoplazza_excel_template import create_excel_from_template_fast PREFERRED_OPTION_KEYS = [ @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url): return "".join(parts) -def amazon_sheet(ws): - headers = [] - for c in range(1, ws.max_column + 1): - v = ws.cell(1, c).value - headers.append(clean_str(v)) - return {h: i + 1 for i, h in enumerate(headers) if h} - - def read_amazon_rows_from_file(xlsx_path, max_rows=None): wb = load_workbook(xlsx_path, read_only=True, data_only=True) sheet_name = None @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): if sheet_name is None: return [] ws = wb[sheet_name] - idx = amazon_sheet(ws) + + # Build header index from first row + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)} required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. # openpyxl cell access is relatively expensive; values_only is much faster. - pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple + pos = {k: idx[k] for k in required} # 0-based positions in row tuple rows = [] end_row = ws.max_row @@ -420,10 +415,9 @@ def main(): parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") - parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)") + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)") parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)") parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") args = parser.parse_args() @@ -445,7 +439,7 @@ def main(): for fp in files: print(" - loading: {}".format(fp), flush=True) try: - rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file) + rows = read_amazon_rows_from_file(fp) except Exception as e: print("WARN: failed to read {}: {}".format(fp, e)) continue @@ -461,9 +455,9 @@ def main(): print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) - excel_rows = [] + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组” + group_rows_list = [] # List[List[dict]] spu_count = 0 - for spu_id, variants in groups.items(): if not variants: continue @@ -471,15 +465,69 @@ def main(): if args.max_products is not None and spu_count > int(args.max_products): break if len(variants) == 1: - excel_rows.append(build_s_row(variants[0])) + group_rows_list.append([build_s_row(variants[0])]) else: - excel_rows.extend(build_m_p_rows(variants)) - - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) - if args.no_fast_write: - create_excel_from_template(args.template, args.output, excel_rows) + group_rows_list.append(build_m_p_rows(variants)) + + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分) + data_start_row = 4 # 与模板/写入工具保持一致 + header_rows = data_start_row - 1 # 包含标题行+说明行 + max_total_rows = args.max_rows_per_output or 0 + if max_total_rows and max_total_rows > header_rows: + max_data_rows = max_total_rows - header_rows + else: + max_data_rows = None # 不限制 + + chunks = [] + current_chunk = [] + current_count = 0 + + if max_data_rows is None: + # 不做分片,直接一个 chunk + for gr in group_rows_list: + current_chunk.extend(gr) + if current_chunk: + chunks.append(current_chunk) else: - create_excel_from_template_fast(args.template, args.output, excel_rows) + for gr in group_rows_list: + gsize = len(gr) + # 如果单个 SPU 本身就超过阈值,只能独占一个文件 + if gsize > max_data_rows: + if current_chunk: + chunks.append(current_chunk) + current_chunk = [] + current_count = 0 + chunks.append(gr) + continue + # 如果放不下当前 chunk,则先封一个,再开新 chunk + if current_count + gsize > max_data_rows: + if current_chunk: + chunks.append(current_chunk) + current_chunk = list(gr) + current_count = gsize + else: + current_chunk.extend(gr) + current_count += gsize + if current_chunk: + chunks.append(current_chunk) + + total_rows = sum(len(c) for c in chunks) + print( + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format( + total_rows, len(group_rows_list), len(chunks) + ), + flush=True, + ) + + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀 + base = Path(args.output) + stem = base.stem + suffix = base.suffix or ".xlsx" + + for idx, chunk in enumerate(chunks, start=1): + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}")) + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True) + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row) if __name__ == "__main__": -- libgit2 0.21.2