From 50170c5a0f915bd2cd0d6e012348417ee380ddcb Mon Sep 17 00:00:00 2001
From: tangwang <tangwang@essa.top>
Date: Wed, 17 Dec 2025 16:33:18 +0800
Subject: [PATCH] 导入成功。有部分失败 （1/4） 原因有： 1. 跟主商品标题不一致 2. 商品图片信息缺失 3. Options是无效的

---
 docs/亚马逊格式数据转店匠商品导入模板.md | 23 ++++++++++++++++++++---
 scripts/amazon_xlsx_to_shoplazza_xlsx.py                 | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
 2 files changed, 92 insertions(+), 27 deletions(-)

diff --git a/docs/亚马逊格式数据转店匠商品导入模板.md b/docs/亚马逊格式数据转店匠商品导入模板.md
index 7c60870..d637a89 100644
--- a/docs/亚马逊格式数据转店匠商品导入模板.md
+++ b/docs/亚马逊格式数据转店匠商品导入模板.md
@@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
   --input-dir data/mai_jia_jing_ling/products_data \
   --template docs/商品导入模板.xlsx \
   --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \
-  --max-files 1 --max-rows-per-file 2000 --max-products 50
+  --max-files 1 --max-products 50
 ```
 
 ### 性能提示（很重要）
 
 - 旧实现如果用 `ws.cell()` 逐格读取/写入，处理 1 个 xlsx 就可能非常慢（分钟级甚至更久）。
-- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取，并默认启用 **fast writer**（写出时不逐格写模板）。
-- 如需使用慢速的“按模板逐格写入”（不推荐），可加：`--no-fast-write`
+- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写（只有这一种实现方式，保持简单）。
 
 ### 2）生成全量
 
@@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
   --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx
 ```
 
+### 3）按 Excel 行数自动拆分文件
+
+当单个导出超过一定行数时，可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数（包含模板头部行）：
+
+```bash
+python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
+  --input-dir data/mai_jia_jing_ling/products_data \
+  --template docs/商品导入模板.xlsx \
+  --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \
+  --max-rows-per-output 40000
+```
+
+- 若结果只需要 1 个文件，仍输出为 `amazon_shoplazza_import_SPLIT.xlsx`
+- 若需要拆分为多个文件，则追加 `_part1/_part2/...` 后缀：
+  - `amazon_shoplazza_import_SPLIT_part1.xlsx`
+  - `amazon_shoplazza_import_SPLIT_part2.xlsx`
+- **同一个 SPU（同一父ASIN 的 M+P 或 S 行）保证不会被拆到不同文件中**
+
 ---
 
 ## 六、可扩展点（后续常见需求）
diff --git a/scripts/amazon_xlsx_to_shoplazza_xlsx.py b/scripts/amazon_xlsx_to_shoplazza_xlsx.py
index 276a45b..cf87549 100644
--- a/scripts/amazon_xlsx_to_shoplazza_xlsx.py
+++ b/scripts/amazon_xlsx_to_shoplazza_xlsx.py
@@ -36,7 +36,7 @@ from openpyxl import load_workbook
 
 # Allow running as `python scripts/xxx.py` without installing as a package
 sys.path.insert(0, str(Path(__file__).resolve().parent))
-from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast
+from shoplazza_excel_template import create_excel_from_template_fast
 
 
 PREFERRED_OPTION_KEYS = [
@@ -182,14 +182,6 @@ def build_description_html(title, details, product_url):
     return "".join(parts)
 
 
-def amazon_sheet(ws):
-    headers = []
-    for c in range(1, ws.max_column + 1):
-        v = ws.cell(1, c).value
-        headers.append(clean_str(v))
-    return {h: i + 1 for i, h in enumerate(headers) if h}
-
-
 def read_amazon_rows_from_file(xlsx_path, max_rows=None):
     wb = load_workbook(xlsx_path, read_only=True, data_only=True)
     sheet_name = None
@@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
     if sheet_name is None:
         return []
     ws = wb[sheet_name]
-    idx = amazon_sheet(ws)
+
+    # Build header index from first row
+    header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
+    idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)}
 
     required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)",
                 "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接",
@@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
 
     # OPT: use iter_rows(values_only=True) instead of ws.cell() per field.
     # openpyxl cell access is relatively expensive; values_only is much faster.
-    pos = {k: idx[k] - 1 for k in required}  # 0-based positions in row tuple
+    pos = {k: idx[k] for k in required}  # 0-based positions in row tuple
 
     rows = []
     end_row = ws.max_row
@@ -420,10 +415,9 @@ def main():
     parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx")
     parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
     parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
-    parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")
-    parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)")
+    parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)")
     parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
-    parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")
+    parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行，默认40000)")
     parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
     args = parser.parse_args()
 
@@ -445,7 +439,7 @@ def main():
     for fp in files:
         print("  - loading: {}".format(fp), flush=True)
         try:
-            rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file)
+            rows = read_amazon_rows_from_file(fp)
         except Exception as e:
             print("WARN: failed to read {}: {}".format(fp, e))
             continue
@@ -461,9 +455,9 @@ def main():
 
     print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True)
 
-    excel_rows = []
+    # 先按 SPU 构造每个组的行，方便做“按最大行数拆分但不拆组”
+    group_rows_list = []  # List[List[dict]]
     spu_count = 0
-
     for spu_id, variants in groups.items():
         if not variants:
             continue
@@ -471,15 +465,69 @@ def main():
         if args.max_products is not None and spu_count > int(args.max_products):
             break
         if len(variants) == 1:
-            excel_rows.append(build_s_row(variants[0]))
+            group_rows_list.append([build_s_row(variants[0])])
         else:
-            excel_rows.extend(build_m_p_rows(variants))
-
-    print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)
-    if args.no_fast_write:
-        create_excel_from_template(args.template, args.output, excel_rows)
+            group_rows_list.append(build_m_p_rows(variants))
+
+    # 按最大行数拆成多个文件（注意：同一 SPU 不拆分）
+    data_start_row = 4  # 与模板/写入工具保持一致
+    header_rows = data_start_row - 1  # 包含标题行+说明行
+    max_total_rows = args.max_rows_per_output or 0
+    if max_total_rows and max_total_rows > header_rows:
+        max_data_rows = max_total_rows - header_rows
+    else:
+        max_data_rows = None  # 不限制
+
+    chunks = []
+    current_chunk = []
+    current_count = 0
+
+    if max_data_rows is None:
+        # 不做分片，直接一个 chunk
+        for gr in group_rows_list:
+            current_chunk.extend(gr)
+        if current_chunk:
+            chunks.append(current_chunk)
     else:
-        create_excel_from_template_fast(args.template, args.output, excel_rows)
+        for gr in group_rows_list:
+            gsize = len(gr)
+            # 如果单个 SPU 本身就超过阈值，只能独占一个文件
+            if gsize > max_data_rows:
+                if current_chunk:
+                    chunks.append(current_chunk)
+                    current_chunk = []
+                    current_count = 0
+                chunks.append(gr)
+                continue
+            # 如果放不下当前 chunk，则先封一个，再开新 chunk
+            if current_count + gsize > max_data_rows:
+                if current_chunk:
+                    chunks.append(current_chunk)
+                current_chunk = list(gr)
+                current_count = gsize
+            else:
+                current_chunk.extend(gr)
+                current_count += gsize
+        if current_chunk:
+            chunks.append(current_chunk)
+
+    total_rows = sum(len(c) for c in chunks)
+    print(
+        "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format(
+            total_rows, len(group_rows_list), len(chunks)
+        ),
+        flush=True,
+    )
+
+    # 输出多个文件：如果只一个 chunk，直接用指定 output；多个则加 _partN 后缀
+    base = Path(args.output)
+    stem = base.stem
+    suffix = base.suffix or ".xlsx"
+
+    for idx, chunk in enumerate(chunks, start=1):
+        out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}"))
+        print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True)
+        create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row)
 
 
 if __name__ == "__main__":
--
libgit2 0.21.2