Commit 50170c5a0f915bd2cd0d6e012348417ee380ddcb
1 parent
80519ec6
导入成功。有部分失败 (1/4) 原因有:
1. 跟主商品标题不一致 2. 商品图片信息缺失 3. Options是无效的
Showing
2 changed files
with
92 additions
and
27 deletions
Show diff stats
docs/亚马逊格式数据转店匠商品导入模板.md
| @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | ||
| 113 | --input-dir data/mai_jia_jing_ling/products_data \ | 113 | --input-dir data/mai_jia_jing_ling/products_data \ |
| 114 | --template docs/商品导入模板.xlsx \ | 114 | --template docs/商品导入模板.xlsx \ |
| 115 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ | 115 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ |
| 116 | - --max-files 1 --max-rows-per-file 2000 --max-products 50 | 116 | + --max-files 1 --max-products 50 |
| 117 | ``` | 117 | ``` |
| 118 | 118 | ||
| 119 | ### 性能提示(很重要) | 119 | ### 性能提示(很重要) |
| 120 | 120 | ||
| 121 | - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 | 121 | - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 |
| 122 | -- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。 | ||
| 123 | -- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write` | 122 | +- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写(只有这一种实现方式,保持简单)。 |
| 124 | 123 | ||
| 125 | ### 2)生成全量 | 124 | ### 2)生成全量 |
| 126 | 125 | ||
| @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | ||
| 131 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx | 130 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx |
| 132 | ``` | 131 | ``` |
| 133 | 132 | ||
| 133 | +### 3)按 Excel 行数自动拆分文件 | ||
| 134 | + | ||
| 135 | +当单个导出超过一定行数时,可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数(包含模板头部行): | ||
| 136 | + | ||
| 137 | +```bash | ||
| 138 | +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | ||
| 139 | + --input-dir data/mai_jia_jing_ling/products_data \ | ||
| 140 | + --template docs/商品导入模板.xlsx \ | ||
| 141 | + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \ | ||
| 142 | + --max-rows-per-output 40000 | ||
| 143 | +``` | ||
| 144 | + | ||
| 145 | +- 若结果只需要 1 个文件,仍输出为 `amazon_shoplazza_import_SPLIT.xlsx` | ||
| 146 | +- 若需要拆分为多个文件,则追加 `_part1/_part2/...` 后缀: | ||
| 147 | + - `amazon_shoplazza_import_SPLIT_part1.xlsx` | ||
| 148 | + - `amazon_shoplazza_import_SPLIT_part2.xlsx` | ||
| 149 | +- **同一个 SPU(同一父ASIN 的 M+P 或 S 行)保证不会被拆到不同文件中** | ||
| 150 | + | ||
| 134 | --- | 151 | --- |
| 135 | 152 | ||
| 136 | ## 六、可扩展点(后续常见需求) | 153 | ## 六、可扩展点(后续常见需求) |
scripts/amazon_xlsx_to_shoplazza_xlsx.py
| @@ -36,7 +36,7 @@ from openpyxl import load_workbook | @@ -36,7 +36,7 @@ from openpyxl import load_workbook | ||
| 36 | 36 | ||
| 37 | # Allow running as `python scripts/xxx.py` without installing as a package | 37 | # Allow running as `python scripts/xxx.py` without installing as a package |
| 38 | sys.path.insert(0, str(Path(__file__).resolve().parent)) | 38 | sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| 39 | -from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast | 39 | +from shoplazza_excel_template import create_excel_from_template_fast |
| 40 | 40 | ||
| 41 | 41 | ||
| 42 | PREFERRED_OPTION_KEYS = [ | 42 | PREFERRED_OPTION_KEYS = [ |
| @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url): | @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url): | ||
| 182 | return "".join(parts) | 182 | return "".join(parts) |
| 183 | 183 | ||
| 184 | 184 | ||
| 185 | -def amazon_sheet(ws): | ||
| 186 | - headers = [] | ||
| 187 | - for c in range(1, ws.max_column + 1): | ||
| 188 | - v = ws.cell(1, c).value | ||
| 189 | - headers.append(clean_str(v)) | ||
| 190 | - return {h: i + 1 for i, h in enumerate(headers) if h} | ||
| 191 | - | ||
| 192 | - | ||
| 193 | def read_amazon_rows_from_file(xlsx_path, max_rows=None): | 185 | def read_amazon_rows_from_file(xlsx_path, max_rows=None): |
| 194 | wb = load_workbook(xlsx_path, read_only=True, data_only=True) | 186 | wb = load_workbook(xlsx_path, read_only=True, data_only=True) |
| 195 | sheet_name = None | 187 | sheet_name = None |
| @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | ||
| 201 | if sheet_name is None: | 193 | if sheet_name is None: |
| 202 | return [] | 194 | return [] |
| 203 | ws = wb[sheet_name] | 195 | ws = wb[sheet_name] |
| 204 | - idx = amazon_sheet(ws) | 196 | + |
| 197 | + # Build header index from first row | ||
| 198 | + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) | ||
| 199 | + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)} | ||
| 205 | 200 | ||
| 206 | required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", | 201 | required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", |
| 207 | "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", | 202 | "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", |
| @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | ||
| 212 | 207 | ||
| 213 | # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. | 208 | # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. |
| 214 | # openpyxl cell access is relatively expensive; values_only is much faster. | 209 | # openpyxl cell access is relatively expensive; values_only is much faster. |
| 215 | - pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple | 210 | + pos = {k: idx[k] for k in required} # 0-based positions in row tuple |
| 216 | 211 | ||
| 217 | rows = [] | 212 | rows = [] |
| 218 | end_row = ws.max_row | 213 | end_row = ws.max_row |
| @@ -420,10 +415,9 @@ def main(): | @@ -420,10 +415,9 @@ def main(): | ||
| 420 | parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") | 415 | parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") |
| 421 | parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") | 416 | parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") |
| 422 | parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") | 417 | parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") |
| 423 | - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") | ||
| 424 | - parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)") | 418 | + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)") |
| 425 | parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") | 419 | parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") |
| 426 | - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") | 420 | + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)") |
| 427 | parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") | 421 | parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") |
| 428 | args = parser.parse_args() | 422 | args = parser.parse_args() |
| 429 | 423 | ||
| @@ -445,7 +439,7 @@ def main(): | @@ -445,7 +439,7 @@ def main(): | ||
| 445 | for fp in files: | 439 | for fp in files: |
| 446 | print(" - loading: {}".format(fp), flush=True) | 440 | print(" - loading: {}".format(fp), flush=True) |
| 447 | try: | 441 | try: |
| 448 | - rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file) | 442 | + rows = read_amazon_rows_from_file(fp) |
| 449 | except Exception as e: | 443 | except Exception as e: |
| 450 | print("WARN: failed to read {}: {}".format(fp, e)) | 444 | print("WARN: failed to read {}: {}".format(fp, e)) |
| 451 | continue | 445 | continue |
| @@ -461,9 +455,9 @@ def main(): | @@ -461,9 +455,9 @@ def main(): | ||
| 461 | 455 | ||
| 462 | print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) | 456 | print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) |
| 463 | 457 | ||
| 464 | - excel_rows = [] | 458 | + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组” |
| 459 | + group_rows_list = [] # List[List[dict]] | ||
| 465 | spu_count = 0 | 460 | spu_count = 0 |
| 466 | - | ||
| 467 | for spu_id, variants in groups.items(): | 461 | for spu_id, variants in groups.items(): |
| 468 | if not variants: | 462 | if not variants: |
| 469 | continue | 463 | continue |
| @@ -471,15 +465,69 @@ def main(): | @@ -471,15 +465,69 @@ def main(): | ||
| 471 | if args.max_products is not None and spu_count > int(args.max_products): | 465 | if args.max_products is not None and spu_count > int(args.max_products): |
| 472 | break | 466 | break |
| 473 | if len(variants) == 1: | 467 | if len(variants) == 1: |
| 474 | - excel_rows.append(build_s_row(variants[0])) | 468 | + group_rows_list.append([build_s_row(variants[0])]) |
| 475 | else: | 469 | else: |
| 476 | - excel_rows.extend(build_m_p_rows(variants)) | ||
| 477 | - | ||
| 478 | - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) | ||
| 479 | - if args.no_fast_write: | ||
| 480 | - create_excel_from_template(args.template, args.output, excel_rows) | 470 | + group_rows_list.append(build_m_p_rows(variants)) |
| 471 | + | ||
| 472 | + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分) | ||
| 473 | + data_start_row = 4 # 与模板/写入工具保持一致 | ||
| 474 | + header_rows = data_start_row - 1 # 包含标题行+说明行 | ||
| 475 | + max_total_rows = args.max_rows_per_output or 0 | ||
| 476 | + if max_total_rows and max_total_rows > header_rows: | ||
| 477 | + max_data_rows = max_total_rows - header_rows | ||
| 478 | + else: | ||
| 479 | + max_data_rows = None # 不限制 | ||
| 480 | + | ||
| 481 | + chunks = [] | ||
| 482 | + current_chunk = [] | ||
| 483 | + current_count = 0 | ||
| 484 | + | ||
| 485 | + if max_data_rows is None: | ||
| 486 | + # 不做分片,直接一个 chunk | ||
| 487 | + for gr in group_rows_list: | ||
| 488 | + current_chunk.extend(gr) | ||
| 489 | + if current_chunk: | ||
| 490 | + chunks.append(current_chunk) | ||
| 481 | else: | 491 | else: |
| 482 | - create_excel_from_template_fast(args.template, args.output, excel_rows) | 492 | + for gr in group_rows_list: |
| 493 | + gsize = len(gr) | ||
| 494 | + # 如果单个 SPU 本身就超过阈值,只能独占一个文件 | ||
| 495 | + if gsize > max_data_rows: | ||
| 496 | + if current_chunk: | ||
| 497 | + chunks.append(current_chunk) | ||
| 498 | + current_chunk = [] | ||
| 499 | + current_count = 0 | ||
| 500 | + chunks.append(gr) | ||
| 501 | + continue | ||
| 502 | + # 如果放不下当前 chunk,则先封一个,再开新 chunk | ||
| 503 | + if current_count + gsize > max_data_rows: | ||
| 504 | + if current_chunk: | ||
| 505 | + chunks.append(current_chunk) | ||
| 506 | + current_chunk = list(gr) | ||
| 507 | + current_count = gsize | ||
| 508 | + else: | ||
| 509 | + current_chunk.extend(gr) | ||
| 510 | + current_count += gsize | ||
| 511 | + if current_chunk: | ||
| 512 | + chunks.append(current_chunk) | ||
| 513 | + | ||
| 514 | + total_rows = sum(len(c) for c in chunks) | ||
| 515 | + print( | ||
| 516 | + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format( | ||
| 517 | + total_rows, len(group_rows_list), len(chunks) | ||
| 518 | + ), | ||
| 519 | + flush=True, | ||
| 520 | + ) | ||
| 521 | + | ||
| 522 | + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀 | ||
| 523 | + base = Path(args.output) | ||
| 524 | + stem = base.stem | ||
| 525 | + suffix = base.suffix or ".xlsx" | ||
| 526 | + | ||
| 527 | + for idx, chunk in enumerate(chunks, start=1): | ||
| 528 | + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}")) | ||
| 529 | + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True) | ||
| 530 | + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row) | ||
| 483 | 531 | ||
| 484 | 532 | ||
| 485 | if __name__ == "__main__": | 533 | if __name__ == "__main__": |