Commit 50170c5a0f915bd2cd0d6e012348417ee380ddcb
1 parent
80519ec6
导入成功。有部分失败 (1/4) 原因有:
1. 跟主商品标题不一致 2. 商品图片信息缺失 3. Options是无效的
Showing
2 changed files
with
92 additions
and
27 deletions
Show diff stats
docs/亚马逊格式数据转店匠商品导入模板.md
| ... | ... | @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ |
| 113 | 113 | --input-dir data/mai_jia_jing_ling/products_data \ |
| 114 | 114 | --template docs/商品导入模板.xlsx \ |
| 115 | 115 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ |
| 116 | - --max-files 1 --max-rows-per-file 2000 --max-products 50 | |
| 116 | + --max-files 1 --max-products 50 | |
| 117 | 117 | ``` |
| 118 | 118 | |
| 119 | 119 | ### 性能提示(很重要) |
| 120 | 120 | |
| 121 | 121 | - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 |
| 122 | -- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。 | |
| 123 | -- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write` | |
| 122 | +- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写(只有这一种实现方式,保持简单)。 | |
| 124 | 123 | |
| 125 | 124 | ### 2)生成全量 |
| 126 | 125 | |
| ... | ... | @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ |
| 131 | 130 | --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx |
| 132 | 131 | ``` |
| 133 | 132 | |
| 133 | +### 3)按 Excel 行数自动拆分文件 | |
| 134 | + | |
| 135 | +当单个导出超过一定行数时,可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数(包含模板头部行): | |
| 136 | + | |
| 137 | +```bash | |
| 138 | +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | |
| 139 | + --input-dir data/mai_jia_jing_ling/products_data \ | |
| 140 | + --template docs/商品导入模板.xlsx \ | |
| 141 | + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \ | |
| 142 | + --max-rows-per-output 40000 | |
| 143 | +``` | |
| 144 | + | |
| 145 | +- 若结果只需要 1 个文件,仍输出为 `amazon_shoplazza_import_SPLIT.xlsx` | |
| 146 | +- 若需要拆分为多个文件,则追加 `_part1/_part2/...` 后缀: | |
| 147 | + - `amazon_shoplazza_import_SPLIT_part1.xlsx` | |
| 148 | + - `amazon_shoplazza_import_SPLIT_part2.xlsx` | |
| 149 | +- **同一个 SPU(同一父ASIN 的 M+P 或 S 行)保证不会被拆到不同文件中** | |
| 150 | + | |
| 134 | 151 | --- |
| 135 | 152 | |
| 136 | 153 | ## 六、可扩展点(后续常见需求) | ... | ... |
scripts/amazon_xlsx_to_shoplazza_xlsx.py
| ... | ... | @@ -36,7 +36,7 @@ from openpyxl import load_workbook |
| 36 | 36 | |
| 37 | 37 | # Allow running as `python scripts/xxx.py` without installing as a package |
| 38 | 38 | sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| 39 | -from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast | |
| 39 | +from shoplazza_excel_template import create_excel_from_template_fast | |
| 40 | 40 | |
| 41 | 41 | |
| 42 | 42 | PREFERRED_OPTION_KEYS = [ |
| ... | ... | @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url): |
| 182 | 182 | return "".join(parts) |
| 183 | 183 | |
| 184 | 184 | |
| 185 | -def amazon_sheet(ws): | |
| 186 | - headers = [] | |
| 187 | - for c in range(1, ws.max_column + 1): | |
| 188 | - v = ws.cell(1, c).value | |
| 189 | - headers.append(clean_str(v)) | |
| 190 | - return {h: i + 1 for i, h in enumerate(headers) if h} | |
| 191 | - | |
| 192 | - | |
| 193 | 185 | def read_amazon_rows_from_file(xlsx_path, max_rows=None): |
| 194 | 186 | wb = load_workbook(xlsx_path, read_only=True, data_only=True) |
| 195 | 187 | sheet_name = None |
| ... | ... | @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): |
| 201 | 193 | if sheet_name is None: |
| 202 | 194 | return [] |
| 203 | 195 | ws = wb[sheet_name] |
| 204 | - idx = amazon_sheet(ws) | |
| 196 | + | |
| 197 | + # Build header index from first row | |
| 198 | + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) | |
| 199 | + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)} | |
| 205 | 200 | |
| 206 | 201 | required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", |
| 207 | 202 | "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", |
| ... | ... | @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): |
| 212 | 207 | |
| 213 | 208 | # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. |
| 214 | 209 | # openpyxl cell access is relatively expensive; values_only is much faster. |
| 215 | - pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple | |
| 210 | + pos = {k: idx[k] for k in required} # 0-based positions in row tuple | |
| 216 | 211 | |
| 217 | 212 | rows = [] |
| 218 | 213 | end_row = ws.max_row |
| ... | ... | @@ -420,10 +415,9 @@ def main(): |
| 420 | 415 | parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") |
| 421 | 416 | parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") |
| 422 | 417 | parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") |
| 423 | - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") | |
| 424 | - parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)") | |
| 418 | + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)") | |
| 425 | 419 | parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") |
| 426 | - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") | |
| 420 | + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)") | |
| 427 | 421 | parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") |
| 428 | 422 | args = parser.parse_args() |
| 429 | 423 | |
| ... | ... | @@ -445,7 +439,7 @@ def main(): |
| 445 | 439 | for fp in files: |
| 446 | 440 | print(" - loading: {}".format(fp), flush=True) |
| 447 | 441 | try: |
| 448 | - rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file) | |
| 442 | + rows = read_amazon_rows_from_file(fp) | |
| 449 | 443 | except Exception as e: |
| 450 | 444 | print("WARN: failed to read {}: {}".format(fp, e)) |
| 451 | 445 | continue |
| ... | ... | @@ -461,9 +455,9 @@ def main(): |
| 461 | 455 | |
| 462 | 456 | print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) |
| 463 | 457 | |
| 464 | - excel_rows = [] | |
| 458 | + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组” | |
| 459 | + group_rows_list = [] # List[List[dict]] | |
| 465 | 460 | spu_count = 0 |
| 466 | - | |
| 467 | 461 | for spu_id, variants in groups.items(): |
| 468 | 462 | if not variants: |
| 469 | 463 | continue |
| ... | ... | @@ -471,15 +465,69 @@ def main(): |
| 471 | 465 | if args.max_products is not None and spu_count > int(args.max_products): |
| 472 | 466 | break |
| 473 | 467 | if len(variants) == 1: |
| 474 | - excel_rows.append(build_s_row(variants[0])) | |
| 468 | + group_rows_list.append([build_s_row(variants[0])]) | |
| 475 | 469 | else: |
| 476 | - excel_rows.extend(build_m_p_rows(variants)) | |
| 477 | - | |
| 478 | - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) | |
| 479 | - if args.no_fast_write: | |
| 480 | - create_excel_from_template(args.template, args.output, excel_rows) | |
| 470 | + group_rows_list.append(build_m_p_rows(variants)) | |
| 471 | + | |
| 472 | + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分) | |
| 473 | + data_start_row = 4 # 与模板/写入工具保持一致 | |
| 474 | + header_rows = data_start_row - 1 # 包含标题行+说明行 | |
| 475 | + max_total_rows = args.max_rows_per_output or 0 | |
| 476 | + if max_total_rows and max_total_rows > header_rows: | |
| 477 | + max_data_rows = max_total_rows - header_rows | |
| 478 | + else: | |
| 479 | + max_data_rows = None # 不限制 | |
| 480 | + | |
| 481 | + chunks = [] | |
| 482 | + current_chunk = [] | |
| 483 | + current_count = 0 | |
| 484 | + | |
| 485 | + if max_data_rows is None: | |
| 486 | + # 不做分片,直接一个 chunk | |
| 487 | + for gr in group_rows_list: | |
| 488 | + current_chunk.extend(gr) | |
| 489 | + if current_chunk: | |
| 490 | + chunks.append(current_chunk) | |
| 481 | 491 | else: |
| 482 | - create_excel_from_template_fast(args.template, args.output, excel_rows) | |
| 492 | + for gr in group_rows_list: | |
| 493 | + gsize = len(gr) | |
| 494 | + # 如果单个 SPU 本身就超过阈值,只能独占一个文件 | |
| 495 | + if gsize > max_data_rows: | |
| 496 | + if current_chunk: | |
| 497 | + chunks.append(current_chunk) | |
| 498 | + current_chunk = [] | |
| 499 | + current_count = 0 | |
| 500 | + chunks.append(gr) | |
| 501 | + continue | |
| 502 | + # 如果放不下当前 chunk,则先封一个,再开新 chunk | |
| 503 | + if current_count + gsize > max_data_rows: | |
| 504 | + if current_chunk: | |
| 505 | + chunks.append(current_chunk) | |
| 506 | + current_chunk = list(gr) | |
| 507 | + current_count = gsize | |
| 508 | + else: | |
| 509 | + current_chunk.extend(gr) | |
| 510 | + current_count += gsize | |
| 511 | + if current_chunk: | |
| 512 | + chunks.append(current_chunk) | |
| 513 | + | |
| 514 | + total_rows = sum(len(c) for c in chunks) | |
| 515 | + print( | |
| 516 | + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format( | |
| 517 | + total_rows, len(group_rows_list), len(chunks) | |
| 518 | + ), | |
| 519 | + flush=True, | |
| 520 | + ) | |
| 521 | + | |
| 522 | + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀 | |
| 523 | + base = Path(args.output) | |
| 524 | + stem = base.stem | |
| 525 | + suffix = base.suffix or ".xlsx" | |
| 526 | + | |
| 527 | + for idx, chunk in enumerate(chunks, start=1): | |
| 528 | + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}")) | |
| 529 | + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True) | |
| 530 | + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row) | |
| 483 | 531 | |
| 484 | 532 | |
| 485 | 533 | if __name__ == "__main__": | ... | ... |