Commit 50170c5a0f915bd2cd0d6e012348417ee380ddcb

Authored by tangwang
1 parent 80519ec6

导入成功。有部分失败 (1/4) 原因有:

1. 跟主商品标题不一致
2. 商品图片信息缺失
3. Options是无效的
docs/亚马逊格式数据转店匠商品导入模板.md
... ... @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
113 113 --input-dir data/mai_jia_jing_ling/products_data \
114 114 --template docs/商品导入模板.xlsx \
115 115 --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \
116   - --max-files 1 --max-rows-per-file 2000 --max-products 50
  116 + --max-files 1 --max-products 50
117 117 ```
118 118  
119 119 ### 性能提示(很重要)
120 120  
121 121 - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。
122   -- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。
123   -- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write`
  122 +- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写(只有这一种实现方式,保持简单)。
124 123  
125 124 ### 2)生成全量
126 125  
... ... @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
131 130 --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx
132 131 ```
133 132  
  133 +### 3)按 Excel 行数自动拆分文件
  134 +
  135 +当单个导出超过一定行数时,可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数(包含模板头部行):
  136 +
  137 +```bash
  138 +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
  139 + --input-dir data/mai_jia_jing_ling/products_data \
  140 + --template docs/商品导入模板.xlsx \
  141 + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \
  142 + --max-rows-per-output 40000
  143 +```
  144 +
  145 +- 若结果只需要 1 个文件,仍输出为 `amazon_shoplazza_import_SPLIT.xlsx`
  146 +- 若需要拆分为多个文件,则追加 `_part1/_part2/...` 后缀:
  147 + - `amazon_shoplazza_import_SPLIT_part1.xlsx`
  148 + - `amazon_shoplazza_import_SPLIT_part2.xlsx`
  149 +- **同一个 SPU(同一父ASIN 的 M+P 或 S 行)保证不会被拆到不同文件中**
  150 +
134 151 ---
135 152  
136 153 ## 六、可扩展点(后续常见需求)
... ...
scripts/amazon_xlsx_to_shoplazza_xlsx.py
... ... @@ -36,7 +36,7 @@ from openpyxl import load_workbook
36 36  
37 37 # Allow running as `python scripts/xxx.py` without installing as a package
38 38 sys.path.insert(0, str(Path(__file__).resolve().parent))
39   -from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast
  39 +from shoplazza_excel_template import create_excel_from_template_fast
40 40  
41 41  
42 42 PREFERRED_OPTION_KEYS = [
... ... @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url):
182 182 return "".join(parts)
183 183  
184 184  
185   -def amazon_sheet(ws):
186   - headers = []
187   - for c in range(1, ws.max_column + 1):
188   - v = ws.cell(1, c).value
189   - headers.append(clean_str(v))
190   - return {h: i + 1 for i, h in enumerate(headers) if h}
191   -
192   -
193 185 def read_amazon_rows_from_file(xlsx_path, max_rows=None):
194 186 wb = load_workbook(xlsx_path, read_only=True, data_only=True)
195 187 sheet_name = None
... ... @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
201 193 if sheet_name is None:
202 194 return []
203 195 ws = wb[sheet_name]
204   - idx = amazon_sheet(ws)
  196 +
  197 + # Build header index from first row
  198 + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
  199 + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)}
205 200  
206 201 required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)",
207 202 "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接",
... ... @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
212 207  
213 208 # OPT: use iter_rows(values_only=True) instead of ws.cell() per field.
214 209 # openpyxl cell access is relatively expensive; values_only is much faster.
215   - pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple
  210 + pos = {k: idx[k] for k in required} # 0-based positions in row tuple
216 211  
217 212 rows = []
218 213 end_row = ws.max_row
... ... @@ -420,10 +415,9 @@ def main():
420 415 parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx")
421 416 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
422 417 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
423   - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")
424   - parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)")
  418 + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)")
425 419 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
426   - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")
  420 + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)")
427 421 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
428 422 args = parser.parse_args()
429 423  
... ... @@ -445,7 +439,7 @@ def main():
445 439 for fp in files:
446 440 print(" - loading: {}".format(fp), flush=True)
447 441 try:
448   - rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file)
  442 + rows = read_amazon_rows_from_file(fp)
449 443 except Exception as e:
450 444 print("WARN: failed to read {}: {}".format(fp, e))
451 445 continue
... ... @@ -461,9 +455,9 @@ def main():
461 455  
462 456 print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True)
463 457  
464   - excel_rows = []
  458 + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组”
  459 + group_rows_list = [] # List[List[dict]]
465 460 spu_count = 0
466   -
467 461 for spu_id, variants in groups.items():
468 462 if not variants:
469 463 continue
... ... @@ -471,15 +465,69 @@ def main():
471 465 if args.max_products is not None and spu_count > int(args.max_products):
472 466 break
473 467 if len(variants) == 1:
474   - excel_rows.append(build_s_row(variants[0]))
  468 + group_rows_list.append([build_s_row(variants[0])])
475 469 else:
476   - excel_rows.extend(build_m_p_rows(variants))
477   -
478   - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)
479   - if args.no_fast_write:
480   - create_excel_from_template(args.template, args.output, excel_rows)
  470 + group_rows_list.append(build_m_p_rows(variants))
  471 +
  472 + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分)
  473 + data_start_row = 4 # 与模板/写入工具保持一致
  474 + header_rows = data_start_row - 1 # 包含标题行+说明行
  475 + max_total_rows = args.max_rows_per_output or 0
  476 + if max_total_rows and max_total_rows > header_rows:
  477 + max_data_rows = max_total_rows - header_rows
  478 + else:
  479 + max_data_rows = None # 不限制
  480 +
  481 + chunks = []
  482 + current_chunk = []
  483 + current_count = 0
  484 +
  485 + if max_data_rows is None:
  486 + # 不做分片,直接一个 chunk
  487 + for gr in group_rows_list:
  488 + current_chunk.extend(gr)
  489 + if current_chunk:
  490 + chunks.append(current_chunk)
481 491 else:
482   - create_excel_from_template_fast(args.template, args.output, excel_rows)
  492 + for gr in group_rows_list:
  493 + gsize = len(gr)
  494 + # 如果单个 SPU 本身就超过阈值,只能独占一个文件
  495 + if gsize > max_data_rows:
  496 + if current_chunk:
  497 + chunks.append(current_chunk)
  498 + current_chunk = []
  499 + current_count = 0
  500 + chunks.append(gr)
  501 + continue
  502 + # 如果放不下当前 chunk,则先封一个,再开新 chunk
  503 + if current_count + gsize > max_data_rows:
  504 + if current_chunk:
  505 + chunks.append(current_chunk)
  506 + current_chunk = list(gr)
  507 + current_count = gsize
  508 + else:
  509 + current_chunk.extend(gr)
  510 + current_count += gsize
  511 + if current_chunk:
  512 + chunks.append(current_chunk)
  513 +
  514 + total_rows = sum(len(c) for c in chunks)
  515 + print(
  516 + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format(
  517 + total_rows, len(group_rows_list), len(chunks)
  518 + ),
  519 + flush=True,
  520 + )
  521 +
  522 + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀
  523 + base = Path(args.output)
  524 + stem = base.stem
  525 + suffix = base.suffix or ".xlsx"
  526 +
  527 + for idx, chunk in enumerate(chunks, start=1):
  528 + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}"))
  529 + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True)
  530 + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row)
483 531  
484 532  
485 533 if __name__ == "__main__":
... ...