Commit 50170c5a0f915bd2cd0d6e012348417ee380ddcb

Authored by tangwang
1 parent 80519ec6

导入成功。有部分失败 (1/4) 原因有:

1. 跟主商品标题不一致
2. 商品图片信息缺失
3. Options是无效的
docs/亚马逊格式数据转店匠商品导入模板.md
@@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ @@ -113,14 +113,13 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
113 --input-dir data/mai_jia_jing_ling/products_data \ 113 --input-dir data/mai_jia_jing_ling/products_data \
114 --template docs/商品导入模板.xlsx \ 114 --template docs/商品导入模板.xlsx \
115 --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ 115 --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \
116 - --max-files 1 --max-rows-per-file 2000 --max-products 50 116 + --max-files 1 --max-products 50
117 ``` 117 ```
118 118
119 ### 性能提示(很重要) 119 ### 性能提示(很重要)
120 120
121 - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 121 - 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。
122 -- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。  
123 -- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write` 122 +- 当前脚本已经使用 **`iter_rows(values_only=True)`** + write_only 模式做快速读写(只有这一种实现方式,保持简单)。
124 123
125 ### 2)生成全量 124 ### 2)生成全量
126 125
@@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ @@ -131,6 +130,24 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
131 --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx 130 --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx
132 ``` 131 ```
133 132
  133 +### 3)按 Excel 行数自动拆分文件
  134 +
  135 +当单个导出超过一定行数时,可以通过 `--max-rows-per-output` 控制单个 Excel 的最大总行数(包含模板头部行):
  136 +
  137 +```bash
  138 +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
  139 + --input-dir data/mai_jia_jing_ling/products_data \
  140 + --template docs/商品导入模板.xlsx \
  141 + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \
  142 + --max-rows-per-output 40000
  143 +```
  144 +
  145 +- 若结果只需要 1 个文件,仍输出为 `amazon_shoplazza_import_SPLIT.xlsx`
  146 +- 若需要拆分为多个文件,则追加 `_part1/_part2/...` 后缀:
  147 + - `amazon_shoplazza_import_SPLIT_part1.xlsx`
  148 + - `amazon_shoplazza_import_SPLIT_part2.xlsx`
  149 +- **同一个 SPU(同一父ASIN 的 M+P 或 S 行)保证不会被拆到不同文件中**
  150 +
134 --- 151 ---
135 152
136 ## 六、可扩展点(后续常见需求) 153 ## 六、可扩展点(后续常见需求)
scripts/amazon_xlsx_to_shoplazza_xlsx.py
@@ -36,7 +36,7 @@ from openpyxl import load_workbook @@ -36,7 +36,7 @@ from openpyxl import load_workbook
36 36
37 # Allow running as `python scripts/xxx.py` without installing as a package 37 # Allow running as `python scripts/xxx.py` without installing as a package
38 sys.path.insert(0, str(Path(__file__).resolve().parent)) 38 sys.path.insert(0, str(Path(__file__).resolve().parent))
39 -from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast 39 +from shoplazza_excel_template import create_excel_from_template_fast
40 40
41 41
42 PREFERRED_OPTION_KEYS = [ 42 PREFERRED_OPTION_KEYS = [
@@ -182,14 +182,6 @@ def build_description_html(title, details, product_url): @@ -182,14 +182,6 @@ def build_description_html(title, details, product_url):
182 return "".join(parts) 182 return "".join(parts)
183 183
184 184
185 -def amazon_sheet(ws):  
186 - headers = []  
187 - for c in range(1, ws.max_column + 1):  
188 - v = ws.cell(1, c).value  
189 - headers.append(clean_str(v))  
190 - return {h: i + 1 for i, h in enumerate(headers) if h}  
191 -  
192 -  
193 def read_amazon_rows_from_file(xlsx_path, max_rows=None): 185 def read_amazon_rows_from_file(xlsx_path, max_rows=None):
194 wb = load_workbook(xlsx_path, read_only=True, data_only=True) 186 wb = load_workbook(xlsx_path, read_only=True, data_only=True)
195 sheet_name = None 187 sheet_name = None
@@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): @@ -201,7 +193,10 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
201 if sheet_name is None: 193 if sheet_name is None:
202 return [] 194 return []
203 ws = wb[sheet_name] 195 ws = wb[sheet_name]
204 - idx = amazon_sheet(ws) 196 +
  197 + # Build header index from first row
  198 + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
  199 + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)}
205 200
206 required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", 201 required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)",
207 "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", 202 "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接",
@@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): @@ -212,7 +207,7 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
212 207
213 # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. 208 # OPT: use iter_rows(values_only=True) instead of ws.cell() per field.
214 # openpyxl cell access is relatively expensive; values_only is much faster. 209 # openpyxl cell access is relatively expensive; values_only is much faster.
215 - pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple 210 + pos = {k: idx[k] for k in required} # 0-based positions in row tuple
216 211
217 rows = [] 212 rows = []
218 end_row = ws.max_row 213 end_row = ws.max_row
@@ -420,10 +415,9 @@ def main(): @@ -420,10 +415,9 @@ def main():
420 parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") 415 parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx")
421 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") 416 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
422 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") 417 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
423 - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")  
424 - parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)") 418 + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)")
425 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") 419 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
426 - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") 420 + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)")
427 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") 421 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
428 args = parser.parse_args() 422 args = parser.parse_args()
429 423
@@ -445,7 +439,7 @@ def main(): @@ -445,7 +439,7 @@ def main():
445 for fp in files: 439 for fp in files:
446 print(" - loading: {}".format(fp), flush=True) 440 print(" - loading: {}".format(fp), flush=True)
447 try: 441 try:
448 - rows = read_amazon_rows_from_file(fp, max_rows=args.max_rows_per_file) 442 + rows = read_amazon_rows_from_file(fp)
449 except Exception as e: 443 except Exception as e:
450 print("WARN: failed to read {}: {}".format(fp, e)) 444 print("WARN: failed to read {}: {}".format(fp, e))
451 continue 445 continue
@@ -461,9 +455,9 @@ def main(): @@ -461,9 +455,9 @@ def main():
461 455
462 print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) 456 print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True)
463 457
464 - excel_rows = [] 458 + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组”
  459 + group_rows_list = [] # List[List[dict]]
465 spu_count = 0 460 spu_count = 0
466 -  
467 for spu_id, variants in groups.items(): 461 for spu_id, variants in groups.items():
468 if not variants: 462 if not variants:
469 continue 463 continue
@@ -471,15 +465,69 @@ def main(): @@ -471,15 +465,69 @@ def main():
471 if args.max_products is not None and spu_count > int(args.max_products): 465 if args.max_products is not None and spu_count > int(args.max_products):
472 break 466 break
473 if len(variants) == 1: 467 if len(variants) == 1:
474 - excel_rows.append(build_s_row(variants[0])) 468 + group_rows_list.append([build_s_row(variants[0])])
475 else: 469 else:
476 - excel_rows.extend(build_m_p_rows(variants))  
477 -  
478 - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)  
479 - if args.no_fast_write:  
480 - create_excel_from_template(args.template, args.output, excel_rows) 470 + group_rows_list.append(build_m_p_rows(variants))
  471 +
  472 + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分)
  473 + data_start_row = 4 # 与模板/写入工具保持一致
  474 + header_rows = data_start_row - 1 # 包含标题行+说明行
  475 + max_total_rows = args.max_rows_per_output or 0
  476 + if max_total_rows and max_total_rows > header_rows:
  477 + max_data_rows = max_total_rows - header_rows
  478 + else:
  479 + max_data_rows = None # 不限制
  480 +
  481 + chunks = []
  482 + current_chunk = []
  483 + current_count = 0
  484 +
  485 + if max_data_rows is None:
  486 + # 不做分片,直接一个 chunk
  487 + for gr in group_rows_list:
  488 + current_chunk.extend(gr)
  489 + if current_chunk:
  490 + chunks.append(current_chunk)
481 else: 491 else:
482 - create_excel_from_template_fast(args.template, args.output, excel_rows) 492 + for gr in group_rows_list:
  493 + gsize = len(gr)
  494 + # 如果单个 SPU 本身就超过阈值,只能独占一个文件
  495 + if gsize > max_data_rows:
  496 + if current_chunk:
  497 + chunks.append(current_chunk)
  498 + current_chunk = []
  499 + current_count = 0
  500 + chunks.append(gr)
  501 + continue
  502 + # 如果放不下当前 chunk,则先封一个,再开新 chunk
  503 + if current_count + gsize > max_data_rows:
  504 + if current_chunk:
  505 + chunks.append(current_chunk)
  506 + current_chunk = list(gr)
  507 + current_count = gsize
  508 + else:
  509 + current_chunk.extend(gr)
  510 + current_count += gsize
  511 + if current_chunk:
  512 + chunks.append(current_chunk)
  513 +
  514 + total_rows = sum(len(c) for c in chunks)
  515 + print(
  516 + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format(
  517 + total_rows, len(group_rows_list), len(chunks)
  518 + ),
  519 + flush=True,
  520 + )
  521 +
  522 + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀
  523 + base = Path(args.output)
  524 + stem = base.stem
  525 + suffix = base.suffix or ".xlsx"
  526 +
  527 + for idx, chunk in enumerate(chunks, start=1):
  528 + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}"))
  529 + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True)
  530 + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row)
483 531
484 532
485 if __name__ == "__main__": 533 if __name__ == "__main__":