Commit 80519ec6d17d0be1d524596395f9f911cf0a0923

Authored by tangwang
1 parent cd29428b

emazon -> shoplazza

docs/亚马逊格式数据转店匠商品导入模板.md
... ... @@ -116,6 +116,12 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
116 116 --max-files 1 --max-rows-per-file 2000 --max-products 50
117 117 ```
118 118  
  119 +### 性能提示(很重要)
  120 +
  121 +- 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。
  122 +- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。
  123 +- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write`
  124 +
119 125 ### 2)生成全量
120 126  
121 127 ```bash
... ...
scripts/amazon_xlsx_to_shoplazza_xlsx.py
... ... @@ -36,7 +36,7 @@ from openpyxl import load_workbook
36 36  
37 37 # Allow running as `python scripts/xxx.py` without installing as a package
38 38 sys.path.insert(0, str(Path(__file__).resolve().parent))
39   -from shoplazza_excel_template import create_excel_from_template
  39 +from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast
40 40  
41 41  
42 42 PREFERRED_OPTION_KEYS = [
... ... @@ -210,35 +210,39 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
210 210 if k not in idx:
211 211 raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name))
212 212  
  213 + # OPT: use iter_rows(values_only=True) instead of ws.cell() per field.
  214 + # openpyxl cell access is relatively expensive; values_only is much faster.
  215 + pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple
  216 +
213 217 rows = []
214 218 end_row = ws.max_row
215 219 if max_rows is not None:
216 220 end_row = min(end_row, 1 + int(max_rows))
217 221  
218   - for r in range(2, end_row + 1):
219   - asin = clean_str(ws.cell(r, idx["ASIN"]).value)
  222 + for tup in ws.iter_rows(min_row=2, max_row=end_row, values_only=True):
  223 + asin = clean_str(tup[pos["ASIN"]])
220 224 if not asin:
221 225 continue
222   - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin
  226 + parent = clean_str(tup[pos["父ASIN"]]) or asin
223 227 rows.append({
224 228 "ASIN": asin,
225 229 "父ASIN": parent,
226   - "SKU": clean_str(ws.cell(r, idx["SKU"]).value),
227   - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value),
228   - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value),
229   - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value),
230   - "价格($)": ws.cell(r, idx["价格($)"]).value,
231   - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value,
232   - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value),
233   - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value),
234   - "大类目": clean_str(ws.cell(r, idx["大类目"]).value),
235   - "小类目": clean_str(ws.cell(r, idx["小类目"]).value),
236   - "品牌": clean_str(ws.cell(r, idx["品牌"]).value),
237   - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value),
238   - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value),
239   - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value),
240   - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value),
241   - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value),
  230 + "SKU": clean_str(tup[pos["SKU"]]),
  231 + "详细参数": clean_str(tup[pos["详细参数"]]),
  232 + "商品标题": clean_str(tup[pos["商品标题"]]),
  233 + "商品主图": clean_str(tup[pos["商品主图"]]),
  234 + "价格($)": tup[pos["价格($)"]],
  235 + "prime价格($)": tup[pos["prime价格($)"]],
  236 + "上架时间": clean_str(tup[pos["上架时间"]]),
  237 + "类目路径": clean_str(tup[pos["类目路径"]]),
  238 + "大类目": clean_str(tup[pos["大类目"]]),
  239 + "小类目": clean_str(tup[pos["小类目"]]),
  240 + "品牌": clean_str(tup[pos["品牌"]]),
  241 + "品牌链接": clean_str(tup[pos["品牌链接"]]),
  242 + "商品详情页链接": clean_str(tup[pos["商品详情页链接"]]),
  243 + "商品重量(单位换算)": clean_str(tup[pos["商品重量(单位换算)"]]),
  244 + "商品重量": clean_str(tup[pos["商品重量"]]),
  245 + "商品尺寸": clean_str(tup[pos["商品尺寸"]]),
242 246 })
243 247 return rows
244 248  
... ... @@ -417,6 +421,7 @@ def main():
417 421 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
418 422 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
419 423 parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")
  424 + parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)")
420 425 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
421 426 parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")
422 427 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
... ... @@ -471,7 +476,10 @@ def main():
471 476 excel_rows.extend(build_m_p_rows(variants))
472 477  
473 478 print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)
474   - create_excel_from_template(args.template, args.output, excel_rows)
  479 + if args.no_fast_write:
  480 + create_excel_from_template(args.template, args.output, excel_rows)
  481 + else:
  482 + create_excel_from_template_fast(args.template, args.output, excel_rows)
475 483  
476 484  
477 485 if __name__ == "__main__":
... ...
scripts/competitor_xlsx_to_shoplazza_xlsx.py
1 1 #!/usr/bin/env python3
2 2 """
3   -DEPRECATED NAME (kept for backward compatibility).
  3 +DEPRECATED SCRIPT NAME (kept for backward compatibility).
4 4  
5   -The input `products_data/*.xlsx` files are **Amazon-format exports** (with Parent/Child ASIN),
6   -not “competitor data”. Please use:
  5 +The input `data/mai_jia_jing_ling/products_data/*.xlsx` files are Amazon-format exports
  6 +(Parent/Child ASIN), not “competitor data”.
7 7  
  8 +Please use:
8 9 - `scripts/amazon_xlsx_to_shoplazza_xlsx.py`
9 10  
10   -This script keeps the same logic but updates user-facing naming gradually.
  11 +This wrapper simply forwards all CLI args to the correctly named script, so you
  12 +automatically get the latest performance improvements (fast read/write).
11 13 """
12 14  
13   -import os
14   -import re
15 15 import sys
16   -import argparse
17   -from datetime import datetime
18   -from collections import defaultdict, Counter
19 16 from pathlib import Path
20 17  
21   -from openpyxl import load_workbook
22   -
23 18 # Allow running as `python scripts/xxx.py` without installing as a package
24 19 sys.path.insert(0, str(Path(__file__).resolve().parent))
25   -from shoplazza_excel_template import create_excel_from_template
26   -
27   -
28   -PREFERRED_OPTION_KEYS = [
29   - "Size", "Color", "Style", "Pattern", "Material", "Flavor", "Scent",
30   - "Pack", "Pack of", "Number of Items", "Count", "Capacity", "Length",
31   - "Width", "Height", "Model", "Configuration",
32   -]
33   -
34   -
35   -def clean_str(v):
36   - if v is None:
37   - return ""
38   - return str(v).strip()
39   -
40   -
41   -def html_escape(s):
42   - s = clean_str(s)
43   - return (s.replace("&", "&")
44   - .replace("<", "&lt;")
45   - .replace(">", "&gt;"))
46   -
47   -
48   -def generate_handle(title):
49   - """
50   - Generate URL-friendly handle from title (ASCII only).
51   - Keep consistent with existing scripts.
52   - """
53   - handle = clean_str(title).lower()
54   - handle = re.sub(r"[^a-z0-9\\s-]", "", handle)
55   - handle = re.sub(r"[-\\s]+", "-", handle).strip("-")
56   - if len(handle) > 255:
57   - handle = handle[:255]
58   - return handle or "product"
59   -
60   -
61   -def parse_date_to_template(dt_value):
62   - """
63   - Template expects: YYYY-MM-DD HH:MM:SS
64   - Input could be "2018-05-09" or datetime/date.
65   - """
66   - if dt_value is None or dt_value == "":
67   - return ""
68   - if isinstance(dt_value, datetime):
69   - return dt_value.strftime("%Y-%m-%d %H:%M:%S")
70   - s = clean_str(dt_value)
71   - # common formats
72   - for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S"):
73   - try:
74   - d = datetime.strptime(s, fmt)
75   - return d.strftime("%Y-%m-%d %H:%M:%S")
76   - except Exception:
77   - pass
78   - return ""
79   -
80   -
81   -def parse_weight(weight_conv, weight_raw):
82   - """
83   - Return (weight_value, unit) where unit in {kg, lb, g, oz}.
84   - Prefer '商品重量(单位换算)' like '68.04 g'.
85   - Fallback to '商品重量' like '0.15 pounds'.
86   - """
87   - s = clean_str(weight_conv) or clean_str(weight_raw)
88   - if not s:
89   - return ("", "")
90   - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)\\s*([a-zA-Z]+)", s)
91   - if not m:
92   - return ("", "")
93   - val = float(m.group(1))
94   - unit = m.group(2).lower()
95   - if unit in ("g", "gram", "grams"):
96   - return (val, "g")
97   - if unit in ("kg", "kilogram", "kilograms"):
98   - return (val, "kg")
99   - if unit in ("lb", "lbs", "pound", "pounds"):
100   - return (val, "lb")
101   - if unit in ("oz", "ounce", "ounces"):
102   - return (val, "oz")
103   - return ("", "")
104   -
105   -
106   -def parse_dimensions_inches(dim_raw):
107   - """
108   - Template '尺寸信息': 'L,W,H' in inches.
109   - Input example: '7.9 x 7.9 x 2 inches'
110   - """
111   - s = clean_str(dim_raw)
112   - if not s:
113   - return ""
114   - # extract first 3 numbers in order
115   - nums = re.findall(r"([0-9]+(?:\\.[0-9]+)?)", s)
116   - if len(nums) < 3:
117   - return ""
118   - return "{},{},{}".format(nums[0], nums[1], nums[2])
119   -
120   -
121   -def parse_sku_options(sku_text):
122   - """
123   - Parse 'SKU' column into {key: value}.
124   - Example:
125   - 'Size: One Size | Color: Black' -> {'Size':'One Size','Color':'Black'}
126   - """
127   - s = clean_str(sku_text)
128   - if not s:
129   - return {}
130   - parts = [p.strip() for p in s.split("|") if p.strip()]
131   - out = {}
132   - for p in parts:
133   - if ":" not in p:
134   - continue
135   - k, v = p.split(":", 1)
136   - k = clean_str(k)
137   - v = clean_str(v)
138   - if k and v:
139   - out[k] = v
140   - return out
141   -
142   -
143   -def choose_option_keys(variant_dicts, max_keys=3):
144   - """
145   - Choose up to 3 option keys for a product group.
146   - Order by preference list first, then by frequency.
147   - """
148   - freq = Counter()
149   - for d in variant_dicts:
150   - for k, v in d.items():
151   - if v:
152   - freq[k] += 1
153   - if not freq:
154   - return []
155   -
156   - preferred_rank = {k: i for i, k in enumerate(PREFERRED_OPTION_KEYS)}
157   -
158   - def key_sort(k):
159   - return (preferred_rank.get(k, 10 ** 6), -freq[k], k.lower())
160   -
161   - keys = sorted(freq.keys(), key=key_sort)
162   - return keys[:max_keys]
163   -
164   -
165   -def build_description_html(title, details, product_url):
166   - parts = []
167   - if title:
168   - parts.append("<p>{}</p>".format(html_escape(title)))
169   - detail_items = [x.strip() for x in clean_str(details).split("|") if x.strip()]
170   - if detail_items:
171   - li = "".join(["<li>{}</li>".format(html_escape(x)) for x in detail_items[:30]])
172   - parts.append("<ul>{}</ul>".format(li))
173   - if product_url:
174   - parts.append('<p>Source: <a href="{0}">{0}</a></p>'.format(html_escape(product_url)))
175   - return "".join(parts)
176   -
177   -
178   -def competitor_sheet(ws):
179   - """
180   - Build (header->col_index) for competitor sheet.
181   - Assumes header is row 1.
182   - """
183   - headers = []
184   - for c in range(1, ws.max_column + 1):
185   - v = ws.cell(1, c).value
186   - headers.append(clean_str(v))
187   - idx = {h: i + 1 for i, h in enumerate(headers) if h}
188   - return idx
189   -
190   -
191   -def read_competitor_rows_from_file(xlsx_path, max_rows=None):
192   - wb = load_workbook(xlsx_path, read_only=True, data_only=True)
193   - # pick first non-Notes sheet
194   - sheet_name = None
195   - for name in wb.sheetnames:
196   - if str(name).lower() == "notes":
197   - continue
198   - sheet_name = name
199   - break
200   - if sheet_name is None:
201   - return []
202   - ws = wb[sheet_name]
203   - idx = competitor_sheet(ws)
204   -
205   - required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)",
206   - "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接",
207   - "商品重量(单位换算)", "商品重量", "商品尺寸"]
208   - for k in required:
209   - if k not in idx:
210   - raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name))
211   -
212   - rows = []
213   - end_row = ws.max_row
214   - if max_rows is not None:
215   - end_row = min(end_row, 1 + int(max_rows))
216   -
217   - for r in range(2, end_row + 1):
218   - asin = clean_str(ws.cell(r, idx["ASIN"]).value)
219   - if not asin:
220   - continue
221   - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin
222   - row = {
223   - "ASIN": asin,
224   - "父ASIN": parent,
225   - "SKU": clean_str(ws.cell(r, idx["SKU"]).value),
226   - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value),
227   - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value),
228   - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value),
229   - "价格($)": ws.cell(r, idx["价格($)"]).value,
230   - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value,
231   - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value),
232   - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value),
233   - "大类目": clean_str(ws.cell(r, idx["大类目"]).value),
234   - "小类目": clean_str(ws.cell(r, idx["小类目"]).value),
235   - "品牌": clean_str(ws.cell(r, idx["品牌"]).value),
236   - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value),
237   - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value),
238   - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value),
239   - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value),
240   - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value),
241   - }
242   - rows.append(row)
243   - return rows
244   -
245   -
246   -def to_price(v):
247   - if v is None or v == "":
248   - return None
249   - try:
250   - return float(v)
251   - except Exception:
252   - s = clean_str(v)
253   - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", s)
254   - if not m:
255   - return None
256   - return float(m.group(1))
257   -
258   -
259   -def build_common_fields(base_row, spu_id):
260   - title = base_row.get("商品标题") or "Product"
261   - brand = base_row.get("品牌") or ""
262   - big_cat = base_row.get("大类目") or ""
263   - small_cat = base_row.get("小类目") or ""
264   - cat_path = base_row.get("类目路径") or ""
265   -
266   - handle = generate_handle(title)
267   - if handle and not handle.startswith("products/"):
268   - handle = "products/{}".format(handle)
269   -
270   - seo_title = title
271   - seo_desc_parts = []
272   - if brand:
273   - seo_desc_parts.append(brand)
274   - seo_desc_parts.append(title)
275   - if big_cat:
276   - seo_desc_parts.append(big_cat)
277   - seo_description = " ".join([x for x in seo_desc_parts if x])[:5000]
278   -
279   - seo_keywords = ",".join([x for x in [title, brand, big_cat, small_cat] if x])
280   - tags = ",".join([x for x in [brand, big_cat, small_cat] if x])
281   -
282   - created_at = parse_date_to_template(base_row.get("上架时间"))
283   -
284   - description = build_description_html(
285   - title=title,
286   - details=base_row.get("详细参数"),
287   - product_url=base_row.get("商品详情页链接"),
288   - )
289   -
290   - # default inventory settings (data source has no stock)
291   - inventory_qty = 100
292   -
293   - weight_val, weight_unit = parse_weight(base_row.get("商品重量(单位换算)"), base_row.get("商品重量"))
294   - size_info = parse_dimensions_inches(base_row.get("商品尺寸"))
295   -
296   - album = big_cat or ""
297   - if not album and cat_path:
298   - album = cat_path.split(":")[0]
299   -
300   - common = {
301   - "商品ID": "",
302   - "创建时间": created_at,
303   - "商品标题*": title[:255],
304   - "商品副标题": "{} {}".format(brand, big_cat).strip()[:600],
305   - "商品描述": description,
306   - "SEO标题": seo_title[:5000],
307   - "SEO描述": seo_description,
308   - "SEO URL Handle": handle,
309   - "SEO URL 重定向": "N",
310   - "SEO关键词": seo_keywords[:5000],
311   - "商品上架": "Y",
312   - "需要物流": "Y",
313   - "商品收税": "N",
314   - "商品spu": spu_id[:100],
315   - "启用虚拟销量": "N",
316   - "虚拟销量值": "",
317   - "跟踪库存": "Y",
318   - "库存规则*": "1",
319   - "专辑名称": album,
320   - "标签": tags,
321   - "供应商名称": "Amazon",
322   - "供应商URL": base_row.get("商品详情页链接") or base_row.get("品牌链接") or "",
323   - "商品重量": weight_val if weight_val != "" else "",
324   - "重量单位": weight_unit,
325   - "商品库存": inventory_qty,
326   - "尺寸信息": size_info,
327   - "原产地国别": "",
328   - "HS(协调制度)代码": "",
329   - "商品备注": "ASIN:{}; ParentASIN:{}; CategoryPath:{}".format(
330   - base_row.get("ASIN", ""), spu_id, (cat_path[:200] if cat_path else "")
331   - )[:500],
332   - "款式备注": "",
333   - }
334   - return common
335   -
336   -
337   -def build_s_row(base_row):
338   - spu_id = base_row.get("父ASIN") or base_row.get("ASIN")
339   - common = build_common_fields(base_row, spu_id=spu_id)
340   - price = to_price(base_row.get("prime价格($)")) or to_price(base_row.get("价格($)")) or 9.99
341   - image = base_row.get("商品主图") or ""
342   -
343   - row = {}
344   - row.update(common)
345   - row.update({
346   - "商品属性*": "S",
347   - "款式1": "",
348   - "款式2": "",
349   - "款式3": "",
350   - "商品售价*": price,
351   - "商品原价": price,
352   - "成本价": "",
353   - "商品SKU": base_row.get("ASIN") or "",
354   - "商品条形码": "",
355   - "商品图片*": image,
356   - "商品主图": image,
357   - })
358   - return row
359   -
360   -
361   -def build_m_p_rows(variant_rows):
362   - """
363   - variant_rows: List[dict] with same 父ASIN.
364   - """
365   - base = variant_rows[0]
366   - spu_id = base.get("父ASIN") or base.get("ASIN")
367   - common = build_common_fields(base, spu_id=spu_id)
368   -
369   - option_dicts = [parse_sku_options(v.get("SKU")) for v in variant_rows]
370   - option_keys = choose_option_keys(option_dicts, max_keys=3)
371   - if not option_keys:
372   - option_keys = ["Variant"]
373   -
374   - # M row
375   - m = {}
376   - m.update(common)
377   - m.update({
378   - "商品属性*": "M",
379   - "款式1": option_keys[0] if len(option_keys) > 0 else "",
380   - "款式2": option_keys[1] if len(option_keys) > 1 else "",
381   - "款式3": option_keys[2] if len(option_keys) > 2 else "",
382   - "商品售价*": "",
383   - "商品原价": "",
384   - "成本价": "",
385   - "商品SKU": "",
386   - "商品条形码": "",
387   - "商品图片*": base.get("商品主图") or "",
388   - "商品主图": base.get("商品主图") or "",
389   - })
390   -
391   - # For M row, these SKU-level fields should be empty per template guidance
392   - m["商品重量"] = ""
393   - m["重量单位"] = ""
394   - m["商品库存"] = ""
395   - m["尺寸信息"] = ""
396   -
397   - rows = [m]
398   -
399   - # P rows
400   - for v in variant_rows:
401   - v_common = build_common_fields(v, spu_id=spu_id)
402   - # wipe SPU-only fields for P row
403   - v_common.update({
404   - "商品副标题": "",
405   - "商品描述": "",
406   - "SEO标题": "",
407   - "SEO描述": "",
408   - "SEO URL Handle": "",
409   - "SEO URL 重定向": "",
410   - "SEO关键词": "",
411   - "专辑名称": "",
412   - "标签": "",
413   - "供应商名称": "",
414   - "供应商URL": "",
415   - "商品备注": "",
416   - })
417   -
418   - opt = parse_sku_options(v.get("SKU"))
419   - if option_keys == ["Variant"]:
420   - opt_vals = [v.get("ASIN")]
421   - else:
422   - opt_vals = [opt.get(k, "") for k in option_keys]
423   -
424   - price = to_price(v.get("prime价格($)")) or to_price(v.get("价格($)")) or 9.99
425   - image = v.get("商品主图") or ""
426   -
427   - p = {}
428   - p.update(v_common)
429   - p.update({
430   - "商品属性*": "P",
431   - "款式1": opt_vals[0] if len(opt_vals) > 0 else "",
432   - "款式2": opt_vals[1] if len(opt_vals) > 1 else "",
433   - "款式3": opt_vals[2] if len(opt_vals) > 2 else "",
434   - "商品售价*": price,
435   - "商品原价": price,
436   - "成本价": "",
437   - "商品SKU": v.get("ASIN") or "",
438   - "商品条形码": "",
439   - # P row supports one variant image; we use variant's main image
440   - "商品图片*": image,
441   - "商品主图": "",
442   - })
443   - rows.append(p)
444   -
445   - return rows
446   -
447   -
448   -def main():
449   - parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx (deprecated script name)")
450   - parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
451   - parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
452   - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")
453   - parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
454   - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")
455   - parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
456   - args = parser.parse_args()
457   -
458   - input_dir = args.input_dir
459   - if not os.path.isdir(input_dir):
460   - raise RuntimeError("input-dir not found: {}".format(input_dir))
461   - if not os.path.exists(args.template):
462   - raise RuntimeError("template not found: {}".format(args.template))
463   -
464   - files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(".xlsx")]
465   - files.sort()
466   - if args.max_files is not None:
467   - files = files[: int(args.max_files)]
468   -
469   - print("Reading Amazon-format files: {} (from {})".format(len(files), input_dir), flush=True)
470   -
471   - groups = defaultdict(list) # spu_id -> [variant rows]
472   - seen_asin = set()
473   -
474   - for fp in files:
475   - print(" - loading: {}".format(fp), flush=True)
476   - try:
477   - rows = read_competitor_rows_from_file(fp, max_rows=args.max_rows_per_file)
478   - except Exception as e:
479   - print("WARN: failed to read {}: {}".format(fp, e))
480   - continue
481   - print(" loaded rows: {}".format(len(rows)), flush=True)
482   -
483   - for r in rows:
484   - asin = r.get("ASIN")
485   - if asin in seen_asin:
486   - continue
487   - seen_asin.add(asin)
488   - spu_id = r.get("父ASIN") or asin
489   - groups[spu_id].append(r)
490   -
491   - print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True)
492   -
493   - excel_rows = []
494   - spu_count = 0
495   -
496   - for spu_id, variants in groups.items():
497   - if not variants:
498   - continue
499   - spu_count += 1
500   - if args.max_products is not None and spu_count > int(args.max_products):
501   - break
502   - if len(variants) == 1:
503   - excel_rows.append(build_s_row(variants[0]))
504   - else:
505   - excel_rows.extend(build_m_p_rows(variants))
506 20  
507   - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)
508   - create_excel_from_template(args.template, args.output, excel_rows)
  21 +from amazon_xlsx_to_shoplazza_xlsx import main as amazon_main
509 22  
510 23  
511 24 if __name__ == "__main__":
512   - main()
  25 + amazon_main()
513 26  
514 27  
... ...
scripts/shoplazza_excel_template.py
... ... @@ -6,6 +6,7 @@ based on the provided template `docs/商品导入模板.xlsx`.
6 6 We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts.
7 7 """
8 8  
  9 +from openpyxl import Workbook
9 10 from openpyxl import load_workbook
10 11 from openpyxl.styles import Alignment
11 12  
... ... @@ -46,14 +47,15 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro
46 47 for col in range(1, ws.max_column + 1):
47 48 ws.cell(row=row, column=col).value = None
48 49  
49   - # Write data rows
  50 + # Write data rows (OPT: only write fields that actually exist in excel_row)
  51 + # This avoids looping over all 42 template columns for every output row.
50 52 for row_idx, excel_row in enumerate(excel_rows):
51 53 excel_row_num = data_start_row + row_idx
52   - for field_name, col_idx in column_mapping.items():
53   - if field_name not in excel_row:
  54 + for field_name, value in excel_row.items():
  55 + col_idx = column_mapping.get(field_name)
  56 + if not col_idx:
54 57 continue
55 58 cell = ws.cell(row=excel_row_num, column=col_idx)
56   - value = excel_row[field_name]
57 59 cell.value = value
58 60 if isinstance(value, str):
59 61 cell.alignment = Alignment(vertical='top', wrap_text=True)
... ... @@ -65,3 +67,67 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro
65 67 print(" - Total rows: {}".format(len(excel_rows)))
66 68  
67 69  
  70 +def create_excel_from_template_fast(template_file, output_file, excel_rows, header_row_idx=2, data_start_row=4):
  71 + """
  72 + Faster writer for large datasets.
  73 +
  74 + Instead of opening the template workbook in write mode and assigning cells one by one,
  75 + we:
  76 + - read the template's first (data_start_row-1) rows as values
  77 + - build a header->index mapping from header_row_idx
  78 + - create a new write_only workbook and append rows
  79 +
  80 + This is much faster for tens/hundreds of thousands of cells.
  81 + """
  82 + tpl_wb = load_workbook(template_file, read_only=True, data_only=True)
  83 + tpl_ws = tpl_wb.active
  84 +
  85 + max_col = tpl_ws.max_column
  86 +
  87 + # Copy template "instruction" rows (typically rows 1-3) into output
  88 + prefix_rows = list(tpl_ws.iter_rows(min_row=1, max_row=data_start_row - 1, values_only=True))
  89 +
  90 + header_values = None
  91 + if 1 <= header_row_idx <= len(prefix_rows):
  92 + header_values = prefix_rows[header_row_idx - 1]
  93 + else:
  94 + # Fallback: read header row directly
  95 + header_values = next(tpl_ws.iter_rows(min_row=header_row_idx, max_row=header_row_idx, values_only=True))
  96 +
  97 + header_values = list(header_values)[:max_col]
  98 + col_map = {}
  99 + for i, v in enumerate(header_values):
  100 + if v is None:
  101 + continue
  102 + col_map[str(v).strip()] = i # 0-based
  103 +
  104 + wb = Workbook(write_only=True)
  105 + ws = wb.create_sheet(title=tpl_ws.title)
  106 + # remove default sheet if present (openpyxl may create one)
  107 + if "Sheet" in wb.sheetnames and wb["Sheet"] is not ws:
  108 + try:
  109 + wb.remove(wb["Sheet"])
  110 + except Exception:
  111 + pass
  112 +
  113 + # Write prefix rows, normalized to max_col
  114 + for r in prefix_rows:
  115 + r = list(r)[:max_col]
  116 + if len(r) < max_col:
  117 + r = r + [None] * (max_col - len(r))
  118 + ws.append(r)
  119 +
  120 + # Write data rows
  121 + for excel_row in excel_rows:
  122 + row_vals = [None] * max_col
  123 + for field_name, value in excel_row.items():
  124 + if field_name not in col_map:
  125 + continue
  126 + row_vals[col_map[field_name]] = value
  127 + ws.append(row_vals)
  128 +
  129 + wb.save(output_file)
  130 + print("Excel file created (fast): {}".format(output_file))
  131 + print(" - Total rows: {}".format(len(excel_rows)))
  132 +
  133 +
... ...