Commit 80519ec6d17d0be1d524596395f9f911cf0a0923

Authored by tangwang
1 parent cd29428b

emazon -> shoplazza

docs/亚马逊格式数据转店匠商品导入模板.md
@@ -116,6 +116,12 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ @@ -116,6 +116,12 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
116 --max-files 1 --max-rows-per-file 2000 --max-products 50 116 --max-files 1 --max-rows-per-file 2000 --max-products 50
117 ``` 117 ```
118 118
  119 +### 性能提示(很重要)
  120 +
  121 +- 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。
  122 +- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。
  123 +- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write`
  124 +
119 ### 2)生成全量 125 ### 2)生成全量
120 126
121 ```bash 127 ```bash
scripts/amazon_xlsx_to_shoplazza_xlsx.py
@@ -36,7 +36,7 @@ from openpyxl import load_workbook @@ -36,7 +36,7 @@ from openpyxl import load_workbook
36 36
37 # Allow running as `python scripts/xxx.py` without installing as a package 37 # Allow running as `python scripts/xxx.py` without installing as a package
38 sys.path.insert(0, str(Path(__file__).resolve().parent)) 38 sys.path.insert(0, str(Path(__file__).resolve().parent))
39 -from shoplazza_excel_template import create_excel_from_template 39 +from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast
40 40
41 41
42 PREFERRED_OPTION_KEYS = [ 42 PREFERRED_OPTION_KEYS = [
@@ -210,35 +210,39 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): @@ -210,35 +210,39 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None):
210 if k not in idx: 210 if k not in idx:
211 raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) 211 raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name))
212 212
  213 + # OPT: use iter_rows(values_only=True) instead of ws.cell() per field.
  214 + # openpyxl cell access is relatively expensive; values_only is much faster.
  215 + pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple
  216 +
213 rows = [] 217 rows = []
214 end_row = ws.max_row 218 end_row = ws.max_row
215 if max_rows is not None: 219 if max_rows is not None:
216 end_row = min(end_row, 1 + int(max_rows)) 220 end_row = min(end_row, 1 + int(max_rows))
217 221
218 - for r in range(2, end_row + 1):  
219 - asin = clean_str(ws.cell(r, idx["ASIN"]).value) 222 + for tup in ws.iter_rows(min_row=2, max_row=end_row, values_only=True):
  223 + asin = clean_str(tup[pos["ASIN"]])
220 if not asin: 224 if not asin:
221 continue 225 continue
222 - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin 226 + parent = clean_str(tup[pos["父ASIN"]]) or asin
223 rows.append({ 227 rows.append({
224 "ASIN": asin, 228 "ASIN": asin,
225 "父ASIN": parent, 229 "父ASIN": parent,
226 - "SKU": clean_str(ws.cell(r, idx["SKU"]).value),  
227 - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value),  
228 - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value),  
229 - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value),  
230 - "价格($)": ws.cell(r, idx["价格($)"]).value,  
231 - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value,  
232 - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value),  
233 - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value),  
234 - "大类目": clean_str(ws.cell(r, idx["大类目"]).value),  
235 - "小类目": clean_str(ws.cell(r, idx["小类目"]).value),  
236 - "品牌": clean_str(ws.cell(r, idx["品牌"]).value),  
237 - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value),  
238 - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value),  
239 - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value),  
240 - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value),  
241 - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value), 230 + "SKU": clean_str(tup[pos["SKU"]]),
  231 + "详细参数": clean_str(tup[pos["详细参数"]]),
  232 + "商品标题": clean_str(tup[pos["商品标题"]]),
  233 + "商品主图": clean_str(tup[pos["商品主图"]]),
  234 + "价格($)": tup[pos["价格($)"]],
  235 + "prime价格($)": tup[pos["prime价格($)"]],
  236 + "上架时间": clean_str(tup[pos["上架时间"]]),
  237 + "类目路径": clean_str(tup[pos["类目路径"]]),
  238 + "大类目": clean_str(tup[pos["大类目"]]),
  239 + "小类目": clean_str(tup[pos["小类目"]]),
  240 + "品牌": clean_str(tup[pos["品牌"]]),
  241 + "品牌链接": clean_str(tup[pos["品牌链接"]]),
  242 + "商品详情页链接": clean_str(tup[pos["商品详情页链接"]]),
  243 + "商品重量(单位换算)": clean_str(tup[pos["商品重量(单位换算)"]]),
  244 + "商品重量": clean_str(tup[pos["商品重量"]]),
  245 + "商品尺寸": clean_str(tup[pos["商品尺寸"]]),
242 }) 246 })
243 return rows 247 return rows
244 248
@@ -417,6 +421,7 @@ def main(): @@ -417,6 +421,7 @@ def main():
417 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") 421 parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")
418 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") 422 parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")
419 parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") 423 parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")
  424 + parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)")
420 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") 425 parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")
421 parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") 426 parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")
422 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") 427 parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")
@@ -471,7 +476,10 @@ def main(): @@ -471,7 +476,10 @@ def main():
471 excel_rows.extend(build_m_p_rows(variants)) 476 excel_rows.extend(build_m_p_rows(variants))
472 477
473 print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) 478 print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)
474 - create_excel_from_template(args.template, args.output, excel_rows) 479 + if args.no_fast_write:
  480 + create_excel_from_template(args.template, args.output, excel_rows)
  481 + else:
  482 + create_excel_from_template_fast(args.template, args.output, excel_rows)
475 483
476 484
477 if __name__ == "__main__": 485 if __name__ == "__main__":
scripts/competitor_xlsx_to_shoplazza_xlsx.py
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 """ 2 """
3 -DEPRECATED NAME (kept for backward compatibility). 3 +DEPRECATED SCRIPT NAME (kept for backward compatibility).
4 4
5 -The input `products_data/*.xlsx` files are **Amazon-format exports** (with Parent/Child ASIN),  
6 -not “competitor data”. Please use: 5 +The input `data/mai_jia_jing_ling/products_data/*.xlsx` files are Amazon-format exports
  6 +(Parent/Child ASIN), not “competitor data”.
7 7
  8 +Please use:
8 - `scripts/amazon_xlsx_to_shoplazza_xlsx.py` 9 - `scripts/amazon_xlsx_to_shoplazza_xlsx.py`
9 10
10 -This script keeps the same logic but updates user-facing naming gradually. 11 +This wrapper simply forwards all CLI args to the correctly named script, so you
  12 +automatically get the latest performance improvements (fast read/write).
11 """ 13 """
12 14
13 -import os  
14 -import re  
15 import sys 15 import sys
16 -import argparse  
17 -from datetime import datetime  
18 -from collections import defaultdict, Counter  
19 from pathlib import Path 16 from pathlib import Path
20 17
21 -from openpyxl import load_workbook  
22 -  
23 # Allow running as `python scripts/xxx.py` without installing as a package 18 # Allow running as `python scripts/xxx.py` without installing as a package
24 sys.path.insert(0, str(Path(__file__).resolve().parent)) 19 sys.path.insert(0, str(Path(__file__).resolve().parent))
25 -from shoplazza_excel_template import create_excel_from_template  
26 -  
27 -  
28 -PREFERRED_OPTION_KEYS = [  
29 - "Size", "Color", "Style", "Pattern", "Material", "Flavor", "Scent",  
30 - "Pack", "Pack of", "Number of Items", "Count", "Capacity", "Length",  
31 - "Width", "Height", "Model", "Configuration",  
32 -]  
33 -  
34 -  
35 -def clean_str(v):  
36 - if v is None:  
37 - return ""  
38 - return str(v).strip()  
39 -  
40 -  
41 -def html_escape(s):  
42 - s = clean_str(s)  
43 - return (s.replace("&", "&")  
44 - .replace("<", "&lt;")  
45 - .replace(">", "&gt;"))  
46 -  
47 -  
48 -def generate_handle(title):  
49 - """  
50 - Generate URL-friendly handle from title (ASCII only).  
51 - Keep consistent with existing scripts.  
52 - """  
53 - handle = clean_str(title).lower()  
54 - handle = re.sub(r"[^a-z0-9\\s-]", "", handle)  
55 - handle = re.sub(r"[-\\s]+", "-", handle).strip("-")  
56 - if len(handle) > 255:  
57 - handle = handle[:255]  
58 - return handle or "product"  
59 -  
60 -  
61 -def parse_date_to_template(dt_value):  
62 - """  
63 - Template expects: YYYY-MM-DD HH:MM:SS  
64 - Input could be "2018-05-09" or datetime/date.  
65 - """  
66 - if dt_value is None or dt_value == "":  
67 - return ""  
68 - if isinstance(dt_value, datetime):  
69 - return dt_value.strftime("%Y-%m-%d %H:%M:%S")  
70 - s = clean_str(dt_value)  
71 - # common formats  
72 - for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S"):  
73 - try:  
74 - d = datetime.strptime(s, fmt)  
75 - return d.strftime("%Y-%m-%d %H:%M:%S")  
76 - except Exception:  
77 - pass  
78 - return ""  
79 -  
80 -  
81 -def parse_weight(weight_conv, weight_raw):  
82 - """  
83 - Return (weight_value, unit) where unit in {kg, lb, g, oz}.  
84 - Prefer '商品重量(单位换算)' like '68.04 g'.  
85 - Fallback to '商品重量' like '0.15 pounds'.  
86 - """  
87 - s = clean_str(weight_conv) or clean_str(weight_raw)  
88 - if not s:  
89 - return ("", "")  
90 - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)\\s*([a-zA-Z]+)", s)  
91 - if not m:  
92 - return ("", "")  
93 - val = float(m.group(1))  
94 - unit = m.group(2).lower()  
95 - if unit in ("g", "gram", "grams"):  
96 - return (val, "g")  
97 - if unit in ("kg", "kilogram", "kilograms"):  
98 - return (val, "kg")  
99 - if unit in ("lb", "lbs", "pound", "pounds"):  
100 - return (val, "lb")  
101 - if unit in ("oz", "ounce", "ounces"):  
102 - return (val, "oz")  
103 - return ("", "")  
104 -  
105 -  
106 -def parse_dimensions_inches(dim_raw):  
107 - """  
108 - Template '尺寸信息': 'L,W,H' in inches.  
109 - Input example: '7.9 x 7.9 x 2 inches'  
110 - """  
111 - s = clean_str(dim_raw)  
112 - if not s:  
113 - return ""  
114 - # extract first 3 numbers in order  
115 - nums = re.findall(r"([0-9]+(?:\\.[0-9]+)?)", s)  
116 - if len(nums) < 3:  
117 - return ""  
118 - return "{},{},{}".format(nums[0], nums[1], nums[2])  
119 -  
120 -  
121 -def parse_sku_options(sku_text):  
122 - """  
123 - Parse 'SKU' column into {key: value}.  
124 - Example:  
125 - 'Size: One Size | Color: Black' -> {'Size':'One Size','Color':'Black'}  
126 - """  
127 - s = clean_str(sku_text)  
128 - if not s:  
129 - return {}  
130 - parts = [p.strip() for p in s.split("|") if p.strip()]  
131 - out = {}  
132 - for p in parts:  
133 - if ":" not in p:  
134 - continue  
135 - k, v = p.split(":", 1)  
136 - k = clean_str(k)  
137 - v = clean_str(v)  
138 - if k and v:  
139 - out[k] = v  
140 - return out  
141 -  
142 -  
143 -def choose_option_keys(variant_dicts, max_keys=3):  
144 - """  
145 - Choose up to 3 option keys for a product group.  
146 - Order by preference list first, then by frequency.  
147 - """  
148 - freq = Counter()  
149 - for d in variant_dicts:  
150 - for k, v in d.items():  
151 - if v:  
152 - freq[k] += 1  
153 - if not freq:  
154 - return []  
155 -  
156 - preferred_rank = {k: i for i, k in enumerate(PREFERRED_OPTION_KEYS)}  
157 -  
158 - def key_sort(k):  
159 - return (preferred_rank.get(k, 10 ** 6), -freq[k], k.lower())  
160 -  
161 - keys = sorted(freq.keys(), key=key_sort)  
162 - return keys[:max_keys]  
163 -  
164 -  
165 -def build_description_html(title, details, product_url):  
166 - parts = []  
167 - if title:  
168 - parts.append("<p>{}</p>".format(html_escape(title)))  
169 - detail_items = [x.strip() for x in clean_str(details).split("|") if x.strip()]  
170 - if detail_items:  
171 - li = "".join(["<li>{}</li>".format(html_escape(x)) for x in detail_items[:30]])  
172 - parts.append("<ul>{}</ul>".format(li))  
173 - if product_url:  
174 - parts.append('<p>Source: <a href="{0}">{0}</a></p>'.format(html_escape(product_url)))  
175 - return "".join(parts)  
176 -  
177 -  
178 -def competitor_sheet(ws):  
179 - """  
180 - Build (header->col_index) for competitor sheet.  
181 - Assumes header is row 1.  
182 - """  
183 - headers = []  
184 - for c in range(1, ws.max_column + 1):  
185 - v = ws.cell(1, c).value  
186 - headers.append(clean_str(v))  
187 - idx = {h: i + 1 for i, h in enumerate(headers) if h}  
188 - return idx  
189 -  
190 -  
191 -def read_competitor_rows_from_file(xlsx_path, max_rows=None):  
192 - wb = load_workbook(xlsx_path, read_only=True, data_only=True)  
193 - # pick first non-Notes sheet  
194 - sheet_name = None  
195 - for name in wb.sheetnames:  
196 - if str(name).lower() == "notes":  
197 - continue  
198 - sheet_name = name  
199 - break  
200 - if sheet_name is None:  
201 - return []  
202 - ws = wb[sheet_name]  
203 - idx = competitor_sheet(ws)  
204 -  
205 - required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)",  
206 - "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接",  
207 - "商品重量(单位换算)", "商品重量", "商品尺寸"]  
208 - for k in required:  
209 - if k not in idx:  
210 - raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name))  
211 -  
212 - rows = []  
213 - end_row = ws.max_row  
214 - if max_rows is not None:  
215 - end_row = min(end_row, 1 + int(max_rows))  
216 -  
217 - for r in range(2, end_row + 1):  
218 - asin = clean_str(ws.cell(r, idx["ASIN"]).value)  
219 - if not asin:  
220 - continue  
221 - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin  
222 - row = {  
223 - "ASIN": asin,  
224 - "父ASIN": parent,  
225 - "SKU": clean_str(ws.cell(r, idx["SKU"]).value),  
226 - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value),  
227 - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value),  
228 - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value),  
229 - "价格($)": ws.cell(r, idx["价格($)"]).value,  
230 - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value,  
231 - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value),  
232 - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value),  
233 - "大类目": clean_str(ws.cell(r, idx["大类目"]).value),  
234 - "小类目": clean_str(ws.cell(r, idx["小类目"]).value),  
235 - "品牌": clean_str(ws.cell(r, idx["品牌"]).value),  
236 - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value),  
237 - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value),  
238 - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value),  
239 - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value),  
240 - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value),  
241 - }  
242 - rows.append(row)  
243 - return rows  
244 -  
245 -  
246 -def to_price(v):  
247 - if v is None or v == "":  
248 - return None  
249 - try:  
250 - return float(v)  
251 - except Exception:  
252 - s = clean_str(v)  
253 - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", s)  
254 - if not m:  
255 - return None  
256 - return float(m.group(1))  
257 -  
258 -  
259 -def build_common_fields(base_row, spu_id):  
260 - title = base_row.get("商品标题") or "Product"  
261 - brand = base_row.get("品牌") or ""  
262 - big_cat = base_row.get("大类目") or ""  
263 - small_cat = base_row.get("小类目") or ""  
264 - cat_path = base_row.get("类目路径") or ""  
265 -  
266 - handle = generate_handle(title)  
267 - if handle and not handle.startswith("products/"):  
268 - handle = "products/{}".format(handle)  
269 -  
270 - seo_title = title  
271 - seo_desc_parts = []  
272 - if brand:  
273 - seo_desc_parts.append(brand)  
274 - seo_desc_parts.append(title)  
275 - if big_cat:  
276 - seo_desc_parts.append(big_cat)  
277 - seo_description = " ".join([x for x in seo_desc_parts if x])[:5000]  
278 -  
279 - seo_keywords = ",".join([x for x in [title, brand, big_cat, small_cat] if x])  
280 - tags = ",".join([x for x in [brand, big_cat, small_cat] if x])  
281 -  
282 - created_at = parse_date_to_template(base_row.get("上架时间"))  
283 -  
284 - description = build_description_html(  
285 - title=title,  
286 - details=base_row.get("详细参数"),  
287 - product_url=base_row.get("商品详情页链接"),  
288 - )  
289 -  
290 - # default inventory settings (data source has no stock)  
291 - inventory_qty = 100  
292 -  
293 - weight_val, weight_unit = parse_weight(base_row.get("商品重量(单位换算)"), base_row.get("商品重量"))  
294 - size_info = parse_dimensions_inches(base_row.get("商品尺寸"))  
295 -  
296 - album = big_cat or ""  
297 - if not album and cat_path:  
298 - album = cat_path.split(":")[0]  
299 -  
300 - common = {  
301 - "商品ID": "",  
302 - "创建时间": created_at,  
303 - "商品标题*": title[:255],  
304 - "商品副标题": "{} {}".format(brand, big_cat).strip()[:600],  
305 - "商品描述": description,  
306 - "SEO标题": seo_title[:5000],  
307 - "SEO描述": seo_description,  
308 - "SEO URL Handle": handle,  
309 - "SEO URL 重定向": "N",  
310 - "SEO关键词": seo_keywords[:5000],  
311 - "商品上架": "Y",  
312 - "需要物流": "Y",  
313 - "商品收税": "N",  
314 - "商品spu": spu_id[:100],  
315 - "启用虚拟销量": "N",  
316 - "虚拟销量值": "",  
317 - "跟踪库存": "Y",  
318 - "库存规则*": "1",  
319 - "专辑名称": album,  
320 - "标签": tags,  
321 - "供应商名称": "Amazon",  
322 - "供应商URL": base_row.get("商品详情页链接") or base_row.get("品牌链接") or "",  
323 - "商品重量": weight_val if weight_val != "" else "",  
324 - "重量单位": weight_unit,  
325 - "商品库存": inventory_qty,  
326 - "尺寸信息": size_info,  
327 - "原产地国别": "",  
328 - "HS(协调制度)代码": "",  
329 - "商品备注": "ASIN:{}; ParentASIN:{}; CategoryPath:{}".format(  
330 - base_row.get("ASIN", ""), spu_id, (cat_path[:200] if cat_path else "")  
331 - )[:500],  
332 - "款式备注": "",  
333 - }  
334 - return common  
335 -  
336 -  
337 -def build_s_row(base_row):  
338 - spu_id = base_row.get("父ASIN") or base_row.get("ASIN")  
339 - common = build_common_fields(base_row, spu_id=spu_id)  
340 - price = to_price(base_row.get("prime价格($)")) or to_price(base_row.get("价格($)")) or 9.99  
341 - image = base_row.get("商品主图") or ""  
342 -  
343 - row = {}  
344 - row.update(common)  
345 - row.update({  
346 - "商品属性*": "S",  
347 - "款式1": "",  
348 - "款式2": "",  
349 - "款式3": "",  
350 - "商品售价*": price,  
351 - "商品原价": price,  
352 - "成本价": "",  
353 - "商品SKU": base_row.get("ASIN") or "",  
354 - "商品条形码": "",  
355 - "商品图片*": image,  
356 - "商品主图": image,  
357 - })  
358 - return row  
359 -  
360 -  
361 -def build_m_p_rows(variant_rows):  
362 - """  
363 - variant_rows: List[dict] with same 父ASIN.  
364 - """  
365 - base = variant_rows[0]  
366 - spu_id = base.get("父ASIN") or base.get("ASIN")  
367 - common = build_common_fields(base, spu_id=spu_id)  
368 -  
369 - option_dicts = [parse_sku_options(v.get("SKU")) for v in variant_rows]  
370 - option_keys = choose_option_keys(option_dicts, max_keys=3)  
371 - if not option_keys:  
372 - option_keys = ["Variant"]  
373 -  
374 - # M row  
375 - m = {}  
376 - m.update(common)  
377 - m.update({  
378 - "商品属性*": "M",  
379 - "款式1": option_keys[0] if len(option_keys) > 0 else "",  
380 - "款式2": option_keys[1] if len(option_keys) > 1 else "",  
381 - "款式3": option_keys[2] if len(option_keys) > 2 else "",  
382 - "商品售价*": "",  
383 - "商品原价": "",  
384 - "成本价": "",  
385 - "商品SKU": "",  
386 - "商品条形码": "",  
387 - "商品图片*": base.get("商品主图") or "",  
388 - "商品主图": base.get("商品主图") or "",  
389 - })  
390 -  
391 - # For M row, these SKU-level fields should be empty per template guidance  
392 - m["商品重量"] = ""  
393 - m["重量单位"] = ""  
394 - m["商品库存"] = ""  
395 - m["尺寸信息"] = ""  
396 -  
397 - rows = [m]  
398 -  
399 - # P rows  
400 - for v in variant_rows:  
401 - v_common = build_common_fields(v, spu_id=spu_id)  
402 - # wipe SPU-only fields for P row  
403 - v_common.update({  
404 - "商品副标题": "",  
405 - "商品描述": "",  
406 - "SEO标题": "",  
407 - "SEO描述": "",  
408 - "SEO URL Handle": "",  
409 - "SEO URL 重定向": "",  
410 - "SEO关键词": "",  
411 - "专辑名称": "",  
412 - "标签": "",  
413 - "供应商名称": "",  
414 - "供应商URL": "",  
415 - "商品备注": "",  
416 - })  
417 -  
418 - opt = parse_sku_options(v.get("SKU"))  
419 - if option_keys == ["Variant"]:  
420 - opt_vals = [v.get("ASIN")]  
421 - else:  
422 - opt_vals = [opt.get(k, "") for k in option_keys]  
423 -  
424 - price = to_price(v.get("prime价格($)")) or to_price(v.get("价格($)")) or 9.99  
425 - image = v.get("商品主图") or ""  
426 -  
427 - p = {}  
428 - p.update(v_common)  
429 - p.update({  
430 - "商品属性*": "P",  
431 - "款式1": opt_vals[0] if len(opt_vals) > 0 else "",  
432 - "款式2": opt_vals[1] if len(opt_vals) > 1 else "",  
433 - "款式3": opt_vals[2] if len(opt_vals) > 2 else "",  
434 - "商品售价*": price,  
435 - "商品原价": price,  
436 - "成本价": "",  
437 - "商品SKU": v.get("ASIN") or "",  
438 - "商品条形码": "",  
439 - # P row supports one variant image; we use variant's main image  
440 - "商品图片*": image,  
441 - "商品主图": "",  
442 - })  
443 - rows.append(p)  
444 -  
445 - return rows  
446 -  
447 -  
448 -def main():  
449 - parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx (deprecated script name)")  
450 - parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files")  
451 - parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx")  
452 - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path")  
453 - parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)")  
454 - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)")  
455 - parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)")  
456 - args = parser.parse_args()  
457 -  
458 - input_dir = args.input_dir  
459 - if not os.path.isdir(input_dir):  
460 - raise RuntimeError("input-dir not found: {}".format(input_dir))  
461 - if not os.path.exists(args.template):  
462 - raise RuntimeError("template not found: {}".format(args.template))  
463 -  
464 - files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(".xlsx")]  
465 - files.sort()  
466 - if args.max_files is not None:  
467 - files = files[: int(args.max_files)]  
468 -  
469 - print("Reading Amazon-format files: {} (from {})".format(len(files), input_dir), flush=True)  
470 -  
471 - groups = defaultdict(list) # spu_id -> [variant rows]  
472 - seen_asin = set()  
473 -  
474 - for fp in files:  
475 - print(" - loading: {}".format(fp), flush=True)  
476 - try:  
477 - rows = read_competitor_rows_from_file(fp, max_rows=args.max_rows_per_file)  
478 - except Exception as e:  
479 - print("WARN: failed to read {}: {}".format(fp, e))  
480 - continue  
481 - print(" loaded rows: {}".format(len(rows)), flush=True)  
482 -  
483 - for r in rows:  
484 - asin = r.get("ASIN")  
485 - if asin in seen_asin:  
486 - continue  
487 - seen_asin.add(asin)  
488 - spu_id = r.get("父ASIN") or asin  
489 - groups[spu_id].append(r)  
490 -  
491 - print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True)  
492 -  
493 - excel_rows = []  
494 - spu_count = 0  
495 -  
496 - for spu_id, variants in groups.items():  
497 - if not variants:  
498 - continue  
499 - spu_count += 1  
500 - if args.max_products is not None and spu_count > int(args.max_products):  
501 - break  
502 - if len(variants) == 1:  
503 - excel_rows.append(build_s_row(variants[0]))  
504 - else:  
505 - excel_rows.extend(build_m_p_rows(variants))  
506 20
507 - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True)  
508 - create_excel_from_template(args.template, args.output, excel_rows) 21 +from amazon_xlsx_to_shoplazza_xlsx import main as amazon_main
509 22
510 23
511 if __name__ == "__main__": 24 if __name__ == "__main__":
512 - main() 25 + amazon_main()
513 26
514 27
scripts/shoplazza_excel_template.py
@@ -6,6 +6,7 @@ based on the provided template `docs/商品导入模板.xlsx`. @@ -6,6 +6,7 @@ based on the provided template `docs/商品导入模板.xlsx`.
6 We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts. 6 We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts.
7 """ 7 """
8 8
  9 +from openpyxl import Workbook
9 from openpyxl import load_workbook 10 from openpyxl import load_workbook
10 from openpyxl.styles import Alignment 11 from openpyxl.styles import Alignment
11 12
@@ -46,14 +47,15 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro @@ -46,14 +47,15 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro
46 for col in range(1, ws.max_column + 1): 47 for col in range(1, ws.max_column + 1):
47 ws.cell(row=row, column=col).value = None 48 ws.cell(row=row, column=col).value = None
48 49
49 - # Write data rows 50 + # Write data rows (OPT: only write fields that actually exist in excel_row)
  51 + # This avoids looping over all 42 template columns for every output row.
50 for row_idx, excel_row in enumerate(excel_rows): 52 for row_idx, excel_row in enumerate(excel_rows):
51 excel_row_num = data_start_row + row_idx 53 excel_row_num = data_start_row + row_idx
52 - for field_name, col_idx in column_mapping.items():  
53 - if field_name not in excel_row: 54 + for field_name, value in excel_row.items():
  55 + col_idx = column_mapping.get(field_name)
  56 + if not col_idx:
54 continue 57 continue
55 cell = ws.cell(row=excel_row_num, column=col_idx) 58 cell = ws.cell(row=excel_row_num, column=col_idx)
56 - value = excel_row[field_name]  
57 cell.value = value 59 cell.value = value
58 if isinstance(value, str): 60 if isinstance(value, str):
59 cell.alignment = Alignment(vertical='top', wrap_text=True) 61 cell.alignment = Alignment(vertical='top', wrap_text=True)
@@ -65,3 +67,67 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro @@ -65,3 +67,67 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro
65 print(" - Total rows: {}".format(len(excel_rows))) 67 print(" - Total rows: {}".format(len(excel_rows)))
66 68
67 69
  70 +def create_excel_from_template_fast(template_file, output_file, excel_rows, header_row_idx=2, data_start_row=4):
  71 + """
  72 + Faster writer for large datasets.
  73 +
  74 + Instead of opening the template workbook in write mode and assigning cells one by one,
  75 + we:
  76 + - read the template's first (data_start_row-1) rows as values
  77 + - build a header->index mapping from header_row_idx
  78 + - create a new write_only workbook and append rows
  79 +
  80 + This is much faster for tens/hundreds of thousands of cells.
  81 + """
  82 + tpl_wb = load_workbook(template_file, read_only=True, data_only=True)
  83 + tpl_ws = tpl_wb.active
  84 +
  85 + max_col = tpl_ws.max_column
  86 +
  87 + # Copy template "instruction" rows (typically rows 1-3) into output
  88 + prefix_rows = list(tpl_ws.iter_rows(min_row=1, max_row=data_start_row - 1, values_only=True))
  89 +
  90 + header_values = None
  91 + if 1 <= header_row_idx <= len(prefix_rows):
  92 + header_values = prefix_rows[header_row_idx - 1]
  93 + else:
  94 + # Fallback: read header row directly
  95 + header_values = next(tpl_ws.iter_rows(min_row=header_row_idx, max_row=header_row_idx, values_only=True))
  96 +
  97 + header_values = list(header_values)[:max_col]
  98 + col_map = {}
  99 + for i, v in enumerate(header_values):
  100 + if v is None:
  101 + continue
  102 + col_map[str(v).strip()] = i # 0-based
  103 +
  104 + wb = Workbook(write_only=True)
  105 + ws = wb.create_sheet(title=tpl_ws.title)
  106 + # remove default sheet if present (openpyxl may create one)
  107 + if "Sheet" in wb.sheetnames and wb["Sheet"] is not ws:
  108 + try:
  109 + wb.remove(wb["Sheet"])
  110 + except Exception:
  111 + pass
  112 +
  113 + # Write prefix rows, normalized to max_col
  114 + for r in prefix_rows:
  115 + r = list(r)[:max_col]
  116 + if len(r) < max_col:
  117 + r = r + [None] * (max_col - len(r))
  118 + ws.append(r)
  119 +
  120 + # Write data rows
  121 + for excel_row in excel_rows:
  122 + row_vals = [None] * max_col
  123 + for field_name, value in excel_row.items():
  124 + if field_name not in col_map:
  125 + continue
  126 + row_vals[col_map[field_name]] = value
  127 + ws.append(row_vals)
  128 +
  129 + wb.save(output_file)
  130 + print("Excel file created (fast): {}".format(output_file))
  131 + print(" - Total rows: {}".format(len(excel_rows)))
  132 +
  133 +