Commit 80519ec6d17d0be1d524596395f9f911cf0a0923
1 parent
cd29428b
emazon -> shoplazza
Showing
4 changed files
with
113 additions
and
520 deletions
Show diff stats
docs/亚马逊格式数据转店匠商品导入模板.md
| @@ -116,6 +116,12 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | @@ -116,6 +116,12 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | ||
| 116 | --max-files 1 --max-rows-per-file 2000 --max-products 50 | 116 | --max-files 1 --max-rows-per-file 2000 --max-products 50 |
| 117 | ``` | 117 | ``` |
| 118 | 118 | ||
| 119 | +### 性能提示(很重要) | ||
| 120 | + | ||
| 121 | +- 旧实现如果用 `ws.cell()` 逐格读取/写入,处理 1 个 xlsx 就可能非常慢(分钟级甚至更久)。 | ||
| 122 | +- 当前脚本已经使用 **`iter_rows(values_only=True)`** 做快速读取,并默认启用 **fast writer**(写出时不逐格写模板)。 | ||
| 123 | +- 如需使用慢速的“按模板逐格写入”(不推荐),可加:`--no-fast-write` | ||
| 124 | + | ||
| 119 | ### 2)生成全量 | 125 | ### 2)生成全量 |
| 120 | 126 | ||
| 121 | ```bash | 127 | ```bash |
scripts/amazon_xlsx_to_shoplazza_xlsx.py
| @@ -36,7 +36,7 @@ from openpyxl import load_workbook | @@ -36,7 +36,7 @@ from openpyxl import load_workbook | ||
| 36 | 36 | ||
| 37 | # Allow running as `python scripts/xxx.py` without installing as a package | 37 | # Allow running as `python scripts/xxx.py` without installing as a package |
| 38 | sys.path.insert(0, str(Path(__file__).resolve().parent)) | 38 | sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| 39 | -from shoplazza_excel_template import create_excel_from_template | 39 | +from shoplazza_excel_template import create_excel_from_template, create_excel_from_template_fast |
| 40 | 40 | ||
| 41 | 41 | ||
| 42 | PREFERRED_OPTION_KEYS = [ | 42 | PREFERRED_OPTION_KEYS = [ |
| @@ -210,35 +210,39 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | @@ -210,35 +210,39 @@ def read_amazon_rows_from_file(xlsx_path, max_rows=None): | ||
| 210 | if k not in idx: | 210 | if k not in idx: |
| 211 | raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) | 211 | raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) |
| 212 | 212 | ||
| 213 | + # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. | ||
| 214 | + # openpyxl cell access is relatively expensive; values_only is much faster. | ||
| 215 | + pos = {k: idx[k] - 1 for k in required} # 0-based positions in row tuple | ||
| 216 | + | ||
| 213 | rows = [] | 217 | rows = [] |
| 214 | end_row = ws.max_row | 218 | end_row = ws.max_row |
| 215 | if max_rows is not None: | 219 | if max_rows is not None: |
| 216 | end_row = min(end_row, 1 + int(max_rows)) | 220 | end_row = min(end_row, 1 + int(max_rows)) |
| 217 | 221 | ||
| 218 | - for r in range(2, end_row + 1): | ||
| 219 | - asin = clean_str(ws.cell(r, idx["ASIN"]).value) | 222 | + for tup in ws.iter_rows(min_row=2, max_row=end_row, values_only=True): |
| 223 | + asin = clean_str(tup[pos["ASIN"]]) | ||
| 220 | if not asin: | 224 | if not asin: |
| 221 | continue | 225 | continue |
| 222 | - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin | 226 | + parent = clean_str(tup[pos["父ASIN"]]) or asin |
| 223 | rows.append({ | 227 | rows.append({ |
| 224 | "ASIN": asin, | 228 | "ASIN": asin, |
| 225 | "父ASIN": parent, | 229 | "父ASIN": parent, |
| 226 | - "SKU": clean_str(ws.cell(r, idx["SKU"]).value), | ||
| 227 | - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value), | ||
| 228 | - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value), | ||
| 229 | - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value), | ||
| 230 | - "价格($)": ws.cell(r, idx["价格($)"]).value, | ||
| 231 | - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value, | ||
| 232 | - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value), | ||
| 233 | - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value), | ||
| 234 | - "大类目": clean_str(ws.cell(r, idx["大类目"]).value), | ||
| 235 | - "小类目": clean_str(ws.cell(r, idx["小类目"]).value), | ||
| 236 | - "品牌": clean_str(ws.cell(r, idx["品牌"]).value), | ||
| 237 | - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value), | ||
| 238 | - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value), | ||
| 239 | - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value), | ||
| 240 | - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value), | ||
| 241 | - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value), | 230 | + "SKU": clean_str(tup[pos["SKU"]]), |
| 231 | + "详细参数": clean_str(tup[pos["详细参数"]]), | ||
| 232 | + "商品标题": clean_str(tup[pos["商品标题"]]), | ||
| 233 | + "商品主图": clean_str(tup[pos["商品主图"]]), | ||
| 234 | + "价格($)": tup[pos["价格($)"]], | ||
| 235 | + "prime价格($)": tup[pos["prime价格($)"]], | ||
| 236 | + "上架时间": clean_str(tup[pos["上架时间"]]), | ||
| 237 | + "类目路径": clean_str(tup[pos["类目路径"]]), | ||
| 238 | + "大类目": clean_str(tup[pos["大类目"]]), | ||
| 239 | + "小类目": clean_str(tup[pos["小类目"]]), | ||
| 240 | + "品牌": clean_str(tup[pos["品牌"]]), | ||
| 241 | + "品牌链接": clean_str(tup[pos["品牌链接"]]), | ||
| 242 | + "商品详情页链接": clean_str(tup[pos["商品详情页链接"]]), | ||
| 243 | + "商品重量(单位换算)": clean_str(tup[pos["商品重量(单位换算)"]]), | ||
| 244 | + "商品重量": clean_str(tup[pos["商品重量"]]), | ||
| 245 | + "商品尺寸": clean_str(tup[pos["商品尺寸"]]), | ||
| 242 | }) | 246 | }) |
| 243 | return rows | 247 | return rows |
| 244 | 248 | ||
| @@ -417,6 +421,7 @@ def main(): | @@ -417,6 +421,7 @@ def main(): | ||
| 417 | parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") | 421 | parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") |
| 418 | parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") | 422 | parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") |
| 419 | parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") | 423 | parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") |
| 424 | + parser.add_argument("--no-fast-write", action="store_true", help="Disable fast writer (use template cell-by-cell write; slower)") | ||
| 420 | parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") | 425 | parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") |
| 421 | parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") | 426 | parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") |
| 422 | parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") | 427 | parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") |
| @@ -471,7 +476,10 @@ def main(): | @@ -471,7 +476,10 @@ def main(): | ||
| 471 | excel_rows.extend(build_m_p_rows(variants)) | 476 | excel_rows.extend(build_m_p_rows(variants)) |
| 472 | 477 | ||
| 473 | print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) | 478 | print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) |
| 474 | - create_excel_from_template(args.template, args.output, excel_rows) | 479 | + if args.no_fast_write: |
| 480 | + create_excel_from_template(args.template, args.output, excel_rows) | ||
| 481 | + else: | ||
| 482 | + create_excel_from_template_fast(args.template, args.output, excel_rows) | ||
| 475 | 483 | ||
| 476 | 484 | ||
| 477 | if __name__ == "__main__": | 485 | if __name__ == "__main__": |
scripts/competitor_xlsx_to_shoplazza_xlsx.py
| 1 | #!/usr/bin/env python3 | 1 | #!/usr/bin/env python3 |
| 2 | """ | 2 | """ |
| 3 | -DEPRECATED NAME (kept for backward compatibility). | 3 | +DEPRECATED SCRIPT NAME (kept for backward compatibility). |
| 4 | 4 | ||
| 5 | -The input `products_data/*.xlsx` files are **Amazon-format exports** (with Parent/Child ASIN), | ||
| 6 | -not “competitor data”. Please use: | 5 | +The input `data/mai_jia_jing_ling/products_data/*.xlsx` files are Amazon-format exports |
| 6 | +(Parent/Child ASIN), not “competitor data”. | ||
| 7 | 7 | ||
| 8 | +Please use: | ||
| 8 | - `scripts/amazon_xlsx_to_shoplazza_xlsx.py` | 9 | - `scripts/amazon_xlsx_to_shoplazza_xlsx.py` |
| 9 | 10 | ||
| 10 | -This script keeps the same logic but updates user-facing naming gradually. | 11 | +This wrapper simply forwards all CLI args to the correctly named script, so you |
| 12 | +automatically get the latest performance improvements (fast read/write). | ||
| 11 | """ | 13 | """ |
| 12 | 14 | ||
| 13 | -import os | ||
| 14 | -import re | ||
| 15 | import sys | 15 | import sys |
| 16 | -import argparse | ||
| 17 | -from datetime import datetime | ||
| 18 | -from collections import defaultdict, Counter | ||
| 19 | from pathlib import Path | 16 | from pathlib import Path |
| 20 | 17 | ||
| 21 | -from openpyxl import load_workbook | ||
| 22 | - | ||
| 23 | # Allow running as `python scripts/xxx.py` without installing as a package | 18 | # Allow running as `python scripts/xxx.py` without installing as a package |
| 24 | sys.path.insert(0, str(Path(__file__).resolve().parent)) | 19 | sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| 25 | -from shoplazza_excel_template import create_excel_from_template | ||
| 26 | - | ||
| 27 | - | ||
| 28 | -PREFERRED_OPTION_KEYS = [ | ||
| 29 | - "Size", "Color", "Style", "Pattern", "Material", "Flavor", "Scent", | ||
| 30 | - "Pack", "Pack of", "Number of Items", "Count", "Capacity", "Length", | ||
| 31 | - "Width", "Height", "Model", "Configuration", | ||
| 32 | -] | ||
| 33 | - | ||
| 34 | - | ||
| 35 | -def clean_str(v): | ||
| 36 | - if v is None: | ||
| 37 | - return "" | ||
| 38 | - return str(v).strip() | ||
| 39 | - | ||
| 40 | - | ||
| 41 | -def html_escape(s): | ||
| 42 | - s = clean_str(s) | ||
| 43 | - return (s.replace("&", "&") | ||
| 44 | - .replace("<", "<") | ||
| 45 | - .replace(">", ">")) | ||
| 46 | - | ||
| 47 | - | ||
| 48 | -def generate_handle(title): | ||
| 49 | - """ | ||
| 50 | - Generate URL-friendly handle from title (ASCII only). | ||
| 51 | - Keep consistent with existing scripts. | ||
| 52 | - """ | ||
| 53 | - handle = clean_str(title).lower() | ||
| 54 | - handle = re.sub(r"[^a-z0-9\\s-]", "", handle) | ||
| 55 | - handle = re.sub(r"[-\\s]+", "-", handle).strip("-") | ||
| 56 | - if len(handle) > 255: | ||
| 57 | - handle = handle[:255] | ||
| 58 | - return handle or "product" | ||
| 59 | - | ||
| 60 | - | ||
| 61 | -def parse_date_to_template(dt_value): | ||
| 62 | - """ | ||
| 63 | - Template expects: YYYY-MM-DD HH:MM:SS | ||
| 64 | - Input could be "2018-05-09" or datetime/date. | ||
| 65 | - """ | ||
| 66 | - if dt_value is None or dt_value == "": | ||
| 67 | - return "" | ||
| 68 | - if isinstance(dt_value, datetime): | ||
| 69 | - return dt_value.strftime("%Y-%m-%d %H:%M:%S") | ||
| 70 | - s = clean_str(dt_value) | ||
| 71 | - # common formats | ||
| 72 | - for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S"): | ||
| 73 | - try: | ||
| 74 | - d = datetime.strptime(s, fmt) | ||
| 75 | - return d.strftime("%Y-%m-%d %H:%M:%S") | ||
| 76 | - except Exception: | ||
| 77 | - pass | ||
| 78 | - return "" | ||
| 79 | - | ||
| 80 | - | ||
| 81 | -def parse_weight(weight_conv, weight_raw): | ||
| 82 | - """ | ||
| 83 | - Return (weight_value, unit) where unit in {kg, lb, g, oz}. | ||
| 84 | - Prefer '商品重量(单位换算)' like '68.04 g'. | ||
| 85 | - Fallback to '商品重量' like '0.15 pounds'. | ||
| 86 | - """ | ||
| 87 | - s = clean_str(weight_conv) or clean_str(weight_raw) | ||
| 88 | - if not s: | ||
| 89 | - return ("", "") | ||
| 90 | - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)\\s*([a-zA-Z]+)", s) | ||
| 91 | - if not m: | ||
| 92 | - return ("", "") | ||
| 93 | - val = float(m.group(1)) | ||
| 94 | - unit = m.group(2).lower() | ||
| 95 | - if unit in ("g", "gram", "grams"): | ||
| 96 | - return (val, "g") | ||
| 97 | - if unit in ("kg", "kilogram", "kilograms"): | ||
| 98 | - return (val, "kg") | ||
| 99 | - if unit in ("lb", "lbs", "pound", "pounds"): | ||
| 100 | - return (val, "lb") | ||
| 101 | - if unit in ("oz", "ounce", "ounces"): | ||
| 102 | - return (val, "oz") | ||
| 103 | - return ("", "") | ||
| 104 | - | ||
| 105 | - | ||
| 106 | -def parse_dimensions_inches(dim_raw): | ||
| 107 | - """ | ||
| 108 | - Template '尺寸信息': 'L,W,H' in inches. | ||
| 109 | - Input example: '7.9 x 7.9 x 2 inches' | ||
| 110 | - """ | ||
| 111 | - s = clean_str(dim_raw) | ||
| 112 | - if not s: | ||
| 113 | - return "" | ||
| 114 | - # extract first 3 numbers in order | ||
| 115 | - nums = re.findall(r"([0-9]+(?:\\.[0-9]+)?)", s) | ||
| 116 | - if len(nums) < 3: | ||
| 117 | - return "" | ||
| 118 | - return "{},{},{}".format(nums[0], nums[1], nums[2]) | ||
| 119 | - | ||
| 120 | - | ||
| 121 | -def parse_sku_options(sku_text): | ||
| 122 | - """ | ||
| 123 | - Parse 'SKU' column into {key: value}. | ||
| 124 | - Example: | ||
| 125 | - 'Size: One Size | Color: Black' -> {'Size':'One Size','Color':'Black'} | ||
| 126 | - """ | ||
| 127 | - s = clean_str(sku_text) | ||
| 128 | - if not s: | ||
| 129 | - return {} | ||
| 130 | - parts = [p.strip() for p in s.split("|") if p.strip()] | ||
| 131 | - out = {} | ||
| 132 | - for p in parts: | ||
| 133 | - if ":" not in p: | ||
| 134 | - continue | ||
| 135 | - k, v = p.split(":", 1) | ||
| 136 | - k = clean_str(k) | ||
| 137 | - v = clean_str(v) | ||
| 138 | - if k and v: | ||
| 139 | - out[k] = v | ||
| 140 | - return out | ||
| 141 | - | ||
| 142 | - | ||
| 143 | -def choose_option_keys(variant_dicts, max_keys=3): | ||
| 144 | - """ | ||
| 145 | - Choose up to 3 option keys for a product group. | ||
| 146 | - Order by preference list first, then by frequency. | ||
| 147 | - """ | ||
| 148 | - freq = Counter() | ||
| 149 | - for d in variant_dicts: | ||
| 150 | - for k, v in d.items(): | ||
| 151 | - if v: | ||
| 152 | - freq[k] += 1 | ||
| 153 | - if not freq: | ||
| 154 | - return [] | ||
| 155 | - | ||
| 156 | - preferred_rank = {k: i for i, k in enumerate(PREFERRED_OPTION_KEYS)} | ||
| 157 | - | ||
| 158 | - def key_sort(k): | ||
| 159 | - return (preferred_rank.get(k, 10 ** 6), -freq[k], k.lower()) | ||
| 160 | - | ||
| 161 | - keys = sorted(freq.keys(), key=key_sort) | ||
| 162 | - return keys[:max_keys] | ||
| 163 | - | ||
| 164 | - | ||
| 165 | -def build_description_html(title, details, product_url): | ||
| 166 | - parts = [] | ||
| 167 | - if title: | ||
| 168 | - parts.append("<p>{}</p>".format(html_escape(title))) | ||
| 169 | - detail_items = [x.strip() for x in clean_str(details).split("|") if x.strip()] | ||
| 170 | - if detail_items: | ||
| 171 | - li = "".join(["<li>{}</li>".format(html_escape(x)) for x in detail_items[:30]]) | ||
| 172 | - parts.append("<ul>{}</ul>".format(li)) | ||
| 173 | - if product_url: | ||
| 174 | - parts.append('<p>Source: <a href="{0}">{0}</a></p>'.format(html_escape(product_url))) | ||
| 175 | - return "".join(parts) | ||
| 176 | - | ||
| 177 | - | ||
| 178 | -def competitor_sheet(ws): | ||
| 179 | - """ | ||
| 180 | - Build (header->col_index) for competitor sheet. | ||
| 181 | - Assumes header is row 1. | ||
| 182 | - """ | ||
| 183 | - headers = [] | ||
| 184 | - for c in range(1, ws.max_column + 1): | ||
| 185 | - v = ws.cell(1, c).value | ||
| 186 | - headers.append(clean_str(v)) | ||
| 187 | - idx = {h: i + 1 for i, h in enumerate(headers) if h} | ||
| 188 | - return idx | ||
| 189 | - | ||
| 190 | - | ||
| 191 | -def read_competitor_rows_from_file(xlsx_path, max_rows=None): | ||
| 192 | - wb = load_workbook(xlsx_path, read_only=True, data_only=True) | ||
| 193 | - # pick first non-Notes sheet | ||
| 194 | - sheet_name = None | ||
| 195 | - for name in wb.sheetnames: | ||
| 196 | - if str(name).lower() == "notes": | ||
| 197 | - continue | ||
| 198 | - sheet_name = name | ||
| 199 | - break | ||
| 200 | - if sheet_name is None: | ||
| 201 | - return [] | ||
| 202 | - ws = wb[sheet_name] | ||
| 203 | - idx = competitor_sheet(ws) | ||
| 204 | - | ||
| 205 | - required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", | ||
| 206 | - "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", | ||
| 207 | - "商品重量(单位换算)", "商品重量", "商品尺寸"] | ||
| 208 | - for k in required: | ||
| 209 | - if k not in idx: | ||
| 210 | - raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) | ||
| 211 | - | ||
| 212 | - rows = [] | ||
| 213 | - end_row = ws.max_row | ||
| 214 | - if max_rows is not None: | ||
| 215 | - end_row = min(end_row, 1 + int(max_rows)) | ||
| 216 | - | ||
| 217 | - for r in range(2, end_row + 1): | ||
| 218 | - asin = clean_str(ws.cell(r, idx["ASIN"]).value) | ||
| 219 | - if not asin: | ||
| 220 | - continue | ||
| 221 | - parent = clean_str(ws.cell(r, idx["父ASIN"]).value) or asin | ||
| 222 | - row = { | ||
| 223 | - "ASIN": asin, | ||
| 224 | - "父ASIN": parent, | ||
| 225 | - "SKU": clean_str(ws.cell(r, idx["SKU"]).value), | ||
| 226 | - "详细参数": clean_str(ws.cell(r, idx["详细参数"]).value), | ||
| 227 | - "商品标题": clean_str(ws.cell(r, idx["商品标题"]).value), | ||
| 228 | - "商品主图": clean_str(ws.cell(r, idx["商品主图"]).value), | ||
| 229 | - "价格($)": ws.cell(r, idx["价格($)"]).value, | ||
| 230 | - "prime价格($)": ws.cell(r, idx["prime价格($)"]).value, | ||
| 231 | - "上架时间": clean_str(ws.cell(r, idx["上架时间"]).value), | ||
| 232 | - "类目路径": clean_str(ws.cell(r, idx["类目路径"]).value), | ||
| 233 | - "大类目": clean_str(ws.cell(r, idx["大类目"]).value), | ||
| 234 | - "小类目": clean_str(ws.cell(r, idx["小类目"]).value), | ||
| 235 | - "品牌": clean_str(ws.cell(r, idx["品牌"]).value), | ||
| 236 | - "品牌链接": clean_str(ws.cell(r, idx["品牌链接"]).value), | ||
| 237 | - "商品详情页链接": clean_str(ws.cell(r, idx["商品详情页链接"]).value), | ||
| 238 | - "商品重量(单位换算)": clean_str(ws.cell(r, idx["商品重量(单位换算)"]).value), | ||
| 239 | - "商品重量": clean_str(ws.cell(r, idx["商品重量"]).value), | ||
| 240 | - "商品尺寸": clean_str(ws.cell(r, idx["商品尺寸"]).value), | ||
| 241 | - } | ||
| 242 | - rows.append(row) | ||
| 243 | - return rows | ||
| 244 | - | ||
| 245 | - | ||
| 246 | -def to_price(v): | ||
| 247 | - if v is None or v == "": | ||
| 248 | - return None | ||
| 249 | - try: | ||
| 250 | - return float(v) | ||
| 251 | - except Exception: | ||
| 252 | - s = clean_str(v) | ||
| 253 | - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", s) | ||
| 254 | - if not m: | ||
| 255 | - return None | ||
| 256 | - return float(m.group(1)) | ||
| 257 | - | ||
| 258 | - | ||
| 259 | -def build_common_fields(base_row, spu_id): | ||
| 260 | - title = base_row.get("商品标题") or "Product" | ||
| 261 | - brand = base_row.get("品牌") or "" | ||
| 262 | - big_cat = base_row.get("大类目") or "" | ||
| 263 | - small_cat = base_row.get("小类目") or "" | ||
| 264 | - cat_path = base_row.get("类目路径") or "" | ||
| 265 | - | ||
| 266 | - handle = generate_handle(title) | ||
| 267 | - if handle and not handle.startswith("products/"): | ||
| 268 | - handle = "products/{}".format(handle) | ||
| 269 | - | ||
| 270 | - seo_title = title | ||
| 271 | - seo_desc_parts = [] | ||
| 272 | - if brand: | ||
| 273 | - seo_desc_parts.append(brand) | ||
| 274 | - seo_desc_parts.append(title) | ||
| 275 | - if big_cat: | ||
| 276 | - seo_desc_parts.append(big_cat) | ||
| 277 | - seo_description = " ".join([x for x in seo_desc_parts if x])[:5000] | ||
| 278 | - | ||
| 279 | - seo_keywords = ",".join([x for x in [title, brand, big_cat, small_cat] if x]) | ||
| 280 | - tags = ",".join([x for x in [brand, big_cat, small_cat] if x]) | ||
| 281 | - | ||
| 282 | - created_at = parse_date_to_template(base_row.get("上架时间")) | ||
| 283 | - | ||
| 284 | - description = build_description_html( | ||
| 285 | - title=title, | ||
| 286 | - details=base_row.get("详细参数"), | ||
| 287 | - product_url=base_row.get("商品详情页链接"), | ||
| 288 | - ) | ||
| 289 | - | ||
| 290 | - # default inventory settings (data source has no stock) | ||
| 291 | - inventory_qty = 100 | ||
| 292 | - | ||
| 293 | - weight_val, weight_unit = parse_weight(base_row.get("商品重量(单位换算)"), base_row.get("商品重量")) | ||
| 294 | - size_info = parse_dimensions_inches(base_row.get("商品尺寸")) | ||
| 295 | - | ||
| 296 | - album = big_cat or "" | ||
| 297 | - if not album and cat_path: | ||
| 298 | - album = cat_path.split(":")[0] | ||
| 299 | - | ||
| 300 | - common = { | ||
| 301 | - "商品ID": "", | ||
| 302 | - "创建时间": created_at, | ||
| 303 | - "商品标题*": title[:255], | ||
| 304 | - "商品副标题": "{} {}".format(brand, big_cat).strip()[:600], | ||
| 305 | - "商品描述": description, | ||
| 306 | - "SEO标题": seo_title[:5000], | ||
| 307 | - "SEO描述": seo_description, | ||
| 308 | - "SEO URL Handle": handle, | ||
| 309 | - "SEO URL 重定向": "N", | ||
| 310 | - "SEO关键词": seo_keywords[:5000], | ||
| 311 | - "商品上架": "Y", | ||
| 312 | - "需要物流": "Y", | ||
| 313 | - "商品收税": "N", | ||
| 314 | - "商品spu": spu_id[:100], | ||
| 315 | - "启用虚拟销量": "N", | ||
| 316 | - "虚拟销量值": "", | ||
| 317 | - "跟踪库存": "Y", | ||
| 318 | - "库存规则*": "1", | ||
| 319 | - "专辑名称": album, | ||
| 320 | - "标签": tags, | ||
| 321 | - "供应商名称": "Amazon", | ||
| 322 | - "供应商URL": base_row.get("商品详情页链接") or base_row.get("品牌链接") or "", | ||
| 323 | - "商品重量": weight_val if weight_val != "" else "", | ||
| 324 | - "重量单位": weight_unit, | ||
| 325 | - "商品库存": inventory_qty, | ||
| 326 | - "尺寸信息": size_info, | ||
| 327 | - "原产地国别": "", | ||
| 328 | - "HS(协调制度)代码": "", | ||
| 329 | - "商品备注": "ASIN:{}; ParentASIN:{}; CategoryPath:{}".format( | ||
| 330 | - base_row.get("ASIN", ""), spu_id, (cat_path[:200] if cat_path else "") | ||
| 331 | - )[:500], | ||
| 332 | - "款式备注": "", | ||
| 333 | - } | ||
| 334 | - return common | ||
| 335 | - | ||
| 336 | - | ||
| 337 | -def build_s_row(base_row): | ||
| 338 | - spu_id = base_row.get("父ASIN") or base_row.get("ASIN") | ||
| 339 | - common = build_common_fields(base_row, spu_id=spu_id) | ||
| 340 | - price = to_price(base_row.get("prime价格($)")) or to_price(base_row.get("价格($)")) or 9.99 | ||
| 341 | - image = base_row.get("商品主图") or "" | ||
| 342 | - | ||
| 343 | - row = {} | ||
| 344 | - row.update(common) | ||
| 345 | - row.update({ | ||
| 346 | - "商品属性*": "S", | ||
| 347 | - "款式1": "", | ||
| 348 | - "款式2": "", | ||
| 349 | - "款式3": "", | ||
| 350 | - "商品售价*": price, | ||
| 351 | - "商品原价": price, | ||
| 352 | - "成本价": "", | ||
| 353 | - "商品SKU": base_row.get("ASIN") or "", | ||
| 354 | - "商品条形码": "", | ||
| 355 | - "商品图片*": image, | ||
| 356 | - "商品主图": image, | ||
| 357 | - }) | ||
| 358 | - return row | ||
| 359 | - | ||
| 360 | - | ||
| 361 | -def build_m_p_rows(variant_rows): | ||
| 362 | - """ | ||
| 363 | - variant_rows: List[dict] with same 父ASIN. | ||
| 364 | - """ | ||
| 365 | - base = variant_rows[0] | ||
| 366 | - spu_id = base.get("父ASIN") or base.get("ASIN") | ||
| 367 | - common = build_common_fields(base, spu_id=spu_id) | ||
| 368 | - | ||
| 369 | - option_dicts = [parse_sku_options(v.get("SKU")) for v in variant_rows] | ||
| 370 | - option_keys = choose_option_keys(option_dicts, max_keys=3) | ||
| 371 | - if not option_keys: | ||
| 372 | - option_keys = ["Variant"] | ||
| 373 | - | ||
| 374 | - # M row | ||
| 375 | - m = {} | ||
| 376 | - m.update(common) | ||
| 377 | - m.update({ | ||
| 378 | - "商品属性*": "M", | ||
| 379 | - "款式1": option_keys[0] if len(option_keys) > 0 else "", | ||
| 380 | - "款式2": option_keys[1] if len(option_keys) > 1 else "", | ||
| 381 | - "款式3": option_keys[2] if len(option_keys) > 2 else "", | ||
| 382 | - "商品售价*": "", | ||
| 383 | - "商品原价": "", | ||
| 384 | - "成本价": "", | ||
| 385 | - "商品SKU": "", | ||
| 386 | - "商品条形码": "", | ||
| 387 | - "商品图片*": base.get("商品主图") or "", | ||
| 388 | - "商品主图": base.get("商品主图") or "", | ||
| 389 | - }) | ||
| 390 | - | ||
| 391 | - # For M row, these SKU-level fields should be empty per template guidance | ||
| 392 | - m["商品重量"] = "" | ||
| 393 | - m["重量单位"] = "" | ||
| 394 | - m["商品库存"] = "" | ||
| 395 | - m["尺寸信息"] = "" | ||
| 396 | - | ||
| 397 | - rows = [m] | ||
| 398 | - | ||
| 399 | - # P rows | ||
| 400 | - for v in variant_rows: | ||
| 401 | - v_common = build_common_fields(v, spu_id=spu_id) | ||
| 402 | - # wipe SPU-only fields for P row | ||
| 403 | - v_common.update({ | ||
| 404 | - "商品副标题": "", | ||
| 405 | - "商品描述": "", | ||
| 406 | - "SEO标题": "", | ||
| 407 | - "SEO描述": "", | ||
| 408 | - "SEO URL Handle": "", | ||
| 409 | - "SEO URL 重定向": "", | ||
| 410 | - "SEO关键词": "", | ||
| 411 | - "专辑名称": "", | ||
| 412 | - "标签": "", | ||
| 413 | - "供应商名称": "", | ||
| 414 | - "供应商URL": "", | ||
| 415 | - "商品备注": "", | ||
| 416 | - }) | ||
| 417 | - | ||
| 418 | - opt = parse_sku_options(v.get("SKU")) | ||
| 419 | - if option_keys == ["Variant"]: | ||
| 420 | - opt_vals = [v.get("ASIN")] | ||
| 421 | - else: | ||
| 422 | - opt_vals = [opt.get(k, "") for k in option_keys] | ||
| 423 | - | ||
| 424 | - price = to_price(v.get("prime价格($)")) or to_price(v.get("价格($)")) or 9.99 | ||
| 425 | - image = v.get("商品主图") or "" | ||
| 426 | - | ||
| 427 | - p = {} | ||
| 428 | - p.update(v_common) | ||
| 429 | - p.update({ | ||
| 430 | - "商品属性*": "P", | ||
| 431 | - "款式1": opt_vals[0] if len(opt_vals) > 0 else "", | ||
| 432 | - "款式2": opt_vals[1] if len(opt_vals) > 1 else "", | ||
| 433 | - "款式3": opt_vals[2] if len(opt_vals) > 2 else "", | ||
| 434 | - "商品售价*": price, | ||
| 435 | - "商品原价": price, | ||
| 436 | - "成本价": "", | ||
| 437 | - "商品SKU": v.get("ASIN") or "", | ||
| 438 | - "商品条形码": "", | ||
| 439 | - # P row supports one variant image; we use variant's main image | ||
| 440 | - "商品图片*": image, | ||
| 441 | - "商品主图": "", | ||
| 442 | - }) | ||
| 443 | - rows.append(p) | ||
| 444 | - | ||
| 445 | - return rows | ||
| 446 | - | ||
| 447 | - | ||
| 448 | -def main(): | ||
| 449 | - parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx (deprecated script name)") | ||
| 450 | - parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") | ||
| 451 | - parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") | ||
| 452 | - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path") | ||
| 453 | - parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") | ||
| 454 | - parser.add_argument("--max-rows-per-file", type=int, default=None, help="Limit rows per xlsx file (for testing)") | ||
| 455 | - parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") | ||
| 456 | - args = parser.parse_args() | ||
| 457 | - | ||
| 458 | - input_dir = args.input_dir | ||
| 459 | - if not os.path.isdir(input_dir): | ||
| 460 | - raise RuntimeError("input-dir not found: {}".format(input_dir)) | ||
| 461 | - if not os.path.exists(args.template): | ||
| 462 | - raise RuntimeError("template not found: {}".format(args.template)) | ||
| 463 | - | ||
| 464 | - files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(".xlsx")] | ||
| 465 | - files.sort() | ||
| 466 | - if args.max_files is not None: | ||
| 467 | - files = files[: int(args.max_files)] | ||
| 468 | - | ||
| 469 | - print("Reading Amazon-format files: {} (from {})".format(len(files), input_dir), flush=True) | ||
| 470 | - | ||
| 471 | - groups = defaultdict(list) # spu_id -> [variant rows] | ||
| 472 | - seen_asin = set() | ||
| 473 | - | ||
| 474 | - for fp in files: | ||
| 475 | - print(" - loading: {}".format(fp), flush=True) | ||
| 476 | - try: | ||
| 477 | - rows = read_competitor_rows_from_file(fp, max_rows=args.max_rows_per_file) | ||
| 478 | - except Exception as e: | ||
| 479 | - print("WARN: failed to read {}: {}".format(fp, e)) | ||
| 480 | - continue | ||
| 481 | - print(" loaded rows: {}".format(len(rows)), flush=True) | ||
| 482 | - | ||
| 483 | - for r in rows: | ||
| 484 | - asin = r.get("ASIN") | ||
| 485 | - if asin in seen_asin: | ||
| 486 | - continue | ||
| 487 | - seen_asin.add(asin) | ||
| 488 | - spu_id = r.get("父ASIN") or asin | ||
| 489 | - groups[spu_id].append(r) | ||
| 490 | - | ||
| 491 | - print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) | ||
| 492 | - | ||
| 493 | - excel_rows = [] | ||
| 494 | - spu_count = 0 | ||
| 495 | - | ||
| 496 | - for spu_id, variants in groups.items(): | ||
| 497 | - if not variants: | ||
| 498 | - continue | ||
| 499 | - spu_count += 1 | ||
| 500 | - if args.max_products is not None and spu_count > int(args.max_products): | ||
| 501 | - break | ||
| 502 | - if len(variants) == 1: | ||
| 503 | - excel_rows.append(build_s_row(variants[0])) | ||
| 504 | - else: | ||
| 505 | - excel_rows.extend(build_m_p_rows(variants)) | ||
| 506 | 20 | ||
| 507 | - print("Generated Excel rows: {} (SPU groups output: {})".format(len(excel_rows), min(spu_count, len(groups))), flush=True) | ||
| 508 | - create_excel_from_template(args.template, args.output, excel_rows) | 21 | +from amazon_xlsx_to_shoplazza_xlsx import main as amazon_main |
| 509 | 22 | ||
| 510 | 23 | ||
| 511 | if __name__ == "__main__": | 24 | if __name__ == "__main__": |
| 512 | - main() | 25 | + amazon_main() |
| 513 | 26 | ||
| 514 | 27 |
scripts/shoplazza_excel_template.py
| @@ -6,6 +6,7 @@ based on the provided template `docs/商品导入模板.xlsx`. | @@ -6,6 +6,7 @@ based on the provided template `docs/商品导入模板.xlsx`. | ||
| 6 | We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts. | 6 | We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts. |
| 7 | """ | 7 | """ |
| 8 | 8 | ||
| 9 | +from openpyxl import Workbook | ||
| 9 | from openpyxl import load_workbook | 10 | from openpyxl import load_workbook |
| 10 | from openpyxl.styles import Alignment | 11 | from openpyxl.styles import Alignment |
| 11 | 12 | ||
| @@ -46,14 +47,15 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro | @@ -46,14 +47,15 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro | ||
| 46 | for col in range(1, ws.max_column + 1): | 47 | for col in range(1, ws.max_column + 1): |
| 47 | ws.cell(row=row, column=col).value = None | 48 | ws.cell(row=row, column=col).value = None |
| 48 | 49 | ||
| 49 | - # Write data rows | 50 | + # Write data rows (OPT: only write fields that actually exist in excel_row) |
| 51 | + # This avoids looping over all 42 template columns for every output row. | ||
| 50 | for row_idx, excel_row in enumerate(excel_rows): | 52 | for row_idx, excel_row in enumerate(excel_rows): |
| 51 | excel_row_num = data_start_row + row_idx | 53 | excel_row_num = data_start_row + row_idx |
| 52 | - for field_name, col_idx in column_mapping.items(): | ||
| 53 | - if field_name not in excel_row: | 54 | + for field_name, value in excel_row.items(): |
| 55 | + col_idx = column_mapping.get(field_name) | ||
| 56 | + if not col_idx: | ||
| 54 | continue | 57 | continue |
| 55 | cell = ws.cell(row=excel_row_num, column=col_idx) | 58 | cell = ws.cell(row=excel_row_num, column=col_idx) |
| 56 | - value = excel_row[field_name] | ||
| 57 | cell.value = value | 59 | cell.value = value |
| 58 | if isinstance(value, str): | 60 | if isinstance(value, str): |
| 59 | cell.alignment = Alignment(vertical='top', wrap_text=True) | 61 | cell.alignment = Alignment(vertical='top', wrap_text=True) |
| @@ -65,3 +67,67 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro | @@ -65,3 +67,67 @@ def create_excel_from_template(template_file, output_file, excel_rows, header_ro | ||
| 65 | print(" - Total rows: {}".format(len(excel_rows))) | 67 | print(" - Total rows: {}".format(len(excel_rows))) |
| 66 | 68 | ||
| 67 | 69 | ||
| 70 | +def create_excel_from_template_fast(template_file, output_file, excel_rows, header_row_idx=2, data_start_row=4): | ||
| 71 | + """ | ||
| 72 | + Faster writer for large datasets. | ||
| 73 | + | ||
| 74 | + Instead of opening the template workbook in write mode and assigning cells one by one, | ||
| 75 | + we: | ||
| 76 | + - read the template's first (data_start_row-1) rows as values | ||
| 77 | + - build a header->index mapping from header_row_idx | ||
| 78 | + - create a new write_only workbook and append rows | ||
| 79 | + | ||
| 80 | + This is much faster for tens/hundreds of thousands of cells. | ||
| 81 | + """ | ||
| 82 | + tpl_wb = load_workbook(template_file, read_only=True, data_only=True) | ||
| 83 | + tpl_ws = tpl_wb.active | ||
| 84 | + | ||
| 85 | + max_col = tpl_ws.max_column | ||
| 86 | + | ||
| 87 | + # Copy template "instruction" rows (typically rows 1-3) into output | ||
| 88 | + prefix_rows = list(tpl_ws.iter_rows(min_row=1, max_row=data_start_row - 1, values_only=True)) | ||
| 89 | + | ||
| 90 | + header_values = None | ||
| 91 | + if 1 <= header_row_idx <= len(prefix_rows): | ||
| 92 | + header_values = prefix_rows[header_row_idx - 1] | ||
| 93 | + else: | ||
| 94 | + # Fallback: read header row directly | ||
| 95 | + header_values = next(tpl_ws.iter_rows(min_row=header_row_idx, max_row=header_row_idx, values_only=True)) | ||
| 96 | + | ||
| 97 | + header_values = list(header_values)[:max_col] | ||
| 98 | + col_map = {} | ||
| 99 | + for i, v in enumerate(header_values): | ||
| 100 | + if v is None: | ||
| 101 | + continue | ||
| 102 | + col_map[str(v).strip()] = i # 0-based | ||
| 103 | + | ||
| 104 | + wb = Workbook(write_only=True) | ||
| 105 | + ws = wb.create_sheet(title=tpl_ws.title) | ||
| 106 | + # remove default sheet if present (openpyxl may create one) | ||
| 107 | + if "Sheet" in wb.sheetnames and wb["Sheet"] is not ws: | ||
| 108 | + try: | ||
| 109 | + wb.remove(wb["Sheet"]) | ||
| 110 | + except Exception: | ||
| 111 | + pass | ||
| 112 | + | ||
| 113 | + # Write prefix rows, normalized to max_col | ||
| 114 | + for r in prefix_rows: | ||
| 115 | + r = list(r)[:max_col] | ||
| 116 | + if len(r) < max_col: | ||
| 117 | + r = r + [None] * (max_col - len(r)) | ||
| 118 | + ws.append(r) | ||
| 119 | + | ||
| 120 | + # Write data rows | ||
| 121 | + for excel_row in excel_rows: | ||
| 122 | + row_vals = [None] * max_col | ||
| 123 | + for field_name, value in excel_row.items(): | ||
| 124 | + if field_name not in col_map: | ||
| 125 | + continue | ||
| 126 | + row_vals[col_map[field_name]] = value | ||
| 127 | + ws.append(row_vals) | ||
| 128 | + | ||
| 129 | + wb.save(output_file) | ||
| 130 | + print("Excel file created (fast): {}".format(output_file)) | ||
| 131 | + print(" - Total rows: {}".format(len(excel_rows))) | ||
| 132 | + | ||
| 133 | + |