From 32e9b30c71aba5d8ce1c76793107546c6d6a9712 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 9 Apr 2026 23:48:39 +0800 Subject: [PATCH] scripts/ 根目录主要保留启动/编排入口,其他脚本归到了几个固定子目录: --- docs/QUICKSTART.md | 2 +- docs/工作总结-微服务性能优化与架构.md | 4 ++-- scripts/README.md | 8 +++++++- scripts/amazon_xlsx_to_shoplazza_xlsx.py | 615 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/check_data_source.py | 301 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/check_es_data.py | 268 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/check_index_mapping.py | 168 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ scripts/compare_index_mappings.py | 189 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/competitor_xlsx_to_shoplazza_xlsx.py | 27 --------------------------- scripts/csv_to_excel.py | 302 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/csv_to_excel_multi_variant.py | 565 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/daily_log_router.sh | 56 -------------------------------------------------------- scripts/data_import/README.md | 13 +++++++++++++ scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py | 615 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/data_import/competitor_xlsx_to_shoplazza_xlsx.py | 26 ++++++++++++++++++++++++++ scripts/data_import/csv_to_excel.py | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/data_import/csv_to_excel_multi_variant.py | 564 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/data_import/shoplazza_excel_template.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/data_import/shoplazza_import_template.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/data_import/tenant3_csv_to_shoplazza_xlsx.sh | 20 ++++++++++++++++++++ scripts/download_translation_models.py | 125 ----------------------------------------------------------------------------------------------------------------------------- scripts/frontend/frontend_server.py | 276 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/frontend_server.py | 276 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ scripts/inspect/README.md | 10 ++++++++++ scripts/inspect/check_data_source.py | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/inspect/check_es_data.py | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/inspect/check_index_mapping.py | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/inspect/compare_index_mappings.py | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/maintenance/embed_tenant_image_urls.py | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/monitor_eviction.py | 89 ----------------------------------------------------------------------------------------- scripts/ops/README.md | 8 ++++++++ scripts/ops/daily_log_router.sh | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/ops/wechat_alert.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/redis/monitor_eviction.py | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/service_ctl.sh | 2 +- scripts/setup_translator_venv.sh | 2 +- scripts/shoplazza_excel_template.py | 133 ------------------------------------------------------------------------------------------------------------------------------------- scripts/shoplazza_import_template.py | 112 ---------------------------------------------------------------------------------------------------------------- scripts/start_cnclip_service.sh | 2 +- scripts/start_frontend.sh | 2 +- scripts/temp_embed_tenant_image_urls.py | 246 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ scripts/tenant3__csv_to_shoplazza_xlsx.sh | 20 -------------------- scripts/translation/download_translation_models.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/wechat_alert.py | 104 -------------------------------------------------------------------------------------------------------- translation/README.md | 4 ++-- 45 files changed, 3636 insertions(+), 3606 deletions(-) delete mode 100644 scripts/amazon_xlsx_to_shoplazza_xlsx.py delete mode 100755 scripts/check_data_source.py delete mode 100755 scripts/check_es_data.py delete mode 100644 scripts/check_index_mapping.py delete mode 100644 scripts/compare_index_mappings.py delete mode 100644 scripts/competitor_xlsx_to_shoplazza_xlsx.py delete mode 100755 scripts/csv_to_excel.py delete mode 100755 scripts/csv_to_excel_multi_variant.py delete mode 100755 scripts/daily_log_router.sh create mode 100644 scripts/data_import/README.md create mode 100644 scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py create mode 100644 scripts/data_import/competitor_xlsx_to_shoplazza_xlsx.py create mode 100755 scripts/data_import/csv_to_excel.py create mode 100755 scripts/data_import/csv_to_excel_multi_variant.py create mode 100644 scripts/data_import/shoplazza_excel_template.py create mode 100644 scripts/data_import/shoplazza_import_template.py create mode 100755 scripts/data_import/tenant3_csv_to_shoplazza_xlsx.sh delete mode 100755 scripts/download_translation_models.py create mode 100755 scripts/frontend/frontend_server.py delete mode 100755 scripts/frontend_server.py create mode 100644 scripts/inspect/README.md create mode 100755 scripts/inspect/check_data_source.py create mode 100755 scripts/inspect/check_es_data.py create mode 100644 scripts/inspect/check_index_mapping.py create mode 100644 scripts/inspect/compare_index_mappings.py create mode 100644 scripts/maintenance/embed_tenant_image_urls.py delete mode 100755 scripts/monitor_eviction.py create mode 100644 scripts/ops/README.md create mode 100755 scripts/ops/daily_log_router.sh create mode 100644 scripts/ops/wechat_alert.py create mode 100755 scripts/redis/monitor_eviction.py delete mode 100644 scripts/shoplazza_excel_template.py delete mode 100644 scripts/shoplazza_import_template.py delete mode 100644 scripts/temp_embed_tenant_image_urls.py delete mode 100755 scripts/tenant3__csv_to_shoplazza_xlsx.sh create mode 100755 scripts/translation/download_translation_models.py delete mode 100644 scripts/wechat_alert.py diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index db2d057..9e52dbd 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -166,7 +166,7 @@ curl -X POST http://localhost:6008/embed/image \ ```bash ./scripts/setup_translator_venv.sh -./.venv-translator/bin/python scripts/download_translation_models.py --all-local # 如需本地模型 +./.venv-translator/bin/python scripts/translation/download_translation_models.py --all-local # 如需本地模型 ./scripts/start_translator.sh curl -X POST http://localhost:6006/translate \ diff --git a/docs/工作总结-微服务性能优化与架构.md b/docs/工作总结-微服务性能优化与架构.md index 5407569..6a2b5df 100644 --- a/docs/工作总结-微服务性能优化与架构.md +++ b/docs/工作总结-微服务性能优化与架构.md @@ -133,8 +133,8 @@ instruction: "Given a shopping query, rank product titles by relevance" - 启动时:backend/indexer/frontend/embedding/translator/reranker 会写 pid 到 `logs/.pid`,并执行 `wait_for_health`(GET `http://127.0.0.1:/health`);reranker 健康重试 90 次,其余 30 次;TEI 校验 Docker 容器存在且 `/health` 成功;cnclip 无 HTTP 健康则仅校验进程/端口。 - **监控常驻**: - `./scripts/service_ctl.sh monitor-start ` 启动后台监控进程,将 targets 写入 `logs/service-monitor.targets`,pid 写入 `logs/service-monitor.pid`,日志追加到 `logs/service-monitor.log`。 - - 轮询间隔 `MONITOR_INTERVAL_SEC` 默认 **10** 秒;连续 **3** 次(`MONITOR_FAIL_THRESHOLD`)健康失败则触发重启;重启冷却 `MONITOR_RESTART_COOLDOWN_SEC` 默认 **30** 秒;每小时最多重启 `MONITOR_MAX_RESTARTS_PER_HOUR` 默认 **6** 次;超限时调用 `scripts/wechat_alert.py` 告警(若存在)。 -- **日志**:各服务按日滚动到 `logs/-.log`,通过 `scripts/daily_log_router.sh` 与 `LOG_RETENTION_DAYS`(默认 30)控制保留。 + - 轮询间隔 `MONITOR_INTERVAL_SEC` 默认 **10** 秒;连续 **3** 次(`MONITOR_FAIL_THRESHOLD`)健康失败则触发重启;重启冷却 `MONITOR_RESTART_COOLDOWN_SEC` 默认 **30** 秒;每小时最多重启 `MONITOR_MAX_RESTARTS_PER_HOUR` 默认 **6** 次;超限时调用 `scripts/ops/wechat_alert.py` 告警(若存在)。 +- **日志**:各服务按日滚动到 `logs/-.log`,通过 `scripts/ops/daily_log_router.sh` 与 `LOG_RETENTION_DAYS`(默认 30)控制保留。 详见:`scripts/service_ctl.sh` 内注释及 `docs/Usage-Guide.md`。 diff --git a/scripts/README.md b/scripts/README.md index 8c6a3b6..501e544 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,6 +1,6 @@ # Scripts -`scripts/` 现在只保留当前架构下仍然有效的运行、运维、环境和数据处理脚本。 +`scripts/` 现在只保留当前架构下仍然有效的运行、运维、环境和数据处理脚本,并按职责拆到稳定子目录,避免继续在根目录平铺。 ## 当前分类 @@ -20,6 +20,8 @@ - `stop.sh` - `stop_tei_service.sh` - `stop_cnclip_service.sh` + - `frontend/` + - `ops/` - 环境初始化 - `create_venv.sh` @@ -33,11 +35,15 @@ - `create_tenant_index.sh` - `build_suggestions.sh` - `mock_data.sh` + - `data_import/` + - `inspect/` + - `maintenance/` - 评估与专项工具 - `evaluation/` - `redis/` - `debug/` + - `translation/` ## 已迁移 diff --git a/scripts/amazon_xlsx_to_shoplazza_xlsx.py b/scripts/amazon_xlsx_to_shoplazza_xlsx.py deleted file mode 100644 index 5a5c70a..0000000 --- a/scripts/amazon_xlsx_to_shoplazza_xlsx.py +++ /dev/null @@ -1,615 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert Amazon-format Excel exports (with Parent/Child ASIN structure) into -Shoplazza (店匠) product import Excel format based on `docs/商品导入模板.xlsx`. - -Data source: -- Directory with multiple `*.xlsx` files under `products_data/`. -- Each file contains a main sheet + "Notes" sheet. -- Column meanings (sample): - - ASIN: variant id (sku_id) - - 父ASIN: parent product id (spu_id) - -Output: -- For each 父ASIN group: - - If only 1 ASIN: generate one "S" row - - Else: generate one "M" row + multiple "P" rows - -Multi-variant (M/P) key point: -- Variant dimensions are parsed primarily from the `SKU` column, e.g. - "Size: One Size | Color: Black", and mapped into 款式1/2/3. -""" - -# NOTE: This file is intentionally the same implementation as -# `competitor_xlsx_to_shoplazza_xlsx.py`, but renamed to reflect the correct -# data source (Amazon-format exports). Keep the logic in sync. - -import os -import re -import sys -import argparse -import random -from datetime import datetime -from collections import defaultdict, Counter -from pathlib import Path - -from openpyxl import load_workbook - -# Allow running as `python scripts/xxx.py` without installing as a package -sys.path.insert(0, str(Path(__file__).resolve().parent)) -from shoplazza_excel_template import create_excel_from_template_fast - - -PREFERRED_OPTION_KEYS = [ - "Size", "Color", "Style", "Pattern", "Material", "Flavor", "Scent", - "Pack", "Pack of", "Number of Items", "Count", "Capacity", "Length", - "Width", "Height", "Model", "Configuration", -] - - -def clean_str(v): - if v is None: - return "" - return str(v).strip() - - -def html_escape(s): - s = clean_str(s) - return (s.replace("&", "&") - .replace("<", "<") - .replace(">", ">")) - - -def generate_handle(title): - """ - Generate URL-friendly handle from title (ASCII only). - Keep consistent with existing scripts. - """ - handle = clean_str(title).lower() - handle = re.sub(r"[^a-z0-9\\s-]", "", handle) - handle = re.sub(r"[-\\s]+", "-", handle).strip("-") - if len(handle) > 255: - handle = handle[:255] - return handle or "product" - - -def parse_date_to_template(dt_value): - """ - Template expects: YYYY-MM-DD HH:MM:SS - Input could be "2018-05-09" or datetime/date. - """ - if dt_value is None or dt_value == "": - return "" - if isinstance(dt_value, datetime): - return dt_value.strftime("%Y-%m-%d %H:%M:%S") - s = clean_str(dt_value) - for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S"): - try: - d = datetime.strptime(s, fmt) - return d.strftime("%Y-%m-%d %H:%M:%S") - except Exception: - pass - return "" - - -def parse_weight(weight_conv, weight_raw): - """ - Return (weight_value, unit) where unit in {kg, lb, g, oz}. - Prefer '商品重量(单位换算)' like '68.04 g'. - Fallback to '商品重量' like '0.15 pounds'. - """ - s = clean_str(weight_conv) or clean_str(weight_raw) - if not s: - return ("", "") - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)\\s*([a-zA-Z]+)", s) - if not m: - return ("", "") - val = float(m.group(1)) - unit = m.group(2).lower() - if unit in ("g", "gram", "grams"): - return (val, "g") - if unit in ("kg", "kilogram", "kilograms"): - return (val, "kg") - if unit in ("lb", "lbs", "pound", "pounds"): - return (val, "lb") - if unit in ("oz", "ounce", "ounces"): - return (val, "oz") - return ("", "") - - -def parse_dimensions_inches(dim_raw): - """ - Template '尺寸信息': 'L,W,H' in inches. - Input example: '7.9 x 7.9 x 2 inches' - """ - s = clean_str(dim_raw) - if not s: - return "" - nums = re.findall(r"([0-9]+(?:\\.[0-9]+)?)", s) - if len(nums) < 3: - return "" - return "{},{},{}".format(nums[0], nums[1], nums[2]) - - -def parse_sku_options(sku_text): - """ - Parse 'SKU' column into {key: value}. - Example: - 'Size: One Size | Color: Black' -> {'Size':'One Size','Color':'Black'} - """ - s = clean_str(sku_text) - if not s: - return {} - parts = [p.strip() for p in s.split("|") if p.strip()] - out = {} - for p in parts: - if ":" not in p: - continue - k, v = p.split(":", 1) - k = clean_str(k) - v = clean_str(v) - if k and v: - out[k] = v - return out - - -def choose_option_keys(variant_dicts, max_keys=3): - freq = Counter() - for d in variant_dicts: - for k, v in d.items(): - if v: - freq[k] += 1 - if not freq: - return [] - preferred_rank = {k: i for i, k in enumerate(PREFERRED_OPTION_KEYS)} - - def key_sort(k): - return (preferred_rank.get(k, 10 ** 6), -freq[k], k.lower()) - - keys = sorted(freq.keys(), key=key_sort) - return keys[:max_keys] - - -def build_description_html(title, details, product_url): - parts = [] - if title: - parts.append("

{}

".format(html_escape(title))) - detail_items = [x.strip() for x in clean_str(details).split("|") if x.strip()] - if detail_items: - li = "".join(["
  • {}
  • ".format(html_escape(x)) for x in detail_items[:30]]) - parts.append("
      {}
    ".format(li)) - if product_url: - parts.append('

    Source: {0}

    '.format(html_escape(product_url))) - return "".join(parts) - - -def read_amazon_rows_from_file(xlsx_path, max_rows=None): - wb = load_workbook(xlsx_path, read_only=True, data_only=True) - sheet_name = None - for name in wb.sheetnames: - if str(name).lower() == "notes": - continue - sheet_name = name - break - if sheet_name is None: - return [] - ws = wb[sheet_name] - - # Build header index from first row - header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) - idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)} - - required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", - "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", - "商品重量(单位换算)", "商品重量", "商品尺寸"] - for k in required: - if k not in idx: - raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) - - # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. - # openpyxl cell access is relatively expensive; values_only is much faster. - pos = {k: idx[k] for k in required} # 0-based positions in row tuple - - rows = [] - end_row = ws.max_row - if max_rows is not None: - end_row = min(end_row, 1 + int(max_rows)) - - for tup in ws.iter_rows(min_row=2, max_row=end_row, values_only=True): - asin = clean_str(tup[pos["ASIN"]]) - if not asin: - continue - parent = clean_str(tup[pos["父ASIN"]]) or asin - rows.append({ - "ASIN": asin, - "父ASIN": parent, - "SKU": clean_str(tup[pos["SKU"]]), - "详细参数": clean_str(tup[pos["详细参数"]]), - "商品标题": clean_str(tup[pos["商品标题"]]), - "商品主图": clean_str(tup[pos["商品主图"]]), - "价格($)": tup[pos["价格($)"]], - "prime价格($)": tup[pos["prime价格($)"]], - "上架时间": clean_str(tup[pos["上架时间"]]), - "类目路径": clean_str(tup[pos["类目路径"]]), - "大类目": clean_str(tup[pos["大类目"]]), - "小类目": clean_str(tup[pos["小类目"]]), - "品牌": clean_str(tup[pos["品牌"]]), - "品牌链接": clean_str(tup[pos["品牌链接"]]), - "商品详情页链接": clean_str(tup[pos["商品详情页链接"]]), - "商品重量(单位换算)": clean_str(tup[pos["商品重量(单位换算)"]]), - "商品重量": clean_str(tup[pos["商品重量"]]), - "商品尺寸": clean_str(tup[pos["商品尺寸"]]), - }) - return rows - - -def to_price(v): - if v is None or v == "": - return None - try: - return float(v) - except Exception: - s = clean_str(v) - m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", s) - return float(m.group(1)) if m else None - - -def build_common_fields(base_row, spu_id): - title = base_row.get("商品标题") or "Product" - brand = base_row.get("品牌") or "" - big_cat = base_row.get("大类目") or "" - small_cat = base_row.get("小类目") or "" - cat_path = base_row.get("类目路径") or "" - - handle = generate_handle(title) - if handle and not handle.startswith("products/"): - handle = "products/{}".format(handle) - - seo_title = title - seo_desc_parts = [x for x in [brand, title, big_cat] if x] - seo_description = " ".join(seo_desc_parts)[:5000] - seo_keywords = ",".join([x for x in [title, brand, big_cat, small_cat] if x])[:5000] - tags = ",".join([x for x in [brand, big_cat, small_cat] if x]) - - created_at = parse_date_to_template(base_row.get("上架时间")) - description = build_description_html(title, base_row.get("详细参数"), base_row.get("商品详情页链接")) - - inventory_qty = 100 - weight_val, weight_unit = parse_weight(base_row.get("商品重量(单位换算)"), base_row.get("商品重量")) - size_info = parse_dimensions_inches(base_row.get("商品尺寸")) - - album = big_cat or (cat_path.split(":")[0] if cat_path else "") - - return { - "商品ID": "", - "创建时间": created_at, - "商品标题*": title[:255], - "商品副标题": "{} {}".format(brand, big_cat).strip()[:600], - "商品描述": description, - "SEO标题": seo_title[:5000], - "SEO描述": seo_description, - "SEO URL Handle": handle, - "SEO URL 重定向": "N", - "SEO关键词": seo_keywords, - "商品上架": "Y", - "需要物流": "Y", - "商品收税": "N", - "商品spu": spu_id[:100], - "启用虚拟销量": "N", - "虚拟销量值": "", - "跟踪库存": "Y", - "库存规则*": "1", - "专辑名称": album, - "标签": tags, - "供应商名称": "Amazon", - "供应商URL": base_row.get("商品详情页链接") or base_row.get("品牌链接") or "", - "商品重量": weight_val if weight_val != "" else "", - "重量单位": weight_unit, - "商品库存": inventory_qty, - "尺寸信息": size_info, - "原产地国别": "", - "HS(协调制度)代码": "", - "商品备注": "ASIN:{}; ParentASIN:{}; CategoryPath:{}".format( - base_row.get("ASIN", ""), spu_id, (cat_path[:200] if cat_path else "") - )[:500], - "款式备注": "", - } - - -def build_s_row(base_row): - spu_id = base_row.get("父ASIN") or base_row.get("ASIN") - common = build_common_fields(base_row, spu_id=spu_id) - price = to_price(base_row.get("prime价格($)")) or to_price(base_row.get("价格($)")) or 9.99 - image = base_row.get("商品主图") or "" - row = {} - row.update(common) - row.update({ - "商品属性*": "S", - "款式1": "", - "款式2": "", - "款式3": "", - "商品售价*": price, - "商品原价": price, - "成本价": "", - "商品SKU": base_row.get("ASIN") or "", - "商品条形码": "", - "商品图片*": image, - "商品主图": image, - }) - return row - - -def build_m_p_rows(variant_rows): - base = variant_rows[0] - spu_id = base.get("父ASIN") or base.get("ASIN") - common = build_common_fields(base, spu_id=spu_id) - - option_dicts = [parse_sku_options(v.get("SKU")) for v in variant_rows] - option_keys = choose_option_keys(option_dicts, max_keys=3) or ["Variant"] - - m = {} - m.update(common) - m.update({ - "商品属性*": "M", - "款式1": option_keys[0] if len(option_keys) > 0 else "", - "款式2": option_keys[1] if len(option_keys) > 1 else "", - "款式3": option_keys[2] if len(option_keys) > 2 else "", - "商品售价*": "", - "商品原价": "", - "成本价": "", - "商品SKU": "", - "商品条形码": "", - "商品图片*": base.get("商品主图") or "", - "商品主图": base.get("商品主图") or "", - }) - m["商品重量"] = "" - m["重量单位"] = "" - m["商品库存"] = "" - m["尺寸信息"] = "" - - rows = [m] - - for v in variant_rows: - v_common = build_common_fields(v, spu_id=spu_id) - v_common.update({ - "商品副标题": "", - "商品描述": "", - "SEO标题": "", - "SEO描述": "", - "SEO URL Handle": "", - "SEO URL 重定向": "", - "SEO关键词": "", - "专辑名称": "", - "标签": "", - "供应商名称": "", - "供应商URL": "", - "商品备注": "", - }) - - opt = parse_sku_options(v.get("SKU")) - opt_vals = [v.get("ASIN")] if option_keys == ["Variant"] else [opt.get(k, "") for k in option_keys] - - price = to_price(v.get("prime价格($)")) or to_price(v.get("价格($)")) or 9.99 - image = v.get("商品主图") or "" - - p = {} - p.update(v_common) - p.update({ - "商品属性*": "P", - "款式1": opt_vals[0] if len(opt_vals) > 0 else "", - "款式2": opt_vals[1] if len(opt_vals) > 1 else "", - "款式3": opt_vals[2] if len(opt_vals) > 2 else "", - "商品售价*": price, - "商品原价": price, - "成本价": "", - "商品SKU": v.get("ASIN") or "", - "商品条形码": "", - "商品图片*": image, - "商品主图": "", - }) - rows.append(p) - - return rows - - -def main(): - parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") - parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") - parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") - parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)") - parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") - parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)") - parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") - # 默认行为:丢弃不符合要求的数据 - parser.add_argument("--keep-spu-if-parent-missing", action="store_false", dest="skip_spu_if_parent_missing", default=False, help="Keep SPU even if parent ASIN not found in variants (default: skip entire SPU)") - parser.add_argument("--fix-sku-if-title-mismatch", action="store_false", dest="skip_sku_if_title_mismatch", default=False, help="Fix SKU title to match parent instead of skipping (default: skip SKU with mismatched title)") - args = parser.parse_args() - - if not os.path.isdir(args.input_dir): - raise RuntimeError("input-dir not found: {}".format(args.input_dir)) - if not os.path.exists(args.template): - raise RuntimeError("template not found: {}".format(args.template)) - - files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if f.lower().endswith(".xlsx")] - files.sort() - if args.max_files is not None: - files = files[: int(args.max_files)] - - print("Reading Amazon-format files: {} (from {})".format(len(files), args.input_dir), flush=True) - - groups = defaultdict(list) - seen_asin = set() - - for fp in files: - print(" - loading: {}".format(fp), flush=True) - try: - rows = read_amazon_rows_from_file(fp) - except Exception as e: - print("WARN: failed to read {}: {}".format(fp, e)) - continue - print(" loaded rows: {}".format(len(rows)), flush=True) - - for r in rows: - asin = r.get("ASIN") - if asin in seen_asin: - continue - seen_asin.add(asin) - spu_id = r.get("父ASIN") or asin - groups[spu_id].append(r) - - print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) - - # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组” - group_rows_list = [] # List[List[dict]] - spu_count = 0 - next_product_id = 1 # 用于填充商品ID,全局自增 - # 将SPU顺序打乱,避免过于依赖输入文件的顺序 - spu_items = list(groups.items()) - random.shuffle(spu_items) - - for spu_id, variants in spu_items: - if not variants: - continue - - # 确保父ASIN对应的变体在列表最前面 - parent_variant = None - other_variants = [] - for v in variants: - if v.get("ASIN") == spu_id: - parent_variant = v - else: - other_variants.append(v) - - # 重新排序:父ASIN在前,其他在后 - if parent_variant: - variants = [parent_variant] + other_variants - else: - # 如果找不到父ASIN对应的变体 - print( - f"WARN: Parent ASIN not found in variants: SPU={spu_id}, " - f"variant_count={len(variants)}, first_ASIN={variants[0].get('ASIN') if variants else 'N/A'}", - flush=True, - ) - # 根据开关决定是否丢弃整个SPU - if args.skip_spu_if_parent_missing: - print( - f"SKIP entire SPU due to missing parent ASIN: SPU={spu_id}", - flush=True, - ) - continue - - # 处理变体标题:如果与主商品不一致,根据开关决定修正或丢弃 - main_title = variants[0].get("商品标题") or "" - filtered_variants = [] - for v in variants: - title = v.get("商品标题") or "" - if main_title and title and title != main_title: - if args.skip_sku_if_title_mismatch: - # 丢弃标题不一致的SKU - print( - f"SKIP SKU due to title mismatch: SPU={spu_id}, ASIN={v.get('ASIN')}, " - f"main_title='{main_title}', variant_title='{title}'", - flush=True, - ) - continue - else: - # 修正标题 - print( - f"FIX variant title mismatch: SPU={spu_id}, ASIN={v.get('ASIN')}, " - f"main_title='{main_title}', variant_title='{title}' -> using main_title", - flush=True, - ) - v["商品标题"] = main_title # 统一为主商品标题 - filtered_variants.append(v) - - # 如果所有变体都被过滤掉,跳过整个SPU - if not filtered_variants: - print( - f"SKIP entire SPU: all variants filtered out, SPU={spu_id}", - flush=True, - ) - continue - - variants = filtered_variants - - spu_count += 1 - if args.max_products is not None and spu_count > int(args.max_products): - break - - if len(variants) == 1: - rows = [build_s_row(variants[0])] - else: - rows = build_m_p_rows(variants) - - # 填充商品ID(从1开始全局递增) - for r in rows: - r["商品ID"] = next_product_id - next_product_id += 1 - - group_rows_list.append(rows) - - # 按最大行数拆成多个文件(注意:同一 SPU 不拆分) - data_start_row = 4 # 与模板/写入工具保持一致 - header_rows = data_start_row - 1 # 包含标题行+说明行 - max_total_rows = args.max_rows_per_output or 0 - if max_total_rows and max_total_rows > header_rows: - max_data_rows = max_total_rows - header_rows - else: - max_data_rows = None # 不限制 - - chunks = [] - current_chunk = [] - current_count = 0 - - if max_data_rows is None: - # 不做分片,直接一个 chunk - for gr in group_rows_list: - current_chunk.extend(gr) - if current_chunk: - chunks.append(current_chunk) - else: - for gr in group_rows_list: - gsize = len(gr) - # 如果单个 SPU 本身就超过阈值,只能独占一个文件 - if gsize > max_data_rows: - if current_chunk: - chunks.append(current_chunk) - current_chunk = [] - current_count = 0 - chunks.append(gr) - continue - # 如果放不下当前 chunk,则先封一个,再开新 chunk - if current_count + gsize > max_data_rows: - if current_chunk: - chunks.append(current_chunk) - current_chunk = list(gr) - current_count = gsize - else: - current_chunk.extend(gr) - current_count += gsize - if current_chunk: - chunks.append(current_chunk) - - total_rows = sum(len(c) for c in chunks) - print( - "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format( - total_rows, len(group_rows_list), len(chunks) - ), - flush=True, - ) - - # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀 - base = Path(args.output) - stem = base.stem - suffix = base.suffix or ".xlsx" - - for idx, chunk in enumerate(chunks, start=1): - out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}")) - print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True) - create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row) - - -if __name__ == "__main__": - main() - - diff --git a/scripts/check_data_source.py b/scripts/check_data_source.py deleted file mode 100755 index 58fe105..0000000 --- a/scripts/check_data_source.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python3 -""" -诊断脚本:检查MySQL数据源中分类和规格信息是否正确 - -检查: -1. category_path 字段是否有值 -2. category_path 格式是否正确(应该能被解析为 category1_name) -3. shoplazza_product_option 表的 name 字段是否有值(应该是 "color", "size", "material") -4. shoplazza_product_sku 表的 option1/2/3 字段是否有值 -""" - -import sys -import argparse -from pathlib import Path -from sqlalchemy import create_engine, text - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from utils.db_connector import create_db_connection - - -def check_category_path(db_engine, tenant_id: str): - """检查 category_path 和 category 字段""" - print("\n" + "="*60) - print("1. 检查 category_path 和 category 字段") - print("="*60) - - query = text(""" - SELECT - COUNT(*) as total, - COUNT(category_path) as has_category_path, - COUNT(*) - COUNT(category_path) as null_category_path, - COUNT(category) as has_category, - COUNT(*) - COUNT(category) as null_category - FROM shoplazza_product_spu - WHERE tenant_id = :tenant_id AND deleted = 0 - """) - - with db_engine.connect() as conn: - result = conn.execute(query, {"tenant_id": tenant_id}).fetchone() - total = result[0] - has_category_path = result[1] - null_category_path = result[2] - has_category = result[3] - null_category = result[4] - - print(f"总SPU数: {total}") - print(f"有 category_path 的SPU: {has_category_path}") - print(f"category_path 为空的SPU: {null_category_path}") - print(f"有 category 的SPU: {has_category}") - print(f"category 为空的SPU: {null_category}") - - # 查看category字段的示例 - if has_category > 0: - sample_query = text(""" - SELECT id, title, category_path, category, category_id, category_level - FROM shoplazza_product_spu - WHERE tenant_id = :tenant_id - AND deleted = 0 - AND category IS NOT NULL - LIMIT 5 - """) - samples = conn.execute(sample_query, {"tenant_id": tenant_id}).fetchall() - print(f"\n示例数据(前5条有 category 的记录):") - for row in samples: - print(f" SPU ID: {row[0]}, Title: {row[1][:50] if row[1] else ''}") - print(f" category_path: {row[2]}") - print(f" category: '{row[3]}'") - print(f" category_id: {row[4]}, category_level: {row[5]}") - - # 解析 category 字段(用于生成 category1_name) - if row[3]: - category = str(row[3]) - if '/' in category: - path_parts = category.split('/') - print(f" 解析后(按'/'分割): {path_parts}") - if len(path_parts) > 0: - print(f" → category1_name: '{path_parts[0].strip()}'") - else: - print(f" → category1_name: '{category.strip()}'(直接作为category1_name)") - else: - print("\n⚠️ 警告: 没有SPU有 category 值!") - - # 查看category_path的示例(如果有) - if has_category_path > 0: - sample_query = text(""" - SELECT id, title, category_path, category - FROM shoplazza_product_spu - WHERE tenant_id = :tenant_id - AND deleted = 0 - AND category_path IS NOT NULL - LIMIT 3 - """) - samples = conn.execute(sample_query, {"tenant_id": tenant_id}).fetchall() - print(f"\n示例数据(有 category_path 的记录):") - for row in samples: - print(f" SPU ID: {row[0]}, Title: {row[1][:50] if row[1] else ''}") - print(f" category_path: '{row[2]}'") - print(f" category: '{row[3]}'") - - # 检查是否是ID列表格式 - if row[2] and ',' in str(row[2]) and not '/' in str(row[2]): - print(f" ⚠️ 注意: category_path是ID列表格式(逗号分隔),不是路径格式") - - -def check_options(db_engine, tenant_id: str): - """检查 option 表的 name 字段""" - print("\n" + "="*60) - print("2. 检查 shoplazza_product_option 表的 name 字段") - print("="*60) - - query = text(""" - SELECT - COUNT(*) as total_options, - COUNT(DISTINCT name) as distinct_names, - COUNT(DISTINCT spu_id) as spus_with_options - FROM shoplazza_product_option - WHERE tenant_id = :tenant_id AND deleted = 0 - """) - - with db_engine.connect() as conn: - result = conn.execute(query, {"tenant_id": tenant_id}).fetchone() - total_options = result[0] - distinct_names = result[1] - spus_with_options = result[2] - - print(f"总 option 记录数: {total_options}") - print(f"不同的 name 数量: {distinct_names}") - print(f"有 option 定义的 SPU 数量: {spus_with_options}") - - if total_options > 0: - # 查看不同的 name 值 - name_query = text(""" - SELECT DISTINCT name, position, COUNT(*) as count - FROM shoplazza_product_option - WHERE tenant_id = :tenant_id AND deleted = 0 - GROUP BY name, position - ORDER BY position, name - """) - names = conn.execute(name_query, {"tenant_id": tenant_id}).fetchall() - print(f"\n不同的 name 值:") - for row in names: - print(f" position={row[1]}, name='{row[0]}', count={row[2]}") - - # 查看一些示例 - sample_query = text(""" - SELECT spu_id, position, name, `values` - FROM shoplazza_product_option - WHERE tenant_id = :tenant_id AND deleted = 0 - ORDER BY spu_id, position - LIMIT 10 - """) - samples = conn.execute(sample_query, {"tenant_id": tenant_id}).fetchall() - print(f"\n示例数据(前10条 option 记录):") - for row in samples: - print(f" SPU ID: {row[0]}, position: {row[1]}, name: '{row[2]}', values: {row[3]}") - else: - print("\n⚠️ 警告: 没有 option 记录!") - - -def check_sku_options(db_engine, tenant_id: str): - """检查 SKU 表的 option1/2/3 字段""" - print("\n" + "="*60) - print("3. 检查 shoplazza_product_sku 表的 option1/2/3 字段") - print("="*60) - - query = text(""" - SELECT - COUNT(*) as total_skus, - COUNT(option1) as has_option1, - COUNT(option2) as has_option2, - COUNT(option3) as has_option3, - COUNT(DISTINCT spu_id) as distinct_spus - FROM shoplazza_product_sku - WHERE tenant_id = :tenant_id AND deleted = 0 - """) - - with db_engine.connect() as conn: - result = conn.execute(query, {"tenant_id": tenant_id}).fetchone() - total_skus = result[0] - has_option1 = result[1] - has_option2 = result[2] - has_option3 = result[3] - distinct_spus = result[4] - - print(f"总 SKU 数: {total_skus}") - print(f"有 option1 的 SKU: {has_option1}") - print(f"有 option2 的 SKU: {has_option2}") - print(f"有 option3 的 SKU: {has_option3}") - print(f"不同的 SPU 数量: {distinct_spus}") - - if total_skus > 0: - # 查看一些示例 - sample_query = text(""" - SELECT spu_id, id, option1, option2, option3 - FROM shoplazza_product_sku - WHERE tenant_id = :tenant_id AND deleted = 0 - ORDER BY spu_id, id - LIMIT 10 - """) - samples = conn.execute(sample_query, {"tenant_id": tenant_id}).fetchall() - print(f"\n示例数据(前10条 SKU 记录):") - for row in samples: - print(f" SPU ID: {row[0]}, SKU ID: {row[1]}") - print(f" option1: '{row[2]}', option2: '{row[3]}', option3: '{row[4]}'") - else: - print("\n⚠️ 警告: 没有 SKU 记录!") - - -def check_spu_summary(db_engine, tenant_id: str): - """检查 SPU 汇总信息""" - print("\n" + "="*60) - print("4. SPU 汇总信息") - print("="*60) - - query = text(""" - SELECT - COUNT(DISTINCT spu.id) as total_spus, - COUNT(DISTINCT sku.id) as total_skus, - COUNT(DISTINCT opt.id) as total_options, - COUNT(DISTINCT CASE WHEN spu.category_path IS NOT NULL THEN spu.id END) as spus_with_category_path, - COUNT(DISTINCT opt.spu_id) as spus_with_options - FROM shoplazza_product_spu spu - LEFT JOIN shoplazza_product_sku sku ON spu.id = sku.spu_id AND sku.tenant_id = :tenant_id AND sku.deleted = 0 - LEFT JOIN shoplazza_product_option opt ON spu.id = opt.spu_id AND opt.tenant_id = :tenant_id AND opt.deleted = 0 - WHERE spu.tenant_id = :tenant_id AND spu.deleted = 0 - """) - - with db_engine.connect() as conn: - result = conn.execute(query, {"tenant_id": tenant_id}).fetchone() - total_spus = result[0] - total_skus = result[1] - total_options = result[2] - spus_with_category_path = result[3] - spus_with_options = result[4] - - print(f"总 SPU 数: {total_spus}") - print(f"总 SKU 数: {total_skus}") - print(f"总 option 记录数: {total_options}") - print(f"有 category_path 的 SPU: {spus_with_category_path}") - print(f"有 option 定义的 SPU: {spus_with_options}") - - -def main(): - parser = argparse.ArgumentParser(description='检查MySQL数据源中的分类和规格信息') - parser.add_argument('--tenant-id', required=True, help='Tenant ID') - parser.add_argument('--db-host', help='MySQL host (或使用环境变量 DB_HOST)') - parser.add_argument('--db-port', type=int, help='MySQL port (或使用环境变量 DB_PORT, 默认: 3306)') - parser.add_argument('--db-database', help='MySQL database (或使用环境变量 DB_DATABASE)') - parser.add_argument('--db-username', help='MySQL username (或使用环境变量 DB_USERNAME)') - parser.add_argument('--db-password', help='MySQL password (或使用环境变量 DB_PASSWORD)') - - args = parser.parse_args() - - # 连接数据库 - import os - db_host = args.db_host or os.environ.get('DB_HOST') - db_port = args.db_port or int(os.environ.get('DB_PORT', 3306)) - db_database = args.db_database or os.environ.get('DB_DATABASE') - db_username = args.db_username or os.environ.get('DB_USERNAME') - db_password = args.db_password or os.environ.get('DB_PASSWORD') - - if not all([db_host, db_database, db_username, db_password]): - print("错误: MySQL连接参数不完整") - print("请提供 --db-host, --db-database, --db-username, --db-password") - print("或设置环境变量: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD") - return 1 - - print(f"连接MySQL: {db_host}:{db_port}/{db_database}") - print(f"Tenant ID: {args.tenant_id}") - - try: - db_engine = create_db_connection( - host=db_host, - port=db_port, - database=db_database, - username=db_username, - password=db_password - ) - print("✓ MySQL连接成功\n") - except Exception as e: - print(f"✗ 连接MySQL失败: {e}") - return 1 - - # 执行检查 - check_spu_summary(db_engine, args.tenant_id) - check_category_path(db_engine, args.tenant_id) - check_options(db_engine, args.tenant_id) - check_sku_options(db_engine, args.tenant_id) - - print("\n" + "="*60) - print("检查完成") - print("="*60) - - return 0 - - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/scripts/check_es_data.py b/scripts/check_es_data.py deleted file mode 100755 index 33da512..0000000 --- a/scripts/check_es_data.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python3 -""" -Check actual data in ES index to see if facet fields have values -""" - -import sys -import os -import argparse -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from utils.es_client import ESClient - - -def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): - """Check facet-related fields in ES""" - print("\n" + "="*60) - print("Checking facet field data in ES index") - print("="*60) - - query = { - "query": { - "term": { - "tenant_id": tenant_id - } - }, - "size": size, - "_source": [ - "spu_id", - "title", - "category1_name", - "category2_name", - "category3_name", - "category_name", - "category_path", - "specifications", - "option1_name", - "option2_name", - "option3_name" - ] - } - - try: - response = es_client.client.search(index="search_products", body=query) - hits = response.get('hits', {}).get('hits', []) - total = response.get('hits', {}).get('total', {}).get('value', 0) - - print(f"\nTotal documents: {total}") - print(f"Checking first {len(hits)} documents:\n") - - for i, hit in enumerate(hits, 1): - source = hit.get('_source', {}) - title_obj = source.get("title") or {} - category_path_obj = source.get("category_path") or {} - print(f"Document {i}:") - print(f" spu_id: {source.get('spu_id')}") - print(f" title.zh: {str(title_obj.get('zh', ''))[:50] if isinstance(title_obj, dict) else ''}") - print(f" category1_name: {source.get('category1_name')}") - print(f" category2_name: {source.get('category2_name')}") - print(f" category3_name: {source.get('category3_name')}") - print(f" category_name: {source.get('category_name')}") - print(f" category_path.zh: {category_path_obj.get('zh') if isinstance(category_path_obj, dict) else None}") - print(f" option1_name: {source.get('option1_name')}") - print(f" option2_name: {source.get('option2_name')}") - print(f" option3_name: {source.get('option3_name')}") - - specs = source.get('specifications', []) - if specs: - print(f" specifications count: {len(specs)}") - # 显示前3个specifications - for spec in specs[:3]: - print(f" - name: {spec.get('name')}, value: {spec.get('value')}") - else: - print(f" specifications: empty") - print() - - except Exception as e: - print(f"Error: {e}") - import traceback - traceback.print_exc() - - -def check_facet_aggregations(es_client, tenant_id: str): - """Check facet aggregation queries""" - print("\n" + "="*60) - print("Checking facet aggregation query results") - print("="*60) - - query = { - "query": { - "term": { - "tenant_id": tenant_id - } - }, - "size": 0, - "aggs": { - "category1_facet": { - "terms": { - "field": "category1_name", - "size": 10 - } - }, - "color_facet": { - "nested": { - "path": "specifications" - }, - "aggs": { - "filter_by_name": { - "filter": { - "term": { - "specifications.name": "color" - } - }, - "aggs": { - "value_counts": { - "terms": { - "field": "specifications.value", - "size": 10 - } - } - } - } - } - }, - "size_facet": { - "nested": { - "path": "specifications" - }, - "aggs": { - "filter_by_name": { - "filter": { - "term": { - "specifications.name": "size" - } - }, - "aggs": { - "value_counts": { - "terms": { - "field": "specifications.value", - "size": 10 - } - } - } - } - } - }, - "material_facet": { - "nested": { - "path": "specifications" - }, - "aggs": { - "filter_by_name": { - "filter": { - "term": { - "specifications.name": "material" - } - }, - "aggs": { - "value_counts": { - "terms": { - "field": "specifications.value", - "size": 10 - } - } - } - } - } - } - } - } - - try: - response = es_client.client.search(index="search_products", body=query) - aggs = response.get('aggregations', {}) - - print("\n1. category1_name facet:") - category1 = aggs.get('category1_facet', {}) - buckets = category1.get('buckets', []) - if buckets: - for bucket in buckets: - print(f" {bucket['key']}: {bucket['doc_count']}") - else: - print(" empty (no data)") - - print("\n2. specifications.color facet:") - color_agg = aggs.get('color_facet', {}) - color_filter = color_agg.get('filter_by_name', {}) - color_values = color_filter.get('value_counts', {}) - color_buckets = color_values.get('buckets', []) - if color_buckets: - for bucket in color_buckets: - print(f" {bucket['key']}: {bucket['doc_count']}") - else: - print(" empty (no data)") - - print("\n3. specifications.size facet:") - size_agg = aggs.get('size_facet', {}) - size_filter = size_agg.get('filter_by_name', {}) - size_values = size_filter.get('value_counts', {}) - size_buckets = size_values.get('buckets', []) - if size_buckets: - for bucket in size_buckets: - print(f" {bucket['key']}: {bucket['doc_count']}") - else: - print(" empty (no data)") - - print("\n4. specifications.material facet:") - material_agg = aggs.get('material_facet', {}) - material_filter = material_agg.get('filter_by_name', {}) - material_values = material_filter.get('value_counts', {}) - material_buckets = material_values.get('buckets', []) - if material_buckets: - for bucket in material_buckets: - print(f" {bucket['key']}: {bucket['doc_count']}") - else: - print(" empty (no data)") - - except Exception as e: - print(f"Error: {e}") - import traceback - traceback.print_exc() - - -def main(): - parser = argparse.ArgumentParser(description='Check facet field data in ES index') - parser.add_argument('--tenant-id', required=True, help='Tenant ID') - parser.add_argument('--es-host', help='Elasticsearch host (or use env var ES_HOST, default: http://localhost:9200)') - parser.add_argument('--size', type=int, default=5, help='Number of documents to check (default: 5)') - - args = parser.parse_args() - - # 连接ES - es_host = args.es_host or os.environ.get('ES_HOST', 'http://localhost:9200') - es_username = os.environ.get('ES_USERNAME') - es_password = os.environ.get('ES_PASSWORD') - - print(f"Connecting to Elasticsearch: {es_host}") - print(f"Tenant ID: {args.tenant_id}\n") - - try: - if es_username and es_password: - es_client = ESClient(hosts=[es_host], username=es_username, password=es_password) - else: - es_client = ESClient(hosts=[es_host]) - - if not es_client.ping(): - print(f"✗ Cannot connect to Elasticsearch: {es_host}") - return 1 - print("✓ Elasticsearch connected successfully\n") - except Exception as e: - print(f"✗ Failed to connect to Elasticsearch: {e}") - return 1 - - # 检查ES数据 - check_es_facet_fields(es_client, args.tenant_id, args.size) - check_facet_aggregations(es_client, args.tenant_id) - - print("\n" + "="*60) - print("Check completed") - print("="*60) - - return 0 - - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/scripts/check_index_mapping.py b/scripts/check_index_mapping.py deleted file mode 100644 index e7569fd..0000000 --- a/scripts/check_index_mapping.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 -""" -检查ES索引的实际映射配置,特别是中文字段的analyzer设置 -""" - -import os -import sys -import json -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from utils.es_client import get_es_client_from_env -from indexer.mapping_generator import get_tenant_index_name - - -def check_field_mapping(mapping_dict, field_path): - """递归查找字段映射""" - parts = field_path.split('.') - current = mapping_dict - - for part in parts: - if not isinstance(current, dict): - return None - - # ES mapping nesting: object fields store subfields under "properties" - if "properties" in current and isinstance(current["properties"], dict): - current = current["properties"] - - # multi-fields store subfields under "fields" (e.g. vendor.zh.keyword) - if part != parts[0] and "fields" in current and isinstance(current["fields"], dict) and part in current["fields"]: - current = current["fields"] - - current = current.get(part) - if current is None: - return None - return current - - -def main(): - import argparse - - parser = argparse.ArgumentParser(description="检查 Elasticsearch 索引实际映射配置") - parser.add_argument("--tenant-id", type=str, required=True, help="租户ID") - args = parser.parse_args() - - print("=" * 80) - print("检查 Elasticsearch 索引实际映射配置") - print("=" * 80) - - # 连接ES - try: - es_client = get_es_client_from_env() - if not es_client.ping(): - print("✗ 无法连接到 Elasticsearch") - return 1 - print("✓ Elasticsearch 连接成功\n") - except Exception as e: - print(f"✗ 连接 Elasticsearch 失败: {e}") - return 1 - - index_name = get_tenant_index_name(args.tenant_id) - - # 检查索引是否存在 - if not es_client.index_exists(index_name): - print(f"✗ 索引 '{index_name}' 不存在") - return 1 - - # 获取实际映射 - print(f"获取索引 '{index_name}' 的映射配置...\n") - mapping = es_client.get_mapping(index_name) - - if not mapping: - print("✗ 无法获取索引映射") - return 1 - - # 提取实际映射结构 - # ES返回格式: {index_name: {mappings: {properties: {...}}}} - index_mapping = mapping.get(index_name, {}).get('mappings', {}).get('properties', {}) - - if not index_mapping: - print("✗ 无法解析映射结构") - return 1 - - # 检查关键字段 - fields_to_check = [ - "title.zh", - "brief.zh", - "description.zh", - "vendor.zh", - "vendor.zh.keyword", - "category_path.zh", - "category_name_text.zh" - ] - - print("=" * 80) - print("中文字段实际映射配置") - print("=" * 80) - - for field_name in fields_to_check: - field_mapping = check_field_mapping(index_mapping, field_name) - - if field_mapping is None: - print(f"\n❌ {field_name}: 字段不存在") - continue - - print(f"\n📋 {field_name}:") - print(f" 类型: {field_mapping.get('type', 'N/A')}") - - analyzer = field_mapping.get('analyzer') - search_analyzer = field_mapping.get('search_analyzer') - - if analyzer: - print(f" 索引分析器 (analyzer): {analyzer}") - else: - print(f" 索引分析器 (analyzer): 未设置(使用默认)") - - if search_analyzer: - print(f" 查询分析器 (search_analyzer): {search_analyzer}") - else: - print(f" 查询分析器 (search_analyzer): 未设置(使用analyzer或默认)") - - # 检查是否有子字段 - if 'fields' in field_mapping: - print(f" 子字段:") - for sub_field, sub_mapping in field_mapping['fields'].items(): - print(f" - {sub_field}: {sub_mapping.get('type', 'N/A')}") - if 'normalizer' in sub_mapping: - print(f" normalizer: {sub_mapping['normalizer']}") - - # 获取settings中的analyzer定义 - print("\n" + "=" * 80) - print("索引 Settings 中的 Analyzer 定义") - print("=" * 80) - - try: - settings = es_client.client.indices.get_settings(index=index_name) - index_settings = settings.get(index_name, {}).get('settings', {}).get('index', {}) - analysis = index_settings.get('analysis', {}) - analyzers = analysis.get('analyzer', {}) - - if analyzers: - print("\n定义的 Analyzer:") - for analyzer_name, analyzer_config in analyzers.items(): - print(f"\n {analyzer_name}:") - if isinstance(analyzer_config, dict): - print(f" 类型: {analyzer_config.get('type', 'N/A')}") - if 'tokenizer' in analyzer_config: - print(f" tokenizer: {analyzer_config['tokenizer']}") - if 'filter' in analyzer_config: - print(f" filter: {analyzer_config['filter']}") - else: - print(f" 配置: {analyzer_config}") - else: - print("\n⚠ 未找到自定义 analyzer 定义") - - except Exception as e: - print(f"\n⚠ 无法获取 settings: {e}") - - print("\n" + "=" * 80) - print("检查完成") - print("=" * 80) - - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/scripts/compare_index_mappings.py b/scripts/compare_index_mappings.py deleted file mode 100644 index 7554e56..0000000 --- a/scripts/compare_index_mappings.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 -""" -对比不同租户索引的 mapping 结构 -""" - -import os -import sys -import json -from pathlib import Path -from typing import Dict, Any - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from utils.es_client import get_es_client_from_env - - -def get_field_type(mapping_dict: Dict, field_path: str) -> Dict[str, Any]: - """递归获取字段的 mapping 信息""" - parts = field_path.split('.') - current = mapping_dict - - for part in parts: - if isinstance(current, dict): - current = current.get(part) - if current is None: - return None - else: - return None - return current - - -def compare_mappings(mapping1: Dict[str, Any], mapping2: Dict[str, Any], index1_name: str, index2_name: str): - """对比两个索引的 mapping""" - props1 = mapping1.get('mappings', {}).get('properties', {}) - props2 = mapping2.get('mappings', {}).get('properties', {}) - - all_fields = set(props1.keys()) | set(props2.keys()) - - print(f"\n{'='*80}") - print(f"对比索引映射结构") - print(f"{'='*80}") - print(f"索引1: {index1_name}") - print(f"索引2: {index2_name}") - print(f"{'='*80}\n") - - differences = [] - same_fields = [] - - for field in sorted(all_fields): - field1 = props1.get(field) - field2 = props2.get(field) - - if field1 is None: - differences.append((field, f"只在 {index2_name} 中存在", field2)) - continue - if field2 is None: - differences.append((field, f"只在 {index1_name} 中存在", field1)) - continue - - type1 = field1.get('type') - type2 = field2.get('type') - - if type1 != type2: - differences.append((field, f"类型不同: {index1_name}={type1}, {index2_name}={type2}", (field1, field2))) - else: - same_fields.append((field, type1)) - - # 打印相同的字段 - print(f"✓ 相同字段 ({len(same_fields)} 个):") - for field, field_type in same_fields[:20]: # 只显示前20个 - print(f" - {field}: {field_type}") - if len(same_fields) > 20: - print(f" ... 还有 {len(same_fields) - 20} 个相同字段") - - # 打印不同的字段 - if differences: - print(f"\n✗ 不同字段 ({len(differences)} 个):") - for field, reason, details in differences: - print(f"\n {field}:") - print(f" {reason}") - if isinstance(details, tuple): - print(f" {index1_name}: {json.dumps(details[0], indent=4, ensure_ascii=False)}") - print(f" {index2_name}: {json.dumps(details[1], indent=4, ensure_ascii=False)}") - else: - print(f" 详情: {json.dumps(details, indent=4, ensure_ascii=False)}") - else: - print(f"\n✓ 所有字段类型一致!") - - # 特别检查 tags 字段 - print(f"\n{'='*80}") - print(f"特别检查: tags 字段") - print(f"{'='*80}") - - tags1 = get_field_type(props1, 'tags') - tags2 = get_field_type(props2, 'tags') - - if tags1: - print(f"\n{index1_name}.tags:") - print(f" 类型: {tags1.get('type')}") - print(f" 完整定义: {json.dumps(tags1, indent=2, ensure_ascii=False)}") - else: - print(f"\n{index1_name}.tags: 不存在") - - if tags2: - print(f"\n{index2_name}.tags:") - print(f" 类型: {tags2.get('type')}") - print(f" 完整定义: {json.dumps(tags2, indent=2, ensure_ascii=False)}") - else: - print(f"\n{index2_name}.tags: 不存在") - - -def main(): - import argparse - - parser = argparse.ArgumentParser(description='对比 Elasticsearch 索引的 mapping 结构') - parser.add_argument('index1', help='第一个索引名称 (例如: search_products_tenant_171)') - parser.add_argument('index2', nargs='?', help='第二个索引名称 (例如: search_products_tenant_162)') - parser.add_argument('--list', action='store_true', help='列出所有以 index1 为前缀的索引') - - args = parser.parse_args() - - # 连接 ES - try: - es_client = get_es_client_from_env() - if not es_client.ping(): - print("✗ 无法连接到 Elasticsearch") - return 1 - print("✓ Elasticsearch 连接成功\n") - except Exception as e: - print(f"✗ 连接 Elasticsearch 失败: {e}") - return 1 - - # 如果指定了 --list,列出所有匹配的索引 - if args.list or not args.index2: - try: - # 使用 cat API 列出所有索引 - indices = es_client.client.cat.indices(format='json') - matching_indices = [idx['index'] for idx in indices if idx['index'].startswith(args.index1)] - - if matching_indices: - print(f"找到 {len(matching_indices)} 个匹配的索引:") - for idx in sorted(matching_indices): - print(f" - {idx}") - return 0 - else: - print(f"未找到以 '{args.index1}' 开头的索引") - return 1 - except Exception as e: - print(f"✗ 列出索引失败: {e}") - return 1 - - # 获取两个索引的 mapping - index1 = args.index1 - index2 = args.index2 - - print(f"正在获取索引映射...") - print(f" 索引1: {index1}") - print(f" 索引2: {index2}\n") - - # 检查索引是否存在 - if not es_client.index_exists(index1): - print(f"✗ 索引 '{index1}' 不存在") - return 1 - - if not es_client.index_exists(index2): - print(f"✗ 索引 '{index2}' 不存在") - return 1 - - # 获取 mapping - mapping1 = es_client.get_mapping(index1) - mapping2 = es_client.get_mapping(index2) - - if not mapping1 or index1 not in mapping1: - print(f"✗ 无法获取索引 '{index1}' 的映射") - return 1 - - if not mapping2 or index2 not in mapping2: - print(f"✗ 无法获取索引 '{index2}' 的映射") - return 1 - - # 对比 mapping - compare_mappings(mapping1[index1], mapping2[index2], index1, index2) - - return 0 - - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/scripts/competitor_xlsx_to_shoplazza_xlsx.py b/scripts/competitor_xlsx_to_shoplazza_xlsx.py deleted file mode 100644 index 5812357..0000000 --- a/scripts/competitor_xlsx_to_shoplazza_xlsx.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 -""" -DEPRECATED SCRIPT NAME (kept for backward compatibility). - -The input `data/mai_jia_jing_ling/products_data/*.xlsx` files are Amazon-format exports -(Parent/Child ASIN), not “competitor data”. - -Please use: - - `scripts/amazon_xlsx_to_shoplazza_xlsx.py` - -This wrapper simply forwards all CLI args to the correctly named script, so you -automatically get the latest performance improvements (fast read/write). -""" - -import sys -from pathlib import Path - -# Allow running as `python scripts/xxx.py` without installing as a package -sys.path.insert(0, str(Path(__file__).resolve().parent)) - -from amazon_xlsx_to_shoplazza_xlsx import main as amazon_main - - -if __name__ == "__main__": - amazon_main() - - diff --git a/scripts/csv_to_excel.py b/scripts/csv_to_excel.py deleted file mode 100755 index 5149eba..0000000 --- a/scripts/csv_to_excel.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert CSV data to Excel import template. - -Reads CSV file (goods_with_pic.5years_congku.csv.shuf.1w) and generates Excel file -based on the template format (商品导入模板.xlsx). - -Each CSV row corresponds to 1 SPU and 1 SKU, which will be exported as a single -S (Single variant) row in the Excel template. -""" - -import sys -import os -import csv -import random -import argparse -import re -from pathlib import Path -from datetime import datetime, timedelta -import pandas as pd -from openpyxl import load_workbook -from openpyxl.styles import Font, Alignment -from openpyxl.utils import get_column_letter - -# Shared helpers (keeps template writing consistent across scripts) -from scripts.shoplazza_import_template import create_excel_from_template as _create_excel_from_template_shared -from scripts.shoplazza_import_template import generate_handle as _generate_handle_shared - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent)) - - -def clean_value(value): - """ - Clean and normalize value. - - Args: - value: Value to clean - - Returns: - Cleaned string value - """ - if value is None: - return '' - value = str(value).strip() - # Remove surrounding quotes - if value.startswith('"') and value.endswith('"'): - value = value[1:-1] - return value - - -def parse_csv_row(row: dict) -> dict: - """ - Parse CSV row and extract fields. - - Args: - row: CSV row dictionary - - Returns: - Parsed data dictionary - """ - return { - 'skuId': clean_value(row.get('skuId', '')), - 'name': clean_value(row.get('name', '')), - 'name_pinyin': clean_value(row.get('name_pinyin', '')), - 'create_time': clean_value(row.get('create_time', '')), - 'ruSkuName': clean_value(row.get('ruSkuName', '')), - 'enSpuName': clean_value(row.get('enSpuName', '')), - 'categoryName': clean_value(row.get('categoryName', '')), - 'supplierName': clean_value(row.get('supplierName', '')), - 'brandName': clean_value(row.get('brandName', '')), - 'file_id': clean_value(row.get('file_id', '')), - 'days_since_last_update': clean_value(row.get('days_since_last_update', '')), - 'id': clean_value(row.get('id', '')), - 'imageUrl': clean_value(row.get('imageUrl', '')) - } - - -def generate_handle(title: str) -> str: - """ - Generate URL-friendly handle from title. - - Args: - title: Product title - - Returns: - URL-friendly handle (ASCII only) - """ - # Keep backward-compatible function name while delegating to shared helper. - return _generate_handle_shared(title) - - -def read_csv_file(csv_file: str) -> list: - """ - Read CSV file and return list of parsed rows. - - Args: - csv_file: Path to CSV file - - Returns: - List of parsed CSV data dictionaries - """ - csv_data_list = [] - - with open(csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - parsed = parse_csv_row(row) - csv_data_list.append(parsed) - - return csv_data_list - - -def csv_to_excel_row(csv_data: dict) -> dict: - """ - Convert CSV data row to Excel template row. - - Each CSV row represents a single product with one variant (S type in Excel). - - Args: - csv_data: Parsed CSV row data - - Returns: - Dictionary mapping Excel column names to values - """ - # Parse create_time - try: - created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - except: - created_at = datetime.now() - timedelta(days=random.randint(1, 365)) - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - - # Generate title - use name or enSpuName - title = csv_data['name'] or csv_data['enSpuName'] or 'Product' - - # Generate handle - prefer enSpuName, then name_pinyin, then title - handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title - handle = generate_handle(handle_source) - if handle and not handle.startswith('products/'): - handle = f'products/{handle}' - - # Generate SEO fields - seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title - seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title - seo_keywords_parts = [title] - if csv_data['categoryName']: - seo_keywords_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - seo_keywords_parts.append(csv_data['brandName']) - seo_keywords = ','.join(seo_keywords_parts) - - # Generate tags from category and brand - tags_parts = [] - if csv_data['categoryName']: - tags_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - tags_parts.append(csv_data['brandName']) - tags = ','.join(tags_parts) if tags_parts else '' - - # Generate prices (similar to import_tenant2_csv.py) - price = round(random.uniform(50, 500), 2) - compare_at_price = round(price * random.uniform(1.2, 1.5), 2) - cost_price = round(price * 0.6, 2) - - # Generate random stock - inventory_quantity = random.randint(0, 100) - - # Generate random weight - weight = round(random.uniform(0.1, 5.0), 2) - weight_unit = 'kg' - - # Use ruSkuName as SKU title, fallback to name - sku_title = csv_data['ruSkuName'] or csv_data['name'] or 'SKU' - - # Use skuId as SKU code - sku_code = csv_data['skuId'] or '' - - # Generate barcode - try: - sku_id = int(csv_data['skuId']) - barcode = f"BAR{sku_id:08d}" - except: - barcode = '' - - # Build description - description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' - - # Build brief (subtitle) - brief = csv_data['name'] or '' - - # Excel row data (mapping to Excel template columns) - excel_row = { - '商品ID': '', # Empty for new products - '创建时间': create_time_str, - '商品标题*': title, - '商品属性*': 'S', # Single variant product - '商品副标题': brief, - '商品描述': description, - 'SEO标题': seo_title, - 'SEO描述': seo_description, - 'SEO URL Handle': handle, - 'SEO URL 重定向': 'N', # Default to N - 'SEO关键词': seo_keywords, - '商品上架': 'Y', # Published by default - '需要物流': 'Y', # Requires shipping - '商品收税': 'N', # Not taxable by default - '商品spu': '', # Empty - '启用虚拟销量': 'N', # No fake sales - '虚拟销量值': '', # Empty - '跟踪库存': 'Y', # Track inventory - '库存规则*': '1', # Allow purchase when stock is 0 - '专辑名称': csv_data['categoryName'] or '', # Category as album - '标签': tags, - '供应商名称': csv_data['supplierName'] or '', - '供应商URL': '', # Empty - '款式1': '', # Not used for S type - '款式2': '', # Not used for S type - '款式3': '', # Not used for S type - '商品售价*': price, - '商品原价': compare_at_price, - '成本价': cost_price, - '商品SKU': sku_code, - '商品重量': weight, - '重量单位': weight_unit, - '商品条形码': barcode, - '商品库存': inventory_quantity, - '尺寸信息': '', # Empty - '原产地国别': '', # Empty - 'HS(协调制度)代码': '', # Empty - '商品图片*': csv_data['imageUrl'] or '', # Image URL - '商品备注': '', # Empty - '款式备注': '', # Empty - '商品主图': csv_data['imageUrl'] or '', # Main image URL - } - - return excel_row - - -def create_excel_from_template(template_file: str, output_file: str, csv_data_list: list): - """ - Create Excel file from template and fill with CSV data. - - Args: - template_file: Path to Excel template file - output_file: Path to output Excel file - csv_data_list: List of parsed CSV data dictionaries - """ - excel_rows = [csv_to_excel_row(d) for d in csv_data_list] - _create_excel_from_template_shared(template_file, output_file, excel_rows) - print(f"Excel file created: {output_file}") - print(f" - Total rows: {len(csv_data_list)}") - - -def main(): - parser = argparse.ArgumentParser(description='Convert CSV data to Excel import template') - parser.add_argument('--csv-file', - default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w', - help='CSV file path (default: data/customer1/goods_with_pic.5years_congku.csv.shuf.1w)') - parser.add_argument('--template', - default='docs/商品导入模板.xlsx', - help='Excel template file path (default: docs/商品导入模板.xlsx)') - parser.add_argument('--output', - default='商品导入数据.xlsx', - help='Output Excel file path (default: 商品导入数据.xlsx)') - parser.add_argument('--limit', - type=int, - default=None, - help='Limit number of rows to process (default: all)') - - args = parser.parse_args() - - # Check if files exist - if not os.path.exists(args.csv_file): - print(f"Error: CSV file not found: {args.csv_file}") - sys.exit(1) - - if not os.path.exists(args.template): - print(f"Error: Template file not found: {args.template}") - sys.exit(1) - - # Read CSV file - print(f"Reading CSV file: {args.csv_file}") - csv_data_list = read_csv_file(args.csv_file) - print(f"Read {len(csv_data_list)} rows from CSV") - - # Limit rows if specified - if args.limit: - csv_data_list = csv_data_list[:args.limit] - print(f"Limited to {len(csv_data_list)} rows") - - # Create Excel file - print(f"Creating Excel file from template: {args.template}") - print(f"Output file: {args.output}") - create_excel_from_template(args.template, args.output, csv_data_list) - - print(f"\nDone! Generated {len(csv_data_list)} product rows in Excel file.") - - -if __name__ == '__main__': - main() - diff --git a/scripts/csv_to_excel_multi_variant.py b/scripts/csv_to_excel_multi_variant.py deleted file mode 100755 index 4e91f59..0000000 --- a/scripts/csv_to_excel_multi_variant.py +++ /dev/null @@ -1,565 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert CSV data to Excel import template with multi-variant support. - -Reads CSV file (goods_with_pic.5years_congku.csv.shuf.1w) and generates Excel file -based on the template format (商品导入模板.xlsx). - -Features: -- 30% products as Single variant (S type) -- 70% products as Multi variant (M+P type) with color, size, material options -""" - -import sys -import os -import csv -import random -import argparse -import re -from pathlib import Path -from datetime import datetime, timedelta -import itertools -from openpyxl import load_workbook -from openpyxl.styles import Alignment - -# Shared helpers (keeps template writing consistent across scripts) -from scripts.shoplazza_import_template import create_excel_from_template as _create_excel_from_template_shared -from scripts.shoplazza_import_template import generate_handle as _generate_handle_shared - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -# Color definitions -COLORS = [ - "Red", "Blue", "Green", "Yellow", "Black", "White", "Orange", "Purple", - "Pink", "Brown", "Gray", "Navy", "Beige", "Cream", "Maroon", "Olive", - "Teal", "Cyan", "Magenta", "Lime", "Indigo", "Gold", "Silver", "Bronze", - "Coral", "Turquoise", "Violet", "Khaki", "Charcoal", "Ivory" -] - - -def clean_value(value): - """ - Clean and normalize value. - - Args: - value: Value to clean - - Returns: - Cleaned string value - """ - if value is None: - return '' - value = str(value).strip() - # Remove surrounding quotes - if value.startswith('"') and value.endswith('"'): - value = value[1:-1] - return value - - -def parse_csv_row(row: dict) -> dict: - """ - Parse CSV row and extract fields. - - Args: - row: CSV row dictionary - - Returns: - Parsed data dictionary - """ - return { - 'skuId': clean_value(row.get('skuId', '')), - 'name': clean_value(row.get('name', '')), - 'name_pinyin': clean_value(row.get('name_pinyin', '')), - 'create_time': clean_value(row.get('create_time', '')), - 'ruSkuName': clean_value(row.get('ruSkuName', '')), - 'enSpuName': clean_value(row.get('enSpuName', '')), - 'categoryName': clean_value(row.get('categoryName', '')), - 'supplierName': clean_value(row.get('supplierName', '')), - 'brandName': clean_value(row.get('brandName', '')), - 'file_id': clean_value(row.get('file_id', '')), - 'days_since_last_update': clean_value(row.get('days_since_last_update', '')), - 'id': clean_value(row.get('id', '')), - 'imageUrl': clean_value(row.get('imageUrl', '')) - } - - -def generate_handle(title: str) -> str: - """ - Generate URL-friendly handle from title. - - Args: - title: Product title - - Returns: - URL-friendly handle (ASCII only) - """ - # Keep backward-compatible function name while delegating to shared helper. - return _generate_handle_shared(title) - - -def extract_material_from_title(title: str) -> str: - """ - Extract material from title by taking the last word after splitting by space. - - 按照商品标题空格分割后的最后一个字符串作为material。 - 例如:"消防套 塑料【英文包装】" -> 最后一个字符串是 "塑料【英文包装】" - - Args: - title: Product title - - Returns: - Material string (single value) - """ - if not title: - return 'default' - - # Split by spaces (只按空格分割,保持原样) - parts = title.strip().split() - if parts: - # Get last part (最后一个字符串) - material = parts[-1] - # Remove brackets but keep content - material = re.sub(r'[【】\[\]()()]', '', material) - material = material.strip() - if material: - return material - - return 'default' - - -def generate_single_variant_row(csv_data: dict, base_sku_id: int = 1) -> dict: - """ - Generate Excel row for Single variant (S type) product. - - Args: - csv_data: Parsed CSV row data - base_sku_id: Base SKU ID for generating SKU code - - Returns: - Dictionary mapping Excel column names to values - """ - # Parse create_time - try: - created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - except: - created_at = datetime.now() - timedelta(days=random.randint(1, 365)) - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - - # Generate title - use name or enSpuName - title = csv_data['name'] or csv_data['enSpuName'] or 'Product' - - # Generate handle - prefer enSpuName, then name_pinyin, then title - handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title - handle = generate_handle(handle_source) - if handle and not handle.startswith('products/'): - handle = f'products/{handle}' - - # Generate SEO fields - seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title - seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title - seo_keywords_parts = [title] - if csv_data['categoryName']: - seo_keywords_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - seo_keywords_parts.append(csv_data['brandName']) - seo_keywords = ','.join(seo_keywords_parts) - - # Generate tags from category and brand - tags_parts = [] - if csv_data['categoryName']: - tags_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - tags_parts.append(csv_data['brandName']) - tags = ','.join(tags_parts) if tags_parts else '' - - # Generate prices - price = round(random.uniform(50, 500), 2) - compare_at_price = round(price * random.uniform(1.2, 1.5), 2) - cost_price = round(price * 0.6, 2) - - # Generate random stock - inventory_quantity = random.randint(0, 100) - - # Generate random weight - weight = round(random.uniform(0.1, 5.0), 2) - weight_unit = 'kg' - - # Use skuId as SKU code - sku_code = csv_data['skuId'] or f'SKU-{base_sku_id}' - - # Generate barcode - try: - sku_id = int(csv_data['skuId']) if csv_data['skuId'] else base_sku_id - barcode = f"BAR{sku_id:08d}" - except: - barcode = f"BAR{base_sku_id:08d}" - - # Build description - description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' - - # Build brief (subtitle) - brief = csv_data['name'] or '' - - # Excel row data - excel_row = { - '商品ID': '', # Empty for new products - '创建时间': create_time_str, - '商品标题*': title, - '商品属性*': 'S', # Single variant product - '商品副标题': brief, - '商品描述': description, - 'SEO标题': seo_title, - 'SEO描述': seo_description, - 'SEO URL Handle': handle, - 'SEO URL 重定向': 'N', - 'SEO关键词': seo_keywords, - '商品上架': 'Y', - '需要物流': 'Y', - '商品收税': 'N', - '商品spu': '', - '启用虚拟销量': 'N', - '虚拟销量值': '', - '跟踪库存': 'Y', - '库存规则*': '1', - '专辑名称': csv_data['categoryName'] or '', - '标签': tags, - '供应商名称': csv_data['supplierName'] or '', - '供应商URL': '', - '款式1': '', # Empty for S type - '款式2': '', # Empty for S type - '款式3': '', # Empty for S type - '商品售价*': price, - '商品原价': compare_at_price, - '成本价': cost_price, - '商品SKU': sku_code, - '商品重量': weight, - '重量单位': weight_unit, - '商品条形码': barcode, - '商品库存': inventory_quantity, - '尺寸信息': '', - '原产地国别': '', - 'HS(协调制度)代码': '', - '商品图片*': csv_data['imageUrl'] or '', - '商品备注': '', - '款式备注': '', - '商品主图': csv_data['imageUrl'] or '', - } - - return excel_row - - -def generate_multi_variant_rows(csv_data: dict, base_sku_id: int = 1) -> list: - """ - Generate Excel rows for Multi variant (M+P type) product. - - Returns a list of rows: - - First row: M (主商品) with option names - - Following rows: P (子款式) with option values - - Args: - csv_data: Parsed CSV row data - base_sku_id: Base SKU ID for generating SKU codes - - Returns: - List of dictionaries mapping Excel column names to values - """ - rows = [] - - # Parse create_time - try: - created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - except: - created_at = datetime.now() - timedelta(days=random.randint(1, 365)) - create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') - - # Generate title - title = csv_data['name'] or csv_data['enSpuName'] or 'Product' - - # Generate handle - handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title - handle = generate_handle(handle_source) - if handle and not handle.startswith('products/'): - handle = f'products/{handle}' - - # Generate SEO fields - seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title - seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title - seo_keywords_parts = [title] - if csv_data['categoryName']: - seo_keywords_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - seo_keywords_parts.append(csv_data['brandName']) - seo_keywords = ','.join(seo_keywords_parts) - - # Generate tags - tags_parts = [] - if csv_data['categoryName']: - tags_parts.append(csv_data['categoryName']) - if csv_data['brandName']: - tags_parts.append(csv_data['brandName']) - tags = ','.join(tags_parts) if tags_parts else '' - - # Extract material from title (last word after splitting by space) - material = extract_material_from_title(title) - - # Generate color options: randomly select 2-10 colors from COLORS list - num_colors = random.randint(2, 10) - selected_colors = random.sample(COLORS, min(num_colors, len(COLORS))) - - # Generate size options: 1-30, randomly select 4-8 - num_sizes = random.randint(4, 8) - all_sizes = [str(i) for i in range(1, 31)] - selected_sizes = random.sample(all_sizes, num_sizes) - - # Material has only one value - materials = [material] - - # Generate all combinations (Cartesian product) - variants = list(itertools.product(selected_colors, selected_sizes, materials)) - - # Generate M row (主商品) - description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' - brief = csv_data['name'] or '' - - m_row = { - '商品ID': '', - '创建时间': create_time_str, - '商品标题*': title, - '商品属性*': 'M', # Main product - '商品副标题': brief, - '商品描述': description, - 'SEO标题': seo_title, - 'SEO描述': seo_description, - 'SEO URL Handle': handle, - 'SEO URL 重定向': 'N', - 'SEO关键词': seo_keywords, - '商品上架': 'Y', - '需要物流': 'Y', - '商品收税': 'N', - '商品spu': '', - '启用虚拟销量': 'N', - '虚拟销量值': '', - '跟踪库存': 'Y', - '库存规则*': '1', - '专辑名称': csv_data['categoryName'] or '', - '标签': tags, - '供应商名称': csv_data['supplierName'] or '', - '供应商URL': '', - '款式1': 'color', # Option name - '款式2': 'size', # Option name - '款式3': 'material', # Option name - '商品售价*': '', # Empty for M row - '商品原价': '', - '成本价': '', - '商品SKU': '', # Empty for M row - '商品重量': '', - '重量单位': '', - '商品条形码': '', - '商品库存': '', # Empty for M row - '尺寸信息': '', - '原产地国别': '', - 'HS(协调制度)代码': '', - '商品图片*': csv_data['imageUrl'] or '', # Main product image - '商品备注': '', - '款式备注': '', - '商品主图': csv_data['imageUrl'] or '', - } - rows.append(m_row) - - # Generate P rows (子款式) for each variant combination - base_price = round(random.uniform(50, 500), 2) - - for variant_idx, (color, size, mat) in enumerate(variants): - # Generate price variation (within ±20% of base) - price = round(base_price * random.uniform(0.8, 1.2), 2) - compare_at_price = round(price * random.uniform(1.2, 1.5), 2) - cost_price = round(price * 0.6, 2) - - # Generate random stock - inventory_quantity = random.randint(0, 100) - - # Generate random weight - weight = round(random.uniform(0.1, 5.0), 2) - weight_unit = 'kg' - - # Generate SKU code - sku_code = f"{csv_data['skuId']}-{color}-{size}-{mat}" if csv_data['skuId'] else f'SKU-{base_sku_id}-{variant_idx+1}' - - # Generate barcode - barcode = f"BAR{base_sku_id:08d}{variant_idx+1:03d}" - - p_row = { - '商品ID': '', - '创建时间': create_time_str, - '商品标题*': title, # Same as M row - '商品属性*': 'P', # Variant - '商品副标题': '', # Empty for P row - '商品描述': '', # Empty for P row - 'SEO标题': '', # Empty for P row - 'SEO描述': '', # Empty for P row - 'SEO URL Handle': '', # Empty for P row - 'SEO URL 重定向': '', - 'SEO关键词': '', - '商品上架': 'Y', - '需要物流': 'Y', - '商品收税': 'N', - '商品spu': '', - '启用虚拟销量': 'N', - '虚拟销量值': '', - '跟踪库存': 'Y', - '库存规则*': '1', - '专辑名称': '', # Empty for P row - '标签': '', # Empty for P row - '供应商名称': '', # Empty for P row - '供应商URL': '', - '款式1': color, # Option value - '款式2': size, # Option value - '款式3': mat, # Option value - '商品售价*': price, - '商品原价': compare_at_price, - '成本价': cost_price, - '商品SKU': sku_code, - '商品重量': weight, - '重量单位': weight_unit, - '商品条形码': barcode, - '商品库存': inventory_quantity, - '尺寸信息': '', - '原产地国别': '', - 'HS(协调制度)代码': '', - '商品图片*': '', # Empty for P row (uses main product image) - '商品备注': '', - '款式备注': '', - '商品主图': '', - } - rows.append(p_row) - - return rows - - -def read_csv_file(csv_file: str) -> list: - """ - Read CSV file and return list of parsed rows. - - Args: - csv_file: Path to CSV file - - Returns: - List of parsed CSV data dictionaries - """ - csv_data_list = [] - - with open(csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - parsed = parse_csv_row(row) - csv_data_list.append(parsed) - - return csv_data_list - - -def create_excel_from_template(template_file: str, output_file: str, excel_rows: list): - """ - Create Excel file from template and fill with data rows. - - Args: - template_file: Path to Excel template file - output_file: Path to output Excel file - excel_rows: List of dictionaries mapping Excel column names to values - """ - _create_excel_from_template_shared(template_file, output_file, excel_rows) - print(f"Excel file created: {output_file}") - print(f" - Total rows: {len(excel_rows)}") - - -def main(): - parser = argparse.ArgumentParser(description='Convert CSV data to Excel import template with multi-variant support') - parser.add_argument('--csv-file', - default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w', - help='CSV file path') - parser.add_argument('--template', - default='docs/商品导入模板.xlsx', - help='Excel template file path') - parser.add_argument('--output', - default='商品导入数据.xlsx', - help='Output Excel file path') - parser.add_argument('--limit', - type=int, - default=None, - help='Limit number of products to process') - parser.add_argument('--single-ratio', - type=float, - default=0.3, - help='Ratio of single variant products (default: 0.3 = 30%%)') - parser.add_argument('--seed', - type=int, - default=None, - help='Random seed for reproducible results') - - args = parser.parse_args() - - # Set random seed if provided - if args.seed is not None: - random.seed(args.seed) - - # Check if files exist - if not os.path.exists(args.csv_file): - print(f"Error: CSV file not found: {args.csv_file}") - sys.exit(1) - - if not os.path.exists(args.template): - print(f"Error: Template file not found: {args.template}") - sys.exit(1) - - # Read CSV file - print(f"Reading CSV file: {args.csv_file}") - csv_data_list = read_csv_file(args.csv_file) - print(f"Read {len(csv_data_list)} rows from CSV") - - # Limit products if specified - if args.limit: - csv_data_list = csv_data_list[:args.limit] - print(f"Limited to {len(csv_data_list)} products") - - # Generate Excel rows - print(f"\nGenerating Excel rows...") - print(f" - Single variant ratio: {args.single_ratio*100:.0f}%") - print(f" - Multi variant ratio: {(1-args.single_ratio)*100:.0f}%") - - excel_rows = [] - single_count = 0 - multi_count = 0 - - for idx, csv_data in enumerate(csv_data_list): - # Decide if this product should be single or multi variant - is_single = random.random() < args.single_ratio - - if is_single: - # Generate single variant (S type) - row = generate_single_variant_row(csv_data, base_sku_id=idx+1) - excel_rows.append(row) - single_count += 1 - else: - # Generate multi variant (M+P type) - rows = generate_multi_variant_rows(csv_data, base_sku_id=idx+1) - excel_rows.extend(rows) - multi_count += 1 - - print(f"\nGenerated:") - print(f" - Single variant products: {single_count}") - print(f" - Multi variant products: {multi_count}") - print(f" - Total Excel rows: {len(excel_rows)}") - - # Create Excel file - print(f"\nCreating Excel file from template: {args.template}") - print(f"Output file: {args.output}") - create_excel_from_template(args.template, args.output, excel_rows) - - print(f"\nDone! Generated {len(excel_rows)} rows in Excel file.") - - -if __name__ == '__main__': - main() - diff --git a/scripts/daily_log_router.sh b/scripts/daily_log_router.sh deleted file mode 100755 index 5c9ddda..0000000 --- a/scripts/daily_log_router.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -# -# Route incoming log stream into per-day files. -# -# Usage: -# command 2>&1 | ./scripts/daily_log_router.sh [retention_days] -# - -set -euo pipefail - -if [ "$#" -lt 2 ]; then - echo "Usage: $0 [retention_days]" >&2 - exit 1 -fi - -SERVICE_NAME="$1" -LOG_DIR="$2" -RETENTION_DAYS="${3:-30}" - -mkdir -p "${LOG_DIR}" - -awk -v dir="${LOG_DIR}" -v service="${SERVICE_NAME}" -v retention_days="${RETENTION_DAYS}" ' -function rotate_file(day) { - return sprintf("%s/%s-%s.log", dir, service, day) -} - -function update_symlink(day) { - cmd = sprintf("ln -sfn \"%s-%s.log\" \"%s/%s.log\"", service, day, dir, service) - system(cmd) -} - -function cleanup_old_logs() { - cmd = sprintf("find \"%s\" -maxdepth 1 -type f -name \"%s-*.log\" -mtime +%d -delete >/dev/null 2>&1", dir, service, retention_days) - system(cmd) -} - -{ - day = strftime("%Y-%m-%d") - target = rotate_file(day) - - if (target != current_target) { - update_symlink(day) - cleanup_old_logs() - current_target = target - } - - print >> current_target - fflush(current_target) -} - -END { - if (current_target != "") { - close(current_target) - } -} -' diff --git a/scripts/data_import/README.md b/scripts/data_import/README.md new file mode 100644 index 0000000..98b831e --- /dev/null +++ b/scripts/data_import/README.md @@ -0,0 +1,13 @@ +# Data Import Scripts + +这一组脚本用于把外部商品数据或 CSV/XLSX 样本转换为 Shoplazza 导入格式。 + +- `amazon_xlsx_to_shoplazza_xlsx.py` +- `competitor_xlsx_to_shoplazza_xlsx.py` +- `csv_to_excel.py` +- `csv_to_excel_multi_variant.py` +- `shoplazza_excel_template.py` +- `shoplazza_import_template.py` +- `tenant3_csv_to_shoplazza_xlsx.sh` + +这里是离线数据转换工具,不属于线上服务运维入口。 diff --git a/scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py b/scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py new file mode 100644 index 0000000..f9a80c5 --- /dev/null +++ b/scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py @@ -0,0 +1,615 @@ +#!/usr/bin/env python3 +""" +Convert Amazon-format Excel exports (with Parent/Child ASIN structure) into +Shoplazza (店匠) product import Excel format based on `docs/商品导入模板.xlsx`. + +Data source: +- Directory with multiple `*.xlsx` files under `products_data/`. +- Each file contains a main sheet + "Notes" sheet. +- Column meanings (sample): + - ASIN: variant id (sku_id) + - 父ASIN: parent product id (spu_id) + +Output: +- For each 父ASIN group: + - If only 1 ASIN: generate one "S" row + - Else: generate one "M" row + multiple "P" rows + +Multi-variant (M/P) key point: +- Variant dimensions are parsed primarily from the `SKU` column, e.g. + "Size: One Size | Color: Black", and mapped into 款式1/2/3. +""" + +# NOTE: This file is intentionally the same implementation as +# `competitor_xlsx_to_shoplazza_xlsx.py`, but renamed to reflect the correct +# data source (Amazon-format exports). Keep the logic in sync. + +import os +import re +import sys +import argparse +import random +from datetime import datetime +from collections import defaultdict, Counter +from pathlib import Path + +from openpyxl import load_workbook + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) + +from scripts.data_import.shoplazza_excel_template import create_excel_from_template_fast + + +PREFERRED_OPTION_KEYS = [ + "Size", "Color", "Style", "Pattern", "Material", "Flavor", "Scent", + "Pack", "Pack of", "Number of Items", "Count", "Capacity", "Length", + "Width", "Height", "Model", "Configuration", +] + + +def clean_str(v): + if v is None: + return "" + return str(v).strip() + + +def html_escape(s): + s = clean_str(s) + return (s.replace("&", "&") + .replace("<", "<") + .replace(">", ">")) + + +def generate_handle(title): + """ + Generate URL-friendly handle from title (ASCII only). + Keep consistent with existing scripts. + """ + handle = clean_str(title).lower() + handle = re.sub(r"[^a-z0-9\\s-]", "", handle) + handle = re.sub(r"[-\\s]+", "-", handle).strip("-") + if len(handle) > 255: + handle = handle[:255] + return handle or "product" + + +def parse_date_to_template(dt_value): + """ + Template expects: YYYY-MM-DD HH:MM:SS + Input could be "2018-05-09" or datetime/date. + """ + if dt_value is None or dt_value == "": + return "" + if isinstance(dt_value, datetime): + return dt_value.strftime("%Y-%m-%d %H:%M:%S") + s = clean_str(dt_value) + for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S"): + try: + d = datetime.strptime(s, fmt) + return d.strftime("%Y-%m-%d %H:%M:%S") + except Exception: + pass + return "" + + +def parse_weight(weight_conv, weight_raw): + """ + Return (weight_value, unit) where unit in {kg, lb, g, oz}. + Prefer '商品重量(单位换算)' like '68.04 g'. + Fallback to '商品重量' like '0.15 pounds'. + """ + s = clean_str(weight_conv) or clean_str(weight_raw) + if not s: + return ("", "") + m = re.search(r"([0-9]+(?:\\.[0-9]+)?)\\s*([a-zA-Z]+)", s) + if not m: + return ("", "") + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in ("g", "gram", "grams"): + return (val, "g") + if unit in ("kg", "kilogram", "kilograms"): + return (val, "kg") + if unit in ("lb", "lbs", "pound", "pounds"): + return (val, "lb") + if unit in ("oz", "ounce", "ounces"): + return (val, "oz") + return ("", "") + + +def parse_dimensions_inches(dim_raw): + """ + Template '尺寸信息': 'L,W,H' in inches. + Input example: '7.9 x 7.9 x 2 inches' + """ + s = clean_str(dim_raw) + if not s: + return "" + nums = re.findall(r"([0-9]+(?:\\.[0-9]+)?)", s) + if len(nums) < 3: + return "" + return "{},{},{}".format(nums[0], nums[1], nums[2]) + + +def parse_sku_options(sku_text): + """ + Parse 'SKU' column into {key: value}. + Example: + 'Size: One Size | Color: Black' -> {'Size':'One Size','Color':'Black'} + """ + s = clean_str(sku_text) + if not s: + return {} + parts = [p.strip() for p in s.split("|") if p.strip()] + out = {} + for p in parts: + if ":" not in p: + continue + k, v = p.split(":", 1) + k = clean_str(k) + v = clean_str(v) + if k and v: + out[k] = v + return out + + +def choose_option_keys(variant_dicts, max_keys=3): + freq = Counter() + for d in variant_dicts: + for k, v in d.items(): + if v: + freq[k] += 1 + if not freq: + return [] + preferred_rank = {k: i for i, k in enumerate(PREFERRED_OPTION_KEYS)} + + def key_sort(k): + return (preferred_rank.get(k, 10 ** 6), -freq[k], k.lower()) + + keys = sorted(freq.keys(), key=key_sort) + return keys[:max_keys] + + +def build_description_html(title, details, product_url): + parts = [] + if title: + parts.append("

    {}

    ".format(html_escape(title))) + detail_items = [x.strip() for x in clean_str(details).split("|") if x.strip()] + if detail_items: + li = "".join(["
  • {}
  • ".format(html_escape(x)) for x in detail_items[:30]]) + parts.append("
      {}
    ".format(li)) + if product_url: + parts.append('

    Source: {0}

    '.format(html_escape(product_url))) + return "".join(parts) + + +def read_amazon_rows_from_file(xlsx_path, max_rows=None): + wb = load_workbook(xlsx_path, read_only=True, data_only=True) + sheet_name = None + for name in wb.sheetnames: + if str(name).lower() == "notes": + continue + sheet_name = name + break + if sheet_name is None: + return [] + ws = wb[sheet_name] + + # Build header index from first row + header = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) + idx = {clean_str(v): i for i, v in enumerate(header) if v is not None and clean_str(v)} + + required = ["ASIN", "父ASIN", "商品标题", "商品主图", "SKU", "详细参数", "价格($)", "prime价格($)", + "上架时间", "类目路径", "大类目", "小类目", "品牌", "品牌链接", "商品详情页链接", + "商品重量(单位换算)", "商品重量", "商品尺寸"] + for k in required: + if k not in idx: + raise RuntimeError("Missing column '{}' in {} sheet {}".format(k, xlsx_path, sheet_name)) + + # OPT: use iter_rows(values_only=True) instead of ws.cell() per field. + # openpyxl cell access is relatively expensive; values_only is much faster. + pos = {k: idx[k] for k in required} # 0-based positions in row tuple + + rows = [] + end_row = ws.max_row + if max_rows is not None: + end_row = min(end_row, 1 + int(max_rows)) + + for tup in ws.iter_rows(min_row=2, max_row=end_row, values_only=True): + asin = clean_str(tup[pos["ASIN"]]) + if not asin: + continue + parent = clean_str(tup[pos["父ASIN"]]) or asin + rows.append({ + "ASIN": asin, + "父ASIN": parent, + "SKU": clean_str(tup[pos["SKU"]]), + "详细参数": clean_str(tup[pos["详细参数"]]), + "商品标题": clean_str(tup[pos["商品标题"]]), + "商品主图": clean_str(tup[pos["商品主图"]]), + "价格($)": tup[pos["价格($)"]], + "prime价格($)": tup[pos["prime价格($)"]], + "上架时间": clean_str(tup[pos["上架时间"]]), + "类目路径": clean_str(tup[pos["类目路径"]]), + "大类目": clean_str(tup[pos["大类目"]]), + "小类目": clean_str(tup[pos["小类目"]]), + "品牌": clean_str(tup[pos["品牌"]]), + "品牌链接": clean_str(tup[pos["品牌链接"]]), + "商品详情页链接": clean_str(tup[pos["商品详情页链接"]]), + "商品重量(单位换算)": clean_str(tup[pos["商品重量(单位换算)"]]), + "商品重量": clean_str(tup[pos["商品重量"]]), + "商品尺寸": clean_str(tup[pos["商品尺寸"]]), + }) + return rows + + +def to_price(v): + if v is None or v == "": + return None + try: + return float(v) + except Exception: + s = clean_str(v) + m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", s) + return float(m.group(1)) if m else None + + +def build_common_fields(base_row, spu_id): + title = base_row.get("商品标题") or "Product" + brand = base_row.get("品牌") or "" + big_cat = base_row.get("大类目") or "" + small_cat = base_row.get("小类目") or "" + cat_path = base_row.get("类目路径") or "" + + handle = generate_handle(title) + if handle and not handle.startswith("products/"): + handle = "products/{}".format(handle) + + seo_title = title + seo_desc_parts = [x for x in [brand, title, big_cat] if x] + seo_description = " ".join(seo_desc_parts)[:5000] + seo_keywords = ",".join([x for x in [title, brand, big_cat, small_cat] if x])[:5000] + tags = ",".join([x for x in [brand, big_cat, small_cat] if x]) + + created_at = parse_date_to_template(base_row.get("上架时间")) + description = build_description_html(title, base_row.get("详细参数"), base_row.get("商品详情页链接")) + + inventory_qty = 100 + weight_val, weight_unit = parse_weight(base_row.get("商品重量(单位换算)"), base_row.get("商品重量")) + size_info = parse_dimensions_inches(base_row.get("商品尺寸")) + + album = big_cat or (cat_path.split(":")[0] if cat_path else "") + + return { + "商品ID": "", + "创建时间": created_at, + "商品标题*": title[:255], + "商品副标题": "{} {}".format(brand, big_cat).strip()[:600], + "商品描述": description, + "SEO标题": seo_title[:5000], + "SEO描述": seo_description, + "SEO URL Handle": handle, + "SEO URL 重定向": "N", + "SEO关键词": seo_keywords, + "商品上架": "Y", + "需要物流": "Y", + "商品收税": "N", + "商品spu": spu_id[:100], + "启用虚拟销量": "N", + "虚拟销量值": "", + "跟踪库存": "Y", + "库存规则*": "1", + "专辑名称": album, + "标签": tags, + "供应商名称": "Amazon", + "供应商URL": base_row.get("商品详情页链接") or base_row.get("品牌链接") or "", + "商品重量": weight_val if weight_val != "" else "", + "重量单位": weight_unit, + "商品库存": inventory_qty, + "尺寸信息": size_info, + "原产地国别": "", + "HS(协调制度)代码": "", + "商品备注": "ASIN:{}; ParentASIN:{}; CategoryPath:{}".format( + base_row.get("ASIN", ""), spu_id, (cat_path[:200] if cat_path else "") + )[:500], + "款式备注": "", + } + + +def build_s_row(base_row): + spu_id = base_row.get("父ASIN") or base_row.get("ASIN") + common = build_common_fields(base_row, spu_id=spu_id) + price = to_price(base_row.get("prime价格($)")) or to_price(base_row.get("价格($)")) or 9.99 + image = base_row.get("商品主图") or "" + row = {} + row.update(common) + row.update({ + "商品属性*": "S", + "款式1": "", + "款式2": "", + "款式3": "", + "商品售价*": price, + "商品原价": price, + "成本价": "", + "商品SKU": base_row.get("ASIN") or "", + "商品条形码": "", + "商品图片*": image, + "商品主图": image, + }) + return row + + +def build_m_p_rows(variant_rows): + base = variant_rows[0] + spu_id = base.get("父ASIN") or base.get("ASIN") + common = build_common_fields(base, spu_id=spu_id) + + option_dicts = [parse_sku_options(v.get("SKU")) for v in variant_rows] + option_keys = choose_option_keys(option_dicts, max_keys=3) or ["Variant"] + + m = {} + m.update(common) + m.update({ + "商品属性*": "M", + "款式1": option_keys[0] if len(option_keys) > 0 else "", + "款式2": option_keys[1] if len(option_keys) > 1 else "", + "款式3": option_keys[2] if len(option_keys) > 2 else "", + "商品售价*": "", + "商品原价": "", + "成本价": "", + "商品SKU": "", + "商品条形码": "", + "商品图片*": base.get("商品主图") or "", + "商品主图": base.get("商品主图") or "", + }) + m["商品重量"] = "" + m["重量单位"] = "" + m["商品库存"] = "" + m["尺寸信息"] = "" + + rows = [m] + + for v in variant_rows: + v_common = build_common_fields(v, spu_id=spu_id) + v_common.update({ + "商品副标题": "", + "商品描述": "", + "SEO标题": "", + "SEO描述": "", + "SEO URL Handle": "", + "SEO URL 重定向": "", + "SEO关键词": "", + "专辑名称": "", + "标签": "", + "供应商名称": "", + "供应商URL": "", + "商品备注": "", + }) + + opt = parse_sku_options(v.get("SKU")) + opt_vals = [v.get("ASIN")] if option_keys == ["Variant"] else [opt.get(k, "") for k in option_keys] + + price = to_price(v.get("prime价格($)")) or to_price(v.get("价格($)")) or 9.99 + image = v.get("商品主图") or "" + + p = {} + p.update(v_common) + p.update({ + "商品属性*": "P", + "款式1": opt_vals[0] if len(opt_vals) > 0 else "", + "款式2": opt_vals[1] if len(opt_vals) > 1 else "", + "款式3": opt_vals[2] if len(opt_vals) > 2 else "", + "商品售价*": price, + "商品原价": price, + "成本价": "", + "商品SKU": v.get("ASIN") or "", + "商品条形码": "", + "商品图片*": image, + "商品主图": "", + }) + rows.append(p) + + return rows + + +def main(): + parser = argparse.ArgumentParser(description="Convert Amazon-format xlsx files to Shoplazza import xlsx") + parser.add_argument("--input-dir", default="data/mai_jia_jing_ling/products_data", help="Directory containing Amazon-format xlsx files") + parser.add_argument("--template", default="docs/商品导入模板.xlsx", help="Shoplazza import template xlsx") + parser.add_argument("--output", default="amazon_shoplazza_import.xlsx", help="Output xlsx file path (or prefix when split)") + parser.add_argument("--max-files", type=int, default=None, help="Limit number of xlsx files to read (for testing)") + parser.add_argument("--max-rows-per-output", type=int, default=40000, help="Max total Excel rows per output file (including模板头部行,默认40000)") + parser.add_argument("--max-products", type=int, default=None, help="Limit number of SPU groups to output (for testing)") + # 默认行为:丢弃不符合要求的数据 + parser.add_argument("--keep-spu-if-parent-missing", action="store_false", dest="skip_spu_if_parent_missing", default=False, help="Keep SPU even if parent ASIN not found in variants (default: skip entire SPU)") + parser.add_argument("--fix-sku-if-title-mismatch", action="store_false", dest="skip_sku_if_title_mismatch", default=False, help="Fix SKU title to match parent instead of skipping (default: skip SKU with mismatched title)") + args = parser.parse_args() + + if not os.path.isdir(args.input_dir): + raise RuntimeError("input-dir not found: {}".format(args.input_dir)) + if not os.path.exists(args.template): + raise RuntimeError("template not found: {}".format(args.template)) + + files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if f.lower().endswith(".xlsx")] + files.sort() + if args.max_files is not None: + files = files[: int(args.max_files)] + + print("Reading Amazon-format files: {} (from {})".format(len(files), args.input_dir), flush=True) + + groups = defaultdict(list) + seen_asin = set() + + for fp in files: + print(" - loading: {}".format(fp), flush=True) + try: + rows = read_amazon_rows_from_file(fp) + except Exception as e: + print("WARN: failed to read {}: {}".format(fp, e)) + continue + print(" loaded rows: {}".format(len(rows)), flush=True) + + for r in rows: + asin = r.get("ASIN") + if asin in seen_asin: + continue + seen_asin.add(asin) + spu_id = r.get("父ASIN") or asin + groups[spu_id].append(r) + + print("Collected variants: {}, SPU groups: {}".format(len(seen_asin), len(groups)), flush=True) + + # 先按 SPU 构造每个组的行,方便做“按最大行数拆分但不拆组” + group_rows_list = [] # List[List[dict]] + spu_count = 0 + next_product_id = 1 # 用于填充商品ID,全局自增 + # 将SPU顺序打乱,避免过于依赖输入文件的顺序 + spu_items = list(groups.items()) + random.shuffle(spu_items) + + for spu_id, variants in spu_items: + if not variants: + continue + + # 确保父ASIN对应的变体在列表最前面 + parent_variant = None + other_variants = [] + for v in variants: + if v.get("ASIN") == spu_id: + parent_variant = v + else: + other_variants.append(v) + + # 重新排序:父ASIN在前,其他在后 + if parent_variant: + variants = [parent_variant] + other_variants + else: + # 如果找不到父ASIN对应的变体 + print( + f"WARN: Parent ASIN not found in variants: SPU={spu_id}, " + f"variant_count={len(variants)}, first_ASIN={variants[0].get('ASIN') if variants else 'N/A'}", + flush=True, + ) + # 根据开关决定是否丢弃整个SPU + if args.skip_spu_if_parent_missing: + print( + f"SKIP entire SPU due to missing parent ASIN: SPU={spu_id}", + flush=True, + ) + continue + + # 处理变体标题:如果与主商品不一致,根据开关决定修正或丢弃 + main_title = variants[0].get("商品标题") or "" + filtered_variants = [] + for v in variants: + title = v.get("商品标题") or "" + if main_title and title and title != main_title: + if args.skip_sku_if_title_mismatch: + # 丢弃标题不一致的SKU + print( + f"SKIP SKU due to title mismatch: SPU={spu_id}, ASIN={v.get('ASIN')}, " + f"main_title='{main_title}', variant_title='{title}'", + flush=True, + ) + continue + else: + # 修正标题 + print( + f"FIX variant title mismatch: SPU={spu_id}, ASIN={v.get('ASIN')}, " + f"main_title='{main_title}', variant_title='{title}' -> using main_title", + flush=True, + ) + v["商品标题"] = main_title # 统一为主商品标题 + filtered_variants.append(v) + + # 如果所有变体都被过滤掉,跳过整个SPU + if not filtered_variants: + print( + f"SKIP entire SPU: all variants filtered out, SPU={spu_id}", + flush=True, + ) + continue + + variants = filtered_variants + + spu_count += 1 + if args.max_products is not None and spu_count > int(args.max_products): + break + + if len(variants) == 1: + rows = [build_s_row(variants[0])] + else: + rows = build_m_p_rows(variants) + + # 填充商品ID(从1开始全局递增) + for r in rows: + r["商品ID"] = next_product_id + next_product_id += 1 + + group_rows_list.append(rows) + + # 按最大行数拆成多个文件(注意:同一 SPU 不拆分) + data_start_row = 4 # 与模板/写入工具保持一致 + header_rows = data_start_row - 1 # 包含标题行+说明行 + max_total_rows = args.max_rows_per_output or 0 + if max_total_rows and max_total_rows > header_rows: + max_data_rows = max_total_rows - header_rows + else: + max_data_rows = None # 不限制 + + chunks = [] + current_chunk = [] + current_count = 0 + + if max_data_rows is None: + # 不做分片,直接一个 chunk + for gr in group_rows_list: + current_chunk.extend(gr) + if current_chunk: + chunks.append(current_chunk) + else: + for gr in group_rows_list: + gsize = len(gr) + # 如果单个 SPU 本身就超过阈值,只能独占一个文件 + if gsize > max_data_rows: + if current_chunk: + chunks.append(current_chunk) + current_chunk = [] + current_count = 0 + chunks.append(gr) + continue + # 如果放不下当前 chunk,则先封一个,再开新 chunk + if current_count + gsize > max_data_rows: + if current_chunk: + chunks.append(current_chunk) + current_chunk = list(gr) + current_count = gsize + else: + current_chunk.extend(gr) + current_count += gsize + if current_chunk: + chunks.append(current_chunk) + + total_rows = sum(len(c) for c in chunks) + print( + "Generated Excel data rows: {} (SPU groups output: {}, files: {})".format( + total_rows, len(group_rows_list), len(chunks) + ), + flush=True, + ) + + # 输出多个文件:如果只一个 chunk,直接用指定 output;多个则加 _partN 后缀 + base = Path(args.output) + stem = base.stem + suffix = base.suffix or ".xlsx" + + for idx, chunk in enumerate(chunks, start=1): + out_path = str(base) if len(chunks) == 1 else str(base.with_name(f"{stem}_part{idx}{suffix}")) + print(f"Writing file {idx}/{len(chunks)}: {out_path} (rows: {len(chunk)})", flush=True) + create_excel_from_template_fast(args.template, out_path, chunk, data_start_row=data_start_row) + + +if __name__ == "__main__": + main() + diff --git a/scripts/data_import/competitor_xlsx_to_shoplazza_xlsx.py b/scripts/data_import/competitor_xlsx_to_shoplazza_xlsx.py new file mode 100644 index 0000000..b9d56d7 --- /dev/null +++ b/scripts/data_import/competitor_xlsx_to_shoplazza_xlsx.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +""" +DEPRECATED SCRIPT NAME (kept for backward compatibility). + +The input `data/mai_jia_jing_ling/products_data/*.xlsx` files are Amazon-format exports +(Parent/Child ASIN), not “competitor data”. + +Please use: + - `scripts/data_import/amazon_xlsx_to_shoplazza_xlsx.py` + +This wrapper simply forwards all CLI args to the correctly named script, so you +automatically get the latest performance improvements (fast read/write). +""" + +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) + +from scripts.data_import.amazon_xlsx_to_shoplazza_xlsx import main as amazon_main + + +if __name__ == "__main__": + amazon_main() + diff --git a/scripts/data_import/csv_to_excel.py b/scripts/data_import/csv_to_excel.py new file mode 100755 index 0000000..6d38b34 --- /dev/null +++ b/scripts/data_import/csv_to_excel.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +""" +Convert CSV data to Excel import template. + +Reads CSV file (goods_with_pic.5years_congku.csv.shuf.1w) and generates Excel file +based on the template format (商品导入模板.xlsx). + +Each CSV row corresponds to 1 SPU and 1 SKU, which will be exported as a single +S (Single variant) row in the Excel template. +""" + +import sys +import os +import csv +import random +import argparse +import re +from pathlib import Path +from datetime import datetime, timedelta +import pandas as pd +from openpyxl import load_workbook +from openpyxl.styles import Font, Alignment +from openpyxl.utils import get_column_letter + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) + +# Shared helpers (keeps template writing consistent across scripts) +from scripts.data_import.shoplazza_import_template import create_excel_from_template as _create_excel_from_template_shared +from scripts.data_import.shoplazza_import_template import generate_handle as _generate_handle_shared + + +def clean_value(value): + """ + Clean and normalize value. + + Args: + value: Value to clean + + Returns: + Cleaned string value + """ + if value is None: + return '' + value = str(value).strip() + # Remove surrounding quotes + if value.startswith('"') and value.endswith('"'): + value = value[1:-1] + return value + + +def parse_csv_row(row: dict) -> dict: + """ + Parse CSV row and extract fields. + + Args: + row: CSV row dictionary + + Returns: + Parsed data dictionary + """ + return { + 'skuId': clean_value(row.get('skuId', '')), + 'name': clean_value(row.get('name', '')), + 'name_pinyin': clean_value(row.get('name_pinyin', '')), + 'create_time': clean_value(row.get('create_time', '')), + 'ruSkuName': clean_value(row.get('ruSkuName', '')), + 'enSpuName': clean_value(row.get('enSpuName', '')), + 'categoryName': clean_value(row.get('categoryName', '')), + 'supplierName': clean_value(row.get('supplierName', '')), + 'brandName': clean_value(row.get('brandName', '')), + 'file_id': clean_value(row.get('file_id', '')), + 'days_since_last_update': clean_value(row.get('days_since_last_update', '')), + 'id': clean_value(row.get('id', '')), + 'imageUrl': clean_value(row.get('imageUrl', '')) + } + + +def generate_handle(title: str) -> str: + """ + Generate URL-friendly handle from title. + + Args: + title: Product title + + Returns: + URL-friendly handle (ASCII only) + """ + # Keep backward-compatible function name while delegating to shared helper. + return _generate_handle_shared(title) + + +def read_csv_file(csv_file: str) -> list: + """ + Read CSV file and return list of parsed rows. + + Args: + csv_file: Path to CSV file + + Returns: + List of parsed CSV data dictionaries + """ + csv_data_list = [] + + with open(csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + parsed = parse_csv_row(row) + csv_data_list.append(parsed) + + return csv_data_list + + +def csv_to_excel_row(csv_data: dict) -> dict: + """ + Convert CSV data row to Excel template row. + + Each CSV row represents a single product with one variant (S type in Excel). + + Args: + csv_data: Parsed CSV row data + + Returns: + Dictionary mapping Excel column names to values + """ + # Parse create_time + try: + created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + except: + created_at = datetime.now() - timedelta(days=random.randint(1, 365)) + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + + # Generate title - use name or enSpuName + title = csv_data['name'] or csv_data['enSpuName'] or 'Product' + + # Generate handle - prefer enSpuName, then name_pinyin, then title + handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title + handle = generate_handle(handle_source) + if handle and not handle.startswith('products/'): + handle = f'products/{handle}' + + # Generate SEO fields + seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title + seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title + seo_keywords_parts = [title] + if csv_data['categoryName']: + seo_keywords_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + seo_keywords_parts.append(csv_data['brandName']) + seo_keywords = ','.join(seo_keywords_parts) + + # Generate tags from category and brand + tags_parts = [] + if csv_data['categoryName']: + tags_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + tags_parts.append(csv_data['brandName']) + tags = ','.join(tags_parts) if tags_parts else '' + + # Generate prices (similar to import_tenant2_csv.py) + price = round(random.uniform(50, 500), 2) + compare_at_price = round(price * random.uniform(1.2, 1.5), 2) + cost_price = round(price * 0.6, 2) + + # Generate random stock + inventory_quantity = random.randint(0, 100) + + # Generate random weight + weight = round(random.uniform(0.1, 5.0), 2) + weight_unit = 'kg' + + # Use ruSkuName as SKU title, fallback to name + sku_title = csv_data['ruSkuName'] or csv_data['name'] or 'SKU' + + # Use skuId as SKU code + sku_code = csv_data['skuId'] or '' + + # Generate barcode + try: + sku_id = int(csv_data['skuId']) + barcode = f"BAR{sku_id:08d}" + except: + barcode = '' + + # Build description + description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' + + # Build brief (subtitle) + brief = csv_data['name'] or '' + + # Excel row data (mapping to Excel template columns) + excel_row = { + '商品ID': '', # Empty for new products + '创建时间': create_time_str, + '商品标题*': title, + '商品属性*': 'S', # Single variant product + '商品副标题': brief, + '商品描述': description, + 'SEO标题': seo_title, + 'SEO描述': seo_description, + 'SEO URL Handle': handle, + 'SEO URL 重定向': 'N', # Default to N + 'SEO关键词': seo_keywords, + '商品上架': 'Y', # Published by default + '需要物流': 'Y', # Requires shipping + '商品收税': 'N', # Not taxable by default + '商品spu': '', # Empty + '启用虚拟销量': 'N', # No fake sales + '虚拟销量值': '', # Empty + '跟踪库存': 'Y', # Track inventory + '库存规则*': '1', # Allow purchase when stock is 0 + '专辑名称': csv_data['categoryName'] or '', # Category as album + '标签': tags, + '供应商名称': csv_data['supplierName'] or '', + '供应商URL': '', # Empty + '款式1': '', # Not used for S type + '款式2': '', # Not used for S type + '款式3': '', # Not used for S type + '商品售价*': price, + '商品原价': compare_at_price, + '成本价': cost_price, + '商品SKU': sku_code, + '商品重量': weight, + '重量单位': weight_unit, + '商品条形码': barcode, + '商品库存': inventory_quantity, + '尺寸信息': '', # Empty + '原产地国别': '', # Empty + 'HS(协调制度)代码': '', # Empty + '商品图片*': csv_data['imageUrl'] or '', # Image URL + '商品备注': '', # Empty + '款式备注': '', # Empty + '商品主图': csv_data['imageUrl'] or '', # Main image URL + } + + return excel_row + + +def create_excel_from_template(template_file: str, output_file: str, csv_data_list: list): + """ + Create Excel file from template and fill with CSV data. + + Args: + template_file: Path to Excel template file + output_file: Path to output Excel file + csv_data_list: List of parsed CSV data dictionaries + """ + excel_rows = [csv_to_excel_row(d) for d in csv_data_list] + _create_excel_from_template_shared(template_file, output_file, excel_rows) + print(f"Excel file created: {output_file}") + print(f" - Total rows: {len(csv_data_list)}") + + +def main(): + parser = argparse.ArgumentParser(description='Convert CSV data to Excel import template') + parser.add_argument('--csv-file', + default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w', + help='CSV file path (default: data/customer1/goods_with_pic.5years_congku.csv.shuf.1w)') + parser.add_argument('--template', + default='docs/商品导入模板.xlsx', + help='Excel template file path (default: docs/商品导入模板.xlsx)') + parser.add_argument('--output', + default='商品导入数据.xlsx', + help='Output Excel file path (default: 商品导入数据.xlsx)') + parser.add_argument('--limit', + type=int, + default=None, + help='Limit number of rows to process (default: all)') + + args = parser.parse_args() + + # Check if files exist + if not os.path.exists(args.csv_file): + print(f"Error: CSV file not found: {args.csv_file}") + sys.exit(1) + + if not os.path.exists(args.template): + print(f"Error: Template file not found: {args.template}") + sys.exit(1) + + # Read CSV file + print(f"Reading CSV file: {args.csv_file}") + csv_data_list = read_csv_file(args.csv_file) + print(f"Read {len(csv_data_list)} rows from CSV") + + # Limit rows if specified + if args.limit: + csv_data_list = csv_data_list[:args.limit] + print(f"Limited to {len(csv_data_list)} rows") + + # Create Excel file + print(f"Creating Excel file from template: {args.template}") + print(f"Output file: {args.output}") + create_excel_from_template(args.template, args.output, csv_data_list) + + print(f"\nDone! Generated {len(csv_data_list)} product rows in Excel file.") + + +if __name__ == '__main__': + main() diff --git a/scripts/data_import/csv_to_excel_multi_variant.py b/scripts/data_import/csv_to_excel_multi_variant.py new file mode 100755 index 0000000..a9a5936 --- /dev/null +++ b/scripts/data_import/csv_to_excel_multi_variant.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 +""" +Convert CSV data to Excel import template with multi-variant support. + +Reads CSV file (goods_with_pic.5years_congku.csv.shuf.1w) and generates Excel file +based on the template format (商品导入模板.xlsx). + +Features: +- 30% products as Single variant (S type) +- 70% products as Multi variant (M+P type) with color, size, material options +""" + +import sys +import os +import csv +import random +import argparse +import re +from pathlib import Path +from datetime import datetime, timedelta +import itertools +from openpyxl import load_workbook +from openpyxl.styles import Alignment + +REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO_ROOT)) + +# Shared helpers (keeps template writing consistent across scripts) +from scripts.data_import.shoplazza_import_template import create_excel_from_template as _create_excel_from_template_shared +from scripts.data_import.shoplazza_import_template import generate_handle as _generate_handle_shared + +# Color definitions +COLORS = [ + "Red", "Blue", "Green", "Yellow", "Black", "White", "Orange", "Purple", + "Pink", "Brown", "Gray", "Navy", "Beige", "Cream", "Maroon", "Olive", + "Teal", "Cyan", "Magenta", "Lime", "Indigo", "Gold", "Silver", "Bronze", + "Coral", "Turquoise", "Violet", "Khaki", "Charcoal", "Ivory" +] + + +def clean_value(value): + """ + Clean and normalize value. + + Args: + value: Value to clean + + Returns: + Cleaned string value + """ + if value is None: + return '' + value = str(value).strip() + # Remove surrounding quotes + if value.startswith('"') and value.endswith('"'): + value = value[1:-1] + return value + + +def parse_csv_row(row: dict) -> dict: + """ + Parse CSV row and extract fields. + + Args: + row: CSV row dictionary + + Returns: + Parsed data dictionary + """ + return { + 'skuId': clean_value(row.get('skuId', '')), + 'name': clean_value(row.get('name', '')), + 'name_pinyin': clean_value(row.get('name_pinyin', '')), + 'create_time': clean_value(row.get('create_time', '')), + 'ruSkuName': clean_value(row.get('ruSkuName', '')), + 'enSpuName': clean_value(row.get('enSpuName', '')), + 'categoryName': clean_value(row.get('categoryName', '')), + 'supplierName': clean_value(row.get('supplierName', '')), + 'brandName': clean_value(row.get('brandName', '')), + 'file_id': clean_value(row.get('file_id', '')), + 'days_since_last_update': clean_value(row.get('days_since_last_update', '')), + 'id': clean_value(row.get('id', '')), + 'imageUrl': clean_value(row.get('imageUrl', '')) + } + + +def generate_handle(title: str) -> str: + """ + Generate URL-friendly handle from title. + + Args: + title: Product title + + Returns: + URL-friendly handle (ASCII only) + """ + # Keep backward-compatible function name while delegating to shared helper. + return _generate_handle_shared(title) + + +def extract_material_from_title(title: str) -> str: + """ + Extract material from title by taking the last word after splitting by space. + + 按照商品标题空格分割后的最后一个字符串作为material。 + 例如:"消防套 塑料【英文包装】" -> 最后一个字符串是 "塑料【英文包装】" + + Args: + title: Product title + + Returns: + Material string (single value) + """ + if not title: + return 'default' + + # Split by spaces (只按空格分割,保持原样) + parts = title.strip().split() + if parts: + # Get last part (最后一个字符串) + material = parts[-1] + # Remove brackets but keep content + material = re.sub(r'[【】\[\]()()]', '', material) + material = material.strip() + if material: + return material + + return 'default' + + +def generate_single_variant_row(csv_data: dict, base_sku_id: int = 1) -> dict: + """ + Generate Excel row for Single variant (S type) product. + + Args: + csv_data: Parsed CSV row data + base_sku_id: Base SKU ID for generating SKU code + + Returns: + Dictionary mapping Excel column names to values + """ + # Parse create_time + try: + created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + except: + created_at = datetime.now() - timedelta(days=random.randint(1, 365)) + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + + # Generate title - use name or enSpuName + title = csv_data['name'] or csv_data['enSpuName'] or 'Product' + + # Generate handle - prefer enSpuName, then name_pinyin, then title + handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title + handle = generate_handle(handle_source) + if handle and not handle.startswith('products/'): + handle = f'products/{handle}' + + # Generate SEO fields + seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title + seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title + seo_keywords_parts = [title] + if csv_data['categoryName']: + seo_keywords_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + seo_keywords_parts.append(csv_data['brandName']) + seo_keywords = ','.join(seo_keywords_parts) + + # Generate tags from category and brand + tags_parts = [] + if csv_data['categoryName']: + tags_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + tags_parts.append(csv_data['brandName']) + tags = ','.join(tags_parts) if tags_parts else '' + + # Generate prices + price = round(random.uniform(50, 500), 2) + compare_at_price = round(price * random.uniform(1.2, 1.5), 2) + cost_price = round(price * 0.6, 2) + + # Generate random stock + inventory_quantity = random.randint(0, 100) + + # Generate random weight + weight = round(random.uniform(0.1, 5.0), 2) + weight_unit = 'kg' + + # Use skuId as SKU code + sku_code = csv_data['skuId'] or f'SKU-{base_sku_id}' + + # Generate barcode + try: + sku_id = int(csv_data['skuId']) if csv_data['skuId'] else base_sku_id + barcode = f"BAR{sku_id:08d}" + except: + barcode = f"BAR{base_sku_id:08d}" + + # Build description + description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' + + # Build brief (subtitle) + brief = csv_data['name'] or '' + + # Excel row data + excel_row = { + '商品ID': '', # Empty for new products + '创建时间': create_time_str, + '商品标题*': title, + '商品属性*': 'S', # Single variant product + '商品副标题': brief, + '商品描述': description, + 'SEO标题': seo_title, + 'SEO描述': seo_description, + 'SEO URL Handle': handle, + 'SEO URL 重定向': 'N', + 'SEO关键词': seo_keywords, + '商品上架': 'Y', + '需要物流': 'Y', + '商品收税': 'N', + '商品spu': '', + '启用虚拟销量': 'N', + '虚拟销量值': '', + '跟踪库存': 'Y', + '库存规则*': '1', + '专辑名称': csv_data['categoryName'] or '', + '标签': tags, + '供应商名称': csv_data['supplierName'] or '', + '供应商URL': '', + '款式1': '', # Empty for S type + '款式2': '', # Empty for S type + '款式3': '', # Empty for S type + '商品售价*': price, + '商品原价': compare_at_price, + '成本价': cost_price, + '商品SKU': sku_code, + '商品重量': weight, + '重量单位': weight_unit, + '商品条形码': barcode, + '商品库存': inventory_quantity, + '尺寸信息': '', + '原产地国别': '', + 'HS(协调制度)代码': '', + '商品图片*': csv_data['imageUrl'] or '', + '商品备注': '', + '款式备注': '', + '商品主图': csv_data['imageUrl'] or '', + } + + return excel_row + + +def generate_multi_variant_rows(csv_data: dict, base_sku_id: int = 1) -> list: + """ + Generate Excel rows for Multi variant (M+P type) product. + + Returns a list of rows: + - First row: M (主商品) with option names + - Following rows: P (子款式) with option values + + Args: + csv_data: Parsed CSV row data + base_sku_id: Base SKU ID for generating SKU codes + + Returns: + List of dictionaries mapping Excel column names to values + """ + rows = [] + + # Parse create_time + try: + created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S') + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + except: + created_at = datetime.now() - timedelta(days=random.randint(1, 365)) + create_time_str = created_at.strftime('%Y-%m-%d %H:%M:%S') + + # Generate title + title = csv_data['name'] or csv_data['enSpuName'] or 'Product' + + # Generate handle + handle_source = csv_data['enSpuName'] or csv_data['name_pinyin'] or title + handle = generate_handle(handle_source) + if handle and not handle.startswith('products/'): + handle = f'products/{handle}' + + # Generate SEO fields + seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title + seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title + seo_keywords_parts = [title] + if csv_data['categoryName']: + seo_keywords_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + seo_keywords_parts.append(csv_data['brandName']) + seo_keywords = ','.join(seo_keywords_parts) + + # Generate tags + tags_parts = [] + if csv_data['categoryName']: + tags_parts.append(csv_data['categoryName']) + if csv_data['brandName']: + tags_parts.append(csv_data['brandName']) + tags = ','.join(tags_parts) if tags_parts else '' + + # Extract material from title (last word after splitting by space) + material = extract_material_from_title(title) + + # Generate color options: randomly select 2-10 colors from COLORS list + num_colors = random.randint(2, 10) + selected_colors = random.sample(COLORS, min(num_colors, len(COLORS))) + + # Generate size options: 1-30, randomly select 4-8 + num_sizes = random.randint(4, 8) + all_sizes = [str(i) for i in range(1, 31)] + selected_sizes = random.sample(all_sizes, num_sizes) + + # Material has only one value + materials = [material] + + # Generate all combinations (Cartesian product) + variants = list(itertools.product(selected_colors, selected_sizes, materials)) + + # Generate M row (主商品) + description = f"

    {csv_data['name']}

    " if csv_data['name'] else '' + brief = csv_data['name'] or '' + + m_row = { + '商品ID': '', + '创建时间': create_time_str, + '商品标题*': title, + '商品属性*': 'M', # Main product + '商品副标题': brief, + '商品描述': description, + 'SEO标题': seo_title, + 'SEO描述': seo_description, + 'SEO URL Handle': handle, + 'SEO URL 重定向': 'N', + 'SEO关键词': seo_keywords, + '商品上架': 'Y', + '需要物流': 'Y', + '商品收税': 'N', + '商品spu': '', + '启用虚拟销量': 'N', + '虚拟销量值': '', + '跟踪库存': 'Y', + '库存规则*': '1', + '专辑名称': csv_data['categoryName'] or '', + '标签': tags, + '供应商名称': csv_data['supplierName'] or '', + '供应商URL': '', + '款式1': 'color', # Option name + '款式2': 'size', # Option name + '款式3': 'material', # Option name + '商品售价*': '', # Empty for M row + '商品原价': '', + '成本价': '', + '商品SKU': '', # Empty for M row + '商品重量': '', + '重量单位': '', + '商品条形码': '', + '商品库存': '', # Empty for M row + '尺寸信息': '', + '原产地国别': '', + 'HS(协调制度)代码': '', + '商品图片*': csv_data['imageUrl'] or '', # Main product image + '商品备注': '', + '款式备注': '', + '商品主图': csv_data['imageUrl'] or '', + } + rows.append(m_row) + + # Generate P rows (子款式) for each variant combination + base_price = round(random.uniform(50, 500), 2) + + for variant_idx, (color, size, mat) in enumerate(variants): + # Generate price variation (within ±20% of base) + price = round(base_price * random.uniform(0.8, 1.2), 2) + compare_at_price = round(price * random.uniform(1.2, 1.5), 2) + cost_price = round(price * 0.6, 2) + + # Generate random stock + inventory_quantity = random.randint(0, 100) + + # Generate random weight + weight = round(random.uniform(0.1, 5.0), 2) + weight_unit = 'kg' + + # Generate SKU code + sku_code = f"{csv_data['skuId']}-{color}-{size}-{mat}" if csv_data['skuId'] else f'SKU-{base_sku_id}-{variant_idx+1}' + + # Generate barcode + barcode = f"BAR{base_sku_id:08d}{variant_idx+1:03d}" + + p_row = { + '商品ID': '', + '创建时间': create_time_str, + '商品标题*': title, # Same as M row + '商品属性*': 'P', # Variant + '商品副标题': '', # Empty for P row + '商品描述': '', # Empty for P row + 'SEO标题': '', # Empty for P row + 'SEO描述': '', # Empty for P row + 'SEO URL Handle': '', # Empty for P row + 'SEO URL 重定向': '', + 'SEO关键词': '', + '商品上架': 'Y', + '需要物流': 'Y', + '商品收税': 'N', + '商品spu': '', + '启用虚拟销量': 'N', + '虚拟销量值': '', + '跟踪库存': 'Y', + '库存规则*': '1', + '专辑名称': '', # Empty for P row + '标签': '', # Empty for P row + '供应商名称': '', # Empty for P row + '供应商URL': '', + '款式1': color, # Option value + '款式2': size, # Option value + '款式3': mat, # Option value + '商品售价*': price, + '商品原价': compare_at_price, + '成本价': cost_price, + '商品SKU': sku_code, + '商品重量': weight, + '重量单位': weight_unit, + '商品条形码': barcode, + '商品库存': inventory_quantity, + '尺寸信息': '', + '原产地国别': '', + 'HS(协调制度)代码': '', + '商品图片*': '', # Empty for P row (uses main product image) + '商品备注': '', + '款式备注': '', + '商品主图': '', + } + rows.append(p_row) + + return rows + + +def read_csv_file(csv_file: str) -> list: + """ + Read CSV file and return list of parsed rows. + + Args: + csv_file: Path to CSV file + + Returns: + List of parsed CSV data dictionaries + """ + csv_data_list = [] + + with open(csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + parsed = parse_csv_row(row) + csv_data_list.append(parsed) + + return csv_data_list + + +def create_excel_from_template(template_file: str, output_file: str, excel_rows: list): + """ + Create Excel file from template and fill with data rows. + + Args: + template_file: Path to Excel template file + output_file: Path to output Excel file + excel_rows: List of dictionaries mapping Excel column names to values + """ + _create_excel_from_template_shared(template_file, output_file, excel_rows) + print(f"Excel file created: {output_file}") + print(f" - Total rows: {len(excel_rows)}") + + +def main(): + parser = argparse.ArgumentParser(description='Convert CSV data to Excel import template with multi-variant support') + parser.add_argument('--csv-file', + default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w', + help='CSV file path') + parser.add_argument('--template', + default='docs/商品导入模板.xlsx', + help='Excel template file path') + parser.add_argument('--output', + default='商品导入数据.xlsx', + help='Output Excel file path') + parser.add_argument('--limit', + type=int, + default=None, + help='Limit number of products to process') + parser.add_argument('--single-ratio', + type=float, + default=0.3, + help='Ratio of single variant products (default: 0.3 = 30%%)') + parser.add_argument('--seed', + type=int, + default=None, + help='Random seed for reproducible results') + + args = parser.parse_args() + + # Set random seed if provided + if args.seed is not None: + random.seed(args.seed) + + # Check if files exist + if not os.path.exists(args.csv_file): + print(f"Error: CSV file not found: {args.csv_file}") + sys.exit(1) + + if not os.path.exists(args.template): + print(f"Error: Template file not found: {args.template}") + sys.exit(1) + + # Read CSV file + print(f"Reading CSV file: {args.csv_file}") + csv_data_list = read_csv_file(args.csv_file) + print(f"Read {len(csv_data_list)} rows from CSV") + + # Limit products if specified + if args.limit: + csv_data_list = csv_data_list[:args.limit] + print(f"Limited to {len(csv_data_list)} products") + + # Generate Excel rows + print(f"\nGenerating Excel rows...") + print(f" - Single variant ratio: {args.single_ratio*100:.0f}%") + print(f" - Multi variant ratio: {(1-args.single_ratio)*100:.0f}%") + + excel_rows = [] + single_count = 0 + multi_count = 0 + + for idx, csv_data in enumerate(csv_data_list): + # Decide if this product should be single or multi variant + is_single = random.random() < args.single_ratio + + if is_single: + # Generate single variant (S type) + row = generate_single_variant_row(csv_data, base_sku_id=idx+1) + excel_rows.append(row) + single_count += 1 + else: + # Generate multi variant (M+P type) + rows = generate_multi_variant_rows(csv_data, base_sku_id=idx+1) + excel_rows.extend(rows) + multi_count += 1 + + print(f"\nGenerated:") + print(f" - Single variant products: {single_count}") + print(f" - Multi variant products: {multi_count}") + print(f" - Total Excel rows: {len(excel_rows)}") + + # Create Excel file + print(f"\nCreating Excel file from template: {args.template}") + print(f"Output file: {args.output}") + create_excel_from_template(args.template, args.output, excel_rows) + + print(f"\nDone! Generated {len(excel_rows)} rows in Excel file.") + + +if __name__ == '__main__': + main() diff --git a/scripts/data_import/shoplazza_excel_template.py b/scripts/data_import/shoplazza_excel_template.py new file mode 100644 index 0000000..c2bbec2 --- /dev/null +++ b/scripts/data_import/shoplazza_excel_template.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Shared utilities for generating Shoplazza (店匠) product import Excel files +based on the provided template `docs/商品导入模板.xlsx`. + +We keep this in `scripts/` to maximize reuse by existing ad-hoc pipeline scripts. +""" + +from openpyxl import Workbook +from openpyxl import load_workbook +from openpyxl.styles import Alignment + + +def load_template_column_mapping(ws, header_row_idx=2): + """ + Read the header row in the template sheet and build a mapping: + header_name -> column_index (1-based). + """ + column_mapping = {} + for col_idx in range(1, ws.max_column + 1): + cell_value = ws.cell(row=header_row_idx, column=col_idx).value + if cell_value: + column_mapping[str(cell_value).strip()] = col_idx + return column_mapping + + +def create_excel_from_template(template_file, output_file, excel_rows, header_row_idx=2, data_start_row=4): + """ + Create an Excel file from the Shoplazza template and fill with data rows. + + Args: + template_file: Path to Excel template file + output_file: Path to output Excel file + excel_rows: List[Dict[str, Any]] mapping template header -> value + header_row_idx: Header row index in template (default 2) + data_start_row: Data start row index in template (default 4) + """ + wb = load_workbook(template_file) + ws = wb.active + + column_mapping = load_template_column_mapping(ws, header_row_idx=header_row_idx) + + # Clear existing data rows + last_template_row = ws.max_row + if last_template_row >= data_start_row: + for row in range(data_start_row, last_template_row + 1): + for col in range(1, ws.max_column + 1): + ws.cell(row=row, column=col).value = None + + # Write data rows (OPT: only write fields that actually exist in excel_row) + # This avoids looping over all 42 template columns for every output row. + for row_idx, excel_row in enumerate(excel_rows): + excel_row_num = data_start_row + row_idx + for field_name, value in excel_row.items(): + col_idx = column_mapping.get(field_name) + if not col_idx: + continue + cell = ws.cell(row=excel_row_num, column=col_idx) + cell.value = value + if isinstance(value, str): + cell.alignment = Alignment(vertical='top', wrap_text=True) + else: + cell.alignment = Alignment(vertical='top') + + wb.save(output_file) + print("Excel file created: {}".format(output_file)) + print(" - Total rows: {}".format(len(excel_rows))) + + +def create_excel_from_template_fast(template_file, output_file, excel_rows, header_row_idx=2, data_start_row=4): + """ + Faster writer for large datasets. + + Instead of opening the template workbook in write mode and assigning cells one by one, + we: + - read the template's first (data_start_row-1) rows as values + - build a header->index mapping from header_row_idx + - create a new write_only workbook and append rows + + This is much faster for tens/hundreds of thousands of cells. + """ + tpl_wb = load_workbook(template_file, read_only=True, data_only=True) + tpl_ws = tpl_wb.active + + max_col = tpl_ws.max_column + + # Copy template "instruction" rows (typically rows 1-3) into output + prefix_rows = list(tpl_ws.iter_rows(min_row=1, max_row=data_start_row - 1, values_only=True)) + + header_values = None + if 1 <= header_row_idx <= len(prefix_rows): + header_values = prefix_rows[header_row_idx - 1] + else: + # Fallback: read header row directly + header_values = next(tpl_ws.iter_rows(min_row=header_row_idx, max_row=header_row_idx, values_only=True)) + + header_values = list(header_values)[:max_col] + col_map = {} + for i, v in enumerate(header_values): + if v is None: + continue + col_map[str(v).strip()] = i # 0-based + + wb = Workbook(write_only=True) + ws = wb.create_sheet(title=tpl_ws.title) + # remove default sheet if present (openpyxl may create one) + if "Sheet" in wb.sheetnames and wb["Sheet"] is not ws: + try: + wb.remove(wb["Sheet"]) + except Exception: + pass + + # Write prefix rows, normalized to max_col + for r in prefix_rows: + r = list(r)[:max_col] + if len(r) < max_col: + r = r + [None] * (max_col - len(r)) + ws.append(r) + + # Write data rows + for excel_row in excel_rows: + row_vals = [None] * max_col + for field_name, value in excel_row.items(): + if field_name not in col_map: + continue + row_vals[col_map[field_name]] = value + ws.append(row_vals) + + wb.save(output_file) + print("Excel file created (fast): {}".format(output_file)) + print(" - Total rows: {}".format(len(excel_rows))) + + diff --git a/scripts/data_import/shoplazza_import_template.py b/scripts/data_import/shoplazza_import_template.py new file mode 100644 index 0000000..a05f291 --- /dev/null +++ b/scripts/data_import/shoplazza_import_template.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Shared helpers for generating Shoplazza product import Excel files from the +official template `docs/商品导入模板.xlsx`. + +We keep this module small and dependency-light (openpyxl only) so other scripts +can reuse the same template-writing behavior (header row mapping, data start +row, alignment). +""" + +import re +from datetime import datetime +from typing import Dict, Iterable, List, Optional + +from openpyxl import load_workbook +from openpyxl.styles import Alignment + + +def generate_handle(title: str) -> str: + """ + Generate URL-friendly handle from title (ASCII only), suitable for Shoplazza + `SEO URL Handle` field. Caller may prepend `products/`. + """ + if not title: + return "product" + + handle = str(title).lower() + handle = re.sub(r"[^a-z0-9\s-]", "", handle) + handle = re.sub(r"[-\s]+", "-", handle).strip("-") + + if len(handle) > 255: + handle = handle[:255] + + return handle or "product" + + +def parse_date_to_datetime_str(value) -> str: + """ + Parse common date strings into Shoplazza template datetime string: + `YYYY-MM-DD HH:MM:SS`. If parsing fails, returns empty string. + """ + if value is None: + return "" + + if isinstance(value, datetime): + return value.strftime("%Y-%m-%d %H:%M:%S") + + s = str(value).strip() + if not s: + return "" + + # Most competitor sheets use YYYY-MM-DD + for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y"): + try: + dt = datetime.strptime(s, fmt) + if fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y"): + dt = dt.replace(hour=0, minute=0, second=0) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except Exception: + pass + + return "" + + +def create_excel_from_template( + template_file: str, + output_file: str, + excel_rows: List[Dict[str, object]], + *, + header_row_idx: int = 2, + data_start_row: int = 4, + sheet_name: Optional[str] = None, +) -> None: + """ + Create an Excel file from Shoplazza import template and fill rows. + + - Header row is expected at row 2 (1-based) in the official template. + - Data starts at row 4 (1-based), after the instruction row(s). + """ + wb = load_workbook(template_file) + ws = wb[sheet_name] if sheet_name else wb.active + + column_mapping: Dict[str, int] = {} + for col_idx in range(1, ws.max_column + 1): + cell_value = ws.cell(row=header_row_idx, column=col_idx).value + if cell_value: + column_mapping[str(cell_value).strip()] = col_idx + + # Clear existing data rows + last_template_row = ws.max_row + if last_template_row >= data_start_row: + for row in range(data_start_row, last_template_row + 1): + for col in range(1, ws.max_column + 1): + ws.cell(row=row, column=col).value = None + + # Write data rows + for row_idx, excel_row in enumerate(excel_rows): + excel_row_num = data_start_row + row_idx + for field_name, col_idx in column_mapping.items(): + if field_name not in excel_row: + continue + value = excel_row[field_name] + cell = ws.cell(row=excel_row_num, column=col_idx) + cell.value = value + if isinstance(value, str): + cell.alignment = Alignment(vertical="top", wrap_text=True) + elif isinstance(value, (int, float)): + cell.alignment = Alignment(vertical="top") + + wb.save(output_file) + + diff --git a/scripts/data_import/tenant3_csv_to_shoplazza_xlsx.sh b/scripts/data_import/tenant3_csv_to_shoplazza_xlsx.sh new file mode 100755 index 0000000..d051aba --- /dev/null +++ b/scripts/data_import/tenant3_csv_to_shoplazza_xlsx.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +cd "$(dirname "$0")/.." +source ./activate.sh + +# # 基本使用(生成所有数据) +# python scripts/data_import/csv_to_excel.py + +# # 指定输出文件 +# python scripts/data_import/csv_to_excel.py --output tenant3_imports.xlsx + +# # 限制处理行数(用于测试) +# python scripts/data_import/csv_to_excel.py --limit 100 + +# 指定CSV文件和模板文件 +python scripts/data_import/csv_to_excel.py \ + --csv-file data/customer1/goods_with_pic.5years_congku.csv.shuf.1w \ + --template docs/商品导入模板.xlsx \ + --output tenant3_imports.xlsx diff --git a/scripts/download_translation_models.py b/scripts/download_translation_models.py deleted file mode 100755 index a6fcba4..0000000 --- a/scripts/download_translation_models.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python3 -"""Download local translation models declared in services.translation.capabilities.""" - -from __future__ import annotations - -import argparse -import os -from pathlib import Path -import shutil -import subprocess -import sys -from typing import Iterable - -from huggingface_hub import snapshot_download - -PROJECT_ROOT = Path(__file__).resolve().parent.parent -if str(PROJECT_ROOT) not in sys.path: - sys.path.insert(0, str(PROJECT_ROOT)) -os.environ.setdefault("HF_HUB_DISABLE_XET", "1") - -from config.services_config import get_translation_config - - -LOCAL_BACKENDS = {"local_nllb", "local_marian"} - - -def iter_local_capabilities(selected: set[str] | None = None) -> Iterable[tuple[str, dict]]: - cfg = get_translation_config() - capabilities = cfg.get("capabilities", {}) if isinstance(cfg, dict) else {} - for name, capability in capabilities.items(): - backend = str(capability.get("backend") or "").strip().lower() - if backend not in LOCAL_BACKENDS: - continue - if selected and name not in selected: - continue - yield name, capability - - -def _compute_ct2_output_dir(capability: dict) -> Path: - custom = str(capability.get("ct2_model_dir") or "").strip() - if custom: - return Path(custom).expanduser() - model_dir = Path(str(capability.get("model_dir") or "")).expanduser() - compute_type = str(capability.get("ct2_compute_type") or capability.get("torch_dtype") or "default").strip().lower() - normalized = compute_type.replace("_", "-") - return model_dir / f"ctranslate2-{normalized}" - - -def _resolve_converter_binary() -> str: - candidate = shutil.which("ct2-transformers-converter") - if candidate: - return candidate - venv_candidate = Path(sys.executable).absolute().parent / "ct2-transformers-converter" - if venv_candidate.exists(): - return str(venv_candidate) - raise RuntimeError( - "ct2-transformers-converter was not found. " - "Install ctranslate2 in the active Python environment first." - ) - - -def convert_to_ctranslate2(name: str, capability: dict) -> None: - model_id = str(capability.get("model_id") or "").strip() - model_dir = Path(str(capability.get("model_dir") or "")).expanduser() - model_source = str(model_dir if model_dir.exists() else model_id) - output_dir = _compute_ct2_output_dir(capability) - if (output_dir / "model.bin").exists(): - print(f"[skip-convert] {name} -> {output_dir}") - return - quantization = str( - capability.get("ct2_conversion_quantization") - or capability.get("ct2_compute_type") - or capability.get("torch_dtype") - or "default" - ).strip() - output_dir.parent.mkdir(parents=True, exist_ok=True) - print(f"[convert] {name} -> {output_dir} ({quantization})") - subprocess.run( - [ - _resolve_converter_binary(), - "--model", - model_source, - "--output_dir", - str(output_dir), - "--quantization", - quantization, - ], - check=True, - ) - print(f"[converted] {name}") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Download local translation models") - parser.add_argument("--all-local", action="store_true", help="Download all configured local translation models") - parser.add_argument("--models", nargs="*", default=[], help="Specific capability names to download") - parser.add_argument( - "--convert-ctranslate2", - action="store_true", - help="Also convert the downloaded Hugging Face models into CTranslate2 format", - ) - args = parser.parse_args() - - selected = {item.strip().lower() for item in args.models if item.strip()} or None - if not args.all_local and not selected: - parser.error("pass --all-local or --models ...") - - for name, capability in iter_local_capabilities(selected): - model_id = str(capability.get("model_id") or "").strip() - model_dir = Path(str(capability.get("model_dir") or "")).expanduser() - if not model_id or not model_dir: - raise ValueError(f"Capability '{name}' must define model_id and model_dir") - model_dir.parent.mkdir(parents=True, exist_ok=True) - print(f"[download] {name} -> {model_dir} ({model_id})") - snapshot_download( - repo_id=model_id, - local_dir=str(model_dir), - ) - print(f"[done] {name}") - if args.convert_ctranslate2: - convert_to_ctranslate2(name, capability) - - -if __name__ == "__main__": - main() diff --git a/scripts/frontend/frontend_server.py b/scripts/frontend/frontend_server.py new file mode 100755 index 0000000..15231ca --- /dev/null +++ b/scripts/frontend/frontend_server.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Simple HTTP server for saas-search frontend. +""" + +import http.server +import socketserver +import os +import sys +import logging +import time +import urllib.request +import urllib.error +from collections import defaultdict, deque +from pathlib import Path +from dotenv import load_dotenv + +# Load .env file +project_root = Path(__file__).resolve().parents[2] +load_dotenv(project_root / '.env') + +# Get API_BASE_URL from environment(默认不注入,避免被旧 .env 覆盖同源策略) +# 仅当显式设置 FRONTEND_INJECT_API_BASE_URL=1 时才注入 window.API_BASE_URL。 +API_BASE_URL = os.getenv('API_BASE_URL') or None +INJECT_API_BASE_URL = os.getenv('FRONTEND_INJECT_API_BASE_URL', '0') == '1' +# Backend proxy target for same-origin API forwarding +BACKEND_PROXY_URL = os.getenv('BACKEND_PROXY_URL', 'http://127.0.0.1:6002').rstrip('/') + +# Change to frontend directory +frontend_dir = os.path.join(project_root, 'frontend') +os.chdir(frontend_dir) + +# FRONTEND_PORT is the canonical config; keep PORT as a secondary fallback. +PORT = int(os.getenv('FRONTEND_PORT', os.getenv('PORT', 6003))) + +# Configure logging to suppress scanner noise +logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') + +class RateLimitingMixin: + """Mixin for rate limiting requests by IP address.""" + request_counts = defaultdict(deque) + rate_limit = 100 # requests per minute + window = 60 # seconds + + @classmethod + def is_rate_limited(cls, ip): + now = time.time() + + # Clean old requests + while cls.request_counts[ip] and cls.request_counts[ip][0] < now - cls.window: + cls.request_counts[ip].popleft() + + # Check rate limit + if len(cls.request_counts[ip]) > cls.rate_limit: + return True + + cls.request_counts[ip].append(now) + return False + +class MyHTTPRequestHandler(http.server.SimpleHTTPRequestHandler, RateLimitingMixin): + """Custom request handler with CORS support and robust error handling.""" + + def _is_proxy_path(self, path: str) -> bool: + """Return True for API paths that should be forwarded to backend service.""" + return path.startswith('/search/') or path.startswith('/admin/') or path.startswith('/indexer/') + + def _proxy_to_backend(self): + """Proxy current request to backend service on the GPU server.""" + target_url = f"{BACKEND_PROXY_URL}{self.path}" + method = self.command.upper() + + try: + content_length = int(self.headers.get('Content-Length', '0')) + except ValueError: + content_length = 0 + body = self.rfile.read(content_length) if content_length > 0 else None + + forward_headers = {} + for key, value in self.headers.items(): + lk = key.lower() + if lk in ('host', 'content-length', 'connection'): + continue + forward_headers[key] = value + + req = urllib.request.Request( + target_url, + data=body, + headers=forward_headers, + method=method, + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + resp_body = resp.read() + self.send_response(resp.getcode()) + for header, value in resp.getheaders(): + lh = header.lower() + if lh in ('transfer-encoding', 'connection', 'content-length'): + continue + self.send_header(header, value) + self.end_headers() + self.wfile.write(resp_body) + except urllib.error.HTTPError as e: + err_body = e.read() if hasattr(e, 'read') else b'' + self.send_response(e.code) + if e.headers: + for header, value in e.headers.items(): + lh = header.lower() + if lh in ('transfer-encoding', 'connection', 'content-length'): + continue + self.send_header(header, value) + self.end_headers() + if err_body: + self.wfile.write(err_body) + except Exception as e: + logging.error(f"Backend proxy error for {method} {self.path}: {e}") + self.send_response(502) + self.send_header('Content-Type', 'application/json; charset=utf-8') + self.end_headers() + self.wfile.write(b'{"error":"Bad Gateway: backend proxy failed"}') + + def do_GET(self): + """Handle GET requests with API config injection.""" + path = self.path.split('?')[0] + + # Proxy API paths to backend first + if self._is_proxy_path(path): + self._proxy_to_backend() + return + + # Route / to index.html + if path == '/' or path == '': + self.path = '/index.html' + (self.path.split('?', 1)[1] if '?' in self.path else '') + + # Inject API config for HTML files + if self.path.endswith('.html'): + self._serve_html_with_config() + else: + super().do_GET() + + def _serve_html_with_config(self): + """Serve HTML with optional API_BASE_URL injected.""" + try: + file_path = self.path.lstrip('/') + if not os.path.exists(file_path): + self.send_error(404) + return + + with open(file_path, 'r', encoding='utf-8') as f: + html = f.read() + + # 默认不注入 API_BASE_URL,避免历史 .env(如 http://xx:6002)覆盖同源调用。 + # 仅当 FRONTEND_INJECT_API_BASE_URL=1 且 API_BASE_URL 有值时才注入。 + if INJECT_API_BASE_URL and API_BASE_URL: + config_script = f'\n ' + html = html.replace('\n ' - html = html.replace('