diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
deleted file mode 100644
index ecf1067..0000000
--- a/scripts/generate_test_data.py
+++ /dev/null
@@ -1,421 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generate test data for Shoplazza SPU and SKU tables.
-
-Generates 100 SPU records with 1-5 SKUs each.
-"""
-
-import sys
-import os
-import random
-import argparse
-from pathlib import Path
-from datetime import datetime, timedelta
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def generate_spu_data(num_spus: int = 100, tenant_id: str = "1", start_id: int = 1):
- """
- Generate SPU test data.
-
- Args:
- num_spus: Number of SPUs to generate
- tenant_id: Tenant ID
- start_id: Starting ID for SPUs
-
- Returns:
- List of SPU data dictionaries
- """
- categories = ["电子产品", "服装", "家居用品", "美妆", "食品", "运动用品", "图书", "玩具"]
- vendors = ["Sony", "Nike", "Apple", "Samsung", "华为", "小米", "美的", "海尔"]
-
- products = [
- ("蓝牙耳机", "Bluetooth Headphone", "高品质无线蓝牙耳机", "High-quality wireless Bluetooth headphone"),
- ("运动鞋", "Running Shoes", "舒适透气的运动鞋", "Comfortable and breathable running shoes"),
- ("智能手机", "Smartphone", "高性能智能手机", "High-performance smartphone"),
- ("笔记本电脑", "Laptop", "轻薄便携笔记本电脑", "Lightweight and portable laptop"),
- ("智能手表", "Smart Watch", "多功能智能手表", "Multi-function smart watch"),
- ("平板电脑", "Tablet", "高清平板电脑", "High-definition tablet"),
- ("无线鼠标", "Wireless Mouse", "人体工学无线鼠标", "Ergonomic wireless mouse"),
- ("机械键盘", "Mechanical Keyboard", "RGB背光机械键盘", "RGB backlit mechanical keyboard"),
- ("显示器", "Monitor", "4K高清显示器", "4K high-definition monitor"),
- ("音响", "Speaker", "蓝牙无线音响", "Bluetooth wireless speaker"),
- ]
-
- spus = []
- for i in range(num_spus):
- spu_id = start_id + i
- product = random.choice(products)
- category = random.choice(categories)
- vendor = random.choice(vendors)
-
- # Generate handle
- handle = f"product-{spu_id}"
-
- # Generate title (Chinese)
- title_zh = f"{product[0]} {vendor}"
-
- # Generate brief
- brief_zh = product[2]
-
- # Generate description
- description_zh = f"
{product[2]},来自{vendor}品牌。{product[3]}
"
-
- # Generate SEO fields
- seo_title = f"{title_zh} - {category}"
- seo_description = f"购买{vendor}{product[0]},{product[2]}"
- seo_keywords = f"{product[0]},{vendor},{category}"
-
- # Generate tags
- tags = f"{category},{vendor},{product[0]}"
-
- # Generate image
- image_src = f"//cdn.example.com/products/{spu_id}.jpg"
-
- # Generate dates
- created_at = datetime.now() - timedelta(days=random.randint(1, 365))
- updated_at = created_at + timedelta(days=random.randint(0, 30))
-
- spu = {
- 'id': spu_id,
- 'shop_id': 1,
- 'shoplazza_id': f"spu-{spu_id}",
- 'handle': handle,
- 'title': title_zh,
- 'brief': brief_zh,
- 'description': description_zh,
- 'spu': '',
- 'vendor': vendor,
- 'vendor_url': f"https://{vendor.lower()}.com",
- 'seo_title': seo_title,
- 'seo_description': seo_description,
- 'seo_keywords': seo_keywords,
- 'image_src': image_src,
- 'image_width': 800,
- 'image_height': 600,
- 'image_path': f"products/{spu_id}.jpg",
- 'image_alt': title_zh,
- 'inventory_policy': '',
- 'inventory_quantity': 0,
- 'inventory_tracking': '0',
- 'published': 1,
- 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'requires_shipping': 1,
- 'taxable': 0,
- 'fake_sales': 0,
- 'display_fake_sales': 0,
- 'mixed_wholesale': 0,
- 'need_variant_image': 0,
- 'has_only_default_variant': 0,
- 'tags': tags,
- 'note': '',
- 'category': category,
- 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'tenant_id': tenant_id,
- 'creator': '1',
- 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'updater': '1',
- 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'deleted': 0
- }
- spus.append(spu)
-
- return spus
-
-
-def generate_sku_data(spus: list, start_sku_id: int = 1):
- """
- Generate SKU test data for SPUs.
-
- Args:
- spus: List of SPU data
- start_sku_id: Starting ID for SKUs
-
- Returns:
- List of SKU data dictionaries
- """
- colors = ["黑色", "白色", "红色", "蓝色", "绿色", "灰色"]
- sizes = ["S", "M", "L", "XL", "XXL"]
-
- skus = []
- sku_id = start_sku_id
-
- for spu in spus:
- spu_id = spu['id']
- num_skus = random.randint(1, 5)
-
- # Base price
- base_price = random.uniform(50, 500)
-
- for i in range(num_skus):
- # Generate variant options
- color = random.choice(colors) if num_skus > 1 else None
- size = random.choice(sizes) if num_skus > 2 else None
-
- # Generate title
- title_parts = []
- if color:
- title_parts.append(color)
- if size:
- title_parts.append(size)
- title = " / ".join(title_parts) if title_parts else ""
-
- # Generate SKU
- sku_code = f"SKU-{spu_id}-{i+1}"
-
- # Generate price (variation from base)
- price = base_price + random.uniform(-20, 50)
- compare_at_price = price * random.uniform(1.2, 1.5)
-
- # Generate stock
- stock = random.randint(0, 100)
-
- # Generate dates
- created_at = datetime.now() - timedelta(days=random.randint(1, 365))
- updated_at = created_at + timedelta(days=random.randint(0, 30))
-
- sku = {
- 'id': sku_id,
- 'spu_id': spu_id,
- 'shop_id': 1,
- 'shoplazza_id': f"sku-{sku_id}",
- 'shoplazza_product_id': spu['shoplazza_id'],
- 'shoplazza_image_id': '',
- 'title': title,
- 'sku': sku_code,
- 'barcode': f"BAR{sku_id:08d}",
- 'position': i + 1,
- 'price': round(price, 2),
- 'compare_at_price': round(compare_at_price, 2),
- 'cost_price': round(price * 0.6, 2),
- 'option1': color if color else '',
- 'option2': size if size else '',
- 'option3': '',
- 'inventory_quantity': stock,
- 'weight': round(random.uniform(0.1, 5.0), 2),
- 'weight_unit': 'kg',
- 'image_src': '',
- 'wholesale_price': '[{"price": ' + str(round(price * 0.8, 2)) + ', "minQuantity": 10}]',
- 'note': '',
- 'extend': None, # JSON field, use NULL instead of empty string
- 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'tenant_id': spu['tenant_id'],
- 'creator': '1',
- 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'updater': '1',
- 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'deleted': 0
- }
- skus.append(sku)
- sku_id += 1
-
- return skus
-
-
-def escape_sql_string(value: str) -> str:
- """
- Escape SQL string value (replace single quotes with doubled quotes).
-
- Args:
- value: String value to escape
-
- Returns:
- Escaped string
- """
- if value is None:
- return ''
- return str(value).replace("'", "''").replace("\\", "\\\\")
-
-
-def generate_sql_inserts(spus: list, skus: list, output_file: str):
- """
- Generate SQL INSERT statements.
-
- Args:
- spus: List of SPU data
- skus: List of SKU data
- output_file: Output file path
- """
- with open(output_file, 'w', encoding='utf-8') as f:
- f.write("-- SPU Test Data\n")
- f.write("INSERT INTO shoplazza_product_spu (\n")
- f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n")
- f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n")
- f.write(" image_src, image_width, image_height, image_path, image_alt,\n")
- f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n")
- f.write(" published, published_at, requires_shipping, taxable,\n")
- f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n")
- f.write(" has_only_default_variant, tags, note, category,\n")
- f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
- f.write(" creator, create_time, updater, update_time, deleted\n")
- f.write(") VALUES\n")
-
- for i, spu in enumerate(spus):
- values = (
- f"({spu['id']}, {spu['shop_id']}, '{escape_sql_string(spu['shoplazza_id'])}', "
- f"'{escape_sql_string(spu['handle'])}', '{escape_sql_string(spu['title'])}', "
- f"'{escape_sql_string(spu['brief'])}', '{escape_sql_string(spu['description'])}', "
- f"'{escape_sql_string(spu['spu'])}', '{escape_sql_string(spu['vendor'])}', "
- f"'{escape_sql_string(spu['vendor_url'])}', '{escape_sql_string(spu['seo_title'])}', "
- f"'{escape_sql_string(spu['seo_description'])}', '{escape_sql_string(spu['seo_keywords'])}', "
- f"'{escape_sql_string(spu['image_src'])}', {spu['image_width']}, "
- f"{spu['image_height']}, '{escape_sql_string(spu['image_path'])}', "
- f"'{escape_sql_string(spu['image_alt'])}', '{escape_sql_string(spu['inventory_policy'])}', "
- f"{spu['inventory_quantity']}, '{escape_sql_string(spu['inventory_tracking'])}', "
- f"{spu['published']}, '{escape_sql_string(spu['published_at'])}', "
- f"{spu['requires_shipping']}, {spu['taxable']}, "
- f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, "
- f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, "
- f"'{escape_sql_string(spu['tags'])}', '{escape_sql_string(spu['note'])}', "
- f"'{escape_sql_string(spu['category'])}', '{escape_sql_string(spu['shoplazza_created_at'])}', "
- f"'{escape_sql_string(spu['shoplazza_updated_at'])}', '{escape_sql_string(spu['tenant_id'])}', "
- f"'{escape_sql_string(spu['creator'])}', '{escape_sql_string(spu['create_time'])}', "
- f"'{escape_sql_string(spu['updater'])}', '{escape_sql_string(spu['update_time'])}', "
- f"{spu['deleted']})"
- )
- f.write(values)
- if i < len(spus) - 1:
- f.write(",\n")
- else:
- f.write(";\n\n")
-
- f.write("-- SKU Test Data\n")
- f.write("INSERT INTO shoplazza_product_sku (\n")
- f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n")
- f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n")
- f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n")
- f.write(" image_src, wholesale_price, note, extend,\n")
- f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
- f.write(" creator, create_time, updater, update_time, deleted\n")
- f.write(") VALUES\n")
-
- for i, sku in enumerate(skus):
- # Handle extend field (JSON, can be NULL)
- extend_value = 'NULL' if sku['extend'] is None else f"'{escape_sql_string(sku['extend'])}'"
-
- values = (
- f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{escape_sql_string(sku['shoplazza_id'])}', "
- f"'{escape_sql_string(sku['shoplazza_product_id'])}', '{escape_sql_string(sku['shoplazza_image_id'])}', "
- f"'{escape_sql_string(sku['title'])}', '{escape_sql_string(sku['sku'])}', "
- f"'{escape_sql_string(sku['barcode'])}', {sku['position']}, "
- f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, "
- f"'{escape_sql_string(sku['option1'])}', '{escape_sql_string(sku['option2'])}', "
- f"'{escape_sql_string(sku['option3'])}', {sku['inventory_quantity']}, {sku['weight']}, "
- f"'{escape_sql_string(sku['weight_unit'])}', '{escape_sql_string(sku['image_src'])}', "
- f"'{escape_sql_string(sku['wholesale_price'])}', '{escape_sql_string(sku['note'])}', "
- f"{extend_value}, '{escape_sql_string(sku['shoplazza_created_at'])}', "
- f"'{escape_sql_string(sku['shoplazza_updated_at'])}', '{escape_sql_string(sku['tenant_id'])}', "
- f"'{escape_sql_string(sku['creator'])}', '{escape_sql_string(sku['create_time'])}', "
- f"'{escape_sql_string(sku['updater'])}', '{escape_sql_string(sku['update_time'])}', "
- f"{sku['deleted']})"
- )
- f.write(values)
- if i < len(skus) - 1:
- f.write(",\n")
- else:
- f.write(";\n")
-
-
-def get_max_ids_from_db(db_config=None):
- """
- Get maximum IDs from database to avoid primary key conflicts.
-
- Args:
- db_config: Optional database config dict with keys: host, port, database, username, password
-
- Returns:
- tuple: (max_spu_id, max_sku_id) or (0, 0) if cannot connect
- """
- if not db_config:
- return 0, 0
-
- try:
- from utils.db_connector import create_db_connection
- from sqlalchemy import text
-
- db_engine = create_db_connection(
- host=db_config['host'],
- port=db_config['port'],
- database=db_config['database'],
- username=db_config['username'],
- password=db_config['password']
- )
-
- with db_engine.connect() as conn:
- result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_spu'))
- max_spu_id = result.scalar() or 0
-
- result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_sku'))
- max_sku_id = result.scalar() or 0
-
- return max_spu_id, max_sku_id
- except Exception as e:
- print(f"Warning: Could not get max IDs from database: {e}")
- return 0, 0
-
-
-def main():
- parser = argparse.ArgumentParser(description='Generate test data for Shoplazza tables')
- parser.add_argument('--num-spus', type=int, default=100, help='Number of SPUs to generate')
- parser.add_argument('--tenant-id', default='1', help='Tenant ID')
- parser.add_argument('--start-spu-id', type=int, default=None, help='Starting SPU ID (default: auto-calculate from DB)')
- parser.add_argument('--start-sku-id', type=int, default=None, help='Starting SKU ID (default: auto-calculate from DB)')
- parser.add_argument('--output', default='test_data.sql', help='Output SQL file')
- parser.add_argument('--db-host', help='Database host (for auto-calculating start IDs)')
- parser.add_argument('--db-port', type=int, default=3306, help='Database port (default: 3306)')
- parser.add_argument('--db-database', help='Database name (for auto-calculating start IDs)')
- parser.add_argument('--db-username', help='Database username (for auto-calculating start IDs)')
- parser.add_argument('--db-password', help='Database password (for auto-calculating start IDs)')
-
- args = parser.parse_args()
-
- # Auto-calculate start IDs if not provided and DB config available
- start_spu_id = args.start_spu_id
- start_sku_id = args.start_sku_id
-
- if (start_spu_id is None or start_sku_id is None) and args.db_host and args.db_database and args.db_username and args.db_password:
- print("Auto-calculating start IDs from database...")
- db_config = {
- 'host': args.db_host,
- 'port': args.db_port,
- 'database': args.db_database,
- 'username': args.db_username,
- 'password': args.db_password
- }
- max_spu_id, max_sku_id = get_max_ids_from_db(db_config)
- if start_spu_id is None:
- start_spu_id = max_spu_id + 1
- if start_sku_id is None:
- start_sku_id = max_sku_id + 1
- print(f" Max SPU ID in DB: {max_spu_id}, using start SPU ID: {start_spu_id}")
- print(f" Max SKU ID in DB: {max_sku_id}, using start SKU ID: {start_sku_id}")
- else:
- if start_spu_id is None:
- start_spu_id = 1
- if start_sku_id is None:
- start_sku_id = 1
- print(f"Using start SPU ID: {start_spu_id}, start SKU ID: {start_sku_id}")
-
- print(f"Generating {args.num_spus} SPUs with skus...")
-
- # Generate SPU data
- spus = generate_spu_data(args.num_spus, args.tenant_id, start_spu_id)
- print(f"Generated {len(spus)} SPUs")
-
- # Generate SKU data
- skus = generate_sku_data(spus, start_sku_id)
- print(f"Generated {len(skus)} SKUs")
-
- # Generate SQL file
- generate_sql_inserts(spus, skus, args.output)
- print(f"SQL file generated: {args.output}")
-
-
-if __name__ == '__main__':
- import json
- main()
-
diff --git a/scripts/generate_test_summary.py b/scripts/generate_test_summary.py
deleted file mode 100644
index f85bf35..0000000
--- a/scripts/generate_test_summary.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-"""
-生成测试摘要脚本
-
-用于CI/CD流水线中汇总所有测试结果
-"""
-
-import json
-import os
-import sys
-import glob
-from pathlib import Path
-from datetime import datetime
-from typing import Dict, Any, List
-
-
-def collect_test_results() -> Dict[str, Any]:
- """收集所有测试结果"""
- results = {
- 'timestamp': datetime.now().isoformat(),
- 'suites': {},
- 'summary': {
- 'total_tests': 0,
- 'passed': 0,
- 'failed': 0,
- 'skipped': 0,
- 'errors': 0,
- 'total_duration': 0.0
- }
- }
-
- # 查找所有测试结果文件
- test_files = glob.glob('*_test_results.json')
-
- for test_file in test_files:
- try:
- with open(test_file, 'r', encoding='utf-8') as f:
- test_data = json.load(f)
-
- suite_name = test_file.replace('_test_results.json', '')
-
- if 'summary' in test_data:
- summary = test_data['summary']
- results['suites'][suite_name] = {
- 'total': summary.get('total', 0),
- 'passed': summary.get('passed', 0),
- 'failed': summary.get('failed', 0),
- 'skipped': summary.get('skipped', 0),
- 'errors': summary.get('error', 0),
- 'duration': summary.get('duration', 0.0)
- }
-
- # 更新总体统计
- results['summary']['total_tests'] += summary.get('total', 0)
- results['summary']['passed'] += summary.get('passed', 0)
- results['summary']['failed'] += summary.get('failed', 0)
- results['summary']['skipped'] += summary.get('skipped', 0)
- results['summary']['errors'] += summary.get('error', 0)
- results['summary']['total_duration'] += summary.get('duration', 0.0)
-
- except Exception as e:
- print(f"Error reading {test_file}: {e}")
- continue
-
- # 计算成功率
- if results['summary']['total_tests'] > 0:
- results['summary']['success_rate'] = (
- results['summary']['passed'] / results['summary']['total_tests'] * 100
- )
- else:
- results['summary']['success_rate'] = 0.0
-
- return results
-
-
-def generate_text_report(results: Dict[str, Any]) -> str:
- """生成文本格式的测试报告"""
- lines = []
-
- # 标题
- lines.append("=" * 60)
- lines.append("搜索引擎自动化测试报告")
- lines.append("=" * 60)
- lines.append(f"时间: {results['timestamp']}")
- lines.append("")
-
- # 摘要
- summary = results['summary']
- lines.append("📊 测试摘要")
- lines.append("-" * 30)
- lines.append(f"总测试数: {summary['total_tests']}")
- lines.append(f"✅ 通过: {summary['passed']}")
- lines.append(f"❌ 失败: {summary['failed']}")
- lines.append(f"⏭️ 跳过: {summary['skipped']}")
- lines.append(f"🚨 错误: {summary['errors']}")
- lines.append(f"📈 成功率: {summary['success_rate']:.1f}%")
- lines.append(f"⏱️ 总耗时: {summary['total_duration']:.2f}秒")
- lines.append("")
-
- # 状态判断
- if summary['failed'] == 0 and summary['errors'] == 0:
- lines.append("🎉 所有测试都通过了!")
- else:
- lines.append("⚠️ 存在失败的测试,请查看详细日志。")
- lines.append("")
-
- # 各测试套件详情
- if results['suites']:
- lines.append("📋 测试套件详情")
- lines.append("-" * 30)
-
- for suite_name, suite_data in results['suites'].items():
- lines.append(f"\n{suite_name.upper()}:")
- lines.append(f" 总数: {suite_data['total']}")
- lines.append(f" ✅ 通过: {suite_data['passed']}")
- lines.append(f" ❌ 失败: {suite_data['failed']}")
- lines.append(f" ⏭️ 跳过: {suite_data['skipped']}")
- lines.append(f" 🚨 错误: {suite_data['errors']}")
- lines.append(f" ⏱️ 耗时: {suite_data['duration']:.2f}秒")
-
- # 添加状态图标
- if suite_data['failed'] == 0 and suite_data['errors'] == 0:
- lines.append(f" 状态: ✅ 全部通过")
- else:
- lines.append(f" 状态: ❌ 存在问题")
-
- lines.append("")
- lines.append("=" * 60)
-
- return "\n".join(lines)
-
-
-def generate_json_report(results: Dict[str, Any]) -> str:
- """生成JSON格式的测试报告"""
- return json.dumps(results, indent=2, ensure_ascii=False)
-
-
-def main():
- """主函数"""
- # 收集测试结果
- print("收集测试结果...")
- results = collect_test_results()
-
- # 生成报告
- print("生成测试报告...")
- text_report = generate_text_report(results)
- json_report = generate_json_report(results)
-
- # 保存报告
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-
- # 文本报告
- text_file = f"final_test_report.txt"
- with open(text_file, 'w', encoding='utf-8') as f:
- f.write(text_report)
-
- # JSON报告
- json_file = f"final_test_report.json"
- with open(json_file, 'w', encoding='utf-8') as f:
- f.write(json_report)
-
- print(f"测试报告已生成:")
- print(f" 文本报告: {text_file}")
- print(f" JSON报告: {json_file}")
-
- # 输出摘要到控制台
- print("\n" + "=" * 60)
- print(text_report)
-
- # 返回退出码
- summary = results['summary']
- if summary['failed'] > 0 or summary['errors'] > 0:
- return 1
- else:
- return 0
-
-
-if __name__ == "__main__":
- sys.exit(main())
\ No newline at end of file
diff --git a/scripts/import_tenant2_csv.py b/scripts/import_tenant2_csv.py
deleted file mode 100755
index 063dd77..0000000
--- a/scripts/import_tenant2_csv.py
+++ /dev/null
@@ -1,495 +0,0 @@
-#!/usr/bin/env python3
-"""
-Import tenant2 CSV data into MySQL Shoplazza tables.
-
-Reads CSV file and generates SQL INSERT statements for SPU and SKU tables.
-Each CSV row corresponds to 1 SPU and 1 SKU.
-This script is for generating test data for tenant_id=2 from CSV files.
-"""
-
-import sys
-import os
-import csv
-import random
-import argparse
-import re
-from pathlib import Path
-from datetime import datetime, timedelta
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def escape_sql_string(value: str) -> str:
- """
- Escape SQL string value (replace single quotes with doubled quotes and handle special characters).
-
- Args:
- value: String value to escape
-
- Returns:
- Escaped string
- """
- if value is None:
- return ''
-
- # Convert to string and handle None
- s = str(value)
-
- # Replace single quotes with doubled quotes (SQL standard)
- s = s.replace("'", "''")
-
- # Replace backslashes (MySQL escape)
- s = s.replace("\\", "\\\\")
-
- # Remove or replace control characters that can break SQL
- # Replace newlines and carriage returns with spaces
- s = s.replace("\n", " ").replace("\r", " ")
-
- # Remove other control characters (except tab)
- s = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', s)
-
- # Remove null bytes
- s = s.replace('\x00', '')
-
- return s
-
-
-def generate_handle(title: str) -> str:
- """
- Generate URL-friendly handle from title.
-
- Args:
- title: Product title
-
- Returns:
- URL-friendly handle
- """
- # Remove special characters, convert to lowercase, replace spaces with hyphens
- handle = re.sub(r'[^\w\s-]', '', title.lower())
- handle = re.sub(r'[-\s]+', '-', handle)
- handle = handle.strip('-')
- # Limit length
- if len(handle) > 255:
- handle = handle[:255]
- return handle or 'product'
-
-
-def parse_csv_row(row: dict) -> dict:
- """
- Parse CSV row and extract fields.
-
- Args:
- row: CSV row dictionary
-
- Returns:
- Parsed data dictionary
- """
- # Remove quotes from values if present
- def clean_value(value):
- if value is None:
- return ''
- value = str(value).strip()
- # Remove surrounding quotes
- if value.startswith('"') and value.endswith('"'):
- value = value[1:-1]
- return value
-
- return {
- 'skuId': clean_value(row.get('skuId', '')),
- 'name': clean_value(row.get('name', '')),
- 'name_pinyin': clean_value(row.get('name_pinyin', '')),
- 'create_time': clean_value(row.get('create_time', '')),
- 'ruSkuName': clean_value(row.get('ruSkuName', '')),
- 'enSpuName': clean_value(row.get('enSpuName', '')),
- 'categoryName': clean_value(row.get('categoryName', '')),
- 'supplierName': clean_value(row.get('supplierName', '')),
- 'brandName': clean_value(row.get('brandName', '')),
- 'file_id': clean_value(row.get('file_id', '')),
- 'days_since_last_update': clean_value(row.get('days_since_last_update', '')),
- 'id': clean_value(row.get('id', '')),
- 'imageUrl': clean_value(row.get('imageUrl', ''))
- }
-
-
-def generate_spu_data(csv_data: dict, spu_id: int, tenant_id: str = "2") -> dict:
- """
- Generate SPU data from CSV row.
-
- Args:
- csv_data: Parsed CSV row data
- spu_id: SPU ID
- tenant_id: Tenant ID (default: "2")
-
- Returns:
- SPU data dictionary
- """
- # Parse create_time
- try:
- created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S')
- except:
- created_at = datetime.now() - timedelta(days=random.randint(1, 365))
-
- updated_at = created_at + timedelta(days=random.randint(0, 30))
-
- # Generate handle from title
- title = csv_data['name'] or csv_data['enSpuName'] or 'Product'
- handle = generate_handle(title)
-
- # Generate tags from category and brand
- tags_parts = []
- if csv_data['categoryName']:
- tags_parts.append(csv_data['categoryName'])
- if csv_data['brandName']:
- tags_parts.append(csv_data['brandName'])
- tags = ','.join(tags_parts) if tags_parts else ''
-
- # Generate SEO fields
- seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title
- seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title
- seo_keywords = f"{title},{csv_data['categoryName']},{csv_data['brandName']}" if csv_data['categoryName'] else title
-
- spu = {
- 'id': spu_id,
- 'shop_id': 1,
- 'shoplazza_id': csv_data['id'] or f"spu-{spu_id}",
- 'handle': handle,
- 'title': title,
- 'brief': csv_data['name'] or '',
- 'description': f"{csv_data['name']}
" if csv_data['name'] else '',
- 'spu': '',
- 'vendor': csv_data['supplierName'] or '',
- 'vendor_url': '',
- 'seo_title': seo_title,
- 'seo_description': seo_description,
- 'seo_keywords': seo_keywords,
- 'image_src': csv_data['imageUrl'] or '',
- 'image_width': 800,
- 'image_height': 600,
- 'image_path': f"products/{spu_id}.jpg",
- 'image_alt': title,
- 'inventory_policy': '',
- 'inventory_quantity': 0,
- 'inventory_tracking': '0',
- 'published': 1,
- 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'requires_shipping': 1,
- 'taxable': 0,
- 'fake_sales': 0,
- 'display_fake_sales': 0,
- 'mixed_wholesale': 0,
- 'need_variant_image': 0,
- 'has_only_default_variant': 0,
- 'tags': tags,
- 'note': '',
- 'category': csv_data['categoryName'] or '',
- 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'tenant_id': tenant_id,
- 'creator': '1',
- 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'updater': '1',
- 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'deleted': 0
- }
-
- return spu
-
-
-def generate_sku_data(csv_data: dict, spu_id: int, sku_id: int, tenant_id: str = "2") -> dict:
- """
- Generate SKU data from CSV row.
-
- Args:
- csv_data: Parsed CSV row data
- spu_id: Associated SPU ID
- sku_id: SKU ID (from CSV skuId)
- tenant_id: Tenant ID (default: "2")
-
- Returns:
- SKU data dictionary
- """
- # Parse create_time
- try:
- created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S')
- except:
- created_at = datetime.now() - timedelta(days=random.randint(1, 365))
-
- updated_at = created_at + timedelta(days=random.randint(0, 30))
-
- # Generate random price
- price = round(random.uniform(50, 500), 2)
- compare_at_price = round(price * random.uniform(1.2, 1.5), 2)
- cost_price = round(price * 0.6, 2)
-
- # Generate random stock
- inventory_quantity = random.randint(0, 100)
-
- # Generate random weight
- weight = round(random.uniform(0.1, 5.0), 2)
-
- # Use ruSkuName as title, fallback to name
- title = csv_data['ruSkuName'] or csv_data['name'] or 'SKU'
-
- # Use skuId as SKU code
- sku_code = csv_data['skuId'] or f"SKU-{sku_id}"
-
- sku = {
- 'id': sku_id,
- 'spu_id': spu_id,
- 'shop_id': 1,
- 'shoplazza_id': f"sku-{sku_id}",
- 'shoplazza_product_id': csv_data['id'] or f"spu-{spu_id}",
- 'shoplazza_image_id': '',
- 'title': title,
- 'sku': sku_code,
- 'barcode': f"BAR{sku_id:08d}",
- 'position': 1,
- 'price': price,
- 'compare_at_price': compare_at_price,
- 'cost_price': cost_price,
- 'option1': '',
- 'option2': '',
- 'option3': '',
- 'inventory_quantity': inventory_quantity,
- 'weight': weight,
- 'weight_unit': 'kg',
- 'image_src': csv_data['imageUrl'] or '',
- 'wholesale_price': f'[{{"price": {round(price * 0.8, 2)}, "minQuantity": 10}}]',
- 'note': '',
- 'extend': None, # JSON field, use NULL
- 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'tenant_id': tenant_id,
- 'creator': '1',
- 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'updater': '1',
- 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
- 'deleted': 0
- }
-
- return sku
-
-
-def read_csv_file(csv_file: str) -> list:
- """
- Read CSV file and return list of parsed rows.
-
- Args:
- csv_file: Path to CSV file
-
- Returns:
- List of parsed CSV data dictionaries
- """
- csv_data_list = []
-
- with open(csv_file, 'r', encoding='utf-8') as f:
- # Use csv.DictReader to handle quoted fields properly
- reader = csv.DictReader(f)
- for row in reader:
- parsed = parse_csv_row(row)
- csv_data_list.append(parsed)
-
- return csv_data_list
-
-
-def generate_sql_inserts(spus: list, skus: list, output_file: str):
- """
- Generate SQL INSERT statements.
-
- Args:
- spus: List of SPU data
- skus: List of SKU data
- output_file: Output file path
- """
- with open(output_file, 'w', encoding='utf-8') as f:
- f.write("-- SPU Data from tenant2 CSV\n")
- f.write("INSERT INTO shoplazza_product_spu (\n")
- f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n")
- f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n")
- f.write(" image_src, image_width, image_height, image_path, image_alt,\n")
- f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n")
- f.write(" published, published_at, requires_shipping, taxable,\n")
- f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n")
- f.write(" has_only_default_variant, tags, note, category,\n")
- f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
- f.write(" creator, create_time, updater, update_time, deleted\n")
- f.write(") VALUES\n")
-
- for i, spu in enumerate(spus):
- values = (
- f"({spu['id']}, {spu['shop_id']}, '{escape_sql_string(spu['shoplazza_id'])}', "
- f"'{escape_sql_string(spu['handle'])}', '{escape_sql_string(spu['title'])}', "
- f"'{escape_sql_string(spu['brief'])}', '{escape_sql_string(spu['description'])}', "
- f"'{escape_sql_string(spu['spu'])}', '{escape_sql_string(spu['vendor'])}', "
- f"'{escape_sql_string(spu['vendor_url'])}', '{escape_sql_string(spu['seo_title'])}', "
- f"'{escape_sql_string(spu['seo_description'])}', '{escape_sql_string(spu['seo_keywords'])}', "
- f"'{escape_sql_string(spu['image_src'])}', {spu['image_width']}, "
- f"{spu['image_height']}, '{escape_sql_string(spu['image_path'])}', "
- f"'{escape_sql_string(spu['image_alt'])}', '{escape_sql_string(spu['inventory_policy'])}', "
- f"{spu['inventory_quantity']}, '{escape_sql_string(spu['inventory_tracking'])}', "
- f"{spu['published']}, '{escape_sql_string(spu['published_at'])}', "
- f"{spu['requires_shipping']}, {spu['taxable']}, "
- f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, "
- f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, "
- f"'{escape_sql_string(spu['tags'])}', '{escape_sql_string(spu['note'])}', "
- f"'{escape_sql_string(spu['category'])}', '{escape_sql_string(spu['shoplazza_created_at'])}', "
- f"'{escape_sql_string(spu['shoplazza_updated_at'])}', '{escape_sql_string(spu['tenant_id'])}', "
- f"'{escape_sql_string(spu['creator'])}', '{escape_sql_string(spu['create_time'])}', "
- f"'{escape_sql_string(spu['updater'])}', '{escape_sql_string(spu['update_time'])}', "
- f"{spu['deleted']})"
- )
- f.write(values)
- if i < len(spus) - 1:
- f.write(",\n")
- else:
- f.write(";\n\n")
-
- f.write("-- SKU Data from tenant2 CSV\n")
- f.write("INSERT INTO shoplazza_product_sku (\n")
- f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n")
- f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n")
- f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n")
- f.write(" image_src, wholesale_price, note, extend,\n")
- f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
- f.write(" creator, create_time, updater, update_time, deleted\n")
- f.write(") VALUES\n")
-
- for i, sku in enumerate(skus):
- # Handle extend field (JSON, can be NULL)
- extend_value = 'NULL' if sku['extend'] is None else f"'{escape_sql_string(sku['extend'])}'"
-
- values = (
- f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{escape_sql_string(sku['shoplazza_id'])}', "
- f"'{escape_sql_string(sku['shoplazza_product_id'])}', '{escape_sql_string(sku['shoplazza_image_id'])}', "
- f"'{escape_sql_string(sku['title'])}', '{escape_sql_string(sku['sku'])}', "
- f"'{escape_sql_string(sku['barcode'])}', {sku['position']}, "
- f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, "
- f"'{escape_sql_string(sku['option1'])}', '{escape_sql_string(sku['option2'])}', "
- f"'{escape_sql_string(sku['option3'])}', {sku['inventory_quantity']}, {sku['weight']}, "
- f"'{escape_sql_string(sku['weight_unit'])}', '{escape_sql_string(sku['image_src'])}', "
- f"'{escape_sql_string(sku['wholesale_price'])}', '{escape_sql_string(sku['note'])}', "
- f"{extend_value}, '{escape_sql_string(sku['shoplazza_created_at'])}', "
- f"'{escape_sql_string(sku['shoplazza_updated_at'])}', '{escape_sql_string(sku['tenant_id'])}', "
- f"'{escape_sql_string(sku['creator'])}', '{escape_sql_string(sku['create_time'])}', "
- f"'{escape_sql_string(sku['updater'])}', '{escape_sql_string(sku['update_time'])}', "
- f"{sku['deleted']})"
- )
- f.write(values)
- if i < len(skus) - 1:
- f.write(",\n")
- else:
- f.write(";\n")
-
-
-def get_max_ids_from_db(db_config=None):
- """
- Get maximum IDs from database to avoid primary key conflicts.
-
- Args:
- db_config: Optional database config dict with keys: host, port, database, username, password
-
- Returns:
- tuple: (max_spu_id, max_sku_id) or (0, 0) if cannot connect
- """
- if not db_config:
- return 0, 0
-
- try:
- from utils.db_connector import create_db_connection
- from sqlalchemy import text
-
- db_engine = create_db_connection(
- host=db_config['host'],
- port=db_config['port'],
- database=db_config['database'],
- username=db_config['username'],
- password=db_config['password']
- )
-
- with db_engine.connect() as conn:
- result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_spu'))
- max_spu_id = result.scalar() or 0
-
- result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_sku'))
- max_sku_id = result.scalar() or 0
-
- return max_spu_id, max_sku_id
- except Exception as e:
- print(f"Warning: Could not get max IDs from database: {e}")
- return 0, 0
-
-
-def main():
- parser = argparse.ArgumentParser(description='Import tenant2 CSV data into MySQL Shoplazza tables')
- parser.add_argument('--csv-file', required=True, help='CSV file path')
- parser.add_argument('--tenant-id', default='2', help='Tenant ID (default: 2)')
- parser.add_argument('--start-spu-id', type=int, default=None, help='Starting SPU ID (default: auto-calculate from DB)')
- parser.add_argument('--output', default='tenant2_data.sql', help='Output SQL file (default: tenant2_data.sql)')
- parser.add_argument('--db-host', help='Database host (for auto-calculating start IDs)')
- parser.add_argument('--db-port', type=int, default=3306, help='Database port (default: 3306)')
- parser.add_argument('--db-database', help='Database name (for auto-calculating start IDs)')
- parser.add_argument('--db-username', help='Database username (for auto-calculating start IDs)')
- parser.add_argument('--db-password', help='Database password (for auto-calculating start IDs)')
-
- args = parser.parse_args()
-
- print(f"Reading CSV file: {args.csv_file}")
- csv_data_list = read_csv_file(args.csv_file)
- print(f"Read {len(csv_data_list)} rows from CSV")
-
- # Auto-calculate start IDs if not provided and DB config available
- start_spu_id = args.start_spu_id
- if start_spu_id is None and args.db_host and args.db_database and args.db_username and args.db_password:
- print("Auto-calculating start IDs from database...")
- db_config = {
- 'host': args.db_host,
- 'port': args.db_port,
- 'database': args.db_database,
- 'username': args.db_username,
- 'password': args.db_password
- }
- max_spu_id, max_sku_id = get_max_ids_from_db(db_config)
- start_spu_id = max_spu_id + 1
- print(f" Max SPU ID in DB: {max_spu_id}")
- print(f" Using start SPU ID: {start_spu_id}")
- elif start_spu_id is None:
- start_spu_id = 1
- print(f"Using default start SPU ID: {start_spu_id}")
-
- # Generate SPU and SKU data
- print(f"Generating SPU and SKU data (tenant_id={args.tenant_id})...")
- spus = []
- skus = []
- spu_id = start_spu_id
-
- for csv_data in csv_data_list:
- # Generate SPU
- spu = generate_spu_data(csv_data, spu_id, args.tenant_id)
- spus.append(spu)
-
- # Generate SKU - use skuId from CSV as SKU ID
- try:
- sku_id = int(csv_data['skuId'])
- except:
- # If skuId is not valid, use a generated ID
- sku_id = 1000000 + spu_id
-
- sku = generate_sku_data(csv_data, spu_id, sku_id, args.tenant_id)
- skus.append(sku)
-
- spu_id += 1
-
- print(f"Generated {len(spus)} SPUs and {len(skus)} SKUs")
-
- # Generate SQL file
- print(f"Generating SQL file: {args.output}")
- generate_sql_inserts(spus, skus, args.output)
- print(f"SQL file generated: {args.output}")
- print(f" - SPUs: {len(spus)}")
- print(f" - SKUs: {len(skus)}")
-
-
-if __name__ == '__main__':
- main()
-
diff --git a/scripts/import_test_data.py b/scripts/import_test_data.py
deleted file mode 100644
index 97ea83d..0000000
--- a/scripts/import_test_data.py
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/bin/env python3
-"""
-Import test data into MySQL Shoplazza tables.
-
-Reads SQL file generated by generate_test_data.py and imports into MySQL.
-"""
-
-import sys
-import os
-import argparse
-from pathlib import Path
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from utils.db_connector import create_db_connection, test_connection
-
-
-def import_sql_file(db_engine, sql_file: str):
- """
- Import SQL file into database using MySQL client (more reliable for large files).
-
- Args:
- db_engine: SQLAlchemy database engine (used to get connection info)
- sql_file: Path to SQL file
- """
- import subprocess
- import os
- from pathlib import Path
-
- # Get connection info from engine URL
- engine_url = str(db_engine.url)
- # Parse: mysql+pymysql://user:pass@host:port/database
- import re
- match = re.match(r'mysql\+pymysql://([^:]+):([^@]+)@([^:]+):(\d+)/(.+)', engine_url)
- if not match:
- raise ValueError(f"Cannot parse database URL: {engine_url}")
-
- username, password, host, port, database = match.groups()
-
- # Use MySQL client to execute SQL file (more reliable)
- sql_file_path = Path(sql_file).absolute()
-
- # Build mysql command
- mysql_cmd = [
- 'mysql',
- f'-h{host}',
- f'-P{port}',
- f'-u{username}',
- f'-p{password}',
- database
- ]
-
- print(f"Executing SQL file using MySQL client...")
- print(f" File: {sql_file_path}")
- print(f" Database: {host}:{port}/{database}")
-
- try:
- with open(sql_file_path, 'r', encoding='utf-8') as f:
- result = subprocess.run(
- mysql_cmd,
- stdin=f,
- capture_output=True,
- text=True,
- timeout=300 # 5 minute timeout
- )
-
- if result.returncode != 0:
- error_msg = result.stderr or result.stdout
- print(f"ERROR: MySQL execution failed")
- print(f"Error output: {error_msg[:500]}")
- raise Exception(f"MySQL execution failed: {error_msg[:200]}")
-
- print("SQL file executed successfully")
- return True
-
- except FileNotFoundError:
- # Fallback to SQLAlchemy if mysql client not available
- print("MySQL client not found, falling back to SQLAlchemy...")
- return import_sql_file_sqlalchemy(db_engine, sql_file)
- except subprocess.TimeoutExpired:
- raise Exception("SQL execution timed out after 5 minutes")
- except Exception as e:
- print(f"Error using MySQL client: {e}")
- print("Falling back to SQLAlchemy...")
- return import_sql_file_sqlalchemy(db_engine, sql_file)
-
-
-def import_sql_file_sqlalchemy(db_engine, sql_file: str):
- """
- Fallback method: Import SQL file using SQLAlchemy (for when mysql client unavailable).
- """
- from sqlalchemy import text
-
- with open(sql_file, 'r', encoding='utf-8') as f:
- sql_content = f.read()
-
- # Remove comment lines
- lines = sql_content.split('\n')
- cleaned_lines = []
- for line in lines:
- stripped = line.lstrip()
- if stripped.startswith('--'):
- continue
- cleaned_lines.append(line)
-
- sql_content = '\n'.join(cleaned_lines)
-
- # Split by semicolon - but we need to handle strings properly
- # Use a state machine to track string boundaries
- statements = []
- current = []
- in_string = False
- i = 0
-
- while i < len(sql_content):
- char = sql_content[i]
-
- if char == "'":
- # Check for escaped quote (two single quotes)
- if i + 1 < len(sql_content) and sql_content[i+1] == "'":
- current.append("''")
- i += 1 # Skip next quote
- elif not in_string:
- in_string = True
- current.append(char)
- else:
- in_string = False
- current.append(char)
- else:
- current.append(char)
-
- # Split on semicolon only if not in string
- if char == ';' and not in_string:
- stmt = ''.join(current).strip()
- if stmt and stmt.upper().startswith('INSERT INTO'):
- statements.append(stmt)
- current = []
-
- i += 1
-
- # Handle last statement
- if current:
- stmt = ''.join(current).strip()
- if stmt and stmt.upper().startswith('INSERT INTO'):
- statements.append(stmt)
-
- print(f"Parsed {len(statements)} SQL statements")
- print(f"Executing {len(statements)} SQL statements...")
-
- # Use raw connection to avoid SQLAlchemy parameter parsing
- raw_conn = db_engine.raw_connection()
- try:
- cursor = raw_conn.cursor()
- try:
- for i, statement in enumerate(statements, 1):
- try:
- # Execute raw SQL directly using pymysql cursor
- cursor.execute(statement)
- raw_conn.commit()
- if i % 1000 == 0 or i == len(statements):
- print(f" [{i}/{len(statements)}] Executed successfully")
- except Exception as e:
- print(f" [{i}/{len(statements)}] ERROR: {e}")
- error_start = max(0, statement.find('VALUES') - 100)
- error_end = min(len(statement), error_start + 500)
- print(f" Statement context: ...{statement[error_start:error_end]}...")
- raise
- finally:
- cursor.close()
- finally:
- raw_conn.close()
-
- return True
-
-
-def verify_import(db_engine, tenant_id: str):
- """
- Verify imported data.
-
- Args:
- db_engine: SQLAlchemy database engine
- tenant_id: Tenant ID to verify
- """
- from sqlalchemy import text
-
- with db_engine.connect() as conn:
- # Count SPUs
- result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
- spu_count = result.scalar()
-
- # Count SKUs
- result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
- sku_count = result.scalar()
-
- print(f"\nVerification:")
- print(f" SPUs: {spu_count}")
- print(f" SKUs: {sku_count}")
-
- return spu_count, sku_count
-
-
-def main():
- parser = argparse.ArgumentParser(description='Import test data into MySQL')
-
- # Database connection
- parser.add_argument('--db-host', required=True, help='MySQL host')
- parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
- parser.add_argument('--db-database', required=True, help='MySQL database name')
- parser.add_argument('--db-username', required=True, help='MySQL username')
- parser.add_argument('--db-password', required=True, help='MySQL password')
-
- # Import options
- parser.add_argument('--sql-file', required=True, help='SQL file to import')
- parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)')
-
- args = parser.parse_args()
-
- print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
-
- # Connect to database
- try:
- db_engine = create_db_connection(
- host=args.db_host,
- port=args.db_port,
- database=args.db_database,
- username=args.db_username,
- password=args.db_password
- )
- except Exception as e:
- print(f"ERROR: Failed to connect to MySQL: {e}")
- return 1
-
- # Test connection
- if not test_connection(db_engine):
- print("ERROR: Database connection test failed")
- return 1
-
- print("Database connection successful")
-
- # Clean existing data if tenant_id provided
- if args.tenant_id:
- print(f"\nCleaning existing data for tenant_id: {args.tenant_id}")
- from sqlalchemy import text
- try:
- with db_engine.connect() as conn:
- # Delete SKUs first (foreign key constraint)
- conn.execute(text(f"DELETE FROM shoplazza_product_sku WHERE tenant_id = '{args.tenant_id}'"))
- # Delete SPUs
- conn.execute(text(f"DELETE FROM shoplazza_product_spu WHERE tenant_id = '{args.tenant_id}'"))
- conn.commit()
- print("✓ Existing data cleaned")
- except Exception as e:
- print(f"⚠ Warning: Failed to clean existing data: {e}")
- # Continue anyway
-
- # Import SQL file
- print(f"\nImporting SQL file: {args.sql_file}")
- try:
- import_sql_file(db_engine, args.sql_file)
- print("Import completed successfully")
- except Exception as e:
- print(f"ERROR: Failed to import SQL file: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
- # Verify import if tenant_id provided
- if args.tenant_id:
- verify_import(db_engine, args.tenant_id)
-
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
-
diff --git a/scripts/indexer__old_2025_11/import_tenant2_csv.py b/scripts/indexer__old_2025_11/import_tenant2_csv.py
new file mode 100755
index 0000000..063dd77
--- /dev/null
+++ b/scripts/indexer__old_2025_11/import_tenant2_csv.py
@@ -0,0 +1,495 @@
+#!/usr/bin/env python3
+"""
+Import tenant2 CSV data into MySQL Shoplazza tables.
+
+Reads CSV file and generates SQL INSERT statements for SPU and SKU tables.
+Each CSV row corresponds to 1 SPU and 1 SKU.
+This script is for generating test data for tenant_id=2 from CSV files.
+"""
+
+import sys
+import os
+import csv
+import random
+import argparse
+import re
+from pathlib import Path
+from datetime import datetime, timedelta
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def escape_sql_string(value: str) -> str:
+ """
+ Escape SQL string value (replace single quotes with doubled quotes and handle special characters).
+
+ Args:
+ value: String value to escape
+
+ Returns:
+ Escaped string
+ """
+ if value is None:
+ return ''
+
+ # Convert to string and handle None
+ s = str(value)
+
+ # Replace single quotes with doubled quotes (SQL standard)
+ s = s.replace("'", "''")
+
+ # Replace backslashes (MySQL escape)
+ s = s.replace("\\", "\\\\")
+
+ # Remove or replace control characters that can break SQL
+ # Replace newlines and carriage returns with spaces
+ s = s.replace("\n", " ").replace("\r", " ")
+
+ # Remove other control characters (except tab)
+ s = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', s)
+
+ # Remove null bytes
+ s = s.replace('\x00', '')
+
+ return s
+
+
+def generate_handle(title: str) -> str:
+ """
+ Generate URL-friendly handle from title.
+
+ Args:
+ title: Product title
+
+ Returns:
+ URL-friendly handle
+ """
+ # Remove special characters, convert to lowercase, replace spaces with hyphens
+ handle = re.sub(r'[^\w\s-]', '', title.lower())
+ handle = re.sub(r'[-\s]+', '-', handle)
+ handle = handle.strip('-')
+ # Limit length
+ if len(handle) > 255:
+ handle = handle[:255]
+ return handle or 'product'
+
+
+def parse_csv_row(row: dict) -> dict:
+ """
+ Parse CSV row and extract fields.
+
+ Args:
+ row: CSV row dictionary
+
+ Returns:
+ Parsed data dictionary
+ """
+ # Remove quotes from values if present
+ def clean_value(value):
+ if value is None:
+ return ''
+ value = str(value).strip()
+ # Remove surrounding quotes
+ if value.startswith('"') and value.endswith('"'):
+ value = value[1:-1]
+ return value
+
+ return {
+ 'skuId': clean_value(row.get('skuId', '')),
+ 'name': clean_value(row.get('name', '')),
+ 'name_pinyin': clean_value(row.get('name_pinyin', '')),
+ 'create_time': clean_value(row.get('create_time', '')),
+ 'ruSkuName': clean_value(row.get('ruSkuName', '')),
+ 'enSpuName': clean_value(row.get('enSpuName', '')),
+ 'categoryName': clean_value(row.get('categoryName', '')),
+ 'supplierName': clean_value(row.get('supplierName', '')),
+ 'brandName': clean_value(row.get('brandName', '')),
+ 'file_id': clean_value(row.get('file_id', '')),
+ 'days_since_last_update': clean_value(row.get('days_since_last_update', '')),
+ 'id': clean_value(row.get('id', '')),
+ 'imageUrl': clean_value(row.get('imageUrl', ''))
+ }
+
+
+def generate_spu_data(csv_data: dict, spu_id: int, tenant_id: str = "2") -> dict:
+ """
+ Generate SPU data from CSV row.
+
+ Args:
+ csv_data: Parsed CSV row data
+ spu_id: SPU ID
+ tenant_id: Tenant ID (default: "2")
+
+ Returns:
+ SPU data dictionary
+ """
+ # Parse create_time
+ try:
+ created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S')
+ except:
+ created_at = datetime.now() - timedelta(days=random.randint(1, 365))
+
+ updated_at = created_at + timedelta(days=random.randint(0, 30))
+
+ # Generate handle from title
+ title = csv_data['name'] or csv_data['enSpuName'] or 'Product'
+ handle = generate_handle(title)
+
+ # Generate tags from category and brand
+ tags_parts = []
+ if csv_data['categoryName']:
+ tags_parts.append(csv_data['categoryName'])
+ if csv_data['brandName']:
+ tags_parts.append(csv_data['brandName'])
+ tags = ','.join(tags_parts) if tags_parts else ''
+
+ # Generate SEO fields
+ seo_title = f"{title} - {csv_data['categoryName']}" if csv_data['categoryName'] else title
+ seo_description = f"购买{csv_data['brandName']}{title}" if csv_data['brandName'] else title
+ seo_keywords = f"{title},{csv_data['categoryName']},{csv_data['brandName']}" if csv_data['categoryName'] else title
+
+ spu = {
+ 'id': spu_id,
+ 'shop_id': 1,
+ 'shoplazza_id': csv_data['id'] or f"spu-{spu_id}",
+ 'handle': handle,
+ 'title': title,
+ 'brief': csv_data['name'] or '',
+ 'description': f"{csv_data['name']}
" if csv_data['name'] else '',
+ 'spu': '',
+ 'vendor': csv_data['supplierName'] or '',
+ 'vendor_url': '',
+ 'seo_title': seo_title,
+ 'seo_description': seo_description,
+ 'seo_keywords': seo_keywords,
+ 'image_src': csv_data['imageUrl'] or '',
+ 'image_width': 800,
+ 'image_height': 600,
+ 'image_path': f"products/{spu_id}.jpg",
+ 'image_alt': title,
+ 'inventory_policy': '',
+ 'inventory_quantity': 0,
+ 'inventory_tracking': '0',
+ 'published': 1,
+ 'published_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'requires_shipping': 1,
+ 'taxable': 0,
+ 'fake_sales': 0,
+ 'display_fake_sales': 0,
+ 'mixed_wholesale': 0,
+ 'need_variant_image': 0,
+ 'has_only_default_variant': 0,
+ 'tags': tags,
+ 'note': '',
+ 'category': csv_data['categoryName'] or '',
+ 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'tenant_id': tenant_id,
+ 'creator': '1',
+ 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'updater': '1',
+ 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'deleted': 0
+ }
+
+ return spu
+
+
+def generate_sku_data(csv_data: dict, spu_id: int, sku_id: int, tenant_id: str = "2") -> dict:
+ """
+ Generate SKU data from CSV row.
+
+ Args:
+ csv_data: Parsed CSV row data
+ spu_id: Associated SPU ID
+ sku_id: SKU ID (from CSV skuId)
+ tenant_id: Tenant ID (default: "2")
+
+ Returns:
+ SKU data dictionary
+ """
+ # Parse create_time
+ try:
+ created_at = datetime.strptime(csv_data['create_time'], '%Y-%m-%d %H:%M:%S')
+ except:
+ created_at = datetime.now() - timedelta(days=random.randint(1, 365))
+
+ updated_at = created_at + timedelta(days=random.randint(0, 30))
+
+ # Generate random price
+ price = round(random.uniform(50, 500), 2)
+ compare_at_price = round(price * random.uniform(1.2, 1.5), 2)
+ cost_price = round(price * 0.6, 2)
+
+ # Generate random stock
+ inventory_quantity = random.randint(0, 100)
+
+ # Generate random weight
+ weight = round(random.uniform(0.1, 5.0), 2)
+
+ # Use ruSkuName as title, fallback to name
+ title = csv_data['ruSkuName'] or csv_data['name'] or 'SKU'
+
+ # Use skuId as SKU code
+ sku_code = csv_data['skuId'] or f"SKU-{sku_id}"
+
+ sku = {
+ 'id': sku_id,
+ 'spu_id': spu_id,
+ 'shop_id': 1,
+ 'shoplazza_id': f"sku-{sku_id}",
+ 'shoplazza_product_id': csv_data['id'] or f"spu-{spu_id}",
+ 'shoplazza_image_id': '',
+ 'title': title,
+ 'sku': sku_code,
+ 'barcode': f"BAR{sku_id:08d}",
+ 'position': 1,
+ 'price': price,
+ 'compare_at_price': compare_at_price,
+ 'cost_price': cost_price,
+ 'option1': '',
+ 'option2': '',
+ 'option3': '',
+ 'inventory_quantity': inventory_quantity,
+ 'weight': weight,
+ 'weight_unit': 'kg',
+ 'image_src': csv_data['imageUrl'] or '',
+ 'wholesale_price': f'[{{"price": {round(price * 0.8, 2)}, "minQuantity": 10}}]',
+ 'note': '',
+ 'extend': None, # JSON field, use NULL
+ 'shoplazza_created_at': created_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'shoplazza_updated_at': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'tenant_id': tenant_id,
+ 'creator': '1',
+ 'create_time': created_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'updater': '1',
+ 'update_time': updated_at.strftime('%Y-%m-%d %H:%M:%S'),
+ 'deleted': 0
+ }
+
+ return sku
+
+
+def read_csv_file(csv_file: str) -> list:
+ """
+ Read CSV file and return list of parsed rows.
+
+ Args:
+ csv_file: Path to CSV file
+
+ Returns:
+ List of parsed CSV data dictionaries
+ """
+ csv_data_list = []
+
+ with open(csv_file, 'r', encoding='utf-8') as f:
+ # Use csv.DictReader to handle quoted fields properly
+ reader = csv.DictReader(f)
+ for row in reader:
+ parsed = parse_csv_row(row)
+ csv_data_list.append(parsed)
+
+ return csv_data_list
+
+
+def generate_sql_inserts(spus: list, skus: list, output_file: str):
+ """
+ Generate SQL INSERT statements.
+
+ Args:
+ spus: List of SPU data
+ skus: List of SKU data
+ output_file: Output file path
+ """
+ with open(output_file, 'w', encoding='utf-8') as f:
+ f.write("-- SPU Data from tenant2 CSV\n")
+ f.write("INSERT INTO shoplazza_product_spu (\n")
+ f.write(" id, shop_id, shoplazza_id, handle, title, brief, description, spu,\n")
+ f.write(" vendor, vendor_url, seo_title, seo_description, seo_keywords,\n")
+ f.write(" image_src, image_width, image_height, image_path, image_alt,\n")
+ f.write(" inventory_policy, inventory_quantity, inventory_tracking,\n")
+ f.write(" published, published_at, requires_shipping, taxable,\n")
+ f.write(" fake_sales, display_fake_sales, mixed_wholesale, need_variant_image,\n")
+ f.write(" has_only_default_variant, tags, note, category,\n")
+ f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
+ f.write(" creator, create_time, updater, update_time, deleted\n")
+ f.write(") VALUES\n")
+
+ for i, spu in enumerate(spus):
+ values = (
+ f"({spu['id']}, {spu['shop_id']}, '{escape_sql_string(spu['shoplazza_id'])}', "
+ f"'{escape_sql_string(spu['handle'])}', '{escape_sql_string(spu['title'])}', "
+ f"'{escape_sql_string(spu['brief'])}', '{escape_sql_string(spu['description'])}', "
+ f"'{escape_sql_string(spu['spu'])}', '{escape_sql_string(spu['vendor'])}', "
+ f"'{escape_sql_string(spu['vendor_url'])}', '{escape_sql_string(spu['seo_title'])}', "
+ f"'{escape_sql_string(spu['seo_description'])}', '{escape_sql_string(spu['seo_keywords'])}', "
+ f"'{escape_sql_string(spu['image_src'])}', {spu['image_width']}, "
+ f"{spu['image_height']}, '{escape_sql_string(spu['image_path'])}', "
+ f"'{escape_sql_string(spu['image_alt'])}', '{escape_sql_string(spu['inventory_policy'])}', "
+ f"{spu['inventory_quantity']}, '{escape_sql_string(spu['inventory_tracking'])}', "
+ f"{spu['published']}, '{escape_sql_string(spu['published_at'])}', "
+ f"{spu['requires_shipping']}, {spu['taxable']}, "
+ f"{spu['fake_sales']}, {spu['display_fake_sales']}, {spu['mixed_wholesale']}, "
+ f"{spu['need_variant_image']}, {spu['has_only_default_variant']}, "
+ f"'{escape_sql_string(spu['tags'])}', '{escape_sql_string(spu['note'])}', "
+ f"'{escape_sql_string(spu['category'])}', '{escape_sql_string(spu['shoplazza_created_at'])}', "
+ f"'{escape_sql_string(spu['shoplazza_updated_at'])}', '{escape_sql_string(spu['tenant_id'])}', "
+ f"'{escape_sql_string(spu['creator'])}', '{escape_sql_string(spu['create_time'])}', "
+ f"'{escape_sql_string(spu['updater'])}', '{escape_sql_string(spu['update_time'])}', "
+ f"{spu['deleted']})"
+ )
+ f.write(values)
+ if i < len(spus) - 1:
+ f.write(",\n")
+ else:
+ f.write(";\n\n")
+
+ f.write("-- SKU Data from tenant2 CSV\n")
+ f.write("INSERT INTO shoplazza_product_sku (\n")
+ f.write(" id, spu_id, shop_id, shoplazza_id, shoplazza_product_id, shoplazza_image_id,\n")
+ f.write(" title, sku, barcode, position, price, compare_at_price, cost_price,\n")
+ f.write(" option1, option2, option3, inventory_quantity, weight, weight_unit,\n")
+ f.write(" image_src, wholesale_price, note, extend,\n")
+ f.write(" shoplazza_created_at, shoplazza_updated_at, tenant_id,\n")
+ f.write(" creator, create_time, updater, update_time, deleted\n")
+ f.write(") VALUES\n")
+
+ for i, sku in enumerate(skus):
+ # Handle extend field (JSON, can be NULL)
+ extend_value = 'NULL' if sku['extend'] is None else f"'{escape_sql_string(sku['extend'])}'"
+
+ values = (
+ f"({sku['id']}, {sku['spu_id']}, {sku['shop_id']}, '{escape_sql_string(sku['shoplazza_id'])}', "
+ f"'{escape_sql_string(sku['shoplazza_product_id'])}', '{escape_sql_string(sku['shoplazza_image_id'])}', "
+ f"'{escape_sql_string(sku['title'])}', '{escape_sql_string(sku['sku'])}', "
+ f"'{escape_sql_string(sku['barcode'])}', {sku['position']}, "
+ f"{sku['price']}, {sku['compare_at_price']}, {sku['cost_price']}, "
+ f"'{escape_sql_string(sku['option1'])}', '{escape_sql_string(sku['option2'])}', "
+ f"'{escape_sql_string(sku['option3'])}', {sku['inventory_quantity']}, {sku['weight']}, "
+ f"'{escape_sql_string(sku['weight_unit'])}', '{escape_sql_string(sku['image_src'])}', "
+ f"'{escape_sql_string(sku['wholesale_price'])}', '{escape_sql_string(sku['note'])}', "
+ f"{extend_value}, '{escape_sql_string(sku['shoplazza_created_at'])}', "
+ f"'{escape_sql_string(sku['shoplazza_updated_at'])}', '{escape_sql_string(sku['tenant_id'])}', "
+ f"'{escape_sql_string(sku['creator'])}', '{escape_sql_string(sku['create_time'])}', "
+ f"'{escape_sql_string(sku['updater'])}', '{escape_sql_string(sku['update_time'])}', "
+ f"{sku['deleted']})"
+ )
+ f.write(values)
+ if i < len(skus) - 1:
+ f.write(",\n")
+ else:
+ f.write(";\n")
+
+
+def get_max_ids_from_db(db_config=None):
+ """
+ Get maximum IDs from database to avoid primary key conflicts.
+
+ Args:
+ db_config: Optional database config dict with keys: host, port, database, username, password
+
+ Returns:
+ tuple: (max_spu_id, max_sku_id) or (0, 0) if cannot connect
+ """
+ if not db_config:
+ return 0, 0
+
+ try:
+ from utils.db_connector import create_db_connection
+ from sqlalchemy import text
+
+ db_engine = create_db_connection(
+ host=db_config['host'],
+ port=db_config['port'],
+ database=db_config['database'],
+ username=db_config['username'],
+ password=db_config['password']
+ )
+
+ with db_engine.connect() as conn:
+ result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_spu'))
+ max_spu_id = result.scalar() or 0
+
+ result = conn.execute(text('SELECT MAX(id) FROM shoplazza_product_sku'))
+ max_sku_id = result.scalar() or 0
+
+ return max_spu_id, max_sku_id
+ except Exception as e:
+ print(f"Warning: Could not get max IDs from database: {e}")
+ return 0, 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Import tenant2 CSV data into MySQL Shoplazza tables')
+ parser.add_argument('--csv-file', required=True, help='CSV file path')
+ parser.add_argument('--tenant-id', default='2', help='Tenant ID (default: 2)')
+ parser.add_argument('--start-spu-id', type=int, default=None, help='Starting SPU ID (default: auto-calculate from DB)')
+ parser.add_argument('--output', default='tenant2_data.sql', help='Output SQL file (default: tenant2_data.sql)')
+ parser.add_argument('--db-host', help='Database host (for auto-calculating start IDs)')
+ parser.add_argument('--db-port', type=int, default=3306, help='Database port (default: 3306)')
+ parser.add_argument('--db-database', help='Database name (for auto-calculating start IDs)')
+ parser.add_argument('--db-username', help='Database username (for auto-calculating start IDs)')
+ parser.add_argument('--db-password', help='Database password (for auto-calculating start IDs)')
+
+ args = parser.parse_args()
+
+ print(f"Reading CSV file: {args.csv_file}")
+ csv_data_list = read_csv_file(args.csv_file)
+ print(f"Read {len(csv_data_list)} rows from CSV")
+
+ # Auto-calculate start IDs if not provided and DB config available
+ start_spu_id = args.start_spu_id
+ if start_spu_id is None and args.db_host and args.db_database and args.db_username and args.db_password:
+ print("Auto-calculating start IDs from database...")
+ db_config = {
+ 'host': args.db_host,
+ 'port': args.db_port,
+ 'database': args.db_database,
+ 'username': args.db_username,
+ 'password': args.db_password
+ }
+ max_spu_id, max_sku_id = get_max_ids_from_db(db_config)
+ start_spu_id = max_spu_id + 1
+ print(f" Max SPU ID in DB: {max_spu_id}")
+ print(f" Using start SPU ID: {start_spu_id}")
+ elif start_spu_id is None:
+ start_spu_id = 1
+ print(f"Using default start SPU ID: {start_spu_id}")
+
+ # Generate SPU and SKU data
+ print(f"Generating SPU and SKU data (tenant_id={args.tenant_id})...")
+ spus = []
+ skus = []
+ spu_id = start_spu_id
+
+ for csv_data in csv_data_list:
+ # Generate SPU
+ spu = generate_spu_data(csv_data, spu_id, args.tenant_id)
+ spus.append(spu)
+
+ # Generate SKU - use skuId from CSV as SKU ID
+ try:
+ sku_id = int(csv_data['skuId'])
+ except:
+ # If skuId is not valid, use a generated ID
+ sku_id = 1000000 + spu_id
+
+ sku = generate_sku_data(csv_data, spu_id, sku_id, args.tenant_id)
+ skus.append(sku)
+
+ spu_id += 1
+
+ print(f"Generated {len(spus)} SPUs and {len(skus)} SKUs")
+
+ # Generate SQL file
+ print(f"Generating SQL file: {args.output}")
+ generate_sql_inserts(spus, skus, args.output)
+ print(f"SQL file generated: {args.output}")
+ print(f" - SPUs: {len(spus)}")
+ print(f" - SKUs: {len(skus)}")
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/scripts/indexer__old_2025_11/import_test_data.py b/scripts/indexer__old_2025_11/import_test_data.py
new file mode 100644
index 0000000..97ea83d
--- /dev/null
+++ b/scripts/indexer__old_2025_11/import_test_data.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Import test data into MySQL Shoplazza tables.
+
+Reads SQL file generated by generate_test_data.py and imports into MySQL.
+"""
+
+import sys
+import os
+import argparse
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from utils.db_connector import create_db_connection, test_connection
+
+
+def import_sql_file(db_engine, sql_file: str):
+ """
+ Import SQL file into database using MySQL client (more reliable for large files).
+
+ Args:
+ db_engine: SQLAlchemy database engine (used to get connection info)
+ sql_file: Path to SQL file
+ """
+ import subprocess
+ import os
+ from pathlib import Path
+
+ # Get connection info from engine URL
+ engine_url = str(db_engine.url)
+ # Parse: mysql+pymysql://user:pass@host:port/database
+ import re
+ match = re.match(r'mysql\+pymysql://([^:]+):([^@]+)@([^:]+):(\d+)/(.+)', engine_url)
+ if not match:
+ raise ValueError(f"Cannot parse database URL: {engine_url}")
+
+ username, password, host, port, database = match.groups()
+
+ # Use MySQL client to execute SQL file (more reliable)
+ sql_file_path = Path(sql_file).absolute()
+
+ # Build mysql command
+ mysql_cmd = [
+ 'mysql',
+ f'-h{host}',
+ f'-P{port}',
+ f'-u{username}',
+ f'-p{password}',
+ database
+ ]
+
+ print(f"Executing SQL file using MySQL client...")
+ print(f" File: {sql_file_path}")
+ print(f" Database: {host}:{port}/{database}")
+
+ try:
+ with open(sql_file_path, 'r', encoding='utf-8') as f:
+ result = subprocess.run(
+ mysql_cmd,
+ stdin=f,
+ capture_output=True,
+ text=True,
+ timeout=300 # 5 minute timeout
+ )
+
+ if result.returncode != 0:
+ error_msg = result.stderr or result.stdout
+ print(f"ERROR: MySQL execution failed")
+ print(f"Error output: {error_msg[:500]}")
+ raise Exception(f"MySQL execution failed: {error_msg[:200]}")
+
+ print("SQL file executed successfully")
+ return True
+
+ except FileNotFoundError:
+ # Fallback to SQLAlchemy if mysql client not available
+ print("MySQL client not found, falling back to SQLAlchemy...")
+ return import_sql_file_sqlalchemy(db_engine, sql_file)
+ except subprocess.TimeoutExpired:
+ raise Exception("SQL execution timed out after 5 minutes")
+ except Exception as e:
+ print(f"Error using MySQL client: {e}")
+ print("Falling back to SQLAlchemy...")
+ return import_sql_file_sqlalchemy(db_engine, sql_file)
+
+
+def import_sql_file_sqlalchemy(db_engine, sql_file: str):
+ """
+ Fallback method: Import SQL file using SQLAlchemy (for when mysql client unavailable).
+ """
+ from sqlalchemy import text
+
+ with open(sql_file, 'r', encoding='utf-8') as f:
+ sql_content = f.read()
+
+ # Remove comment lines
+ lines = sql_content.split('\n')
+ cleaned_lines = []
+ for line in lines:
+ stripped = line.lstrip()
+ if stripped.startswith('--'):
+ continue
+ cleaned_lines.append(line)
+
+ sql_content = '\n'.join(cleaned_lines)
+
+ # Split by semicolon - but we need to handle strings properly
+ # Use a state machine to track string boundaries
+ statements = []
+ current = []
+ in_string = False
+ i = 0
+
+ while i < len(sql_content):
+ char = sql_content[i]
+
+ if char == "'":
+ # Check for escaped quote (two single quotes)
+ if i + 1 < len(sql_content) and sql_content[i+1] == "'":
+ current.append("''")
+ i += 1 # Skip next quote
+ elif not in_string:
+ in_string = True
+ current.append(char)
+ else:
+ in_string = False
+ current.append(char)
+ else:
+ current.append(char)
+
+ # Split on semicolon only if not in string
+ if char == ';' and not in_string:
+ stmt = ''.join(current).strip()
+ if stmt and stmt.upper().startswith('INSERT INTO'):
+ statements.append(stmt)
+ current = []
+
+ i += 1
+
+ # Handle last statement
+ if current:
+ stmt = ''.join(current).strip()
+ if stmt and stmt.upper().startswith('INSERT INTO'):
+ statements.append(stmt)
+
+ print(f"Parsed {len(statements)} SQL statements")
+ print(f"Executing {len(statements)} SQL statements...")
+
+ # Use raw connection to avoid SQLAlchemy parameter parsing
+ raw_conn = db_engine.raw_connection()
+ try:
+ cursor = raw_conn.cursor()
+ try:
+ for i, statement in enumerate(statements, 1):
+ try:
+ # Execute raw SQL directly using pymysql cursor
+ cursor.execute(statement)
+ raw_conn.commit()
+ if i % 1000 == 0 or i == len(statements):
+ print(f" [{i}/{len(statements)}] Executed successfully")
+ except Exception as e:
+ print(f" [{i}/{len(statements)}] ERROR: {e}")
+ error_start = max(0, statement.find('VALUES') - 100)
+ error_end = min(len(statement), error_start + 500)
+ print(f" Statement context: ...{statement[error_start:error_end]}...")
+ raise
+ finally:
+ cursor.close()
+ finally:
+ raw_conn.close()
+
+ return True
+
+
+def verify_import(db_engine, tenant_id: str):
+ """
+ Verify imported data.
+
+ Args:
+ db_engine: SQLAlchemy database engine
+ tenant_id: Tenant ID to verify
+ """
+ from sqlalchemy import text
+
+ with db_engine.connect() as conn:
+ # Count SPUs
+ result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_spu WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
+ spu_count = result.scalar()
+
+ # Count SKUs
+ result = conn.execute(text("SELECT COUNT(*) FROM shoplazza_product_sku WHERE tenant_id = :tenant_id"), {"tenant_id": tenant_id})
+ sku_count = result.scalar()
+
+ print(f"\nVerification:")
+ print(f" SPUs: {spu_count}")
+ print(f" SKUs: {sku_count}")
+
+ return spu_count, sku_count
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Import test data into MySQL')
+
+ # Database connection
+ parser.add_argument('--db-host', required=True, help='MySQL host')
+ parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
+ parser.add_argument('--db-database', required=True, help='MySQL database name')
+ parser.add_argument('--db-username', required=True, help='MySQL username')
+ parser.add_argument('--db-password', required=True, help='MySQL password')
+
+ # Import options
+ parser.add_argument('--sql-file', required=True, help='SQL file to import')
+ parser.add_argument('--tenant-id', help='Tenant ID to verify (optional)')
+
+ args = parser.parse_args()
+
+ print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
+
+ # Connect to database
+ try:
+ db_engine = create_db_connection(
+ host=args.db_host,
+ port=args.db_port,
+ database=args.db_database,
+ username=args.db_username,
+ password=args.db_password
+ )
+ except Exception as e:
+ print(f"ERROR: Failed to connect to MySQL: {e}")
+ return 1
+
+ # Test connection
+ if not test_connection(db_engine):
+ print("ERROR: Database connection test failed")
+ return 1
+
+ print("Database connection successful")
+
+ # Clean existing data if tenant_id provided
+ if args.tenant_id:
+ print(f"\nCleaning existing data for tenant_id: {args.tenant_id}")
+ from sqlalchemy import text
+ try:
+ with db_engine.connect() as conn:
+ # Delete SKUs first (foreign key constraint)
+ conn.execute(text(f"DELETE FROM shoplazza_product_sku WHERE tenant_id = '{args.tenant_id}'"))
+ # Delete SPUs
+ conn.execute(text(f"DELETE FROM shoplazza_product_spu WHERE tenant_id = '{args.tenant_id}'"))
+ conn.commit()
+ print("✓ Existing data cleaned")
+ except Exception as e:
+ print(f"⚠ Warning: Failed to clean existing data: {e}")
+ # Continue anyway
+
+ # Import SQL file
+ print(f"\nImporting SQL file: {args.sql_file}")
+ try:
+ import_sql_file(db_engine, args.sql_file)
+ print("Import completed successfully")
+ except Exception as e:
+ print(f"ERROR: Failed to import SQL file: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+ # Verify import if tenant_id provided
+ if args.tenant_id:
+ verify_import(db_engine, args.tenant_id)
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
+
diff --git a/scripts/indexer__old_2025_11/ingest.sh b/scripts/indexer__old_2025_11/ingest.sh
new file mode 100755
index 0000000..a420fe3
--- /dev/null
+++ b/scripts/indexer__old_2025_11/ingest.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Unified data ingestion script for SearchEngine
+# Ingests data from MySQL to Elasticsearch
+
+cd "$(dirname "$0")/.."
+source /home/tw/miniconda3/etc/profile.d/conda.sh
+conda activate searchengine
+
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}数据灌入脚本${NC}"
+echo -e "${GREEN}========================================${NC}"
+
+# Load config from .env file if it exists
+if [ -f .env ]; then
+ set -a
+ source .env
+ set +a
+fi
+
+# Parameters
+TENANT_ID=${1:-""}
+RECREATE_INDEX=${2:-"false"}
+
+DB_HOST=${DB_HOST:-"120.79.247.228"}
+DB_PORT=${DB_PORT:-"3316"}
+DB_DATABASE=${DB_DATABASE:-"saas"}
+DB_USERNAME=${DB_USERNAME:-"saas"}
+DB_PASSWORD=${DB_PASSWORD:-"P89cZHS5d7dFyc9R"}
+ES_HOST=${ES_HOST:-"http://localhost:9200"}
+BATCH_SIZE=${BATCH_SIZE:-500}
+
+echo -e "\n${YELLOW}Configuration:${NC}"
+echo " Tenant ID: $TENANT_ID"
+echo " Recreate Index: $RECREATE_INDEX"
+echo " MySQL: $DB_HOST:$DB_PORT/$DB_DATABASE"
+echo " Elasticsearch: $ES_HOST"
+echo " Batch Size: $BATCH_SIZE"
+
+# Validate parameters
+if [ -z "$TENANT_ID" ]; then
+ echo -e "${RED}ERROR: Tenant ID is required${NC}"
+ echo "Usage: $0 [recreate_index]"
+ echo " tenant_id: Required, tenant ID"
+ echo " recreate_index: Optional, recreate index if exists (true/false, default: false)"
+ exit 1
+fi
+
+if [ -z "$DB_PASSWORD" ]; then
+ echo -e "${RED}ERROR: DB_PASSWORD未设置,请检查.env文件或环境变量${NC}"
+ exit 1
+fi
+
+# Build command
+CMD="python scripts/ingest_shoplazza.py \
+ --db-host $DB_HOST \
+ --db-port $DB_PORT \
+ --db-database $DB_DATABASE \
+ --db-username $DB_USERNAME \
+ --db-password $DB_PASSWORD \
+ --tenant-id $TENANT_ID \
+ --es-host $ES_HOST \
+ --batch-size $BATCH_SIZE"
+
+if [ "$RECREATE_INDEX" = "true" ] || [ "$RECREATE_INDEX" = "1" ]; then
+ CMD="$CMD --recreate"
+ echo -e "\n${YELLOW}Warning: Index will be deleted and recreated!${NC}"
+fi
+
+echo -e "\n${YELLOW}Starting data ingestion...${NC}"
+eval $CMD
+
+if [ $? -eq 0 ]; then
+ echo -e "\n${GREEN}========================================${NC}"
+ echo -e "${GREEN}数据灌入完成!${NC}"
+ echo -e "${GREEN}========================================${NC}"
+else
+ echo -e "\n${RED}========================================${NC}"
+ echo -e "${RED}数据灌入失败!${NC}"
+ echo -e "${RED}========================================${NC}"
+ exit 1
+fi
diff --git a/scripts/indexer__old_2025_11/ingest_shoplazza.py b/scripts/indexer__old_2025_11/ingest_shoplazza.py
new file mode 100644
index 0000000..60699c0
--- /dev/null
+++ b/scripts/indexer__old_2025_11/ingest_shoplazza.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""
+Shoplazza data ingestion script.
+
+Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer.
+"""
+
+import sys
+import os
+import argparse
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from utils.db_connector import create_db_connection
+from utils.es_client import ESClient
+from indexer.spu_transformer import SPUTransformer
+from indexer.mapping_generator import load_mapping, DEFAULT_INDEX_NAME
+from indexer.bulk_indexer import BulkIndexer
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch')
+
+ # Database connection
+ parser.add_argument('--db-host', required=True, help='MySQL host')
+ parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
+ parser.add_argument('--db-database', required=True, help='MySQL database name')
+ parser.add_argument('--db-username', required=True, help='MySQL username')
+ parser.add_argument('--db-password', required=True, help='MySQL password')
+
+ # Tenant and index
+ parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)')
+ parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')
+
+ # Options
+ parser.add_argument('--recreate', action='store_true', help='Recreate index if exists')
+ parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)')
+
+ args = parser.parse_args()
+
+ print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}")
+
+ # Load mapping from JSON file
+ try:
+ mapping = load_mapping()
+ print(f"Loaded mapping configuration")
+ except Exception as e:
+ print(f"ERROR: Failed to load mapping: {e}")
+ return 1
+
+ index_name = DEFAULT_INDEX_NAME
+
+ # Connect to MySQL
+ print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
+ try:
+ db_engine = create_db_connection(
+ host=args.db_host,
+ port=args.db_port,
+ database=args.db_database,
+ username=args.db_username,
+ password=args.db_password
+ )
+ except Exception as e:
+ print(f"ERROR: Failed to connect to MySQL: {e}")
+ return 1
+
+ # Connect to Elasticsearch
+ es_host = args.es_host
+ es_username = os.environ.get('ES_USERNAME')
+ es_password = os.environ.get('ES_PASSWORD')
+
+ print(f"Connecting to Elasticsearch: {es_host}")
+ if es_username and es_password:
+ print(f"Using authentication: {es_username}")
+ es_client = ESClient(hosts=[es_host], username=es_username, password=es_password)
+ else:
+ es_client = ESClient(hosts=[es_host])
+
+ if not es_client.ping():
+ print(f"ERROR: Cannot connect to Elasticsearch at {es_host}")
+ return 1
+
+ # Create index if needed
+ if args.recreate:
+ if es_client.index_exists(index_name):
+ print(f"Deleting existing index: {index_name}")
+ if not es_client.delete_index(index_name):
+ print(f"ERROR: Failed to delete index '{index_name}'")
+ return 1
+
+ if not es_client.index_exists(index_name):
+ print(f"Creating index: {index_name}")
+ if not es_client.create_index(index_name, mapping):
+ print(f"ERROR: Failed to create index '{index_name}'")
+ print("Please check the mapping configuration and try again.")
+ return 1
+ else:
+ print(f"Using existing index: {index_name}")
+
+ # Initialize SPU transformer
+ print(f"Initializing SPU transformer for tenant: {args.tenant_id}")
+ transformer = SPUTransformer(db_engine, args.tenant_id)
+
+ # Transform data
+ print("Transforming SPU and SKU data...")
+ try:
+ documents = transformer.transform_batch()
+ print(f"Transformed {len(documents)} SPU documents")
+ except Exception as e:
+ print(f"ERROR: Failed to transform data: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+ if not documents:
+ print("WARNING: No documents to index")
+ return 0
+
+ # Bulk index
+ print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...")
+ indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
+
+ try:
+ results = indexer.index_documents(documents, id_field="spu_id", show_progress=True)
+ print(f"\nIngestion complete:")
+ print(f" Success: {results['success']}")
+ print(f" Failed: {results['failed']}")
+ print(f" Time: {results.get('elapsed_time', 0):.2f}s")
+
+ if results['failed'] > 0:
+ print(f"\nWARNING: {results['failed']} documents failed to index")
+ return 1
+
+ return 0
+ except Exception as e:
+ print(f"ERROR: Failed to index documents: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+
+if __name__ == '__main__':
+ sys.exit(main())
+
diff --git a/scripts/indexer__old_2025_11/recreate_and_import.py b/scripts/indexer__old_2025_11/recreate_and_import.py
new file mode 100755
index 0000000..af0a448
--- /dev/null
+++ b/scripts/indexer__old_2025_11/recreate_and_import.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+重建索引并导入数据的脚本。
+
+清除旧索引,使用新的mapping重建索引,然后导入数据。
+"""
+
+import sys
+import os
+import argparse
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from utils.db_connector import create_db_connection
+from utils.es_client import ESClient
+from indexer.mapping_generator import load_mapping, delete_index_if_exists, DEFAULT_INDEX_NAME
+from indexer.spu_transformer import SPUTransformer
+from indexer.bulk_indexer import BulkIndexer
+
+
+def main():
+ parser = argparse.ArgumentParser(description='重建ES索引并导入数据')
+
+ # Database connection
+ parser.add_argument('--db-host', help='MySQL host (或使用环境变量 DB_HOST)')
+ parser.add_argument('--db-port', type=int, help='MySQL port (或使用环境变量 DB_PORT, 默认: 3306)')
+ parser.add_argument('--db-database', help='MySQL database (或使用环境变量 DB_DATABASE)')
+ parser.add_argument('--db-username', help='MySQL username (或使用环境变量 DB_USERNAME)')
+ parser.add_argument('--db-password', help='MySQL password (或使用环境变量 DB_PASSWORD)')
+
+ # Tenant and ES
+ parser.add_argument('--tenant-id', required=True, help='Tenant ID (必需)')
+ parser.add_argument('--es-host', help='Elasticsearch host (或使用环境变量 ES_HOST, 默认: http://localhost:9200)')
+
+ # Options
+ parser.add_argument('--batch-size', type=int, default=500, help='批量导入大小 (默认: 500)')
+ parser.add_argument('--skip-delete', action='store_true', help='跳过删除旧索引步骤')
+
+ args = parser.parse_args()
+
+ print("=" * 60)
+ print("重建ES索引并导入数据")
+ print("=" * 60)
+
+ # 加载mapping
+ print("\n[1/4] 加载mapping配置...")
+ try:
+ mapping = load_mapping()
+ print(f"✓ 成功加载mapping配置")
+ except Exception as e:
+ print(f"✗ 加载mapping失败: {e}")
+ return 1
+
+ index_name = DEFAULT_INDEX_NAME
+ print(f"索引名称: {index_name}")
+
+ # 连接Elasticsearch
+ print("\n[2/4] 连接Elasticsearch...")
+ es_host = args.es_host or os.environ.get('ES_HOST', 'http://localhost:9200')
+ es_username = os.environ.get('ES_USERNAME')
+ es_password = os.environ.get('ES_PASSWORD')
+
+ print(f"ES地址: {es_host}")
+ if es_username:
+ print(f"ES用户名: {es_username}")
+
+ try:
+ if es_username and es_password:
+ es_client = ESClient(hosts=[es_host], username=es_username, password=es_password)
+ else:
+ es_client = ESClient(hosts=[es_host])
+
+ if not es_client.ping():
+ print(f"✗ 无法连接到Elasticsearch: {es_host}")
+ return 1
+ print("✓ Elasticsearch连接成功")
+ except Exception as e:
+ print(f"✗ 连接Elasticsearch失败: {e}")
+ return 1
+
+ # 删除旧索引
+ if not args.skip_delete:
+ print("\n[3/4] 删除旧索引...")
+ if es_client.index_exists(index_name):
+ print(f"发现已存在的索引: {index_name}")
+ if delete_index_if_exists(es_client, index_name):
+ print(f"✓ 成功删除索引: {index_name}")
+ else:
+ print(f"✗ 删除索引失败: {index_name}")
+ return 1
+ else:
+ print(f"索引不存在,跳过删除: {index_name}")
+ else:
+ print("\n[3/4] 跳过删除旧索引步骤")
+
+ # 创建新索引
+ print("\n[4/4] 创建新索引...")
+ try:
+ if es_client.index_exists(index_name):
+ print(f"✓ 索引已存在: {index_name},跳过创建")
+ else:
+ print(f"创建索引: {index_name}")
+ if es_client.create_index(index_name, mapping):
+ print(f"✓ 成功创建索引: {index_name}")
+ else:
+ print(f"✗ 创建索引失败: {index_name}")
+ return 1
+ except Exception as e:
+ print(f"✗ 创建索引失败: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+ # 连接MySQL
+ print("\n[5/5] 连接MySQL...")
+ db_host = args.db_host or os.environ.get('DB_HOST')
+ db_port = args.db_port or int(os.environ.get('DB_PORT', 3306))
+ db_database = args.db_database or os.environ.get('DB_DATABASE')
+ db_username = args.db_username or os.environ.get('DB_USERNAME')
+ db_password = args.db_password or os.environ.get('DB_PASSWORD')
+
+ if not all([db_host, db_database, db_username, db_password]):
+ print("✗ MySQL连接参数不完整")
+ print("请提供 --db-host, --db-database, --db-username, --db-password")
+ print("或设置环境变量: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD")
+ return 1
+
+ print(f"MySQL: {db_host}:{db_port}/{db_database}")
+ try:
+ db_engine = create_db_connection(
+ host=db_host,
+ port=db_port,
+ database=db_database,
+ username=db_username,
+ password=db_password
+ )
+ print("✓ MySQL连接成功")
+ except Exception as e:
+ print(f"✗ 连接MySQL失败: {e}")
+ return 1
+
+ # 导入数据
+ print("\n[6/6] 导入数据...")
+ print(f"Tenant ID: {args.tenant_id}")
+ print(f"批量大小: {args.batch_size}")
+
+ try:
+ transformer = SPUTransformer(db_engine, args.tenant_id)
+ print("正在转换数据...")
+ documents = transformer.transform_batch()
+ print(f"✓ 转换完成: {len(documents)} 个文档")
+
+ if not documents:
+ print("⚠ 没有数据需要导入")
+ return 0
+
+ print(f"正在导入数据到ES (批量大小: {args.batch_size})...")
+ indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
+ results = indexer.index_documents(documents, id_field="spu_id", show_progress=True)
+
+ print(f"\n{'='*60}")
+ print("导入完成!")
+ print(f"{'='*60}")
+ print(f"成功: {results['success']}")
+ print(f"失败: {results['failed']}")
+ print(f"耗时: {results.get('elapsed_time', 0):.2f}秒")
+
+ if results['failed'] > 0:
+ print(f"\n⚠ 警告: {results['failed']} 个文档导入失败")
+ return 1
+
+ return 0
+ except Exception as e:
+ print(f"✗ 导入数据失败: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+
+if __name__ == '__main__':
+ sys.exit(main())
+
diff --git a/scripts/ingest.sh b/scripts/ingest.sh
deleted file mode 100755
index a420fe3..0000000
--- a/scripts/ingest.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-
-# Unified data ingestion script for SearchEngine
-# Ingests data from MySQL to Elasticsearch
-
-cd "$(dirname "$0")/.."
-source /home/tw/miniconda3/etc/profile.d/conda.sh
-conda activate searchengine
-
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-RED='\033[0;31m'
-NC='\033[0m'
-
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}数据灌入脚本${NC}"
-echo -e "${GREEN}========================================${NC}"
-
-# Load config from .env file if it exists
-if [ -f .env ]; then
- set -a
- source .env
- set +a
-fi
-
-# Parameters
-TENANT_ID=${1:-""}
-RECREATE_INDEX=${2:-"false"}
-
-DB_HOST=${DB_HOST:-"120.79.247.228"}
-DB_PORT=${DB_PORT:-"3316"}
-DB_DATABASE=${DB_DATABASE:-"saas"}
-DB_USERNAME=${DB_USERNAME:-"saas"}
-DB_PASSWORD=${DB_PASSWORD:-"P89cZHS5d7dFyc9R"}
-ES_HOST=${ES_HOST:-"http://localhost:9200"}
-BATCH_SIZE=${BATCH_SIZE:-500}
-
-echo -e "\n${YELLOW}Configuration:${NC}"
-echo " Tenant ID: $TENANT_ID"
-echo " Recreate Index: $RECREATE_INDEX"
-echo " MySQL: $DB_HOST:$DB_PORT/$DB_DATABASE"
-echo " Elasticsearch: $ES_HOST"
-echo " Batch Size: $BATCH_SIZE"
-
-# Validate parameters
-if [ -z "$TENANT_ID" ]; then
- echo -e "${RED}ERROR: Tenant ID is required${NC}"
- echo "Usage: $0 [recreate_index]"
- echo " tenant_id: Required, tenant ID"
- echo " recreate_index: Optional, recreate index if exists (true/false, default: false)"
- exit 1
-fi
-
-if [ -z "$DB_PASSWORD" ]; then
- echo -e "${RED}ERROR: DB_PASSWORD未设置,请检查.env文件或环境变量${NC}"
- exit 1
-fi
-
-# Build command
-CMD="python scripts/ingest_shoplazza.py \
- --db-host $DB_HOST \
- --db-port $DB_PORT \
- --db-database $DB_DATABASE \
- --db-username $DB_USERNAME \
- --db-password $DB_PASSWORD \
- --tenant-id $TENANT_ID \
- --es-host $ES_HOST \
- --batch-size $BATCH_SIZE"
-
-if [ "$RECREATE_INDEX" = "true" ] || [ "$RECREATE_INDEX" = "1" ]; then
- CMD="$CMD --recreate"
- echo -e "\n${YELLOW}Warning: Index will be deleted and recreated!${NC}"
-fi
-
-echo -e "\n${YELLOW}Starting data ingestion...${NC}"
-eval $CMD
-
-if [ $? -eq 0 ]; then
- echo -e "\n${GREEN}========================================${NC}"
- echo -e "${GREEN}数据灌入完成!${NC}"
- echo -e "${GREEN}========================================${NC}"
-else
- echo -e "\n${RED}========================================${NC}"
- echo -e "${RED}数据灌入失败!${NC}"
- echo -e "${RED}========================================${NC}"
- exit 1
-fi
diff --git a/scripts/ingest_shoplazza.py b/scripts/ingest_shoplazza.py
deleted file mode 100644
index 60699c0..0000000
--- a/scripts/ingest_shoplazza.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env python3
-"""
-Shoplazza data ingestion script.
-
-Loads SPU and SKU data from MySQL and indexes into Elasticsearch using SPU transformer.
-"""
-
-import sys
-import os
-import argparse
-from pathlib import Path
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from utils.db_connector import create_db_connection
-from utils.es_client import ESClient
-from indexer.spu_transformer import SPUTransformer
-from indexer.mapping_generator import load_mapping, DEFAULT_INDEX_NAME
-from indexer.bulk_indexer import BulkIndexer
-
-
-def main():
- parser = argparse.ArgumentParser(description='Ingest Shoplazza SPU/SKU data into Elasticsearch')
-
- # Database connection
- parser.add_argument('--db-host', required=True, help='MySQL host')
- parser.add_argument('--db-port', type=int, default=3306, help='MySQL port (default: 3306)')
- parser.add_argument('--db-database', required=True, help='MySQL database name')
- parser.add_argument('--db-username', required=True, help='MySQL username')
- parser.add_argument('--db-password', required=True, help='MySQL password')
-
- # Tenant and index
- parser.add_argument('--tenant-id', required=True, help='Tenant ID (required)')
- parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')
-
- # Options
- parser.add_argument('--recreate', action='store_true', help='Recreate index if exists')
- parser.add_argument('--batch-size', type=int, default=500, help='Batch size for indexing (default: 500)')
-
- args = parser.parse_args()
-
- print(f"Starting Shoplazza data ingestion for tenant: {args.tenant_id}")
-
- # Load mapping from JSON file
- try:
- mapping = load_mapping()
- print(f"Loaded mapping configuration")
- except Exception as e:
- print(f"ERROR: Failed to load mapping: {e}")
- return 1
-
- index_name = DEFAULT_INDEX_NAME
-
- # Connect to MySQL
- print(f"Connecting to MySQL: {args.db_host}:{args.db_port}/{args.db_database}")
- try:
- db_engine = create_db_connection(
- host=args.db_host,
- port=args.db_port,
- database=args.db_database,
- username=args.db_username,
- password=args.db_password
- )
- except Exception as e:
- print(f"ERROR: Failed to connect to MySQL: {e}")
- return 1
-
- # Connect to Elasticsearch
- es_host = args.es_host
- es_username = os.environ.get('ES_USERNAME')
- es_password = os.environ.get('ES_PASSWORD')
-
- print(f"Connecting to Elasticsearch: {es_host}")
- if es_username and es_password:
- print(f"Using authentication: {es_username}")
- es_client = ESClient(hosts=[es_host], username=es_username, password=es_password)
- else:
- es_client = ESClient(hosts=[es_host])
-
- if not es_client.ping():
- print(f"ERROR: Cannot connect to Elasticsearch at {es_host}")
- return 1
-
- # Create index if needed
- if args.recreate:
- if es_client.index_exists(index_name):
- print(f"Deleting existing index: {index_name}")
- if not es_client.delete_index(index_name):
- print(f"ERROR: Failed to delete index '{index_name}'")
- return 1
-
- if not es_client.index_exists(index_name):
- print(f"Creating index: {index_name}")
- if not es_client.create_index(index_name, mapping):
- print(f"ERROR: Failed to create index '{index_name}'")
- print("Please check the mapping configuration and try again.")
- return 1
- else:
- print(f"Using existing index: {index_name}")
-
- # Initialize SPU transformer
- print(f"Initializing SPU transformer for tenant: {args.tenant_id}")
- transformer = SPUTransformer(db_engine, args.tenant_id)
-
- # Transform data
- print("Transforming SPU and SKU data...")
- try:
- documents = transformer.transform_batch()
- print(f"Transformed {len(documents)} SPU documents")
- except Exception as e:
- print(f"ERROR: Failed to transform data: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
- if not documents:
- print("WARNING: No documents to index")
- return 0
-
- # Bulk index
- print(f"Indexing {len(documents)} documents (batch size: {args.batch_size})...")
- indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
-
- try:
- results = indexer.index_documents(documents, id_field="spu_id", show_progress=True)
- print(f"\nIngestion complete:")
- print(f" Success: {results['success']}")
- print(f" Failed: {results['failed']}")
- print(f" Time: {results.get('elapsed_time', 0):.2f}s")
-
- if results['failed'] > 0:
- print(f"\nWARNING: {results['failed']} documents failed to index")
- return 1
-
- return 0
- except Exception as e:
- print(f"ERROR: Failed to index documents: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
-
-if __name__ == '__main__':
- sys.exit(main())
-
diff --git a/scripts/recreate_and_import.py b/scripts/recreate_and_import.py
deleted file mode 100755
index af0a448..0000000
--- a/scripts/recreate_and_import.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-"""
-重建索引并导入数据的脚本。
-
-清除旧索引,使用新的mapping重建索引,然后导入数据。
-"""
-
-import sys
-import os
-import argparse
-from pathlib import Path
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from utils.db_connector import create_db_connection
-from utils.es_client import ESClient
-from indexer.mapping_generator import load_mapping, delete_index_if_exists, DEFAULT_INDEX_NAME
-from indexer.spu_transformer import SPUTransformer
-from indexer.bulk_indexer import BulkIndexer
-
-
-def main():
- parser = argparse.ArgumentParser(description='重建ES索引并导入数据')
-
- # Database connection
- parser.add_argument('--db-host', help='MySQL host (或使用环境变量 DB_HOST)')
- parser.add_argument('--db-port', type=int, help='MySQL port (或使用环境变量 DB_PORT, 默认: 3306)')
- parser.add_argument('--db-database', help='MySQL database (或使用环境变量 DB_DATABASE)')
- parser.add_argument('--db-username', help='MySQL username (或使用环境变量 DB_USERNAME)')
- parser.add_argument('--db-password', help='MySQL password (或使用环境变量 DB_PASSWORD)')
-
- # Tenant and ES
- parser.add_argument('--tenant-id', required=True, help='Tenant ID (必需)')
- parser.add_argument('--es-host', help='Elasticsearch host (或使用环境变量 ES_HOST, 默认: http://localhost:9200)')
-
- # Options
- parser.add_argument('--batch-size', type=int, default=500, help='批量导入大小 (默认: 500)')
- parser.add_argument('--skip-delete', action='store_true', help='跳过删除旧索引步骤')
-
- args = parser.parse_args()
-
- print("=" * 60)
- print("重建ES索引并导入数据")
- print("=" * 60)
-
- # 加载mapping
- print("\n[1/4] 加载mapping配置...")
- try:
- mapping = load_mapping()
- print(f"✓ 成功加载mapping配置")
- except Exception as e:
- print(f"✗ 加载mapping失败: {e}")
- return 1
-
- index_name = DEFAULT_INDEX_NAME
- print(f"索引名称: {index_name}")
-
- # 连接Elasticsearch
- print("\n[2/4] 连接Elasticsearch...")
- es_host = args.es_host or os.environ.get('ES_HOST', 'http://localhost:9200')
- es_username = os.environ.get('ES_USERNAME')
- es_password = os.environ.get('ES_PASSWORD')
-
- print(f"ES地址: {es_host}")
- if es_username:
- print(f"ES用户名: {es_username}")
-
- try:
- if es_username and es_password:
- es_client = ESClient(hosts=[es_host], username=es_username, password=es_password)
- else:
- es_client = ESClient(hosts=[es_host])
-
- if not es_client.ping():
- print(f"✗ 无法连接到Elasticsearch: {es_host}")
- return 1
- print("✓ Elasticsearch连接成功")
- except Exception as e:
- print(f"✗ 连接Elasticsearch失败: {e}")
- return 1
-
- # 删除旧索引
- if not args.skip_delete:
- print("\n[3/4] 删除旧索引...")
- if es_client.index_exists(index_name):
- print(f"发现已存在的索引: {index_name}")
- if delete_index_if_exists(es_client, index_name):
- print(f"✓ 成功删除索引: {index_name}")
- else:
- print(f"✗ 删除索引失败: {index_name}")
- return 1
- else:
- print(f"索引不存在,跳过删除: {index_name}")
- else:
- print("\n[3/4] 跳过删除旧索引步骤")
-
- # 创建新索引
- print("\n[4/4] 创建新索引...")
- try:
- if es_client.index_exists(index_name):
- print(f"✓ 索引已存在: {index_name},跳过创建")
- else:
- print(f"创建索引: {index_name}")
- if es_client.create_index(index_name, mapping):
- print(f"✓ 成功创建索引: {index_name}")
- else:
- print(f"✗ 创建索引失败: {index_name}")
- return 1
- except Exception as e:
- print(f"✗ 创建索引失败: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
- # 连接MySQL
- print("\n[5/5] 连接MySQL...")
- db_host = args.db_host or os.environ.get('DB_HOST')
- db_port = args.db_port or int(os.environ.get('DB_PORT', 3306))
- db_database = args.db_database or os.environ.get('DB_DATABASE')
- db_username = args.db_username or os.environ.get('DB_USERNAME')
- db_password = args.db_password or os.environ.get('DB_PASSWORD')
-
- if not all([db_host, db_database, db_username, db_password]):
- print("✗ MySQL连接参数不完整")
- print("请提供 --db-host, --db-database, --db-username, --db-password")
- print("或设置环境变量: DB_HOST, DB_DATABASE, DB_USERNAME, DB_PASSWORD")
- return 1
-
- print(f"MySQL: {db_host}:{db_port}/{db_database}")
- try:
- db_engine = create_db_connection(
- host=db_host,
- port=db_port,
- database=db_database,
- username=db_username,
- password=db_password
- )
- print("✓ MySQL连接成功")
- except Exception as e:
- print(f"✗ 连接MySQL失败: {e}")
- return 1
-
- # 导入数据
- print("\n[6/6] 导入数据...")
- print(f"Tenant ID: {args.tenant_id}")
- print(f"批量大小: {args.batch_size}")
-
- try:
- transformer = SPUTransformer(db_engine, args.tenant_id)
- print("正在转换数据...")
- documents = transformer.transform_batch()
- print(f"✓ 转换完成: {len(documents)} 个文档")
-
- if not documents:
- print("⚠ 没有数据需要导入")
- return 0
-
- print(f"正在导入数据到ES (批量大小: {args.batch_size})...")
- indexer = BulkIndexer(es_client, index_name, batch_size=args.batch_size)
- results = indexer.index_documents(documents, id_field="spu_id", show_progress=True)
-
- print(f"\n{'='*60}")
- print("导入完成!")
- print(f"{'='*60}")
- print(f"成功: {results['success']}")
- print(f"失败: {results['failed']}")
- print(f"耗时: {results.get('elapsed_time', 0):.2f}秒")
-
- if results['failed'] > 0:
- print(f"\n⚠ 警告: {results['failed']} 个文档导入失败")
- return 1
-
- return 0
- except Exception as e:
- print(f"✗ 导入数据失败: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
-
-if __name__ == '__main__':
- sys.exit(main())
-
diff --git a/scripts/run_tests.py b/scripts/run_tests.py
deleted file mode 100755
index 67d900f..0000000
--- a/scripts/run_tests.py
+++ /dev/null
@@ -1,705 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试执行脚本
-
-运行完整的测试流水线,包括:
-- 环境检查
-- 单元测试
-- 集成测试
-- 性能测试
-- 测试报告生成
-"""
-
-import os
-import sys
-import subprocess
-import time
-import json
-import argparse
-import logging
-from pathlib import Path
-from typing import Dict, List, Optional, Any
-from dataclasses import dataclass, asdict
-from datetime import datetime
-
-
-# 添加项目根目录到Python路径
-project_root = Path(__file__).parent.parent
-sys.path.insert(0, str(project_root))
-
-
-@dataclass
-class TestResult:
- """测试结果数据结构"""
- name: str
- status: str # "passed", "failed", "skipped", "error"
- duration: float
- details: Optional[Dict[str, Any]] = None
- output: Optional[str] = None
- error: Optional[str] = None
-
-
-@dataclass
-class TestSuiteResult:
- """测试套件结果"""
- name: str
- total_tests: int
- passed: int
- failed: int
- skipped: int
- errors: int
- duration: float
- results: List[TestResult]
-
-
-class TestRunner:
- """测试运行器"""
-
- def __init__(self, config: Dict[str, Any]):
- self.config = config
- self.logger = self._setup_logger()
- self.results: List[TestSuiteResult] = []
- self.start_time = time.time()
-
- def _setup_logger(self) -> logging.Logger:
- """设置日志记录器"""
- log_level = getattr(logging, self.config.get('log_level', 'INFO').upper())
- logging.basicConfig(
- level=log_level,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- handlers=[
- logging.StreamHandler(),
- logging.FileHandler(
- project_root / 'test_logs' / f'test_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
- )
- ]
- )
- return logging.getLogger(__name__)
-
- def _run_command(self, cmd: List[str], cwd: Optional[Path] = None, env: Optional[Dict[str, str]] = None) -> subprocess.CompletedProcess:
- """运行命令"""
- try:
- self.logger.info(f"执行命令: {' '.join(cmd)}")
-
- # 设置环境变量
- process_env = os.environ.copy()
- if env:
- process_env.update(env)
-
- result = subprocess.run(
- cmd,
- cwd=cwd or project_root,
- env=process_env,
- capture_output=True,
- text=True,
- timeout=self.config.get('test_timeout', 300)
- )
-
- self.logger.debug(f"命令返回码: {result.returncode}")
- if result.stdout:
- self.logger.debug(f"标准输出: {result.stdout[:500]}...")
- if result.stderr:
- self.logger.debug(f"标准错误: {result.stderr[:500]}...")
-
- return result
-
- except subprocess.TimeoutExpired:
- self.logger.error(f"命令执行超时: {' '.join(cmd)}")
- raise
- except Exception as e:
- self.logger.error(f"命令执行失败: {e}")
- raise
-
- def check_environment(self) -> bool:
- """检查测试环境"""
- self.logger.info("检查测试环境...")
-
- checks = []
-
- # 检查Python环境
- try:
- python_version = sys.version
- self.logger.info(f"Python版本: {python_version}")
- checks.append(("Python", True, f"版本 {python_version}"))
- except Exception as e:
- checks.append(("Python", False, str(e)))
-
- # 检查conda环境
- try:
- result = self._run_command(['conda', '--version'])
- if result.returncode == 0:
- conda_version = result.stdout.strip()
- self.logger.info(f"Conda版本: {conda_version}")
- checks.append(("Conda", True, conda_version))
- else:
- checks.append(("Conda", False, "未找到conda"))
- except Exception as e:
- checks.append(("Conda", False, str(e)))
-
- # 检查依赖包
- required_packages = [
- 'pytest', 'fastapi', 'elasticsearch', 'numpy',
- 'torch', 'transformers', 'pyyaml'
- ]
-
- for package in required_packages:
- try:
- result = self._run_command(['python', '-c', f'import {package}'])
- if result.returncode == 0:
- checks.append((package, True, "已安装"))
- else:
- checks.append((package, False, "导入失败"))
- except Exception as e:
- checks.append((package, False, str(e)))
-
- # 检查Elasticsearch
- try:
- es_host = os.getenv('ES_HOST', 'http://localhost:9200')
- result = self._run_command(['curl', '-s', f'{es_host}/_cluster/health'])
- if result.returncode == 0:
- health_data = json.loads(result.stdout)
- status = health_data.get('status', 'unknown')
- self.logger.info(f"Elasticsearch状态: {status}")
- checks.append(("Elasticsearch", True, f"状态: {status}"))
- else:
- checks.append(("Elasticsearch", False, "连接失败"))
- except Exception as e:
- checks.append(("Elasticsearch", False, str(e)))
-
- # 检查API服务
- try:
- api_host = os.getenv('API_HOST', '127.0.0.1')
- api_port = os.getenv('API_PORT', '6003')
- result = self._run_command(['curl', '-s', f'http://{api_host}:{api_port}/health'])
- if result.returncode == 0:
- health_data = json.loads(result.stdout)
- status = health_data.get('status', 'unknown')
- self.logger.info(f"API服务状态: {status}")
- checks.append(("API服务", True, f"状态: {status}"))
- else:
- checks.append(("API服务", False, "连接失败"))
- except Exception as e:
- checks.append(("API服务", False, str(e)))
-
- # 输出检查结果
- self.logger.info("环境检查结果:")
- all_passed = True
- for name, passed, details in checks:
- status = "✓" if passed else "✗"
- self.logger.info(f" {status} {name}: {details}")
- if not passed:
- all_passed = False
-
- return all_passed
-
- def run_unit_tests(self) -> TestSuiteResult:
- """运行单元测试"""
- self.logger.info("运行单元测试...")
-
- start_time = time.time()
- cmd = [
- 'python', '-m', 'pytest',
- 'tests/unit/',
- '-v',
- '--tb=short',
- '--json-report',
- '--json-report-file=test_logs/unit_test_results.json'
- ]
-
- try:
- result = self._run_command(cmd)
- duration = time.time() - start_time
-
- # 解析测试结果
- if result.returncode == 0:
- status = "passed"
- else:
- status = "failed"
-
- # 尝试解析JSON报告
- test_results = []
- passed = failed = skipped = errors = 0
-
- try:
- with open(project_root / 'test_logs' / 'unit_test_results.json', 'r') as f:
- report_data = json.load(f)
-
- summary = report_data.get('summary', {})
- total = summary.get('total', 0)
- passed = summary.get('passed', 0)
- failed = summary.get('failed', 0)
- skipped = summary.get('skipped', 0)
- errors = summary.get('error', 0)
-
- # 获取详细结果
- for test in report_data.get('tests', []):
- test_results.append(TestResult(
- name=test.get('nodeid', ''),
- status=test.get('outcome', 'unknown'),
- duration=test.get('duration', 0.0),
- details=test
- ))
-
- except Exception as e:
- self.logger.warning(f"无法解析单元测试JSON报告: {e}")
-
- suite_result = TestSuiteResult(
- name="单元测试",
- total_tests=passed + failed + skipped + errors,
- passed=passed,
- failed=failed,
- skipped=skipped,
- errors=errors,
- duration=duration,
- results=test_results
- )
-
- self.results.append(suite_result)
- self.logger.info(f"单元测试完成: {suite_result.total_tests}个测试, "
- f"{suite_result.passed}通过, {suite_result.failed}失败, "
- f"{suite_result.skipped}跳过, {suite_result.errors}错误")
-
- return suite_result
-
- except Exception as e:
- self.logger.error(f"单元测试执行失败: {e}")
- raise
-
- def run_integration_tests(self) -> TestSuiteResult:
- """运行集成测试"""
- self.logger.info("运行集成测试...")
-
- start_time = time.time()
- cmd = [
- 'python', '-m', 'pytest',
- 'tests/integration/',
- '-v',
- '--tb=short',
- '-m', 'not slow', # 排除慢速测试
- '--json-report',
- '--json-report-file=test_logs/integration_test_results.json'
- ]
-
- try:
- result = self._run_command(cmd)
- duration = time.time() - start_time
-
- # 解析测试结果
- if result.returncode == 0:
- status = "passed"
- else:
- status = "failed"
-
- # 尝试解析JSON报告
- test_results = []
- passed = failed = skipped = errors = 0
-
- try:
- with open(project_root / 'test_logs' / 'integration_test_results.json', 'r') as f:
- report_data = json.load(f)
-
- summary = report_data.get('summary', {})
- total = summary.get('total', 0)
- passed = summary.get('passed', 0)
- failed = summary.get('failed', 0)
- skipped = summary.get('skipped', 0)
- errors = summary.get('error', 0)
-
- for test in report_data.get('tests', []):
- test_results.append(TestResult(
- name=test.get('nodeid', ''),
- status=test.get('outcome', 'unknown'),
- duration=test.get('duration', 0.0),
- details=test
- ))
-
- except Exception as e:
- self.logger.warning(f"无法解析集成测试JSON报告: {e}")
-
- suite_result = TestSuiteResult(
- name="集成测试",
- total_tests=passed + failed + skipped + errors,
- passed=passed,
- failed=failed,
- skipped=skipped,
- errors=errors,
- duration=duration,
- results=test_results
- )
-
- self.results.append(suite_result)
- self.logger.info(f"集成测试完成: {suite_result.total_tests}个测试, "
- f"{suite_result.passed}通过, {suite_result.failed}失败, "
- f"{suite_result.skipped}跳过, {suite_result.errors}错误")
-
- return suite_result
-
- except Exception as e:
- self.logger.error(f"集成测试执行失败: {e}")
- raise
-
- def run_api_tests(self) -> TestSuiteResult:
- """运行API测试"""
- self.logger.info("运行API测试...")
-
- start_time = time.time()
- cmd = [
- 'python', '-m', 'pytest',
- 'tests/integration/test_api_integration.py',
- '-v',
- '--tb=short',
- '--json-report',
- '--json-report-file=test_logs/api_test_results.json'
- ]
-
- try:
- result = self._run_command(cmd)
- duration = time.time() - start_time
-
- # 解析测试结果
- if result.returncode == 0:
- status = "passed"
- else:
- status = "failed"
-
- # 尝试解析JSON报告
- test_results = []
- passed = failed = skipped = errors = 0
-
- try:
- with open(project_root / 'test_logs' / 'api_test_results.json', 'r') as f:
- report_data = json.load(f)
-
- summary = report_data.get('summary', {})
- total = summary.get('total', 0)
- passed = summary.get('passed', 0)
- failed = summary.get('failed', 0)
- skipped = summary.get('skipped', 0)
- errors = summary.get('error', 0)
-
- for test in report_data.get('tests', []):
- test_results.append(TestResult(
- name=test.get('nodeid', ''),
- status=test.get('outcome', 'unknown'),
- duration=test.get('duration', 0.0),
- details=test
- ))
-
- except Exception as e:
- self.logger.warning(f"无法解析API测试JSON报告: {e}")
-
- suite_result = TestSuiteResult(
- name="API测试",
- total_tests=passed + failed + skipped + errors,
- passed=passed,
- failed=failed,
- skipped=skipped,
- errors=errors,
- duration=duration,
- results=test_results
- )
-
- self.results.append(suite_result)
- self.logger.info(f"API测试完成: {suite_result.total_tests}个测试, "
- f"{suite_result.passed}通过, {suite_result.failed}失败, "
- f"{suite_result.skipped}跳过, {suite_result.errors}错误")
-
- return suite_result
-
- except Exception as e:
- self.logger.error(f"API测试执行失败: {e}")
- raise
-
- def run_performance_tests(self) -> TestSuiteResult:
- """运行性能测试"""
- self.logger.info("运行性能测试...")
-
- start_time = time.time()
-
- # 简单的性能测试 - 测试搜索响应时间
- test_queries = [
- "红色连衣裙",
- "智能手机",
- "笔记本电脑 AND (游戏 OR 办公)",
- "无线蓝牙耳机"
- ]
-
- test_results = []
- passed = failed = 0
-
- for query in test_queries:
- try:
- query_start = time.time()
- result = self._run_command([
- 'curl', '-s',
- f'http://{os.getenv("API_HOST", "127.0.0.1")}:{os.getenv("API_PORT", "6003")}/search',
- '-d', f'q={query}'
- ])
- query_duration = time.time() - query_start
-
- if result.returncode == 0:
- response_data = json.loads(result.stdout)
- took_ms = response_data.get('took_ms', 0)
-
- # 性能阈值:响应时间不超过2秒
- if took_ms <= 2000:
- test_results.append(TestResult(
- name=f"搜索性能测试: {query}",
- status="passed",
- duration=query_duration,
- details={"took_ms": took_ms, "response_size": len(result.stdout)}
- ))
- passed += 1
- else:
- test_results.append(TestResult(
- name=f"搜索性能测试: {query}",
- status="failed",
- duration=query_duration,
- details={"took_ms": took_ms, "threshold": 2000}
- ))
- failed += 1
- else:
- test_results.append(TestResult(
- name=f"搜索性能测试: {query}",
- status="failed",
- duration=query_duration,
- error=result.stderr
- ))
- failed += 1
-
- except Exception as e:
- test_results.append(TestResult(
- name=f"搜索性能测试: {query}",
- status="error",
- duration=0.0,
- error=str(e)
- ))
- failed += 1
-
- duration = time.time() - start_time
-
- suite_result = TestSuiteResult(
- name="性能测试",
- total_tests=len(test_results),
- passed=passed,
- failed=failed,
- skipped=0,
- errors=0,
- duration=duration,
- results=test_results
- )
-
- self.results.append(suite_result)
- self.logger.info(f"性能测试完成: {suite_result.total_tests}个测试, "
- f"{suite_result.passed}通过, {suite_result.failed}失败")
-
- return suite_result
-
- def generate_report(self) -> str:
- """生成测试报告"""
- self.logger.info("生成测试报告...")
-
- # 计算总体统计
- total_tests = sum(suite.total_tests for suite in self.results)
- total_passed = sum(suite.passed for suite in self.results)
- total_failed = sum(suite.failed for suite in self.results)
- total_skipped = sum(suite.skipped for suite in self.results)
- total_errors = sum(suite.errors for suite in self.results)
- total_duration = sum(suite.duration for suite in self.results)
-
- # 生成报告数据
- report_data = {
- "timestamp": datetime.now().isoformat(),
- "summary": {
- "total_tests": total_tests,
- "passed": total_passed,
- "failed": total_failed,
- "skipped": total_skipped,
- "errors": total_errors,
- "success_rate": (total_passed / total_tests * 100) if total_tests > 0 else 0,
- "total_duration": total_duration
- },
- "suites": [asdict(suite) for suite in self.results]
- }
-
- # 保存JSON报告
- report_file = project_root / 'test_logs' / f'test_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
- with open(report_file, 'w', encoding='utf-8') as f:
- json.dump(report_data, f, indent=2, ensure_ascii=False)
-
- # 生成文本报告
- text_report = self._generate_text_report(report_data)
-
- report_file_text = project_root / 'test_logs' / f'test_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'
- with open(report_file_text, 'w', encoding='utf-8') as f:
- f.write(text_report)
-
- self.logger.info(f"测试报告已保存: {report_file}")
- self.logger.info(f"文本报告已保存: {report_file_text}")
-
- return text_report
-
- def _generate_text_report(self, report_data: Dict[str, Any]) -> str:
- """生成文本格式的测试报告"""
- lines = []
-
- # 标题
- lines.append("=" * 60)
- lines.append("搜索引擎测试报告")
- lines.append("=" * 60)
- lines.append(f"时间: {report_data['timestamp']}")
- lines.append("")
-
- # 摘要
- summary = report_data['summary']
- lines.append("测试摘要")
- lines.append("-" * 30)
- lines.append(f"总测试数: {summary['total_tests']}")
- lines.append(f"通过: {summary['passed']}")
- lines.append(f"失败: {summary['failed']}")
- lines.append(f"跳过: {summary['skipped']}")
- lines.append(f"错误: {summary['errors']}")
- lines.append(f"成功率: {summary['success_rate']:.1f}%")
- lines.append(f"总耗时: {summary['total_duration']:.2f}秒")
- lines.append("")
-
- # 各测试套件详情
- lines.append("测试套件详情")
- lines.append("-" * 30)
-
- for suite in report_data['suites']:
- lines.append(f"\n{suite['name']}:")
- lines.append(f" 总数: {suite['total_tests']}, 通过: {suite['passed']}, "
- f"失败: {suite['failed']}, 跳过: {suite['skipped']}, 错误: {suite['errors']}")
- lines.append(f" 耗时: {suite['duration']:.2f}秒")
-
- # 显示失败的测试
- failed_tests = [r for r in suite['results'] if r['status'] in ['failed', 'error']]
- if failed_tests:
- lines.append(" 失败的测试:")
- for test in failed_tests[:5]: # 只显示前5个
- lines.append(f" - {test['name']}: {test['status']}")
- if test.get('error'):
- lines.append(f" 错误: {test['error'][:100]}...")
- if len(failed_tests) > 5:
- lines.append(f" ... 还有 {len(failed_tests) - 5} 个失败的测试")
-
- return "\n".join(lines)
-
- def run_all_tests(self) -> bool:
- """运行所有测试"""
- try:
- # 确保日志目录存在
- (project_root / 'test_logs').mkdir(exist_ok=True)
-
- # 加载环境变量
- env_file = project_root / 'test_env.sh'
- if env_file.exists():
- self.logger.info("加载测试环境变量...")
- result = self._run_command(['bash', str(env_file)])
- if result.returncode != 0:
- self.logger.warning("环境变量加载失败,继续使用默认配置")
-
- # 检查环境
- if not self.check_environment():
- self.logger.error("环境检查失败,请先启动测试环境")
- return False
-
- # 运行各类测试
- test_suites = [
- ("unit", self.run_unit_tests),
- ("integration", self.run_integration_tests),
- ("api", self.run_api_tests),
- ("performance", self.run_performance_tests)
- ]
-
- failed_suites = []
-
- for suite_name, suite_func in test_suites:
- if suite_name in self.config.get('skip_suites', []):
- self.logger.info(f"跳过 {suite_name} 测试")
- continue
-
- try:
- suite_result = suite_func()
- if suite_result.failed > 0 or suite_result.errors > 0:
- failed_suites.append(suite_name)
- except Exception as e:
- self.logger.error(f"{suite_name} 测试执行失败: {e}")
- failed_suites.append(suite_name)
-
- # 生成报告
- report = self.generate_report()
- print(report)
-
- # 返回测试结果
- return len(failed_suites) == 0
-
- except Exception as e:
- self.logger.error(f"测试执行失败: {e}")
- return False
-
-
-def main():
- """主函数"""
- parser = argparse.ArgumentParser(description="运行搜索引擎测试流水线")
- parser.add_argument('--skip-suites', nargs='+',
- choices=['unit', 'integration', 'api', 'performance'],
- help='跳过指定的测试套件')
- parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
- default='INFO', help='日志级别')
- parser.add_argument('--test-timeout', type=int, default=300,
- help='单个测试超时时间(秒)')
- parser.add_argument('--start-env', action='store_true',
- help='启动测试环境后运行测试')
- parser.add_argument('--stop-env', action='store_true',
- help='测试完成后停止测试环境')
-
- args = parser.parse_args()
-
- # 配置
- config = {
- 'skip_suites': args.skip_suites or [],
- 'log_level': args.log_level,
- 'test_timeout': args.test_timeout
- }
-
- # 启动环境
- if args.start_env:
- print("启动测试环境...")
- result = subprocess.run([
- 'bash', str(project_root / 'scripts' / 'start_test_environment.sh')
- ], capture_output=True, text=True)
-
- if result.returncode != 0:
- print(f"测试环境启动失败: {result.stderr}")
- return 1
-
- print("测试环境启动成功")
- time.sleep(5) # 等待服务完全启动
-
- try:
- # 运行测试
- runner = TestRunner(config)
- success = runner.run_all_tests()
-
- if success:
- print("\n🎉 所有测试通过!")
- return_code = 0
- else:
- print("\n❌ 部分测试失败,请查看日志")
- return_code = 1
-
- finally:
- # 停止环境
- if args.stop_env:
- print("\n停止测试环境...")
- subprocess.run([
- 'bash', str(project_root / 'scripts' / 'stop_test_environment.sh')
- ])
-
- return return_code
-
-
-if __name__ == "__main__":
- sys.exit(main())
\ No newline at end of file
diff --git a/scripts/start_test_environment.sh b/scripts/start_test_environment.sh
deleted file mode 100755
index e9d3727..0000000
--- a/scripts/start_test_environment.sh
+++ /dev/null
@@ -1,275 +0,0 @@
-#!/bin/bash
-
-# 启动测试环境脚本
-# 用于在commit前自动化测试时启动必要的依赖服务
-
-set -e
-
-# 颜色定义
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-# 配置
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-TEST_LOG_DIR="$PROJECT_ROOT/test_logs"
-PID_FILE="$PROJECT_ROOT/test_environment.pid"
-
-# 日志文件
-LOG_FILE="$TEST_LOG_DIR/test_environment.log"
-ES_LOG_FILE="$TEST_LOG_DIR/elasticsearch.log"
-API_LOG_FILE="$TEST_LOG_DIR/api_test.log"
-
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}启动测试环境${NC}"
-echo -e "${GREEN}========================================${NC}"
-
-# 创建日志目录
-mkdir -p "$TEST_LOG_DIR"
-
-# 检查是否已经运行
-if [ -f "$PID_FILE" ]; then
- OLD_PID=$(cat "$PID_FILE")
- if ps -p $OLD_PID > /dev/null 2>&1; then
- echo -e "${YELLOW}测试环境已在运行 (PID: $OLD_PID)${NC}"
- echo -e "${BLUE}如需重启,请先运行: ./scripts/stop_test_environment.sh${NC}"
- exit 0
- else
- rm -f "$PID_FILE"
- fi
-fi
-
-# 激活conda环境
-echo -e "${BLUE}激活conda环境...${NC}"
-source /home/tw/miniconda3/etc/profile.d/conda.sh
-conda activate searchengine
-
-# 设置环境变量
-echo -e "${BLUE}设置测试环境变量...${NC}"
-export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH"
-export TESTING_MODE=true
-export LOG_LEVEL=DEBUG
-
-# Elasticsearch配置
-export ES_HOST="http://localhost:9200"
-export ES_USERNAME="elastic"
-export ES_PASSWORD="changeme"
-
-# API配置
-export API_HOST="127.0.0.1"
-export API_PORT="6003" # 使用不同的端口避免冲突
-export TENANT_ID="test_tenant"
-
-# 测试配置
-export TEST_TIMEOUT=60
-export TEST_RETRY_COUNT=3
-
-echo -e "${BLUE}环境配置:${NC}"
-echo " ES_HOST: $ES_HOST"
-echo " API_HOST: $API_HOST:$API_PORT"
-echo " TENANT_ID: $TENANT_ID"
-echo " LOG_LEVEL: $LOG_LEVEL"
-echo " TESTING_MODE: $TESTING_MODE"
-
-# 检查Elasticsearch是否运行
-echo -e "${BLUE}检查Elasticsearch状态...${NC}"
-if curl -s "$ES_HOST/_cluster/health" > /dev/null; then
- echo -e "${GREEN}✓ Elasticsearch正在运行${NC}"
-else
- echo -e "${YELLOW}⚠ Elasticsearch未运行,尝试启动...${NC}"
-
- # 尝试启动Elasticsearch(如果安装了本地版本)
- if command -v elasticsearch &> /dev/null; then
- echo -e "${BLUE}启动本地Elasticsearch...${NC}"
- elasticsearch -d -p "$TEST_LOG_DIR/es.pid"
- sleep 10
-
- # 再次检查
- if curl -s "$ES_HOST/_cluster/health" > /dev/null; then
- echo -e "${GREEN}✓ Elasticsearch启动成功${NC}"
- else
- echo -e "${RED}✗ Elasticsearch启动失败${NC}"
- echo -e "${YELLOW}请手动启动Elasticsearch或配置远程ES地址${NC}"
- exit 1
- fi
- else
- echo -e "${RED}✗ 未找到本地Elasticsearch${NC}"
- echo -e "${YELLOW}请启动Elasticsearch服务或修改ES_HOST配置${NC}"
- exit 1
- fi
-fi
-
-# 等待Elasticsearch就绪
-echo -e "${BLUE}等待Elasticsearch就绪...${NC}"
-for i in {1..30}; do
- if curl -s "$ES_HOST/_cluster/health?wait_for_status=yellow&timeout=1s" | grep -q '"status":"green\|yellow"'; then
- echo -e "${GREEN}✓ Elasticsearch已就绪${NC}"
- break
- fi
- if [ $i -eq 30 ]; then
- echo -e "${RED}✗ Elasticsearch就绪超时${NC}"
- exit 1
- fi
- sleep 1
-done
-
-# 创建测试索引(如果需要)
-echo -e "${BLUE}准备测试数据索引...${NC}"
-curl -X PUT "$ES_HOST/test_products" -H 'Content-Type: application/json' -d'
-{
- "settings": {
- "number_of_shards": 1,
- "number_of_replicas": 0,
- "analysis": {
- "analyzer": {
- "ansj": {
- "type": "custom",
- "tokenizer": "keyword"
- }
- }
- }
- },
- "mappings": {
- "properties": {
- "name": {
- "type": "text",
- "analyzer": "ansj"
- },
- "brand_name": {
- "type": "text",
- "analyzer": "ansj"
- },
- "tags": {
- "type": "text",
- "analyzer": "ansj"
- },
- "price": {
- "type": "double"
- },
- "category_id": {
- "type": "integer"
- },
- "spu_id": {
- "type": "keyword"
- },
- "text_embedding": {
- "type": "dense_vector",
- "dims": 1024
- }
- }
- }
-}' > /dev/null 2>&1 || echo -e "${YELLOW}索引可能已存在${NC}"
-
-# 插入测试数据
-echo -e "${BLUE}插入测试数据...${NC}"
-curl -X POST "$ES_HOST/test_products/_bulk" -H 'Content-Type: application/json' -d'
-{"index": {"_id": "1"}}
-{"name": "红色连衣裙", "brand_name": "测试品牌", "tags": ["红色", "连衣裙", "女装"], "price": 299.0, "category_id": 1, "spu_id": "dress_001"}
-{"index": {"_id": "2"}}
-{"name": "蓝色连衣裙", "brand_name": "测试品牌", "tags": ["蓝色", "连衣裙", "女装"], "price": 399.0, "category_id": 1, "spu_id": "dress_002"}
-{"index": {"_id": "3"}}
-{"name": "智能手机", "brand_name": "科技品牌", "tags": ["智能", "手机", "数码"], "price": 2999.0, "category_id": 2, "spu_id": "phone_001"}
-{"index": {"_id": "4"}}
-{"name": "笔记本电脑", "brand_name": "科技品牌", "tags": ["笔记本", "电脑", "办公"], "price": 5999.0, "category_id": 3, "spu_id": "laptop_001"}
-' > /dev/null 2>&1 || echo -e "${YELLOW}测试数据可能已存在${NC}"
-
-# 启动测试API服务
-echo -e "${BLUE}启动测试API服务...${NC}"
-cd "$PROJECT_ROOT"
-
-# 使用后台模式启动API
-python -m api.app \
- --host $API_HOST \
- --port $API_PORT \
- --tenant $TENANT_ID \
- --es-host $ES_HOST \
- > "$API_LOG_FILE" 2>&1 &
-
-API_PID=$!
-echo $API_PID > "$PID_FILE"
-
-# 等待API服务启动
-echo -e "${BLUE}等待API服务启动...${NC}"
-for i in {1..30}; do
- if curl -s "http://$API_HOST:$API_PORT/health" > /dev/null; then
- echo -e "${GREEN}✓ API服务已就绪 (PID: $API_PID)${NC}"
- break
- fi
- if [ $i -eq 30 ]; then
- echo -e "${RED}✗ API服务启动超时${NC}"
- kill $API_PID 2>/dev/null || true
- rm -f "$PID_FILE"
- exit 1
- fi
- sleep 1
-done
-
-# 验证测试环境
-echo -e "${BLUE}验证测试环境...${NC}"
-
-# 测试Elasticsearch连接
-if curl -s "$ES_HOST/_cluster/health" | grep -q '"status":"green\|yellow"'; then
- echo -e "${GREEN}✓ Elasticsearch连接正常${NC}"
-else
- echo -e "${RED}✗ Elasticsearch连接失败${NC}"
- exit 1
-fi
-
-# 测试API健康检查
-if curl -s "http://$API_HOST:$API_PORT/health" | grep -q '"status"'; then
- echo -e "${GREEN}✓ API服务健康检查通过${NC}"
-else
- echo -e "${RED}✗ API服务健康检查失败${NC}"
- exit 1
-fi
-
-# 测试基本搜索功能
-if curl -s "http://$API_HOST:$API_PORT/search?q=红色连衣裙" | grep -q '"hits"'; then
- echo -e "${GREEN}✓ 基本搜索功能正常${NC}"
-else
- echo -e "${YELLOW}⚠ 基本搜索功能可能有问题,但继续进行${NC}"
-fi
-
-# 输出环境信息
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}测试环境启动完成!${NC}"
-echo -e "${GREEN}========================================${NC}"
-echo -e "${BLUE}服务信息:${NC}"
-echo " Elasticsearch: $ES_HOST"
-echo " API服务: http://$API_HOST:$API_PORT"
-echo " 测试客户: $TENANT_ID"
-echo -e "${BLUE}进程信息:${NC}"
-echo " API PID: $API_PID"
-echo " PID文件: $PID_FILE"
-echo -e "${BLUE}日志文件:${NC}"
-echo " 环境日志: $LOG_FILE"
-echo " API日志: $API_LOG_FILE"
-echo " ES日志: $ES_LOG_FILE"
-echo -e "${BLUE}测试命令:${NC}"
-echo " 运行所有测试: python scripts/run_tests.py"
-echo " 单元测试: pytest tests/unit/ -v"
-echo " 集成测试: pytest tests/integration/ -v"
-echo " API测试: pytest tests/integration/test_api_integration.py -v"
-echo "e${NC}"
-echo -e "${BLUE}停止环境: ./scripts/stop_test_environment.sh${NC}"
-
-# 保存环境变量到文件供测试脚本使用
-cat > "$PROJECT_ROOT/test_env.sh" << EOF
-#!/bin/bash
-export ES_HOST="$ES_HOST"
-export ES_USERNAME="$ES_USERNAME"
-export ES_PASSWORD="$ES_PASSWORD"
-export API_HOST="$API_HOST"
-export API_PORT="$API_PORT"
-export TENANT_ID="$TENANT_ID"
-export TESTING_MODE="$TESTING_MODE"
-export LOG_LEVEL="$LOG_LEVEL"
-export PYTHONPATH="$PROJECT_ROOT:\$PYTHONPATH"
-EOF
-
-chmod +x "$PROJECT_ROOT/test_env.sh"
-
-echo -e "${GREEN}测试环境已准备就绪!${NC}"
\ No newline at end of file
diff --git a/scripts/stop_test_environment.sh b/scripts/stop_test_environment.sh
deleted file mode 100755
index c17e744..0000000
--- a/scripts/stop_test_environment.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-# 停止测试环境脚本
-
-set -e
-
-# 颜色定义
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-# 配置
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-PID_FILE="$PROJECT_ROOT/test_environment.pid"
-ES_PID_FILE="$PROJECT_ROOT/test_logs/es.pid"
-
-echo -e "${BLUE}========================================${NC}"
-echo -e "${BLUE}停止测试环境${NC}"
-echo -e "${BLUE}========================================${NC}"
-
-# 停止API服务
-if [ -f "$PID_FILE" ]; then
- API_PID=$(cat "$PID_FILE")
- if ps -p $API_PID > /dev/null 2>&1; then
- echo -e "${BLUE}停止API服务 (PID: $API_PID)...${NC}"
- kill $API_PID
-
- # 等待进程结束
- for i in {1..10}; do
- if ! ps -p $API_PID > /dev/null 2>&1; then
- echo -e "${GREEN}✓ API服务已停止${NC}"
- break
- fi
- if [ $i -eq 10 ]; then
- echo -e "${YELLOW}强制停止API服务...${NC}"
- kill -9 $API_PID 2>/dev/null || true
- fi
- sleep 1
- done
- else
- echo -e "${YELLOW}API服务进程不存在${NC}"
- fi
- rm -f "$PID_FILE"
-else
- echo -e "${YELLOW}未找到API服务PID文件${NC}"
-fi
-
-# 停止Elasticsearch(如果是本地启动的)
-if [ -f "$ES_PID_FILE" ]; then
- ES_PID=$(cat "$ES_PID_FILE")
- if ps -p $ES_PID > /dev/null 2>&1; then
- echo -e "${BLUE}停止本地Elasticsearch (PID: $ES_PID)...${NC}"
- kill $ES_PID
- rm -f "$ES_PID_FILE"
- echo -e "${GREEN}✓ Elasticsearch已停止${NC}"
- else
- echo -e "${YELLOW}Elasticsearch进程不存在${NC}"
- rm -f "$ES_PID_FILE"
- fi
-else
- echo -e "${BLUE}跳过本地Elasticsearch停止(未找到PID文件)${NC}"
-fi
-
-# 清理测试环境文件
-echo -e "${BLUE}清理测试环境文件...${NC}"
-rm -f "$PROJECT_ROOT/test_env.sh"
-
-# 清理测试索引(可选)
-read -p "是否删除测试索引? (y/N): " -n 1 -r
-echo
-if [[ $REPLY =~ ^[Yy]$ ]]; then
- echo -e "${BLUE}删除测试索引...${NC}"
- curl -X DELETE "http://localhost:9200/test_products" 2>/dev/null || true
- echo -e "${GREEN}✓ 测试索引已删除${NC}"
-fi
-
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}测试环境已停止!${NC}"
-echo -e "${GREEN}========================================${NC}"
\ No newline at end of file
diff --git a/scripts/test_base.py b/scripts/test_base.py
deleted file mode 100644
index 3e80dcd..0000000
--- a/scripts/test_base.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for base configuration.
-
-Tests data ingestion, search API, response format, and tenant isolation.
-"""
-
-import sys
-import os
-import argparse
-import requests
-import json
-from pathlib import Path
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def test_search_api(base_url: str, tenant_id: str, query: str = "耳机"):
- """
- Test search API.
-
- Args:
- base_url: API base URL
- tenant_id: Tenant ID
- query: Search query
-
- Returns:
- Response JSON or None if failed
- """
- url = f"{base_url}/search/"
- headers = {
- "X-Tenant-ID": tenant_id,
- "Content-Type": "application/json"
- }
- payload = {
- "query": query,
- "size": 10,
- "from": 0
- }
-
- print(f"\nTesting search API:")
- print(f" URL: {url}")
- print(f" Query: {query}")
- print(f" Tenant ID: {tenant_id}")
-
- try:
- response = requests.post(url, json=payload, headers=headers, timeout=30)
- response.raise_for_status()
- data = response.json()
-
- print(f" Status: {response.status_code}")
- print(f" Total: {data.get('total', 0)}")
- print(f" Results: {len(data.get('results', []))}")
-
- return data
- except Exception as e:
- print(f" ERROR: {e}")
- return None
-
-
-def validate_response_format(data: dict):
- """
- Validate response format.
-
- Args:
- data: Response data
-
- Returns:
- List of validation errors (empty if valid)
- """
- errors = []
-
- # Check for results field (not hits)
- if 'hits' in data:
- errors.append("Response contains 'hits' field (should be 'results')")
-
- if 'results' not in data:
- errors.append("Response missing 'results' field")
- else:
- results = data['results']
- if not isinstance(results, list):
- errors.append("'results' should be a list")
- else:
- # Validate first result structure
- if results:
- result = results[0]
- required_fields = ['spu_id', 'title', 'skus', 'relevance_score']
- for field in required_fields:
- if field not in result:
- errors.append(f"Result missing required field: {field}")
-
- # Check for ES internal fields
- es_internal_fields = ['_id', '_score', '_source']
- for field in es_internal_fields:
- if field in result:
- errors.append(f"Result contains ES internal field: {field}")
-
- # Validate skus
- if 'skus' in result:
- skus = result['skus']
- if not isinstance(skus, list):
- errors.append("'skus' should be a list")
- elif skus:
- sku = skus[0]
- sku_required = ['sku_id', 'price', 'sku', 'stock']
- for field in sku_required:
- if field not in sku:
- errors.append(f"SKU missing required field: {field}")
-
- # Check for suggestions and related_searches
- if 'suggestions' not in data:
- errors.append("Response missing 'suggestions' field")
- if 'related_searches' not in data:
- errors.append("Response missing 'related_searches' field")
-
- return errors
-
-
-def test_facets(base_url: str, tenant_id: str):
- """
- Test facets aggregation.
-
- Args:
- base_url: API base URL
- tenant_id: Tenant ID
-
- Returns:
- Response JSON or None if failed
- """
- url = f"{base_url}/search/"
- headers = {
- "X-Tenant-ID": tenant_id,
- "Content-Type": "application/json"
- }
- payload = {
- "query": "商品",
- "size": 10,
- "facets": ["category.keyword", "vendor.keyword"]
- }
-
- print(f"\nTesting facets:")
- print(f" Facets: {payload['facets']}")
-
- try:
- response = requests.post(url, json=payload, headers=headers, timeout=30)
- response.raise_for_status()
- data = response.json()
-
- if 'facets' in data and data['facets']:
- print(f" Facets returned: {len(data['facets'])}")
- for facet in data['facets']:
- print(f" - {facet.get('field')}: {len(facet.get('values', []))} values")
- else:
- print(" WARNING: No facets returned")
-
- return data
- except Exception as e:
- print(f" ERROR: {e}")
- return None
-
-
-def test_tenant_isolation(base_url: str, tenant_id_1: str, tenant_id_2: str):
- """
- Test tenant isolation.
-
- Args:
- base_url: API base URL
- tenant_id_1: First tenant ID
- tenant_id_2: Second tenant ID
- """
- print(f"\nTesting tenant isolation:")
- print(f" Tenant 1: {tenant_id_1}")
- print(f" Tenant 2: {tenant_id_2}")
-
- # Search for tenant 1
- data1 = test_search_api(base_url, tenant_id_1, "商品")
- # Search for tenant 2
- data2 = test_search_api(base_url, tenant_id_2, "商品")
-
- if data1 and data2:
- results1 = set(r.get('spu_id') for r in data1.get('results', []))
- results2 = set(r.get('spu_id') for r in data2.get('results', []))
-
- overlap = results1 & results2
- if overlap:
- print(f" WARNING: Found {len(overlap)} overlapping results between tenants")
- else:
- print(f" OK: No overlapping results (tenant isolation working)")
-
-
-def main():
- parser = argparse.ArgumentParser(description='Test base configuration')
- parser.add_argument('--api-url', default='http://localhost:8000', help='API base URL')
- parser.add_argument('--tenant-id', default='1', help='Tenant ID for testing')
- parser.add_argument('--test-tenant-2', help='Second tenant ID for isolation test')
-
- args = parser.parse_args()
-
- print("=" * 60)
- print("Base Configuration Test Suite")
- print("=" * 60)
-
- # Test 1: Basic search
- print("\n[Test 1] Basic Search")
- data = test_search_api(args.api_url, args.tenant_id)
- if not data:
- print("FAILED: Basic search test")
- return 1
-
- # Test 2: Response format validation
- print("\n[Test 2] Response Format Validation")
- errors = validate_response_format(data)
- if errors:
- print("FAILED: Response format validation")
- for error in errors:
- print(f" - {error}")
- return 1
- else:
- print("PASSED: Response format is correct")
-
- # Test 3: Facets
- print("\n[Test 3] Facets Aggregation")
- facet_data = test_facets(args.api_url, args.tenant_id)
- if not facet_data:
- print("WARNING: Facets test failed (may be expected if no data)")
-
- # Test 4: Tenant isolation (if second tenant provided)
- if args.test_tenant_2:
- print("\n[Test 4] Tenant Isolation")
- test_tenant_isolation(args.api_url, args.tenant_id, args.test_tenant_2)
-
- print("\n" + "=" * 60)
- print("All tests completed")
- print("=" * 60)
-
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
-
diff --git a/scripts/test_cloud_embedding.py b/scripts/test_cloud_embedding.py
deleted file mode 100644
index 67b358c..0000000
--- a/scripts/test_cloud_embedding.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""
-Test script for cloud text embedding using Aliyun DashScope API.
-
-Reads queries from queries.txt and tests embedding generation,
-logging send time, receive time, and duration for each request.
-"""
-
-import os
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from embeddings.cloud_text_encoder import CloudTextEncoder
-
-
-def format_timestamp(ts: float) -> str:
- """Format timestamp to readable string."""
- return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
-
-
-def read_queries(file_path: str, limit: int = 100) -> list:
- """
- Read queries from text file.
-
- Args:
- file_path: Path to queries file
- limit: Maximum number of queries to read
-
- Returns:
- List of query strings
- """
- queries = []
- with open(file_path, 'r', encoding='utf-8') as f:
- for i, line in enumerate(f):
- if i >= limit:
- break
- query = line.strip()
- if query: # Skip empty lines
- queries.append(query)
- return queries
-
-
-def test_cloud_embedding(queries_file: str, num_queries: int = 100):
- """
- Test cloud embedding with queries from file.
-
- Args:
- queries_file: Path to queries file
- num_queries: Number of queries to test
- """
- print("=" * 80)
- print("Cloud Text Embedding Test - Aliyun DashScope API")
- print("=" * 80)
- print()
-
- # Check if API key is set
- api_key = os.getenv("DASHSCOPE_API_KEY")
- if not api_key:
- print("ERROR: DASHSCOPE_API_KEY environment variable is not set!")
- print("Please set it using: export DASHSCOPE_API_KEY='your-api-key'")
- return
-
- print(f"API Key: {api_key[:10]}...{api_key[-4:]}")
- print()
-
- # Read queries
- print(f"Reading queries from: {queries_file}")
- try:
- queries = read_queries(queries_file, limit=num_queries)
- print(f"Successfully read {len(queries)} queries")
- print()
- except Exception as e:
- print(f"ERROR: Failed to read queries file: {e}")
- return
-
- # Initialize encoder
- print("Initializing CloudTextEncoder...")
- try:
- encoder = CloudTextEncoder()
- print("CloudTextEncoder initialized successfully")
- print()
- except Exception as e:
- print(f"ERROR: Failed to initialize encoder: {e}")
- return
-
- # Test embeddings
- print("=" * 80)
- print(f"Testing {len(queries)} queries (one by one)")
- print("=" * 80)
- print()
-
- total_start = time.time()
- success_count = 0
- failure_count = 0
- total_duration = 0.0
-
- for i, query in enumerate(queries, 1):
- try:
- # Record send time
- send_time = time.time()
- send_time_str = format_timestamp(send_time)
-
- # Generate embedding
- embedding = encoder.encode(query)
-
- # Record receive time
- receive_time = time.time()
- receive_time_str = format_timestamp(receive_time)
-
- # Calculate duration
- duration = receive_time - send_time
- total_duration += duration
-
- # Verify embedding
- if embedding.shape[0] > 0:
- success_count += 1
- status = "✓ SUCCESS"
- else:
- failure_count += 1
- status = "✗ FAILED"
-
- # Print result
- query_display = query[:50] + "..." if len(query) > 50 else query
- print(f"[{i:3d}/{len(queries)}] {status}")
- print(f" Query: {query_display}")
- print(f" Send Time: {send_time_str}")
- print(f" Receive Time: {receive_time_str}")
- print(f" Duration: {duration:.3f}s")
- print(f" Embedding Shape: {embedding.shape}")
- print()
-
- except Exception as e:
- failure_count += 1
- receive_time = time.time()
- duration = receive_time - send_time
-
- print(f"[{i:3d}/{len(queries)}] ✗ ERROR")
- print(f" Query: {query[:50]}...")
- print(f" Send Time: {send_time_str}")
- print(f" Receive Time: {format_timestamp(receive_time)}")
- print(f" Duration: {duration:.3f}s")
- print(f" Error: {str(e)}")
- print()
-
- # Print summary
- total_elapsed = time.time() - total_start
- avg_duration = total_duration / len(queries) if queries else 0
-
- print("=" * 80)
- print("Test Summary")
- print("=" * 80)
- print(f"Total Queries: {len(queries)}")
- print(f"Successful: {success_count}")
- print(f"Failed: {failure_count}")
- print(f"Success Rate: {success_count / len(queries) * 100:.1f}%")
- print(f"Total Time: {total_elapsed:.3f}s")
- print(f"Total API Time: {total_duration:.3f}s")
- print(f"Average Duration: {avg_duration:.3f}s per query")
- print(f"Throughput: {len(queries) / total_elapsed:.2f} queries/second")
- print("=" * 80)
-
-
-def main():
- """Main entry point."""
- # Default queries file path
- queries_file = Path(__file__).parent.parent / "data_crawling" / "queries.txt"
-
- # Check if file exists
- if not queries_file.exists():
- print(f"ERROR: Queries file not found: {queries_file}")
- return
-
- # Run test with 100 queries
- test_cloud_embedding(str(queries_file), num_queries=100)
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/scripts/test_cnclip_service.py b/scripts/test_cnclip_service.py
deleted file mode 100755
index 2fcfc7b..0000000
--- a/scripts/test_cnclip_service.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/env python3
-"""
-CN-CLIP 服务测试脚本
-
-用途:
- 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议)
-
-使用方法:
- python scripts/test_cnclip_service.py [PORT]
-
-参数:
- PORT: 服务端口(默认:51000)
-"""
-
-import sys
-import numpy as np
-from clip_client import Client
-
-
-def test_encoding(client, test_name, inputs):
- """测试编码功能"""
- print(f"\n{test_name}...")
- try:
- result = client.encode(inputs)
- if isinstance(result, np.ndarray):
- print(f"✓ 成功! 形状: {result.shape}")
- print(f" 输入数量: {len(inputs)}")
- print(f" 输出维度: {result.shape[1]}")
-
- # 显示每个 embedding 的维度和前20个数字
- for i in range(min(len(inputs), result.shape[0])):
- emb = result[i]
- first_20 = emb[:20].tolist()
-
- # 计算 L2 归一化
- norm = np.linalg.norm(emb)
- normalized_emb = emb / norm if norm > 0 else emb
- normalized_first_20 = normalized_emb[:20].tolist()
-
- print(f" input: {inputs[i]}")
- print(f" Embedding[{i}] 维度: {len(emb)}")
- print(f" 前20个数字: {first_20}")
- print(f" normalize后的前20个数字: {normalized_first_20}")
- return True
- else:
- print(f"✗ 失败: 返回类型错误: {type(result)}")
- return False
- except Exception as e:
- print(f"✗ 失败: {e}")
- import traceback
- traceback.print_exc()
- return False
-
-
-def main():
- # 获取端口参数
- port = sys.argv[1] if len(sys.argv) > 1 else "51000"
- grpc_url = f"grpc://localhost:{port}"
-
- print("=" * 50)
- print("CN-CLIP 服务测试")
- print("=" * 50)
- print(f"服务地址: {grpc_url} (gRPC 协议)")
- print()
-
- # 创建客户端
- try:
- client = Client(grpc_url)
- except Exception as e:
- print(f"✗ 客户端创建失败: {e}")
- sys.exit(1)
-
- # 运行测试
- results = []
-
- # 测试1: 文本编码
- results.append(test_encoding(
- client,
- "测试1: 编码文本",
- ['这是一个测试文本', '另一个测试文本']
- ))
-
- # 测试2: 图像编码
- results.append(test_encoding(
- client,
- "测试2: 编码图像(远程 URL)",
- ['https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']
- ))
-
- # 测试3: 混合编码
- results.append(test_encoding(
- client,
- "测试3: 混合编码(文本和图像)",
- ['这是一段文本', 'https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']
- ))
-
- # 汇总
- print("\n" + "=" * 50)
- print("测试结果汇总")
- print("=" * 50)
- print(f"总测试数: {len(results)}")
- print(f"通过: {sum(results)}")
- print(f"失败: {len(results) - sum(results)}")
-
- if all(results):
- print("\n✓ 所有测试通过!")
- sys.exit(0)
- else:
- print("\n✗ 部分测试失败")
- sys.exit(1)
-
-
-if __name__ == '__main__':
- main()
-
diff --git a/scripts/test_facet_api.py b/scripts/test_facet_api.py
deleted file mode 100755
index e6c7f55..0000000
--- a/scripts/test_facet_api.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试脚本:模拟前端请求,检查后端返回的分面结果
-"""
-
-import sys
-import json
-import requests
-import argparse
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-
-def main():
- parser = argparse.ArgumentParser(description='测试分面API')
- parser.add_argument('--api-url', type=str, default='http://localhost:6002/search/', help='API URL')
- parser.add_argument('--tenant-id', type=str, required=True, help='Tenant ID')
- args = parser.parse_args()
-
- # 模拟前端的分面请求(与frontend/static/js/app.js一致)
- request_data = {
- "query": "", # 空查询,获取所有数据
- "size": 10,
- "from": 0,
- "facets": [
- "category1_name",
- "specifications.color",
- "specifications.size",
- "specifications.material"
- ]
- }
-
- headers = {
- "Content-Type": "application/json",
- "X-Tenant-ID": args.tenant_id
- }
-
- try:
- print(f"发送请求到: {args.api_url}")
- print(f"Tenant ID: {args.tenant_id}")
- print(f"请求数据:")
- print(json.dumps(request_data, indent=2, ensure_ascii=False))
- print("\n" + "="*60)
-
- response = requests.post(args.api_url, json=request_data, headers=headers, timeout=30)
-
- if response.status_code != 200:
- print(f"API错误: {response.status_code}")
- print(response.text)
- return 1
-
- data = response.json()
-
- print("API响应:")
- print(f" 总结果数: {data.get('total', 0)}")
- print(f" 返回结果数: {len(data.get('results', []))}")
-
- facets = data.get('facets', [])
- print(f"\n分面数量: {len(facets)}")
-
- if not facets:
- print("\n⚠ 分面列表为空!")
- return 1
-
- print("\n" + "="*60)
- print("分面详情:")
- print("="*60)
-
- for i, facet in enumerate(facets, 1):
- print(f"\n{i}. {facet.get('field')}")
- print(f" 标签: {facet.get('label')}")
- print(f" 类型: {facet.get('type')}")
- print(f" 值数量: {len(facet.get('values', []))}")
- print(f" 总计数: {facet.get('total_count', 0)}")
-
- values = facet.get('values', [])
- if values:
- print(f" 前5个值:")
- for v in values[:5]:
- print(f" - {v.get('value')}: {v.get('count')}")
- else:
- print(f" ⚠ 值列表为空!")
-
- # 检查specifications.color分面
- print("\n" + "="*60)
- print("检查specifications.color分面:")
- print("="*60)
-
- color_facet = None
- for facet in facets:
- if facet.get('field') == 'specifications.color':
- color_facet = facet
- break
-
- if color_facet:
- print("✓ 找到specifications.color分面")
- print(f" 值数量: {len(color_facet.get('values', []))}")
- if color_facet.get('values'):
- print(" 前10个值:")
- for v in color_facet.get('values', [])[:10]:
- print(f" {v.get('value')}: {v.get('count')}")
- else:
- print(" ⚠ 值列表为空!")
- else:
- print("✗ 未找到specifications.color分面")
- print(f" 可用分面字段: {[f.get('field') for f in facets]}")
-
- # 输出完整JSON(便于调试)
- print("\n" + "="*60)
- print("完整分面JSON(前500字符):")
- print("="*60)
- facets_json = json.dumps(facets, indent=2, ensure_ascii=False)
- print(facets_json[:500])
-
- except requests.exceptions.ConnectionError as e:
- print(f"\n连接错误: 无法连接到API服务器 {args.api_url}")
- print("请确保后端服务正在运行")
- return 1
- except Exception as e:
- print(f"\n错误: {e}")
- import traceback
- traceback.print_exc()
- return 1
-
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
-
diff --git a/scripts/test_frontend.sh b/scripts/test_frontend.sh
deleted file mode 100755
index f5024f3..0000000
--- a/scripts/test_frontend.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-
-# Test Frontend - Quick verification script
-
-set -e
-
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-RED='\033[0;31m'
-NC='\033[0m'
-
-API_URL="http://120.76.41.98:6002"
-
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}Frontend Test Script${NC}"
-echo -e "${GREEN}========================================${NC}"
-
-echo -e "\n${YELLOW}Testing API endpoints...${NC}"
-
-# Test 1: Health check
-echo -e "\n1. Testing health endpoint..."
-if curl -s "${API_URL}/health" > /dev/null; then
- echo -e "${GREEN}✓ Health check passed${NC}"
-else
- echo -e "${RED}✗ Health check failed${NC}"
- exit 1
-fi
-
-# Test 2: Frontend HTML
-echo -e "\n2. Testing frontend HTML..."
-if curl -s "${API_URL}/" | grep -q "Product Search"; then
- echo -e "${GREEN}✓ Frontend HTML accessible${NC}"
-else
- echo -e "${RED}✗ Frontend HTML not found${NC}"
- exit 1
-fi
-
-# Test 3: Static CSS
-echo -e "\n3. Testing static CSS..."
-if curl -s "${API_URL}/static/css/style.css" | grep -q "page-container"; then
- echo -e "${GREEN}✓ CSS file accessible${NC}"
-else
- echo -e "${RED}✗ CSS file not found${NC}"
- exit 1
-fi
-
-# Test 4: Static JS
-echo -e "\n4. Testing static JavaScript..."
-if curl -s "${API_URL}/static/js/app.js" | grep -q "performSearch"; then
- echo -e "${GREEN}✓ JavaScript file accessible${NC}"
-else
- echo -e "${RED}✗ JavaScript file not found${NC}"
- exit 1
-fi
-
-# Test 5: Search API
-echo -e "\n5. Testing search API..."
-SEARCH_RESULT=$(curl -s -X POST "${API_URL}/search/" \
- -H "Content-Type: application/json" \
- -d '{"query":"玩具","size":5}')
-
-if echo "$SEARCH_RESULT" | grep -q "hits"; then
- echo -e "${GREEN}✓ Search API working${NC}"
- TOTAL=$(echo "$SEARCH_RESULT" | grep -o '"total":[0-9]*' | cut -d: -f2)
- echo -e " Found ${YELLOW}${TOTAL}${NC} results"
-else
- echo -e "${RED}✗ Search API failed${NC}"
- exit 1
-fi
-
-echo -e "\n${GREEN}========================================${NC}"
-echo -e "${GREEN}All tests passed! ✓${NC}"
-echo -e "${GREEN}========================================${NC}"
-
-echo -e "\n${YELLOW}Frontend is ready!${NC}"
-echo -e "Open in browser: ${GREEN}${API_URL}/${NC}"
-
-echo -e "\n${YELLOW}Quick Start Guide:${NC}"
-echo "1. Open browser and go to: ${API_URL}/"
-echo "2. Enter a search query (e.g., '玩具')"
-echo "3. Click on filter tags to refine results"
-echo "4. Use sort buttons with arrows to sort"
-echo "5. Use pagination at the bottom to browse"
-
-echo -e "\n${YELLOW}Key Features:${NC}"
-echo "- Clean white background design"
-echo "- Horizontal filter tags (categories, brands, suppliers)"
-echo "- Sort buttons with up/down arrows for ascending/descending"
-echo "- Product grid with images, prices, MOQ info"
-echo "- Full pagination support"
-echo "- Responsive design for mobile and desktop"
-
-echo -e "\n${GREEN}Enjoy your new frontend! 🎉${NC}"
-
diff --git a/tests/test_cloud_embedding.py b/tests/test_cloud_embedding.py
new file mode 100644
index 0000000..67b358c
--- /dev/null
+++ b/tests/test_cloud_embedding.py
@@ -0,0 +1,183 @@
+"""
+Test script for cloud text embedding using Aliyun DashScope API.
+
+Reads queries from queries.txt and tests embedding generation,
+logging send time, receive time, and duration for each request.
+"""
+
+import os
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from embeddings.cloud_text_encoder import CloudTextEncoder
+
+
+def format_timestamp(ts: float) -> str:
+ """Format timestamp to readable string."""
+ return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+
+
+def read_queries(file_path: str, limit: int = 100) -> list:
+ """
+ Read queries from text file.
+
+ Args:
+ file_path: Path to queries file
+ limit: Maximum number of queries to read
+
+ Returns:
+ List of query strings
+ """
+ queries = []
+ with open(file_path, 'r', encoding='utf-8') as f:
+ for i, line in enumerate(f):
+ if i >= limit:
+ break
+ query = line.strip()
+ if query: # Skip empty lines
+ queries.append(query)
+ return queries
+
+
+def test_cloud_embedding(queries_file: str, num_queries: int = 100):
+ """
+ Test cloud embedding with queries from file.
+
+ Args:
+ queries_file: Path to queries file
+ num_queries: Number of queries to test
+ """
+ print("=" * 80)
+ print("Cloud Text Embedding Test - Aliyun DashScope API")
+ print("=" * 80)
+ print()
+
+ # Check if API key is set
+ api_key = os.getenv("DASHSCOPE_API_KEY")
+ if not api_key:
+ print("ERROR: DASHSCOPE_API_KEY environment variable is not set!")
+ print("Please set it using: export DASHSCOPE_API_KEY='your-api-key'")
+ return
+
+ print(f"API Key: {api_key[:10]}...{api_key[-4:]}")
+ print()
+
+ # Read queries
+ print(f"Reading queries from: {queries_file}")
+ try:
+ queries = read_queries(queries_file, limit=num_queries)
+ print(f"Successfully read {len(queries)} queries")
+ print()
+ except Exception as e:
+ print(f"ERROR: Failed to read queries file: {e}")
+ return
+
+ # Initialize encoder
+ print("Initializing CloudTextEncoder...")
+ try:
+ encoder = CloudTextEncoder()
+ print("CloudTextEncoder initialized successfully")
+ print()
+ except Exception as e:
+ print(f"ERROR: Failed to initialize encoder: {e}")
+ return
+
+ # Test embeddings
+ print("=" * 80)
+ print(f"Testing {len(queries)} queries (one by one)")
+ print("=" * 80)
+ print()
+
+ total_start = time.time()
+ success_count = 0
+ failure_count = 0
+ total_duration = 0.0
+
+ for i, query in enumerate(queries, 1):
+ try:
+ # Record send time
+ send_time = time.time()
+ send_time_str = format_timestamp(send_time)
+
+ # Generate embedding
+ embedding = encoder.encode(query)
+
+ # Record receive time
+ receive_time = time.time()
+ receive_time_str = format_timestamp(receive_time)
+
+ # Calculate duration
+ duration = receive_time - send_time
+ total_duration += duration
+
+ # Verify embedding
+ if embedding.shape[0] > 0:
+ success_count += 1
+ status = "✓ SUCCESS"
+ else:
+ failure_count += 1
+ status = "✗ FAILED"
+
+ # Print result
+ query_display = query[:50] + "..." if len(query) > 50 else query
+ print(f"[{i:3d}/{len(queries)}] {status}")
+ print(f" Query: {query_display}")
+ print(f" Send Time: {send_time_str}")
+ print(f" Receive Time: {receive_time_str}")
+ print(f" Duration: {duration:.3f}s")
+ print(f" Embedding Shape: {embedding.shape}")
+ print()
+
+ except Exception as e:
+ failure_count += 1
+ receive_time = time.time()
+ duration = receive_time - send_time
+
+ print(f"[{i:3d}/{len(queries)}] ✗ ERROR")
+ print(f" Query: {query[:50]}...")
+ print(f" Send Time: {send_time_str}")
+ print(f" Receive Time: {format_timestamp(receive_time)}")
+ print(f" Duration: {duration:.3f}s")
+ print(f" Error: {str(e)}")
+ print()
+
+ # Print summary
+ total_elapsed = time.time() - total_start
+ avg_duration = total_duration / len(queries) if queries else 0
+
+ print("=" * 80)
+ print("Test Summary")
+ print("=" * 80)
+ print(f"Total Queries: {len(queries)}")
+ print(f"Successful: {success_count}")
+ print(f"Failed: {failure_count}")
+ print(f"Success Rate: {success_count / len(queries) * 100:.1f}%")
+ print(f"Total Time: {total_elapsed:.3f}s")
+ print(f"Total API Time: {total_duration:.3f}s")
+ print(f"Average Duration: {avg_duration:.3f}s per query")
+ print(f"Throughput: {len(queries) / total_elapsed:.2f} queries/second")
+ print("=" * 80)
+
+
+def main():
+ """Main entry point."""
+ # Default queries file path
+ queries_file = Path(__file__).parent.parent / "data_crawling" / "queries.txt"
+
+ # Check if file exists
+ if not queries_file.exists():
+ print(f"ERROR: Queries file not found: {queries_file}")
+ return
+
+ # Run test with 100 queries
+ test_cloud_embedding(str(queries_file), num_queries=100)
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/tests/test_cnclip_service.py b/tests/test_cnclip_service.py
new file mode 100755
index 0000000..2fcfc7b
--- /dev/null
+++ b/tests/test_cnclip_service.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+CN-CLIP 服务测试脚本
+
+用途:
+ 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议)
+
+使用方法:
+ python scripts/test_cnclip_service.py [PORT]
+
+参数:
+ PORT: 服务端口(默认:51000)
+"""
+
+import sys
+import numpy as np
+from clip_client import Client
+
+
+def test_encoding(client, test_name, inputs):
+ """测试编码功能"""
+ print(f"\n{test_name}...")
+ try:
+ result = client.encode(inputs)
+ if isinstance(result, np.ndarray):
+ print(f"✓ 成功! 形状: {result.shape}")
+ print(f" 输入数量: {len(inputs)}")
+ print(f" 输出维度: {result.shape[1]}")
+
+ # 显示每个 embedding 的维度和前20个数字
+ for i in range(min(len(inputs), result.shape[0])):
+ emb = result[i]
+ first_20 = emb[:20].tolist()
+
+ # 计算 L2 归一化
+ norm = np.linalg.norm(emb)
+ normalized_emb = emb / norm if norm > 0 else emb
+ normalized_first_20 = normalized_emb[:20].tolist()
+
+ print(f" input: {inputs[i]}")
+ print(f" Embedding[{i}] 维度: {len(emb)}")
+ print(f" 前20个数字: {first_20}")
+ print(f" normalize后的前20个数字: {normalized_first_20}")
+ return True
+ else:
+ print(f"✗ 失败: 返回类型错误: {type(result)}")
+ return False
+ except Exception as e:
+ print(f"✗ 失败: {e}")
+ import traceback
+ traceback.print_exc()
+ return False
+
+
+def main():
+ # 获取端口参数
+ port = sys.argv[1] if len(sys.argv) > 1 else "51000"
+ grpc_url = f"grpc://localhost:{port}"
+
+ print("=" * 50)
+ print("CN-CLIP 服务测试")
+ print("=" * 50)
+ print(f"服务地址: {grpc_url} (gRPC 协议)")
+ print()
+
+ # 创建客户端
+ try:
+ client = Client(grpc_url)
+ except Exception as e:
+ print(f"✗ 客户端创建失败: {e}")
+ sys.exit(1)
+
+ # 运行测试
+ results = []
+
+ # 测试1: 文本编码
+ results.append(test_encoding(
+ client,
+ "测试1: 编码文本",
+ ['这是一个测试文本', '另一个测试文本']
+ ))
+
+ # 测试2: 图像编码
+ results.append(test_encoding(
+ client,
+ "测试2: 编码图像(远程 URL)",
+ ['https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']
+ ))
+
+ # 测试3: 混合编码
+ results.append(test_encoding(
+ client,
+ "测试3: 混合编码(文本和图像)",
+ ['这是一段文本', 'https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg']
+ ))
+
+ # 汇总
+ print("\n" + "=" * 50)
+ print("测试结果汇总")
+ print("=" * 50)
+ print(f"总测试数: {len(results)}")
+ print(f"通过: {sum(results)}")
+ print(f"失败: {len(results) - sum(results)}")
+
+ if all(results):
+ print("\n✓ 所有测试通过!")
+ sys.exit(0)
+ else:
+ print("\n✗ 部分测试失败")
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/tests/test_facet_api.py b/tests/test_facet_api.py
new file mode 100755
index 0000000..e6c7f55
--- /dev/null
+++ b/tests/test_facet_api.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+测试脚本:模拟前端请求,检查后端返回的分面结果
+"""
+
+import sys
+import json
+import requests
+import argparse
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def main():
+ parser = argparse.ArgumentParser(description='测试分面API')
+ parser.add_argument('--api-url', type=str, default='http://localhost:6002/search/', help='API URL')
+ parser.add_argument('--tenant-id', type=str, required=True, help='Tenant ID')
+ args = parser.parse_args()
+
+ # 模拟前端的分面请求(与frontend/static/js/app.js一致)
+ request_data = {
+ "query": "", # 空查询,获取所有数据
+ "size": 10,
+ "from": 0,
+ "facets": [
+ "category1_name",
+ "specifications.color",
+ "specifications.size",
+ "specifications.material"
+ ]
+ }
+
+ headers = {
+ "Content-Type": "application/json",
+ "X-Tenant-ID": args.tenant_id
+ }
+
+ try:
+ print(f"发送请求到: {args.api_url}")
+ print(f"Tenant ID: {args.tenant_id}")
+ print(f"请求数据:")
+ print(json.dumps(request_data, indent=2, ensure_ascii=False))
+ print("\n" + "="*60)
+
+ response = requests.post(args.api_url, json=request_data, headers=headers, timeout=30)
+
+ if response.status_code != 200:
+ print(f"API错误: {response.status_code}")
+ print(response.text)
+ return 1
+
+ data = response.json()
+
+ print("API响应:")
+ print(f" 总结果数: {data.get('total', 0)}")
+ print(f" 返回结果数: {len(data.get('results', []))}")
+
+ facets = data.get('facets', [])
+ print(f"\n分面数量: {len(facets)}")
+
+ if not facets:
+ print("\n⚠ 分面列表为空!")
+ return 1
+
+ print("\n" + "="*60)
+ print("分面详情:")
+ print("="*60)
+
+ for i, facet in enumerate(facets, 1):
+ print(f"\n{i}. {facet.get('field')}")
+ print(f" 标签: {facet.get('label')}")
+ print(f" 类型: {facet.get('type')}")
+ print(f" 值数量: {len(facet.get('values', []))}")
+ print(f" 总计数: {facet.get('total_count', 0)}")
+
+ values = facet.get('values', [])
+ if values:
+ print(f" 前5个值:")
+ for v in values[:5]:
+ print(f" - {v.get('value')}: {v.get('count')}")
+ else:
+ print(f" ⚠ 值列表为空!")
+
+ # 检查specifications.color分面
+ print("\n" + "="*60)
+ print("检查specifications.color分面:")
+ print("="*60)
+
+ color_facet = None
+ for facet in facets:
+ if facet.get('field') == 'specifications.color':
+ color_facet = facet
+ break
+
+ if color_facet:
+ print("✓ 找到specifications.color分面")
+ print(f" 值数量: {len(color_facet.get('values', []))}")
+ if color_facet.get('values'):
+ print(" 前10个值:")
+ for v in color_facet.get('values', [])[:10]:
+ print(f" {v.get('value')}: {v.get('count')}")
+ else:
+ print(" ⚠ 值列表为空!")
+ else:
+ print("✗ 未找到specifications.color分面")
+ print(f" 可用分面字段: {[f.get('field') for f in facets]}")
+
+ # 输出完整JSON(便于调试)
+ print("\n" + "="*60)
+ print("完整分面JSON(前500字符):")
+ print("="*60)
+ facets_json = json.dumps(facets, indent=2, ensure_ascii=False)
+ print(facets_json[:500])
+
+ except requests.exceptions.ConnectionError as e:
+ print(f"\n连接错误: 无法连接到API服务器 {args.api_url}")
+ print("请确保后端服务正在运行")
+ return 1
+ except Exception as e:
+ print(f"\n错误: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
+
--
libgit2 0.21.2