#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Shopee API 爬虫脚本 - 简化版 使用万邦API爬取Shopee商品数据 """ import requests import json import time from pathlib import Path from datetime import datetime from urllib.parse import urlencode # API配置 API_KEY = 't8618339029' API_SECRET = '9029f568' API_URL = 'https://api-gw.onebound.cn/shopee/item_search' # 爬取配置 COUNTRY = '.com.my' # 站点: .vn, .co.th, .tw, .co.id, .sg, .com.my PAGE = 1 # 页码 DELAY = 2 # 请求间隔(秒) MAX_RETRIES = 3 # 最大重试次数 def fetch_shopee_data(query, page=1, country=COUNTRY): """ 请求Shopee API获取数据 :param query: 搜索关键词 :param page: 页码 :param country: 站点 :return: JSON数据或None """ params = { 'key': API_KEY, 'secret': API_SECRET, 'q': query, 'page': page, 'country': country, 'cache': 'yes', 'result_type': 'json', 'lang': 'en' } url = f"{API_URL}?{urlencode(params)}" for retry in range(MAX_RETRIES): try: print(f" 请求中... (尝试 {retry + 1}/{MAX_RETRIES})") response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() if data.get('error_code') == '0000': items_count = len(data.get('items', {}).get('item', [])) print(f" ✓ 成功! 获取 {items_count} 个商品") return data else: print(f" ✗ API错误: {data.get('reason', '未知错误')}") except Exception as e: print(f" ✗ 请求失败: {str(e)}") if retry < MAX_RETRIES - 1: time.sleep(3) return None def save_json(data, filename): """保存JSON数据""" with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def main(): """主函数""" # 文件路径 script_dir = Path(__file__).parent query_file = script_dir / 'queries.txt' results_dir = script_dir / 'shopee_results' # 创建结果目录 results_dir.mkdir(exist_ok=True) # 读取查询词 print("=" * 80) print("Shopee API 爬虫") print("=" * 80) if not query_file.exists(): print(f"错误: 查询词文件不存在: {query_file}") return with open(query_file, 'r', encoding='utf-8') as f: queries = [line.strip() for line in f if line.strip()] total = len(queries) print(f"查询词数量: {total}") print(f"结果目录: {results_dir}") print(f"站点: {COUNTRY}") print(f"请求间隔: {DELAY}秒") print("=" * 80) # 统计 success_count = 0 fail_count = 0 failed_queries = [] start_time = time.time() # 逐个爬取 for idx, query in enumerate(queries, 1): if not query: continue print(f"\n[{idx}/{total}] 查询词: '{query}'") # 获取数据 data = fetch_shopee_data(query, PAGE, COUNTRY) # 保存结果 if data: # 生成文件名 safe_name = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in query) safe_name = safe_name.strip()[:50] timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{idx:04d}_{safe_name}_{timestamp}.json" filepath = results_dir / filename save_json(data, filepath) print(f" ✓ 已保存: {filename}") success_count += 1 else: print(f" ✗ 保存失败") fail_count += 1 failed_queries.append(query) # 延迟(最后一个不延迟) if idx < total: print(f" 等待 {DELAY} 秒...") time.sleep(DELAY) # 统计结果 elapsed = time.time() - start_time print("\n" + "=" * 80) print("爬取完成!") print(f"总数: {total} | 成功: {success_count} | 失败: {fail_count}") print(f"耗时: {elapsed:.2f} 秒 ({elapsed/60:.2f} 分钟)") print("=" * 80) # 保存失败列表 if failed_queries: fail_file = results_dir / 'failed_queries.txt' with open(fail_file, 'w', encoding='utf-8') as f: f.write('\n'.join(failed_queries)) print(f"失败列表已保存: {fail_file}") # 保存摘要 summary = { 'crawl_time': datetime.now().isoformat(), 'total': total, 'success': success_count, 'fail': fail_count, 'elapsed_seconds': elapsed, 'config': { 'country': COUNTRY, 'page': PAGE, 'delay': DELAY }, 'failed_queries': failed_queries } summary_file = results_dir / 'summary.json' save_json(summary, summary_file) print(f"摘要已保存: {summary_file}") if __name__ == '__main__': main()