shopee_crawler.py 4.96 KB
Edit Raw Blame History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Shopee API 爬虫脚本 - 简化版
使用万邦API爬取Shopee商品数据
"""

import requests
import json
import time
from pathlib import Path
from datetime import datetime
from urllib.parse import urlencode

# API配置
API_KEY = 't8618339029'
API_SECRET = '9029f568'
API_URL = 'https://api-gw.onebound.cn/shopee/item_search'

# 爬取配置
COUNTRY = '.com.my'  # 站点: .vn, .co.th, .tw, .co.id, .sg, .com.my
PAGE = 1  # 页码
DELAY = 2  # 请求间隔（秒）
MAX_RETRIES = 3  # 最大重试次数


def fetch_shopee_data(query, page=1, country=COUNTRY):
    """
    请求Shopee API获取数据
    :param query: 搜索关键词
    :param page: 页码
    :param country: 站点
    :return: JSON数据或None
    """
    params = {
        'key': API_KEY,
        'secret': API_SECRET,
        'q': query,
        'page': page,
        'country': country,
        'cache': 'yes',
        'result_type': 'json',
        'lang': 'en'
    }

    url = f"{API_URL}?{urlencode(params)}"

    for retry in range(MAX_RETRIES):
        try:
            print(f"  请求中... (尝试 {retry + 1}/{MAX_RETRIES})")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            data = response.json()

            if data.get('error_code') == '0000':
                items_count = len(data.get('items', {}).get('item', []))
                print(f"  ✓ 成功! 获取 {items_count} 个商品")
                return data
            else:
                print(f"  ✗ API错误: {data.get('reason', '未知错误')}")

        except Exception as e:
            print(f"  ✗ 请求失败: {str(e)}")
            if retry < MAX_RETRIES - 1:
                time.sleep(3)

    return None


def save_json(data, filename):
    """保存JSON数据"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def main():
    """主函数"""
    # 文件路径
    script_dir = Path(__file__).parent
    query_file = script_dir / 'queries.txt'
    results_dir = script_dir / 'shopee_results'

    # 创建结果目录
    results_dir.mkdir(exist_ok=True)

    # 读取查询词
    print("=" * 80)
    print("Shopee API 爬虫")
    print("=" * 80)

    if not query_file.exists():
        print(f"错误: 查询词文件不存在: {query_file}")
        return

    with open(query_file, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    total = len(queries)
    print(f"查询词数量: {total}")
    print(f"结果目录: {results_dir}")
    print(f"站点: {COUNTRY}")
    print(f"请求间隔: {DELAY}秒")
    print("=" * 80)

    # 统计
    success_count = 0
    fail_count = 0
    failed_queries = []

    start_time = time.time()

    # 逐个爬取
    for idx, query in enumerate(queries, 1):
        if not query:
            continue

        print(f"\n[{idx}/{total}] 查询词: '{query}'")

        # 获取数据
        data = fetch_shopee_data(query, PAGE, COUNTRY)

        # 保存结果
        if data:
            # 生成文件名
            safe_name = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in query)
            safe_name = safe_name.strip()[:50]
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"{idx:04d}_{safe_name}_{timestamp}.json"
            filepath = results_dir / filename

            save_json(data, filepath)
            print(f"  ✓ 已保存: {filename}")
            success_count += 1
        else:
            print(f"  ✗ 保存失败")
            fail_count += 1
            failed_queries.append(query)

        # 延迟（最后一个不延迟）
        if idx < total:
            print(f"  等待 {DELAY} 秒...")
            time.sleep(DELAY)

    # 统计结果
    elapsed = time.time() - start_time

    print("\n" + "=" * 80)
    print("爬取完成!")
    print(f"总数: {total} | 成功: {success_count} | 失败: {fail_count}")
    print(f"耗时: {elapsed:.2f} 秒 ({elapsed/60:.2f} 分钟)")
    print("=" * 80)

    # 保存失败列表
    if failed_queries:
        fail_file = results_dir / 'failed_queries.txt'
        with open(fail_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(failed_queries))
        print(f"失败列表已保存: {fail_file}")

    # 保存摘要
    summary = {
        'crawl_time': datetime.now().isoformat(),
        'total': total,
        'success': success_count,
        'fail': fail_count,
        'elapsed_seconds': elapsed,
        'config': {
            'country': COUNTRY,
            'page': PAGE,
            'delay': DELAY
        },
        'failed_queries': failed_queries
    }

    summary_file = results_dir / 'summary.json'
    save_json(summary, summary_file)
    print(f"摘要已保存: {summary_file}")


if __name__ == '__main__':
    main()