1688玩具数据爬虫实现

下面是一个使用Python实现的爬虫,用于抓取1688网站的玩具数据并存储到数据库中。由于1688网站有反爬机制,我们需要使用代理和随机User-Agent来避免被封锁。

完整代码

import requests
from bs4 import BeautifulSoup
import pymysql
import time
import random
from fake_useragent import UserAgent
from urllib.parse import quote

# 数据库配置
DB_CONFIG = {
    'host': 'localhost',
    'port': 3306,
    'user': 'your_username',
    'password': 'your_password',
    'database': 'toys_data',
    'charset': 'utf8mb4'
}

# 代理配置(如果需要)
PROXIES = {
    # 'http': 'http://your_proxy:port',
    # 'https': 'https://your_proxy:port'
}

class ToySpider:
    def __init__(self):
        self.ua = UserAgent()
        self.session = requests.Session()
        self.session.headers.update({
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # 初始化数据库连接
        self.conn = pymysql.connect(**DB_CONFIG)
        self.create_table()
    
    def create_table(self):
        """创建数据表"""
        with self.conn.cursor() as cursor:
            sql = """
            CREATE TABLE IF NOT EXISTS toys (
                id INT AUTO_INCREMENT PRIMARY KEY,
                title VARCHAR(255) NOT NULL,
                price VARCHAR(50),
                min_order VARCHAR(50),
                company VARCHAR(255),
                location VARCHAR(100),
                image_url VARCHAR(255),
                product_url VARCHAR(255),
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
            """
            cursor.execute(sql)
        self.conn.commit()
    
    def get_random_headers(self):
        """生成随机请求头"""
        return {
            'User-Agent': self.ua.random,
            'Referer': 'https://www.1688.com/'
        }
    
    def fetch_page(self, url, max_retries=5):
        """获取页面内容"""
        for attempt in range(max_retries):
            try:
                headers = self.get_random_headers()
                response = self.session.get(url, headers=headers, proxies=PROXIES, timeout=10)
                response.raise_for_status()
                
                # 检查是否被反爬
                if "验证" in response.text or "security" in response.text.lower():
                    print("可能触发了反爬机制,等待后重试...")
                    time.sleep(10)
                    continue
                    
                return response.text
            except Exception as e:
                print(f"请求失败 (尝试 {attempt+1}/{max_retries}): {e}")
                time.sleep(2 ** attempt)  # 指数退避策略
        return None
    
    def parse_product_list(self, html):
        """解析产品列表页"""
        soup = BeautifulSoup(html, 'html.parser')
        products = []
        
        # 根据1688实际页面结构调整选择器
        items = soup.select('.component-product-list .list-item')
        if not items:
            # 尝试其他可能的选择器
            items = soup.select('.offer-list .offer-item')
        
        for item in items:
            try:
                # 提取产品信息
                title_elem = item.select_one('.title a')
                title = title_elem.get_text(strip=True) if title_elem else "无标题"
                product_url = title_elem['href'] if title_elem and title_elem.has_attr('href') else ""
                
                # 确保URL完整
                if product_url and not product_url.startswith('http'):
                    product_url = 'https:' + product_url
                
                price_elem = item.select_one('.price')
                price = price_elem.get_text(strip=True) if price_elem else "价格面议"
                
                min_order_elem = item.select_one('.moq')
                min_order = min_order_elem.get_text(strip=True) if min_order_elem else "未知"
                
                company_elem = item.select_one('.company-name')
                company = company_elem.get_text(strip=True) if company_elem else "未知公司"
                
                location_elem = item.select_one('.location')
                location = location_elem.get_text(strip=True) if location_elem else "未知地区"
                
                image_elem = item.select_one('.image img')
                image_url = image_elem['src'] if image_elem and image_elem.has_attr('src') else ""
                if image_url and not image_url.startswith('http'):
                    image_url = 'https:' + image_url
                
                products.append({
                    'title': title,
                    'price': price,
                    'min_order': min_order,
                    'company': company,
                    'location': location,
                    'image_url': image_url,
                    'product_url': product_url
                })
            except Exception as e:
                print(f"解析产品项时出错: {e}")
                continue
        
        return products
    
    def save_to_database(self, products):
        """保存产品数据到数据库"""
        if not products:
            return
        
        with self.conn.cursor() as cursor:
            sql = """
            INSERT INTO toys (title, price, min_order, company, location, image_url, product_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            """
            values = [
                (p['title'], p['price'], p['min_order'], p['company'], 
                 p['location'], p['image_url'], p['product_url'])
                for p in products
            ]
            cursor.executemany(sql, values)
        self.conn.commit()
        print(f"成功保存 {len(products)} 条产品数据")
    
    def crawl(self, base_url, pages=5):
        """爬取多页数据"""
        for page in range(1, pages + 1):
            print(f"正在爬取第 {page} 页...")
            
            # 构建分页URL
            if page == 1:
                url = base_url
            else:
                # 根据1688的分页规则调整
                url = f"{base_url}&pageNum={page}"
            
            html = self.fetch_page(url)
            if not html:
                print(f"获取第 {page} 页失败")
                continue
            
            products = self.parse_product_list(html)
            if products:
                self.save_to_database(products)
            else:
                print(f"第 {page} 页未找到产品数据")
            
            # 随机延迟,避免请求过于频繁
            time.sleep(random.uniform(1, 3))
    
    def close(self):
        """关闭数据库连接"""
        self.conn.close()

def main():
    # 编码关键词
    keyword = "泡泡枪"
    encoded_keyword = quote(keyword)
    
    # 基础URL(根据实际页面调整)
    base_url = f"https://s.1688.com/company/-{encoded_keyword}.html"
    
    # 或者使用提供的URL
    # base_url = "https://www.1688.com/zw/page.html?spm=a2638t.b_30496503.szyx_divert_popup.26.7403436cZhW0Qr&hpageId=old-sem-pc-list&cosite=baidujj_pz&keywords=泡泡枪&trackid=885662561117990122602&location=re&ptid=017700000004f5cbcd361f3287a80045"
    
    spider = ToySpider()
    try:
        spider.crawl(base_url, pages=3)  # 爬取3页
    except Exception as e:
        print(f"爬虫执行出错: {e}")
    finally:
        spider.close()

if __name__ == "__main__":
    main()

使用说明

  1. 安装依赖库

    pip install requests beautifulsoup4 pymysql fake-useragent
    
  2. 数据库配置

    • 修改代码中的DB_CONFIG,填写你的MySQL数据库信息
    • 确保数据库已创建:CREATE DATABASE toys_data;
  3. 代理设置(如果需要):

    • 如果访问1688需要代理,取消注释PROXIES部分并填写你的代理信息
  4. 运行爬虫

    python toy_spider.py
    

注意事项

  1. 反爬机制

    • 1688有严格的反爬措施,代码中已添加随机User-Agent和请求延迟
    • 如果频繁被封,可以考虑使用代理IP池
  2. 页面结构变化

    • 1688的页面结构可能会变化,需要根据实际情况调整选择器
    • 可以使用浏览器开发者工具检查最新的HTML结构
  3. 法律合规

    • 确保你的爬虫使用符合网站的使用条款
    • 不要过于频繁地请求,避免对网站造成负担
  4. 数据存储

    • 代码中使用MySQL存储数据,你也可以根据需要改为SQLite或其他数据库

这个爬虫提供了基本框架,你可能需要根据1688的实际页面结构进行调整,特别是选择器部分可能需要更新。

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐