1688玩具数据爬虫实现
·
1688玩具数据爬虫实现
下面是一个使用Python实现的爬虫,用于抓取1688网站的玩具数据并存储到数据库中。由于1688网站有反爬机制,我们需要使用代理和随机User-Agent来避免被封锁。
完整代码
import requests
from bs4 import BeautifulSoup
import pymysql
import time
import random
from fake_useragent import UserAgent
from urllib.parse import quote
# 数据库配置
DB_CONFIG = {
'host': 'localhost',
'port': 3306,
'user': 'your_username',
'password': 'your_password',
'database': 'toys_data',
'charset': 'utf8mb4'
}
# 代理配置(如果需要)
PROXIES = {
# 'http': 'http://your_proxy:port',
# 'https': 'https://your_proxy:port'
}
class ToySpider:
def __init__(self):
self.ua = UserAgent()
self.session = requests.Session()
self.session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
# 初始化数据库连接
self.conn = pymysql.connect(**DB_CONFIG)
self.create_table()
def create_table(self):
"""创建数据表"""
with self.conn.cursor() as cursor:
sql = """
CREATE TABLE IF NOT EXISTS toys (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255) NOT NULL,
price VARCHAR(50),
min_order VARCHAR(50),
company VARCHAR(255),
location VARCHAR(100),
image_url VARCHAR(255),
product_url VARCHAR(255),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
"""
cursor.execute(sql)
self.conn.commit()
def get_random_headers(self):
"""生成随机请求头"""
return {
'User-Agent': self.ua.random,
'Referer': 'https://www.1688.com/'
}
def fetch_page(self, url, max_retries=5):
"""获取页面内容"""
for attempt in range(max_retries):
try:
headers = self.get_random_headers()
response = self.session.get(url, headers=headers, proxies=PROXIES, timeout=10)
response.raise_for_status()
# 检查是否被反爬
if "验证" in response.text or "security" in response.text.lower():
print("可能触发了反爬机制,等待后重试...")
time.sleep(10)
continue
return response.text
except Exception as e:
print(f"请求失败 (尝试 {attempt+1}/{max_retries}): {e}")
time.sleep(2 ** attempt) # 指数退避策略
return None
def parse_product_list(self, html):
"""解析产品列表页"""
soup = BeautifulSoup(html, 'html.parser')
products = []
# 根据1688实际页面结构调整选择器
items = soup.select('.component-product-list .list-item')
if not items:
# 尝试其他可能的选择器
items = soup.select('.offer-list .offer-item')
for item in items:
try:
# 提取产品信息
title_elem = item.select_one('.title a')
title = title_elem.get_text(strip=True) if title_elem else "无标题"
product_url = title_elem['href'] if title_elem and title_elem.has_attr('href') else ""
# 确保URL完整
if product_url and not product_url.startswith('http'):
product_url = 'https:' + product_url
price_elem = item.select_one('.price')
price = price_elem.get_text(strip=True) if price_elem else "价格面议"
min_order_elem = item.select_one('.moq')
min_order = min_order_elem.get_text(strip=True) if min_order_elem else "未知"
company_elem = item.select_one('.company-name')
company = company_elem.get_text(strip=True) if company_elem else "未知公司"
location_elem = item.select_one('.location')
location = location_elem.get_text(strip=True) if location_elem else "未知地区"
image_elem = item.select_one('.image img')
image_url = image_elem['src'] if image_elem and image_elem.has_attr('src') else ""
if image_url and not image_url.startswith('http'):
image_url = 'https:' + image_url
products.append({
'title': title,
'price': price,
'min_order': min_order,
'company': company,
'location': location,
'image_url': image_url,
'product_url': product_url
})
except Exception as e:
print(f"解析产品项时出错: {e}")
continue
return products
def save_to_database(self, products):
"""保存产品数据到数据库"""
if not products:
return
with self.conn.cursor() as cursor:
sql = """
INSERT INTO toys (title, price, min_order, company, location, image_url, product_url)
VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
values = [
(p['title'], p['price'], p['min_order'], p['company'],
p['location'], p['image_url'], p['product_url'])
for p in products
]
cursor.executemany(sql, values)
self.conn.commit()
print(f"成功保存 {len(products)} 条产品数据")
def crawl(self, base_url, pages=5):
"""爬取多页数据"""
for page in range(1, pages + 1):
print(f"正在爬取第 {page} 页...")
# 构建分页URL
if page == 1:
url = base_url
else:
# 根据1688的分页规则调整
url = f"{base_url}&pageNum={page}"
html = self.fetch_page(url)
if not html:
print(f"获取第 {page} 页失败")
continue
products = self.parse_product_list(html)
if products:
self.save_to_database(products)
else:
print(f"第 {page} 页未找到产品数据")
# 随机延迟,避免请求过于频繁
time.sleep(random.uniform(1, 3))
def close(self):
"""关闭数据库连接"""
self.conn.close()
def main():
# 编码关键词
keyword = "泡泡枪"
encoded_keyword = quote(keyword)
# 基础URL(根据实际页面调整)
base_url = f"https://s.1688.com/company/-{encoded_keyword}.html"
# 或者使用提供的URL
# base_url = "https://www.1688.com/zw/page.html?spm=a2638t.b_30496503.szyx_divert_popup.26.7403436cZhW0Qr&hpageId=old-sem-pc-list&cosite=baidujj_pz&keywords=泡泡枪&trackid=885662561117990122602&location=re&ptid=017700000004f5cbcd361f3287a80045"
spider = ToySpider()
try:
spider.crawl(base_url, pages=3) # 爬取3页
except Exception as e:
print(f"爬虫执行出错: {e}")
finally:
spider.close()
if __name__ == "__main__":
main()
使用说明
-
安装依赖库:
pip install requests beautifulsoup4 pymysql fake-useragent
-
数据库配置:
- 修改代码中的
DB_CONFIG
,填写你的MySQL数据库信息 - 确保数据库已创建:
CREATE DATABASE toys_data;
- 修改代码中的
-
代理设置(如果需要):
- 如果访问1688需要代理,取消注释
PROXIES
部分并填写你的代理信息
- 如果访问1688需要代理,取消注释
-
运行爬虫:
python toy_spider.py
注意事项
-
反爬机制:
- 1688有严格的反爬措施,代码中已添加随机User-Agent和请求延迟
- 如果频繁被封,可以考虑使用代理IP池
-
页面结构变化:
- 1688的页面结构可能会变化,需要根据实际情况调整选择器
- 可以使用浏览器开发者工具检查最新的HTML结构
-
法律合规:
- 确保你的爬虫使用符合网站的使用条款
- 不要过于频繁地请求,避免对网站造成负担
-
数据存储:
- 代码中使用MySQL存储数据,你也可以根据需要改为SQLite或其他数据库
这个爬虫提供了基本框架,你可能需要根据1688的实际页面结构进行调整,特别是选择器部分可能需要更新。
更多推荐
所有评论(0)