BeautifulSoup 爬取豆瓣电影影评并且制作词云
font_path='C:/Windows/Fonts/simkai.ttf'#中文处理,用系统自带的字体。#soup.b就是b标签b.string标签下文字#find_all找全标签内容,attrs选择属性。#print(soup.find_all(attrs={'class':'short'}))#方法1。#print(soup.find_all(class_='short'))#方法2。#定
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
#一页影评的爬取
'''
url = "https://movie.douban.com/subject/34780991/comments"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
r = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(r,'lxml')
#prettify( )方法。这个方法可以把要解析的字符串以标准的缩进格式输出
#soup.b就是b标签b.string标签下文字#find_all找全标签内容,attrs选择属性
print(soup.title.string)
#print(soup.find_all('p'))
#print(soup.find_all(attrs={'class':'short'}))#方法1
#print(soup.find_all(class_='short'))#方法2
comments = soup.find_all(attrs={'class':'short'})
for i in comments:
print(i.string)
'''
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
if __name__ == '__main__':
for i in range(0,101,20):
url = 'https://movie.douban.com/subject/34780991/comments?start=' + str(i) + '&limit=20&status=P&sort=new_score'
try:
proxy_list = [
{'http': 'http://111.230.129.54:3128'},
{'http': 'http://121.199.6.124:8080'},
{'http': 'http://118.178.227.171:80'},
{'http': 'http://124.205.155.154:9090'},
{'http': 'http://39.108.57.218:80'}
]
proxy = random.choice(proxy_list)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
r = requests.get(url=url, headers=headers, timeout=(2,3), proxies=proxy).text
soup = BeautifulSoup(r, 'lxml')
comments = soup.find_all(attrs={'class': 'short'})
# print(comments)
for j in comments:
comment = j.string
print(j.string)
with open('影评.txt', 'a', encoding='utf-8') as f:
f.write(comment)
except :
print('出错')
#制作词云代码
#coding=utf-8
#添加自定义分词
import jieba
from os import path #用来获取文档的路径
jieba.setLogLevel(jieba.logging.INFO)
#词云
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
#词云生成工具
from wordcloud import WordCloud,ImageColorGenerator
#需要对中文进行处理
import matplotlib.font_manager as fm
#背景图
bg=np.array(Image.open("china.jpg"))
#获取当前的项目文件加的路径
d=path.dirname(__file__)
#读取停用词表
stopwords_path='stopwords.txt'
#添加需要自定以的分词
# jieba.add_word("叶文洁")
#读取要分析的文本
text_path="影评.txt"
#读取要分析的文本,读取格式
text=open(path.join(d,text_path),encoding="utf8").read()
#定义个函数式用于分词
def jiebaclearText(text):
#定义一个空的列表,将去除的停用词的分词保存
mywordList=[]
#进行分词
seg_list=jieba.cut(text,cut_all=False)
#将一个generator的内容用/连接
listStr='/'.join(seg_list)
#打开停用词表
f_stop=open(stopwords_path,encoding="utf8")
#读取
try:
f_stop_text=f_stop.read()
finally:
f_stop.close()#关闭资源
#将停用词格式化,用\n分开,返回一个列表
f_stop_seg_list=f_stop_text.split("\n")
#对默认模式分词的进行遍历,去除停用词
for myword in listStr.split('/'):
#去除停用词
if not(myword.split()) in f_stop_seg_list and len(myword.strip())>1:
mywordList.append(myword)
return ' '.join(mywordList)
text1=jiebaclearText(text)
#生成
wc=WordCloud(
background_color="white", #设置背景为白色,默认为黑色
width=990, #设置图片的宽度
height=440, #设置图片的高度
margin=10, #设置图片的边缘
max_font_size=50,
random_state=30,
font_path='C:/Windows/Fonts/simkai.ttf' #中文处理,用系统自带的字体
).generate(text1)
#为图片设置字体
my_font=fm.FontProperties(fname='C:/Windows/Fonts/simkai.ttf')
#产生背景图片,基于彩色图像的颜色生成器
image_colors=ImageColorGenerator(bg)
#开始画图
plt.imshow(wc)
#为云图去掉坐标轴
plt.axis("off")
#画云图,显示
#保存云图
wc.to_file("wordcloud.png")
更多推荐
所有评论(0)