BeautifulSoup 爬取豆瓣电影影评并且制作词云

font_path='C:/Windows/Fonts/simkai.ttf'#中文处理，用系统自带的字体。#soup.b就是b标签b.string标签下文字#find_all找全标签内容,attrs选择属性。#print(soup.find_all(attrs={'class':'short'}))#方法1。#print(soup.find_all(class_='short'))#方法2。#定

weixin_54141179

1723人浏览 · 2025-09-16 18:57:44

weixin_54141179 · 2025-09-16 18:57:44 发布

import random

import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
#一页影评的爬取
'''
url = "https://movie.douban.com/subject/34780991/comments"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
r = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(r,'lxml')
#prettify( )方法。这个方法可以把要解析的字符串以标准的缩进格式输出
#soup.b就是b标签b.string标签下文字#find_all找全标签内容,attrs选择属性
print(soup.title.string)
#print(soup.find_all('p'))
#print(soup.find_all(attrs={'class':'short'}))#方法1
#print(soup.find_all(class_='short'))#方法2
comments = soup.find_all(attrs={'class':'short'})
for i in comments:
print(i.string)
'''
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list

def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies

if __name__ == '__main__':
for i in range(0,101,20):
url = 'https://movie.douban.com/subject/34780991/comments?start=' + str(i) + '&limit=20&status=P&sort=new_score'
try:
proxy_list = [
{'http': 'http://111.230.129.54:3128'},
{'http': 'http://121.199.6.124:8080'},
{'http': 'http://118.178.227.171:80'},
{'http': 'http://124.205.155.154:9090'},
{'http': 'http://39.108.57.218:80'}
]
proxy = random.choice(proxy_list)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}

r = requests.get(url=url, headers=headers, timeout=(2,3), proxies=proxy).text
soup = BeautifulSoup(r, 'lxml')
comments = soup.find_all(attrs={'class': 'short'})
# print(comments)
for j in comments:
comment = j.string
print(j.string)
with open('影评.txt', 'a', encoding='utf-8') as f:
f.write(comment)
except :
print('出错')

#制作词云代码

#coding=utf-8

#添加自定义分词

import jieba
from os import path #用来获取文档的路径
jieba.setLogLevel(jieba.logging.INFO)
#词云
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
#词云生成工具
from wordcloud import WordCloud,ImageColorGenerator
#需要对中文进行处理
import matplotlib.font_manager as fm

#背景图
bg=np.array(Image.open("china.jpg"))

#获取当前的项目文件加的路径
d=path.dirname(__file__)
#读取停用词表
stopwords_path='stopwords.txt'
#添加需要自定以的分词
# jieba.add_word("叶文洁")

#读取要分析的文本
text_path="影评.txt"
#读取要分析的文本，读取格式
text=open(path.join(d,text_path),encoding="utf8").read()
#定义个函数式用于分词
def jiebaclearText(text):
#定义一个空的列表，将去除的停用词的分词保存
mywordList=[]
#进行分词
seg_list=jieba.cut(text,cut_all=False)
#将一个generator的内容用/连接
listStr='/'.join(seg_list)
#打开停用词表
f_stop=open(stopwords_path,encoding="utf8")
#读取
try:
f_stop_text=f_stop.read()
finally:
f_stop.close()#关闭资源
#将停用词格式化，用\n分开，返回一个列表
f_stop_seg_list=f_stop_text.split("\n")
#对默认模式分词的进行遍历，去除停用词
for myword in listStr.split('/'):
#去除停用词
if not(myword.split()) in f_stop_seg_list and len(myword.strip())>1:
mywordList.append(myword)
return ' '.join(mywordList)
text1=jiebaclearText(text)
#生成
wc=WordCloud(
background_color="white", #设置背景为白色，默认为黑色
width=990, #设置图片的宽度
height=440, #设置图片的高度
margin=10, #设置图片的边缘

max_font_size=50,
random_state=30,
font_path='C:/Windows/Fonts/simkai.ttf' #中文处理，用系统自带的字体
).generate(text1)
#为图片设置字体
my_font=fm.FontProperties(fname='C:/Windows/Fonts/simkai.ttf')
#产生背景图片，基于彩色图像的颜色生成器
image_colors=ImageColorGenerator(bg)
#开始画图
plt.imshow(wc)
#为云图去掉坐标轴
plt.axis("off")
#画云图，显示
#保存云图
wc.to_file("wordcloud.png")

2048 AI社区

有“AI”的1024 = 2048，欢迎大家加入2048 AI社区

更多推荐

医疗模型Hyperopt调参稳住AUC

2048 AI社区

Java 后端如何高效对接 Python 微调大模型？四种数据交互方案全解析（含实战代码）

Python 微调大模型可通过 Flask/FastAPI 暴露 REST 接口，Java 使用 Spring 的或调用。Python 示例（FastAPI）：https://www.zhihu.com/zvideo/1993874870761722160/https://www.zhihu.com/zvideo/1993874870648451336/https://www.zhihu.com/

2048 AI社区

AI的平衡艺术：2026年如何在技术狂奔中守护人性价值

2026年AI发展进入精智落地新阶段，战略核心从"盲目追新"转向"平衡艺术"。本文深入探讨技术与人性、创新与治理、自动化与体验三大平衡维度，通过真实场景分析展示如何构建可持续的AI价值体系。文章为企业和技术开发者提供兼具前瞻性与实操性的实施框架，助力在AI浪潮中把握发展节奏。关键词AI战略、技术伦理、人机协同、AI治理、用户体验、可持续创新