import random

import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
#一页影评的爬取
'''
url = "https://movie.douban.com/subject/34780991/comments"
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
r = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(r,'lxml')
#prettify( )方法。这个方法可以把要解析的字符串以标准的缩进格式输出
#soup.b就是b标签b.string标签下文字#find_all找全标签内容,attrs选择属性
print(soup.title.string)
#print(soup.find_all('p'))
#print(soup.find_all(attrs={'class':'short'}))#方法1
#print(soup.find_all(class_='short'))#方法2
comments = soup.find_all(attrs={'class':'short'})
for i in comments:
    print(i.string)
'''
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies

if __name__ == '__main__':
    for i in range(0,101,20):
        url = 'https://movie.douban.com/subject/34780991/comments?start=' + str(i) + '&limit=20&status=P&sort=new_score'
        try:
            proxy_list = [
                {'http': 'http://111.230.129.54:3128'},
                {'http': 'http://121.199.6.124:8080'},
                {'http': 'http://118.178.227.171:80'},
                {'http': 'http://124.205.155.154:9090'},
                {'http': 'http://39.108.57.218:80'}
            ]
            proxy = random.choice(proxy_list)
            headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}

            r = requests.get(url=url, headers=headers, timeout=(2,3), proxies=proxy).text
            soup = BeautifulSoup(r, 'lxml')
            comments = soup.find_all(attrs={'class': 'short'})
            # print(comments)
            for j in comments:
                comment = j.string
                print(j.string)
                with open('影评.txt', 'a', encoding='utf-8') as f:
                    f.write(comment)
        except :
            print('出错')


#制作词云代码

#coding=utf-8

#添加自定义分词

import jieba
from os import path  #用来获取文档的路径
jieba.setLogLevel(jieba.logging.INFO)
#词云
from PIL import Image
import numpy as  np
import matplotlib.pyplot as plt
#词云生成工具
from wordcloud import WordCloud,ImageColorGenerator
#需要对中文进行处理
import matplotlib.font_manager as fm

#背景图
bg=np.array(Image.open("china.jpg"))

#获取当前的项目文件加的路径
d=path.dirname(__file__)
#读取停用词表
stopwords_path='stopwords.txt'
#添加需要自定以的分词
# jieba.add_word("叶文洁")

#读取要分析的文本
text_path="影评.txt"
#读取要分析的文本,读取格式
text=open(path.join(d,text_path),encoding="utf8").read()
#定义个函数式用于分词
def jiebaclearText(text):
    #定义一个空的列表,将去除的停用词的分词保存
    mywordList=[]
    #进行分词
    seg_list=jieba.cut(text,cut_all=False)
    #将一个generator的内容用/连接
    listStr='/'.join(seg_list)
    #打开停用词表
    f_stop=open(stopwords_path,encoding="utf8")
    #读取
    try:
        f_stop_text=f_stop.read()
    finally:
        f_stop.close()#关闭资源
    #将停用词格式化,用\n分开,返回一个列表
    f_stop_seg_list=f_stop_text.split("\n")
    #对默认模式分词的进行遍历,去除停用词
    for myword in listStr.split('/'):
        #去除停用词
        if not(myword.split()) in f_stop_seg_list and len(myword.strip())>1:
            mywordList.append(myword)
    return ' '.join(mywordList)
text1=jiebaclearText(text)
#生成
wc=WordCloud(
        background_color="white", #设置背景为白色,默认为黑色
        width=990,              #设置图片的宽度
        height=440,              #设置图片的高度
        margin=10,               #设置图片的边缘

    max_font_size=50,
    random_state=30,
    font_path='C:/Windows/Fonts/simkai.ttf'   #中文处理,用系统自带的字体
    ).generate(text1)
#为图片设置字体
my_font=fm.FontProperties(fname='C:/Windows/Fonts/simkai.ttf')
#产生背景图片,基于彩色图像的颜色生成器
image_colors=ImageColorGenerator(bg)
#开始画图
plt.imshow(wc)
#为云图去掉坐标轴
plt.axis("off")
#画云图,显示
#保存云图
wc.to_file("wordcloud.png")
 

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐