new一个对象

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, features="html.parser")
1. 查找
# 根据标签查找
imgs = soup.find_all("img")
# 根据属性查找
imgs = soup.find_all("img", attrs={"class": "avatar"})
# 根据样式查找(支持正则)
tabs = soup.find_all(style=re.compile(r'.*display:none.*?'))
2. 删除标签
# 删除style中包含隐藏的标签
for i in self.soup.find_all(style=re.compile(r'.*display:none.*?')):
    i.decompose()
3. 获取标签的属性
# 获取img标签的src属性
imgs = soup.find_all('img')
for img in imgs:
    url = img.get("src")
    print(url)
4. 获取标签内的文本
# 获取文本(分隔符、去除空白)
soup.get_text(separator=" ", strip=True)
5. 根据css选择器选择标签
 #获取a标签中具有href属性的标签
soup.select('a[href]')
6. 正则匹配标签
# 选取所有的h标签,替换内容(去除h标签内的标签,只保留文本)
for i in soup.find_all(re.compile("^h[1-6]")):
	i.string = i.get_text()
7. 去除空标签
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
soup = BeautifulSoup(clean_content, features="html.parser")
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br"):
    for j in i.descendants:
        if j.name in ["img", "video", "br"]:
            break
    else:
        i.decompose()
Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐