BeautifulSoup查找、选择、删除标签,获取标签属性、文本等
new一个对象from bs4 import BeautifulSoupsoup = BeautifulSoup(html, features="html.parser")1. 查找# 根据标签查找imgs = soup.find_all("img")# 根据属性查找imgs = soup.find_all("img", attrs={"class": "avatar"})# 根据样式查找(支持正
·
new一个对象
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, features="html.parser")
1. 查找
# 根据标签查找
imgs = soup.find_all("img")
# 根据属性查找
imgs = soup.find_all("img", attrs={"class": "avatar"})
# 根据样式查找(支持正则)
tabs = soup.find_all(style=re.compile(r'.*display:none.*?'))
2. 删除标签
# 删除style中包含隐藏的标签
for i in self.soup.find_all(style=re.compile(r'.*display:none.*?')):
i.decompose()
3. 获取标签的属性
# 获取img标签的src属性
imgs = soup.find_all('img')
for img in imgs:
url = img.get("src")
print(url)
4. 获取标签内的文本
# 获取文本(分隔符、去除空白)
soup.get_text(separator=" ", strip=True)
5. 根据css选择器选择标签
#获取a标签中具有href属性的标签
soup.select('a[href]')
6. 正则匹配标签
# 选取所有的h标签,替换内容(去除h标签内的标签,只保留文本)
for i in soup.find_all(re.compile("^h[1-6]")):
i.string = i.get_text()
7. 去除空标签
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
soup = BeautifulSoup(clean_content, features="html.parser")
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br"):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
更多推荐


所有评论(0)