html = ''''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> '''
html = ''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> '''
html = ''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> ''' soup = BeautifulSoup(html,'lxml') print(soup.title.name) #打印标签名称
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#获取属性 from bs4 import BeautifulSoup
html = ''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> ''' soup = BeautifulSoup(html,'lxml') print(soup.p.attrs['name']) print(soup.p['name']) #两种方式都可以获取标签属性
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#获取内容 from bs4 import BeautifulSoup
html = ''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> ''' soup = BeautifulSoup(html,'lxml') print(soup.p.string)
6、嵌套选择
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
from bs4 import BeautifulSoup
html = ''' <html><head><title>The Domouse's story</title></head> <body> <p class="title"name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a> <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a> and they lived at bottom of a well.</p> <p class="story">...</p> ''' soup = BeautifulSoup(html,'lxml') print(soup.head.title.string)
html = ''' <html> <head> <title>The Domouse's story</title> </head> <body> <p class="story"> Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie" class="sister"id="link1"> <span>Elsle</span> </a> <a hred="http://example.com/lacle"class="sister" id="link2">Lacle</a> and <a hred="http://example.com/tilie"class="sister" id="link3">Tillie</a> and they lived at bottom of a well. </p> <p class="story">...</p> ''' soup = BeautifulSoup(html,'lxml') print(soup.p.contents) #子节点以列表形式返回
print(soup.p.children) #不同之处:children实际上是一个迭代器,需要用循环的方式才能将内容取出 for i,child in enumerate(soup.p.children): print(i,child)
print(soup.p.descendants) #获取所有的子孙节点,也是一个迭代器 for l,child1 in enumerate(soup.p.descendants): print(l,child1)
html = ''' <html> <head> <title>The Domouse's story</title> </head> <body> <p class="story"> Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie" class="sister"id="link1"> <span>Elsle</span> </a> <a hred="http://example.com/lacle"class="sister" id="link2">Lacle</a> and <a hred="http://example.com/tilie"class="sister" id="link3">Tillie</a> and they lived at bottom of a well. </p> <p class="story">...</p> '''
html = ''' <html> <head> <title>The Domouse's story</title> </head> <body> <p class="story"> Once upon a time there were little sisters;and their names were <a href="http://example.com/elsie" class="sister"id="link1"> <span>Elsle</span> </a> <a hred="http://example.com/lacle"class="sister" id="link2">Lacle</a> and <a hred="http://example.com/tilie"class="sister" id="link3">Tillie</a> and they lived at bottom of a well. </p> <p class="story">...</p> '''