|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- import re
- def parse_page(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
- }
- response = requests.get(url, headers=headers)
- text = response.text
- titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
- dynasties = re.findall(r'<p class="source">.*?<a.*>(.*?)</a>',text)
- authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)(</a>',text,re.DOTALL)
- contents_tag = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
- contents = []
- for content in contents:
- x =re.sub(r'<.*?>',"",content)
- contents.append(x.strip())
- for value in zip(titles,dynasties,authors,contents):
- title,dynasty,author,content = value
- poems =[]
- poem ={
- 'title':title,
- 'dynasty':dynasty,
- 'author':author,
- 'content':content
- }
- poems.append(poem)
- for poem in poems:
- print(poem)
- print('='*40)
- def main():
- url = 'https://www.gushiwen.org/default_1.aspx'
- for x in range(1,10):
- url = 'https://www.gushiwen.org/default_%s.aspx' %x
- parse_page(url)
- if __name__ == ' __main__':
- main()
复制代码 |
|