|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from bs4 import BeautifulSoup
- import json
- def get_html_text(url):
- try:
- r = requests.get(url)
- r.raise_for_status()
- # r.encoding = 'utf-8'
- return r.text
- except Exception as e:
- print(e)
- return ''
- def parse_url(html):
- soup = BeautifulSoup(html, 'lxml')
- content = []
- for mulu in soup.find_all(class_='mulu'):
- h2 = mulu.find('h2')
- if h2 != None:
- h2_title = h2.string
- list = []
- for a in mulu.find(class_='box').find_all('a'):
- herf = a.get('href')
- box_title = a.get('title')
- list.append({'herf': herf, 'box_title': box_title})
- content.append({'title': h2_title, 'content': list})
- with open('novel.json', 'w') as f:
- json.dump(content, fp=f, ensure_ascii=False, indent=4)
- if __name__ == '__main__':
- url = 'http://seputu.com/'
- html = get_html_text(url)
- parse_url(html)
复制代码 |
|