马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
from bs4 import BeautifulSoup
import json
def get_html_text(url):
try:
r = requests.get(url)
r.raise_for_status()
# r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ''
def parse_url(html):
soup = BeautifulSoup(html, 'lxml')
content = []
for mulu in soup.find_all(class_='mulu'):
h2 = mulu.find('h2')
if h2 != None:
h2_title = h2.string
list = []
for a in mulu.find(class_='box').find_all('a'):
herf = a.get('href')
box_title = a.get('title')
list.append({'herf': herf, 'box_title': box_title})
content.append({'title': h2_title, 'content': list})
with open('novel.json', 'w') as f:
json.dump(content, fp=f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
url = 'http://seputu.com/'
html = get_html_text(url)
parse_url(html)
|