|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 wongyusing 于 2018-3-12 17:49 编辑
代码如下:
- #!/usr/bin/env python
- # encoding: utf-8
- # Sing
- from bs4 import BeautifulSoup as Soup
- import re
- import urllib
- import requests
- def url_open(url): # 网页打开函数,以防被禁
- headers = {
- 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36"}
- req = requests.get(url, headers=headers)
- req.encoding = 'utf-8'
- return req.text
- def get_book_list(url):#获取书籍的url并提取出修订版的url后缀列表
- ret = url_open(url)
- reg = r'<p class="title"><a href="(.*?)">.*?</a></p>'
- req = re.findall(reg, ret)
- return req[0:15]
- def get_chapter(url_char):#获取章节后缀和书名
- ret = url_open(url_char)
- reg = r'<li><a href="(.*?)">.*?</a></li>'
- char = re.findall(reg, ret)#章节后缀
- reg = r'<h1 class="title"><span>(.*?)</span>.*?</h1>'
- name = re.findall(reg, ret)#书名
- return char,name
- def getContent(html):#获取章节名和章节内容
- ret = url_open(html)
- soup = Soup(ret,'html.parser')
- title = soup.find('div', class_='mbtitle').contents[-1].string or soup.find('h1', class_='title').contents[-1].string #获取标签 #获取标签
- #上面是获取章节名这里总是报错误,说无法获取内容
- content = soup.find('div',class_='vcon').strings #获取内容
-
- return title,content
- def writeFile(title,content,name):
- with open(name +'.txt','a',encoding='utf-8')as ganrong:
- #设置文件编码,避免写入时乱码
- ganrong.write('\n'+title+'\n')
- for line in content:
- ganrong.write(line)
- print('%s was writed ..'%title,name)
- def getganrong(): #主函数
- url = 'http://www.jinyongwang.com/book/'
- baseurl ='http://www.jinyongwang.com'
- #ppp = url_open(url)
- #print(ppp)
- book_list = get_book_list(url)
- #print(book_list)
- for i in book_list:
- url_char = baseurl + i
- char, name = get_chapter(url_char)
- for page in char:
- html = baseurl + page
- title, content = getContent(html)
- writeFile(title, content,name[0])
- if __name__ == "__main__":
- getganrong()
复制代码
问题如下:
1.小说文件爬取下来后,文本格式不好看,不能分好段落,全部连成一坨,求解决方案。
2.如果我想用scrapy框架的话,该如何写?我根据网上教程去写,运行的爬虫全部都报“DNS”错误,有好心人给个可以运行的scrapy爬虫让我测试和参考一下吗??
3.在获取标签的BS4表达式中总是报错误,具体位置在37行代码中。
错误信息如下:
第05章 老鼠汤 was writed .. 连城诀小说
第06章 血刀老祖 was writed .. 连城诀小说
Traceback (most recent call last):
File "/home/sing/桌面/untitled1/bbbbb.py", line 74, in <module>
getganrong()
File "/home/sing/桌面/untitled1/bbbbb.py", line 64, in getganrong
title, content = getContent(html)
File "/home/sing/桌面/untitled1/bbbbb.py", line 37, in getContent
title = soup.find('div', class_='mbtitle').contents[-1].string or soup.find('h1', class_='title').contents[-1].string #获取标签 #获取标签
AttributeError: 'NoneType' object has no attribute 'contents'
本帖最后由 纳兰小寒 于 2018-3-12 23:37 编辑
写入每一行数据的时候,末尾加个换行符 - def writeFile(title,content,name):
- with open(name +'.txt','a',encoding='utf-8')as ganrong:
- #设置文件编码,避免写入时乱码
- ganrong.write('\n'+title+'\n')#加个换行符
- for line in content:
- ganrong.write(line+'\n')
- print('%s was writed ..'%title,name)
复制代码
抓到了你说的第二个问题,爬虫返回的是“<title>500 Internal Server Error</title>”
换IP或者换个浏览器头就好了,再或者在采集的时候加个间隔时间
- def url_open(url): # 网页打开函数,以防被禁
- headers = {
- 'User-Agent': rand_ua()}
- req = requests.get(url, headers=headers)
- req.encoding = 'utf-8'
- return req.text
- def rand_ua():
- headers = ["Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36"]
- headers.append('Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0')
- headers.append('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
- headers.append('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36')
- headers.append('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/57.0.2987.98 Chrome/57.0.2987.98 Safari/537.36')
- return headers[rd.randint(0,4)]
复制代码
|
|