|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 RIXO 于 2018-9-24 09:40 编辑
呃,看书籍编的一个爬百度百科的代码,本来没有爬取成功,但是我在git上面看到一个爬取成功的,怀着好奇的心理就下载了,结果他的代码能用。。。。。
看了一下,发现在爬百度百科的时候,他使用的是urllib.request.urlopen(url)连header都没有加,但是能爬
结果我的用requests库,就会出现requests.exceptions.TooManyRedirects: Exceeded 30 redirects. 这个错误
求个解答!
呃,贴代码
- import requests
- import urlparse2,re,html_codec
- from bs4 import BeautifulSoup
- class UrlManager():
- def __init__(self):
- self.new_urls = set()
- self.old_urls = set()
- def has_new_url(self): #是否有待获取的url
- return self.new_url_size != 0
- def get_new_url(self): #获取一个新的url
- new_url = self.new_urls.pop()
- self.old_urls.add(new_url)
- return new_url
- def add_new_url(self,url): #加入新的url
- if url is None:
- return
- if url not in self.new_urls and url not in self.old_urls:
- self.new_urls.add(url)
- def add_new_urls(self,urls):
- if urls is None or len(urls) == 0:
- return
- for url in urls:
- self.add_new_urls(url)
- def new_url_size(self):
- return len(self.new_urls)
- def old_url_size(self):
- return len(self.old_urls)
- class HtmlDownloader(): #返回html内容
- def download(self,url):
- if url is None:
- return None
- headers = {
- 'Host' : 'baike.baidu.com',
- 'User_Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
- 'Upgrade-Insecure-Requests': '1'
- }
- r = requests.get(url,headers = headers,allow_redirects=False)
- if r.status_code == 200:
- r,encoding = 'utf-8'
- return r.text
- return None
- class HtmlParser():
- def parser(self,page_url,html_cont):
- if page_url is None or html_cont is None :
- return
- soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
- new_urls = self._get_new_urls(page_url,soup)
- new_data = self._get_new_data(page_url,soup)
- def _get_new_urls(self,page_url,soup): #解析出url
- new_urls = set()
- links = soup.find_all('a',href = re.compile(r'/view/\d+\.htm'))
- for link in links:
- new_url = link('href')
- new_full_url = urlparse2.urljoin(page_url,new_url)
- new_urls.add(new_full_url)
- return new_urls
- def _get_new_data(self,page_url,soup): #解析出数据
- data = {}
- data['url'] = page_url
- title = soup.find('dd',class_ = 'lemmaWgt-lemmaTitle-title').find('h1')
- data['title'] = title.get_text()
- summary = soup.find('div',class_ = 'lemma-summary' )
- data['summary'] = summary.get_text()
- return data
- class DataOutput():
- def __init__(self):
- self.datas = []
- def store_data(self,data):
- if data is None:
- return
- self.datas.append(data)
- def output_html(self):
- fout = html_codec.open('baike.html','w',encoding = 'utf-8')
- fout.write('<html>')
- fout.write('<body>')
- fout.write('<table>')
- for data in self.datas:
- fout.wite('<tr>')
- fout.write('<td>%s</td>'%data['url'])
- fout.write('<td>%s</td>'%data['title'])
- fout.write('<td>%s</td>'%data['summary'])
- fout.write('</tr>')
- self.datas.remove(data)
- fout.write('</table>')
- fout.write('</body>')
- fout.write('</html>')
- fout.close()
-
- class SpiderMan():
- def __init__(self):
- self.manager = UrlManager()
- self.downloader = HtmlDownloader()
- self.parser = HtmlParser()
- self.output = DataOutput()
- def craw(self,root_url):
- self.manager.add_new_url(root_url)
- while (self.manager.has_new_url() and self.manager.old_url_size()<1):
- #try:
- new_url = self.manager.get_new_url()
- html = self.downloader.download(new_url)
- new_urls,data = self.parser.parser(new_url,html)
- self.manager.add_new_urls(new_urls)
- self.output.store_data(data)
- print('已经抓取了%s个链接'%self.manager.old_url_size())
- #except Exception as e:
- # print('crawl failed',e)
- self.output.output_html()
- if __name__ == '__main__':
- spider_man = SpiderMan()
- spider_man.craw('https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB')
复制代码 |
|