|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 小明EVO 于 2017-10-23 18:12 编辑
- import io
- import formatter
- import html.parser
- import http.client
- import os
- import sys
- import urllib.parse, urllib.request
- class Retriever(object):
- __slots__ = ('url', 'file')
- def __init__(self, url):
- self.url, self.file = self.get_file(url)
- def get_file(self, url, default='index.html'):
- 'Create usable lacal fielname from URL'
- parsed = urllib.parse.urlparse(url)
- host = parsed.netloc.split('@')[-1].split(':')[0]
- filepath = '%s%s' % (host, parsed.path)
- if not os.path.splitext(parsed.path)[1]:
- filepath = os.path.join(filepath, default)
- linkdir = os.path.dirname(filepath)
- if not os.path.isdir(linkdir):
- if os.path.exists(linkdir):
- os.unlink(linkdir)
- os.makedirs(linkdir)
- return url, filepath
- def download(self):
- 'Download URL to specific named file'
- try:
- retval = urllib.request.urlretrieve(self.url, self.file)
- except (IOError, http.client.InvalidURL) as e:
- retval = (('***ERROR: BAD URL "%s": %s ' % (self.url, e)),)
- return retval
- def parse_links(self):
- 'Parse out the links found in downloaded HTML file'
- with open(self.file, 'r') as f:
- data = f.read()
- parser = html.parser.HTMLParser(formatter.AbstractFormatter(
- formatter.DumbWriter(io.StringIO())))
- parser.feed(data)
- parser.close()
- return parser.anchorlist
- class Crawler(object):
- count = 0
- def __init__(self, url):
- self.q = [url]
- self.seen = set()
- parsed = urllib.parse.urlparse(url)
- host = parsed.netloc.split('@')[-1].split(':')[0]
- self.dom = '.'.join(host.split('.')[-2:])
- def get_page(self, url, media=False):
- 'Download page and parse links, add to queue if nec'
- r = Retriever(url)
- fname = r.download()[0]
- if fname[0] == '*':
- print(fname, '... skipping parse')
- return
- Crawler.count += 1
- print('\n(', Crawler.count, ')')
- print('URL:', url)
- print('FILE:', fname)
- self.seen.add(url)
- ftype = os.path.splitext(fname)[1]
- if ftype not in ('.htm', '.html'):
- return
- for link in r.parse_links():
- if link.startswith('mailto'):
- print('... discarded, mailto link')
- continue
- #忽略邮箱链接
- if not media:
- ftype = os.path.splitext(link)[1]
- if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
- print('... discarded, media file')
- continue
- #忽略媒体文件
- if not link.startswith('http://'):
- link = urllib.parse.urljoin(url, link)
- print('*', link, )
- #补全并打印正确的url
- if link not in self.seen:
- if self.dom not in link:
- print('... discarded, not in domain')
- else:
- if link not in self.q:
- self.q.append(link)
- print('... new, added to Q')
- else:
- print('... discarded, already in Q')
- else:
- print('... discarded, already processed')
- def go(self, media=False):
- 'Process next page in queue (if any)'
- while self.q:
- url = self.q.pop()
- self.get_page(url, media)
- def main():
- if len(sys.argv) > 1:
- url = sys.argv[1]
- else:
- try:
- url = 'http://www.null.com/home/index.html'
- except KeyboardInterrupt as EOFError:
- url = ''
- if not url:
- return
- if not url.startswith('http://') and \
- not url.startswith('ftp://'):
- url = 'http://%s/' % url
- robot = Crawler(url)
- robot.go()
- if __name__ == '__main__':
- main()
复制代码
这是核心编程第三版里的爬虫的例子,基本是照着敲了一遍,只不过原来是2我改成了3,报错如下:
- ( 1 )
- URL: [url]http://www.null.com/home/index.html[/url]
- FILE: [url]www.null.com/home/index.html[/url]
- Traceback (most recent call last):
- File "C:/Users/77409/PycharmProjects/untitled/crawl.py", line 122, in <module>
- main()
- File "C:/Users/77409/PycharmProjects/untitled/crawl.py", line 119, in main
- robot.go()
- File "C:/Users/77409/PycharmProjects/untitled/crawl.py", line 103, in go
- self.get_page(url, media)
- File "C:/Users/77409/PycharmProjects/untitled/crawl.py", line 72, in get_page
- for link in r.parse_links():
- File "C:/Users/77409/PycharmProjects/untitled/crawl.py", line 41, in parse_links
- formatter.DumbWriter(io.StringIO())))
- TypeError: __init__() takes 1 positional argument but 2 were given
复制代码
我就不懂了。。。init返回的参数数量怎么和这里41行有关了。。。而且到底返回值是啥,怎么就数量不对了。。。求解 |
|