|

楼主 |
发表于 2022-4-25 00:20:57
|
显示全部楼层
'''
0. 是定位符
1. 是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。
2. 搜索的准确性,速度
3.
4. <class 'http.client.HTTPResponse'>
5. urllib.error.URLError
6.
7. Unicode
'''
import chardet as cha ##查询编码器
import urllib.request
class Urlread:
def url_read(self, html = ''):
self.html = html
self.response = urllib.request.urlopen(self.html).read() #读取网站内容
self.load_encode()
return self.response.decode(self.encoding_method)
def load_encode(self):
'''识别网站编码器,并自动解码'''
self.encoding_method = cha.detect(self.response)['encoding']
if self.encoding_method == 'GB2312':
self.encoding_method = 'GBK'
return self.encoding_method
class File():
def read_file(self):
self.time = 1
with open(r'C:\Users\zhuzj\Desktop\python\test\老教材\urls.txt','r+') as self.file:
self.file_url = self.file.readlines()
for i in self.file_url:
i = i.rstrip('\n')
self.file_words = self.url_read(i)
with open(f'url_{self.time}.txt','w+',encoding = 'utf-8') as new_file: #文本的文件编码格式不统一,这里可以统一用utf-8
new_file.write(self.file_words)
self.time += 1
return self.file_url#返回在urls里的所有url 并生成了一个列表
def url_read(self, html = ''):
self.html = html
self.response = urllib.request.urlopen(self.html).read() #读取网站内容
self.encoding_method = cha.detect(self.response)['encoding']
self.encoding_method = 'GBK' if self.encoding_method == 'GB2312' else self.encoding_method
return self.response.decode(self.encoding_method)
if __name__ == '__main__':
#动动手0
u0 = Urlread()
print((u0.url_read(r'http://www.fishc.com'))[0:300])
#动动手1
u1 = Urlread()
u1.url_read(input('请输入URL'))
print(f'该网页使用的编码是{u1.encoding_method}')
#动动手2
f = File()
print(f.read_file())
|
|