关于这个gbk报警
谁知道我是哪里错了啊 一直消不了。。import random
from easygui import *
import os
from pathlib import Path
def countsuffix(path):
all_files = path.glob('**/*')
for file in all_files:
countlines = 0
f = open(file)
for lines in f:
countlines += 1
print(countlines)
findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath) import random
from easygui import *
import os
from pathlib import Path
def countsuffix(path):
all_files = path.glob('**/*')
for file in all_files:
countlines = 0
f = open(file, encoding='utf-8') # 改了这一行
for lines in f:
countlines += 1
print(countlines)
findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath) isdkz 发表于 2022-4-16 00:34
他这个不是默认的读取方式么。。。。我后面又报了个这个 ,这个意思是说他读不了二进制的意思么?
File "C:\Users\zhuzj\Desktop\python\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 2: invalid continuation byte 本帖最后由 isdkz 于 2022-4-16 11:17 编辑
Victor0321 发表于 2022-4-16 11:11
他这个不是默认的读取方式么。。。。我后面又报了个这个 ,这个意思是说他读不了二进制的意思么?
F ...
因为你的文本文件的编码格式不一定统一,建议你使用 cchardet 检测一下编码。
使用前先在 cmd 执行以下命令安装:
pip install cchardet -i https://mirrors.aliyun.com/pypi/simple
import random
from easygui import *
import os
import cchardet
from pathlib import Path
def countsuffix(path):
all_files = path.glob('**/*')
for file in all_files:
countlines = 0
with open(file, 'rb') as f: # 注意这里
encoding = cchardet.detect(f.read(2048)) # 注意这里
f = open(file, encoding=encoding) # 改了这一行
for lines in f:
countlines += 1
print(countlines)
findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath) isdkz 发表于 2022-4-16 11:16
因为你的文本文件的编码格式不一定统一,建议你使用 cchardet 检测一下编码。
使用前先在 cmd 执行 ...
try:
for each_line in f:
lines += 1
except UnicodeDecodeError:
pass
我看小甲鱼这样写的 Victor0321 发表于 2022-4-16 12:09
try:
for each_line in f:
lines += 1
这样也行,直接跳过编码错误 '''
0. 是定位符
1. 是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。
2. 搜索的准确性,速度
3.
4. <class 'http.client.HTTPResponse'>
5. urllib.error.URLError
6.
7. Unicode
'''
import chardet as cha ##查询编码器
import urllib.request
class Urlread:
def url_read(self, html = ''):
self.html = html
self.response = urllib.request.urlopen(self.html).read() #读取网站内容
self.load_encode()
return self.response.decode(self.encoding_method)
def load_encode(self):
'''识别网站编码器,并自动解码'''
self.encoding_method = cha.detect(self.response)['encoding']
if self.encoding_method == 'GB2312':
self.encoding_method = 'GBK'
return self.encoding_method
class File():
def read_file(self):
self.time = 1
with open(r'C:\Users\zhuzj\Desktop\python\test\老教材\urls.txt','r+') as self.file:
self.file_url = self.file.readlines()
for i in self.file_url:
i = i.rstrip('\n')
self.file_words = self.url_read(i)
with open(f'url_{self.time}.txt','w+',encoding = 'utf-8') as new_file: #文本的文件编码格式不统一,这里可以统一用utf-8
new_file.write(self.file_words)
self.time += 1
return self.file_url#返回在urls里的所有url 并生成了一个列表
def url_read(self, html = ''):
self.html = html
self.response = urllib.request.urlopen(self.html).read() #读取网站内容
self.encoding_method = cha.detect(self.response)['encoding']
self.encoding_method = 'GBK' if self.encoding_method == 'GB2312' else self.encoding_method
return self.response.decode(self.encoding_method)
if __name__ == '__main__':
#动动手0
u0 = Urlread()
print((u0.url_read(r'http://www.fishc.com')))
#动动手1
u1 = Urlread()
u1.url_read(input('请输入URL'))
print(f'该网页使用的编码是{u1.encoding_method}')
#动动手2
f = File()
print(f.read_file())
页:
[1]