关于这个gbk报警,Python交流,编程语言专区,鱼C论坛

Victor0321 发表于 2022-4-16 00:33:55

关于这个gbk报警

谁知道我是哪里错了啊一直消不了。。

import random
from easygui import *
import os
from pathlib import Path

def countsuffix(path):
all_files = path.glob('**/*')

for file in all_files:
   countlines = 0
   f = open(file)
   for lines in f:
         countlines += 1
   print(countlines)


findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath)

isdkz 发表于 2022-4-16 00:34:56

import random
from easygui import *
import os
from pathlib import Path

def countsuffix(path):
all_files = path.glob('**/*')

for file in all_files:
   countlines = 0
   f = open(file, encoding='utf-8')                            # 改了这一行
   for lines in f:
         countlines += 1
   print(countlines)


findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath)

Victor0321 发表于 2022-4-16 11:11:49

isdkz 发表于 2022-4-16 00:34

他这个不是默认的读取方式么。。。。我后面又报了个这个，这个意思是说他读不了二进制的意思么？

File "C:\Users\zhuzj\Desktop\python\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 2: invalid continuation byte

isdkz 发表于 2022-4-16 11:16:26

本帖最后由 isdkz 于 2022-4-16 11:17 编辑

Victor0321 发表于 2022-4-16 11:11
他这个不是默认的读取方式么。。。。我后面又报了个这个，这个意思是说他读不了二进制的意思么？

F ...

因为你的文本文件的编码格式不一定统一，建议你使用 cchardet 检测一下编码。

使用前先在 cmd 执行以下命令安装：
pip install cchardet -i https://mirrors.aliyun.com/pypi/simple
import random
from easygui import *
import os
import cchardet
from pathlib import Path

def countsuffix(path):
all_files = path.glob('**/*')

for file in all_files:
   countlines = 0
   with open(file, 'rb') as f:                                                 # 注意这里
         encoding = cchardet.detect(f.read(2048))                                                 # 注意这里
   f = open(file, encoding=encoding)                            # 改了这一行
   for lines in f:
         countlines += 1
   print(countlines)


findpath = Path(diropenbox('请选择您的代码库:','浏览文件夹'))
countsuffix(findpath)

Victor0321 发表于 2022-4-16 12:09:15

isdkz 发表于 2022-4-16 11:16
因为你的文本文件的编码格式不一定统一，建议你使用 cchardet 检测一下编码。

使用前先在 cmd 执行 ...

try:
         for each_line in f:
            lines += 1
   except UnicodeDecodeError:
         pass

我看小甲鱼这样写的

isdkz 发表于 2022-4-16 12:27:40

Victor0321 发表于 2022-4-16 12:09
try:
for each_line in f:
lines += 1

这样也行，直接跳过编码错误

Victor0321 发表于 2022-4-25 00:20:57

'''
0. 是定位符
1. 是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。
2. 搜索的准确性，速度
3.
4. <class 'http.client.HTTPResponse'>
5. urllib.error.URLError
6.
7. Unicode
'''

import chardet as cha ##查询编码器
import urllib.request

class Urlread:
def url_read(self, html = ''):
   self.html = html
   self.response = urllib.request.urlopen(self.html).read() #读取网站内容
   self.load_encode()
   return self.response.decode(self.encoding_method)

def load_encode(self):
   '''识别网站编码器，并自动解码'''
   self.encoding_method = cha.detect(self.response)['encoding']
   if self.encoding_method == 'GB2312':
         self.encoding_method = 'GBK'
   return self.encoding_method

class File():
def read_file(self):
   self.time = 1
   with open(r'C:\Users\zhuzj\Desktop\python\test\老教材\urls.txt','r+') as self.file:
         self.file_url = self.file.readlines()
         for i in self.file_url:
            i = i.rstrip('\n')
            self.file_words = self.url_read(i)
            with open(f'url_{self.time}.txt','w+',encoding = 'utf-8') as new_file: #文本的文件编码格式不统一，这里可以统一用utf-8
               new_file.write(self.file_words)
            self.time += 1
   return self.file_url#返回在urls里的所有url 并生成了一个列表

def url_read(self, html = ''):
   self.html = html
   self.response = urllib.request.urlopen(self.html).read() #读取网站内容
   self.encoding_method = cha.detect(self.response)['encoding']
   self.encoding_method = 'GBK' if self.encoding_method == 'GB2312' else self.encoding_method
   return self.response.decode(self.encoding_method)

if __name__ == '__main__':
#动动手0
u0 = Urlread()
print((u0.url_read(r'http://www.fishc.com')))
#动动手1
u1 = Urlread()
u1.url_read(input('请输入URL'))
print(f'该网页使用的编码是{u1.encoding_method}')

#动动手2
f = File()
print(f.read_file())

页: [1]

鱼C论坛's Archiver

关于这个gbk报警