爬取文本DOXC写入内容报错,Python交流,编程语言专区,鱼C论坛

fatalman 发表于 2022-5-5 11:11:50

爬取文本DOXC写入内容报错

docx添加段落那一步总是报错TypeError: 'in <string>' requires string as left operand, not int
如果取消编码ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters

import requests
from bs4 import BeautifulSoup
from docx import Document

def get_page_code(url, header):
page_code = requests.get(url, headers=header)
return page_code.text

def get_word_content(page_code):
word_code = BeautifulSoup(page_code, 'html.parser')
word_content = word_code.find('div', attrs={"style": "border:1px solid #C8DBD3;padding:20px;line-height:24px;"})
word_title = word_code.find('title')
# print(word_title.text, word_content.text)
# return word_content
data = word_title.text, word_content.text
return data

def save_word(data):
document = Document()
document.add_heading(data)
print(data)
for detail in data:
   document.add_paragraph(detail.encode("gb2312"))# 添加段落

document.save(f'{data}.docx')
# with open(f'{data}.docx', mode='w', encoding='utf-8') as f:
#       f.write(data)
# f.close()

def main(url, header):
page_code = get_page_code(url, header)
word_content = get_word_content(page_code)
save_word(word_content)

if __name__ == "__main__":
url = "https://wenku.baidu.com/view/fb6324d8fa0f76c66137ee06eff9aef8941e48b0.html"
header = {
   'User-agent': 'Googlebot'
}
main(url, header)

suchocolate 发表于 2022-5-6 11:49:54

本帖最后由 suchocolate 于 2022-5-6 13:14 编辑

def get_page_code(url, header):
page_code = requests.get(url, headers=header)
page_code.encoding = 'utf-8'
return page_code.text

fatalman 发表于 2022-5-6 17:06:23

还是不行啊

ketai 发表于 2022-5-6 18:51:01

def save_word(data):
document = Document()
document.add_heading(data)

# print(len(data.split('/t')))
file_doc = open("file.txt", 'w', encoding='gbk')
file_doc.write(data)

file_doc_ = open("file.txt", 'r', encoding='gbk')
for detail in file_doc_.readlines():
document.add_paragraph(detail.strip())
document.save(f'{data}.docx')
报的错是传入类型错误，但是强转str类型结果是一堆byte类型的字符串，得不到想要的结果。能确定的是document.add_paragraph(detail.encode("gb2312")) 这行用法有问题，你看看有啥好办法解决把。

页: [1]

鱼C论坛's Archiver

爬取文本DOXC写入内容报错