|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
docx添加段落那一步总是报错TypeError: 'in <string>' requires string as left operand, not int
如果取消编码ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
import requests
from bs4 import BeautifulSoup
from docx import Document
def get_page_code(url, header):
page_code = requests.get(url, headers=header)
return page_code.text
def get_word_content(page_code):
word_code = BeautifulSoup(page_code, 'html.parser')
word_content = word_code.find('div', attrs={"style": "border:1px solid #C8DBD3;padding:20px;line-height:24px;"})
word_title = word_code.find('title')
# print(word_title.text, word_content.text)
# return word_content
data = word_title.text, word_content.text
return data
def save_word(data):
document = Document()
document.add_heading(data[0])
print(data[0])
for detail in data[1]:
document.add_paragraph(detail.encode("gb2312")) # 添加段落
document.save(f'{data[0]}.docx')
# with open(f'{data[0]}.docx', mode='w', encoding='utf-8') as f:
# f.write(data[1])
# f.close()
def main(url, header):
page_code = get_page_code(url, header)
word_content = get_word_content(page_code)
save_word(word_content)
if __name__ == "__main__":
url = "https://wenku.baidu.com/view/fb6324d8fa0f76c66137ee06eff9aef8941e48b0.html"
header = {
'User-agent': 'Googlebot'
}
main(url, header)
|
|