|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
用正则表达式,把一个网页上的所有文章链接爬了下来
然后用bs4下载这些链接的内容
然后保存内容到文件
可能技术不佳
爬的文章都在一个text里,求教育,分开文章。
代码如下:
#coding=utf-8
import os
from bs4 import BeautifulSoup
import re
import requests
url = '**************************'
def geturl(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
openurl = requests.get(url,headers = headers)
text = openurl.content.decode('gbk',"ignore")
return text
def openurl(text):
reg = r'<a href="(.*html)" target='
rew = re.findall(reg,text)
urlz = []
#print(rew)
for i in rew:
#print(i)
urlz.append ( '*********/'+i)
#print((urlz)[-1])#不加-1的时候,取的是全部,加了之后取的最后一个
print(urlz[-1])
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
openev= requests.get(urlz[-1],headers = headers)
textev = openev.content.decode('gbk',"ignore")
soup= BeautifulSoup(textev,'html.parser')
every = soup.findAll('div',{'class':'tpc_content do_not_catch'})
for t in every:
x=1
try:
with open ('D:\新建文件夹\小说%d.txt'%x,'a',encoding='utf-8') as f:
x+=1
f.write(str(t.get_text()))
except UnicodeDecodeError as reason:
print(str(reason))
finally:
pass
openurl(geturl(url))
python3.5 网址就不贴了,你懂的。 求怎么能分开每篇文章
|
|