|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import re
class TiebaSpider:
def __init__(self):
self.tiebaname = input("请输入爬取贴吧:")
self.selfpage = int(input("请输入爬取页码:"))
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"}
self.img_dir = r"C:/Users/亮晶晶/Desktop" + self.tiebaname
self.save()
self.run()
def run(self):
for i in range(self.selfpage):
page = i * 50
tiebaurl = ("http://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search".format(self.tiebaname, page))
respond = requests.get(tiebaurl, headers=self.headers).text
# resl = re.compile('href="/p/\d\d\d\d\d\d\d\d\d\d" title=".*?"')
resl = re.compile('href="/p/\d\d\d\d\d\d\d\d\d\d"')
urlname = re.findall(resl, respond)
# urlname2 = urlname[7:-2]
# print(urlname)
for i in urlname:
i = i[6:-2]
# print(i)
url_1 = 'https://tieba.baidu.com' + i
# print(url_1)
respond = requests.get(url_1,headers=self.headers).text
# print(respond)
res2 = re.findall('src="https://imgsa.baidu.com/forum(.*?)"', respond)
print(res2)
for i in res2:
i = 'https://imgsa.baidu.com/forum' + i
print(i)
def save(self):
pass
if __name__ == "__main__":
TiebaSpider() |
|