爬取当当新书24小时排名
import requestsfrom bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random
import re
from openpyxl import Workbook
User_Agent = UserAgent(use_cache_server=False)# 实例化UserAgent
headers = {
"user-agent": User_Agent.random
}
wb = Workbook()
ws = wb.active
ws.title = "mysheet"
worksheet2 = wb.create_sheet()
worksheet2.title = "New Title"
# 访问网站功能
class htmlL:
def html(url):
html = requests.get(url, headers=headers).text
html_bs4 = BeautifulSoup(html, 'html.parser')
return html_bs4
book_list = []
for i in range(25):
url = 'http://bang.dangdang.com/books/newhotsales/01.00.00.00.00.00-24hours-0-0-1-'+str(i)
html_url = htmlL.html(url=url)
book = html_url.find(class_='bang_list clearfix bang_list_mode')
for f in book.find_all('li'):
book = []
pic = re.findall('http:.*.?html',str(f.find_all(class_="pic"))) # 书籍链接
name = re.sub('\(.*','',f.find(class_='name').text) # 书籍名称
#list_numred = f.find_all('list_num')# 排名
publisher_info = f.find(class_='publisher_info').text # 作者出版社信息
price = re.sub('\n|加入购物车|购买电子书|收藏|\r','',f.find(class_='price').text) # 价格
price_yj = re.sub('电子书:.*','',price)# 非电子价格
price_dz = re.findall('电子书:.*',price) # 电子书
book.append(name)
book.append(publisher_info)
book.append(price_yj)
try:
book.append(price_dz)
except IndexError :
book.append(' ')
book.append(pic)
book_list.append(book)
for d in range(len(book_list)):
for f in range(len(book_list)):
ws.cell(d+1,f+1,str(book_list))
wb.save("当当排名.xlsx")
页:
[1]