外国的爬虫
# -*- coding: utf-8 -*-import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
from openpyxl import Workbook
import time
wb = Workbook()
ws = wb.active
ws.title = "Sheet1"
# 获取采集的条码
csv_str = open('book0906.csv','r')
# 按行读取
list_d = csv_str.readlines()
# 代理地址
proxies = {
'https':'http://127.0.0.1:1080'
}
user_agent = UserAgent(use_cache_server=False)
# 获取内容网站的url地址
def url(self):
headers = {
"user-agent": user_agent.random
}
html = requests.get(self,proxies=proxies,headers=headers).text
bs4 = BeautifulSoup(html,'html.parser')
html_bs4 = bs4.find(class_='resultBooksImg resultBooksImg')
urld = 'https://www.sanmin.com.tw/'+re.sub('href=|\"','',re.findall('href=\".*?\"',str(html_bs4)))
return urld
class book():
# 获取书籍内容
def book_lr(self):
book_lrs =[]
ddd = self.find(class_='bookProject tabsview')
for d in ddd.find_all(class_='productContent'):
book_lrs.append(d.text)
return book_lrs
# 书籍信息
def book_xx(self):
book_xx =[]
for d in self.find_all(class_='mainText ga'):
book_xx.append(d.text)
return book_xx
def book_sx(self):
for i in bookxx:
iflen(re.findall(self,i)) == 1:
jj = re.sub(self + '|\:','',i)
return jj
else:
pass
for str_list_num in list_d:
try:
books = []
headers = {
"user-agent": user_agent.random
}
str_list = str_list_num.strip('\n')
url1 = url('https://www.sanmin.com.tw/search/index/?ct=ISBN&qu='+str_list+'&ls=SD')
html = requests.get(url1,proxies=proxies,headers=headers).text
bs4 = BeautifulSoup(html,'html.parser')
book_name = bs4.find(class_='bookStatusInfor')
bookxx = book.book_xx(book_name)
books.append(str_list)
books.append(book_name.find('li').text)
books.append(book.book_sx('作者'))
mony = re.findall('.*',re.findall('定.*.?元',bs4.find(class_='bookStatusAddCtrl').text))
books.append(mony)
books.append(book.book_sx('規格'))
books.append(re.findall('.*',book.book_sx('裝訂/頁數')))
books.append(re.sub('/','',re.findall('.*/',book.book_sx('裝訂/頁數'))))
books.append(book.book_sx('出版日'))
kc = re.findall('預購中.*|庫存.*',bs4.find(class_='bookStatusAddCtrl').text)
books.append(kc)
books.append(book.book_sx('圖書分類'))
books.append(book.book_lr(bs4))
try:
books.append(book.book_lr(bs4))
except:
books.append('')
try:
books.append(book.book_lr(bs4))
except:
books.append('')
ws.append(books)
wb.save('sanmin.xlsx')
except:
pass
wb.close()
页:
[1]