|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- # -*- coding: utf-8 -*-
- import requests
- from bs4 import BeautifulSoup
- from fake_useragent import UserAgent
- import re
- from openpyxl import Workbook
- import time
- wb = Workbook()
- ws = wb.active
- ws.title = "Sheet1"
- # 获取采集的条码
- csv_str = open('book0906.csv','r')
- # 按行读取
- list_d = csv_str.readlines()
- # 代理地址
- proxies = {
- 'https':'http://127.0.0.1:1080'
- }
- user_agent = UserAgent(use_cache_server=False)
- # 获取内容网站的url地址
- def url(self):
- headers = {
- "user-agent": user_agent.random
- }
- html = requests.get(self,proxies=proxies,headers=headers).text
- bs4 = BeautifulSoup(html,'html.parser')
- html_bs4 = bs4.find(class_='resultBooksImg resultBooksImg')
- urld = 'https://www.sanmin.com.tw/'+re.sub('href=|"','',re.findall('href=".*?"',str(html_bs4))[0])
- return urld
- class book():
- # 获取书籍内容
- def book_lr(self):
- book_lrs =[]
- ddd = self.find(class_='bookProject tabsview')
- for d in ddd.find_all(class_='productContent'):
- book_lrs.append(d.text)
- return book_lrs
- # 书籍信息
- def book_xx(self):
- book_xx =[]
- for d in self.find_all(class_='mainText ga'):
- book_xx.append(d.text)
- return book_xx
- def book_sx(self):
- for i in bookxx:
- if len(re.findall(self,i)) == 1:
- jj = re.sub(self + '|\:','',i)
- return jj
- else:
- pass
- for str_list_num in list_d:
- try:
- books = []
- headers = {
- "user-agent": user_agent.random
- }
- str_list = str_list_num.strip('\n')
- url1 = url('https://www.sanmin.com.tw/search/index/?ct=ISBN&qu='+str_list+'&ls=SD')
- html = requests.get(url1,proxies=proxies,headers=headers).text
- bs4 = BeautifulSoup(html,'html.parser')
- book_name = bs4.find(class_='bookStatusInfor')
- bookxx = book.book_xx(book_name)
- books.append(str_list)
- books.append(book_name.find('li').text)
- books.append(book.book_sx('作者'))
- mony = re.findall('[0-9].*[0-9]',re.findall('定.*.?元',bs4.find(class_='bookStatusAddCtrl').text)[0])
- books.append(mony[0])
- books.append(book.book_sx('規格'))
- books.append(re.findall('[0-9].*[0-9]',book.book_sx('裝訂/頁數'))[0])
- books.append(re.sub('/','',re.findall('.*/',book.book_sx('裝訂/頁數'))[0]))
- books.append(book.book_sx('出版日'))
- kc = re.findall('預購中.*|庫存.*',bs4.find(class_='bookStatusAddCtrl').text)[0]
- books.append(kc)
- books.append(book.book_sx('圖書分類'))
- books.append(book.book_lr(bs4)[0])
- try:
- books.append(book.book_lr(bs4)[1])
- except:
- books.append('')
- try:
- books.append(book.book_lr(bs4)[3])
- except:
- books.append('')
- ws.append(books)
- wb.save('sanmin.xlsx')
- except:
- pass
- wb.close()
复制代码 |
|