|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- # -*- coding:utf-8 -*-
- import requests
- from lxml import etree
- import re
- import xlwt
- datalist = []
- for i in range(1,9):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
- }
- url = 'http://sem.bjtu.edu.cn/lists-szjs.html?szyx=184&leixing=0&zhicheng=0&zimu=&k=&page=%d'
- num_url = format(url%i)
- response = requests.get(url=num_url,headers=headers)
- page_textw = response.content.decode()
- tree = etree.HTML(page_textw)
- tec = tree.xpath('/html/body/div[3]/div[2]/div[2]/ul/li')
-
- for i in tec:
- # 老师地址
- tec_url = 'http://sem.bjtu.edu.cn' + i.xpath('./div[2]/div/h6/a/@href')[0]
- # print(tec_url)
-
- tec_page = requests.get(url=tec_url,headers=headers).text
-
- # 名字
- tec_tree = etree.HTML(tec_page)
- name = tec_tree.xpath('/html/body/div[3]/div[2]/div[1]/div[2]/h6/text()')[0]
-
- datalist.append(name)
- # print(name)
- # 职位
- zhi_wei = re.findall(r'<p><span>教师职称:</span><span>(.*?)</span></p>',tec_page)
- if len(zhi_wei) == 0:
- zhi_wei = ['/']
- datalist.append(zhi_wei)
- # print(zhi_wei)
-
- # 院系
- yuan_xi = re.findall(r'<p><span>所属系 :</span><span>(.*?)</span></p>',tec_page)[0]
- if len(yuan_xi) == 0:
- yuan_xi = '/'
- datalist.append(yuan_xi)
- # print(yuan_xi)
- # 邮箱
- e_maile = re.findall(r'<p><span>邮箱:</span><span>(.*?)</span></p>',tec_page)
- if len(e_maile) == 0:
- e_maile = ['/']
- datalist.append(e_maile)
- # print(e_maile)
- # 存储
- book = xlwt.Workbook(encoding="utf-8",style_compression=0)
- sheet = book.add_sheet('北京经济系老师信息.xls',cell_overwrite_ok=True)
- col = ['名字','职位','院系','邮箱']
- for i in range(len(col)):
- sheet.write(0,i,col[i])
- for i in range(40):
-
- for j in range(0,4):
- sheet.write(i+1,datalist[j])
- book.save('./教师.xls')
- print(datalist)
- print('保存成功')
- print('老师共:',len(tec_url))
复制代码 我保存过程中老是报出:
D:\迅雷下载\python\爬虫>D:/迅雷下载/python/python.exe d:/迅雷下载/python/爬虫/练习2.py
Traceback (most recent call last):
File "d:/迅雷下载/python/爬虫/练习2.py", line 70, in <module>
sheet.write(i+1,datalist[j])
File "D:\迅雷下载\python\lib\site-packages\xlwt\Worksheet.py", line 1088, in write
self.row(r).write(c, label, style)
File "D:\迅雷下载\python\lib\site-packages\xlwt\Row.py", line 230, in write
self.__adjust_bound_col_idx(col)
File "D:\迅雷下载\python\lib\site-packages\xlwt\Row.py", line 71, in __adjust_bound_col_idx
iarg = int(arg)
ValueError: invalid literal for int() with base 10: '卜伟
求大佬解决我这个小白问题
|
|