|
5鱼币
- #coding=utf-8
- import requests,re,os,time,pymysql,time
- from lxml import etree
- down_path = '新余快速人才网'
- if not os.path.exists(down_path):
- os.makedirs(down_path)
- url3 = []
- def nextpage(lastpage):
- for i in range (1,2):
- print('页码:',i)
- nexturl = f'https://www.xinyurc.com/index.php?m=&c=jobs&a=jobs_list&page={i}'
- r = s.get(url=nexturl).text
- html = etree.HTML(r)
- href = html.xpath('/html/body/div[5]/div[1]/div[2]/div/div[3]/a/@href') # 公司主页
- global url3
- url3.extend(href)
- url3 = set(url3)
- print(url3)
- for h in url3:
- print('hurl:',h)
- time.sleep(1)
- r = s.get(url=h).text
- jobs = re.findall('<a href="(.*?)">全部职位',r)#全部职位
- jobs = 'https://www.xinyurc.com' + jobs[0]#全部职位链接
- r2 = s.get(url=jobs).text
- print('jobs是:',jobs)
- html2 = etree.HTML(r2)
- #alljobs = html2.xpath('/html/body//div[@class="jname"]/a/text()')#全部职位
- try:
- allurl = html2.xpath('/html/body//div[@class="jname"]/a/@href')#全部职位URl
- except:
- pass
- for url in allurl: # 遍历全部职位url
- r = s.get(url).text
- print('Url:',url)
- html = etree.HTML(r)
- job = html.xpath('/html/body/div[3]/div/div/div[2]/div[1]/text()')[0] # 岗位名称
- cname = html.xpath('/html/body//div[@class="comname"]/a/text()')[0] # 公司名称
- jobin = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[2]/text()')[0] # 全职/兼职
- joba = html.xpath('/html/body//div[@class="itemli"][2]/text()') # 职位类别
- if len(joba) == 0:
- joba = ['无']
- jobnum = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[4]/text()')[0] # 招聘人数
- #jobsc = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[6]/text()')[0] # 学历要求
- #jobyear = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[7]/text()')[0] # 工作经验
- sex = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[8]/text()')[0] # 性别要求
- if sex == '不限':
- sex = 0
- elif sex == '女':
- sex = 2
- elif sex == '男':
- sex = 1
- #old = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[10]/text()')[0] # 年龄要求
- #intype = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[11]/text()')[0] # 招聘部门
- jobaddress = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[13]/text()')[0] # 工作地点
- #whos = html.xpath('/html/body/div[4]/div[1]/div[1]/div[2]/div[2]/text()[1]')[0] # 联系人
- andtel = html.xpath('/html/body/div[4]/div[1]/div[1]/div[2]/div[4]/span/text()') # 联系电话
- classjob = html.xpath('/html/body//div[@class="describe"]/div[@class="txt"]/text()')#职位描述
- money = html.xpath('/html/body//div[@class="jobstit"]/div[@class="wage"]/text()')[0]#薪资
- if '1K' in money:
- money = 2
- elif '2K' in money:
- money = 3
- elif '3K' in money:
- money = 4
- elif '4K' in money:
- moeny = 5
- elif '5K' in money:
- money = 6
- elif '6K' in money:
- money = 6
- elif '7K' in money:
- money = 6
- elif '8K' in money or '9K' in money or '10K' in money or '11K' in money or '12K' in money:
- money = 7
- elif '13K' in money or '14K' in money or '15K' in money or '16K' in money or '17K' in money or '18K' in money or '19K' in money or '20K' in money :
- money = 8
- else:
- money = 0
- if len(andtel) != 0:
- db = pymysql.connect(host = 'localhost' ,user='root', password='6330055', port=3306, db='spiders')
- cursor = db.cursor()
- data = {
- 'uniacid' : 1,
- 'uid' : 24,
- 'isc' : 2,
- 'cid' : 0,
- 'title' : job,
- 'jobcatindex' : 0,
- 'salaryindex' : money,
- 'flow' : 0,
- 'recruitnum' : jobnum,
- 'sexindex' : sex,
- 'degreesindex' : 0,
- 'experiencesindex' : 0,
- 'des' : classjob,
- 'chosewelfare' : ' a:1:{i:0;s:12:"其他补助";}',
- 'imgs' : 'a:1:{i:0;s:0:"";}',
- 'time':int(time.time()),
- 'status' : 1,
- 'istop' : 0,
- 'toptime' : 'NULL',
- 'flow' : 0,
- 'istou' : 0,
- 'isting' : 0,
- 'citycode' : '360500',
- 'telnum' : andtel[0],
- 'address' : jobaddress,
- 'state' : '1'
- }
- table = 'ims_lshd_zhaopinhign_zpxx'
- keys = ','.join(data.keys())
- values = ','.join(['%s'] * len(data))
- sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table,keys=keys,values=values)
- try:
- if cursor.execute(sql,tuple(data.values())):
- print('Successful')
- db.commit()
- except:
- print('Failed')
- db.rollback()
- db.close()
- # with open(f'{down_path}/新余快速人才网数据222222.txt', 'a') as f:
- # f.write(cname)
- # f.write(' ')
- # f.write('\n\n')
- # f.write('岗位名称:' + job)
- # f.write(' ')
- # f.write('职位类别:' + joba[0])
- # f.write(' ')
- # f.write('招聘人数:' + jobnum)
- # f.write(' ')
- # f.write('性别要求' + sex)
- # f.write(' ')
- # f.write(jobaddress)
- # f.write(' ')
- # f.write('联系电话' + andtel[0])
- # f.write('\n\n')
- # f.write('岗位职责:' + classjob)
- # f.write('\r\n\n')
- #
- #
- # else:
- # pass
- if __name__ == '__main__':
- url = 'https://www.xinyurc.com/index.php?m=Home&c=Members&a=login'
- url2 = 'https://www.xinyurc.com/index.php?m=Home&c=jobs&a=jobs_list'
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
- 'X-Requested-With': 'XMLHttpRequest',
- 'Sec-Fetch-Site': 'same-origin',
- 'Origin': 'https: // www.xinyurc.com',
- 'Referer': 'https://www.xinyurc.com/members/login',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'sec-ch-ua-mobile': '?0',
- 'Sec-Fetch-Dest': 'empty',
- 'Sec-Fetch-Mode': 'cors',
- 'Sec-Fetch-Site': 'same-origin',
- 'sec-ch-ua': '" Not;A Brand";v = "99", "Google Chrome";v = "91", "Chromium";v = "91"',
- 'Host': 'www.xinyurc.com',
- 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
- 'Content-Length': '46',
- 'Connection': 'keep-alive',
- 'Accept-Language': 'zh-CN, zh;q=0.9, en;q=0.8, zh-TW;q=0.7',
- 'Accept': 'application/json,text/javascript, */*; q=0.01',
- 'Date': 'Sat, 12 Jun 2021 02:34: 55 GMT',
- 'Expires': 'Thu, 19 Nov 1981 08: 52:00 GMT',
- 'Transfer-Encoding': 'chunked',
- 'Vary': 'Accept-Encoding',
- }
- data = {
- 'username': '15579001118',
- 'password':'6330055',
- 'expire': '1'
- }
- s = requests.session()
- s.headers.update()
- r = s.post(url=url,data=data,headers=headers,allow_redirects = True);
- r = s.get(url=url2).text
- print(r,'*****' * 10)
- html = etree.HTML(r)
- lastpage = html.xpath('/html/body/div[5]/div[1]/div[2]/div[17]/a[7]/@href')[0]
- lastpage = re.findall('page=(\d{3})', lastpage)[0]
- nextpage(lastpage)
复制代码
这里研究了蛮久,不知道为什么现在好像登陆不成功了 |
|