|

楼主 |
发表于 2018-7-11 12:56:31
|
显示全部楼层
思路:爬取代码中的url,获取子url——爬取子url,正则方式,获取目标信息——存入mysql
- import urllib
- import re
- import random
- import mysql.connector
- def use_proxy_1(url,proxy_add):
- proxy = urllib.request.ProxyHandler({'http':proxy_add})
- opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
- headers =('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')
- opener.addheaders = [headers]
- urllib.request.install_opener(opener)
- data = urllib.request.urlopen(url).read().decode(encoding='UTF-8',errors='replace')
- return data
- for page in range(1,90):
- url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=489&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&sg=5c6e7451ff0d48a6852cd7e35bb08be0&p='+str(page)+''
- iplist =['124.128.39.138:8088','223.241.78.1064:8010','119.183.220.224:8088']
- proxy_add = random.choice(iplist)
- response = use_proxy_1(url,proxy_add)
- response = response.replace(' ','')
- pat = 'weight:.*?so=\d*"href="(.*?)"target="'
- res = re.compile(pat,re.DOTALL)
- res = res.findall(response)
- print(page,len(res))
- for i in range(0,60):#len(res)):
- url1 = str(res[i]).replace('http','https')
- if 'httpss' in url1:
- url1 = str(res[i]).replace('httpss','https')
- response1 = use_proxy_1(url1,proxy_add)
- pat_company = 'var Str_CompName = "(.*?)";'
- pat_job = '<h1>(.*?)</h1>'
- pat_location = '<li><span>工作地点:</span><strong><a target=.*?">(.*?)</a>.*?</strong></li>'
- pat_type = '<li><span>公司性质:</span><strong>(.*?)</strong></li>'
- pat_size = '<li><span>公司规模:</span><strong>(.*?)</strong></li>'
- pat_industry = '<li><span>公司行业:</span><strong><a target="_blank" href=".*?">(.*?)</a></strong></li>'
- pat_experience = '<li><span>工作经验:</span><strong>(.*?)</strong></li>'
- pat_education = '<li><span>最低学历:</span><strong>(.*?)</strong></li>'
- pat_salary = '<li><span>职位月薪:</span><strong>(.*?) '
- pat_number = '<li><span>招聘人数:</span><strong>(.*?) </strong></li>'
- pat_categories ='<li><span>职位类别:</span><strong><a target="_blank" href=".*?">(.*?)</a></strong></li>'
- try:
- company = re.compile(pat_company).findall(response1)[0]
- job = re.compile(pat_job).findall(response1)[0]
- location = re.compile(pat_location).findall(response1)[0]
- type = re.compile(pat_type).findall(response1)[0]
- size = re.compile(pat_size).findall(response1)[0]
- industry = re.compile(pat_industry).findall(response1)[0]
- experience = re.compile(pat_experience).findall(response1)[0]
- education = re.compile(pat_education).findall(response1)[0]
- salary = re.compile(pat_salary).findall(response1)[0]
- number = re.compile(pat_number).findall(response1)[0]
- categories = re.compile(pat_categories).findall(response1)[0]
- except Exception as e:
- print(e)
- try:
- conn = mysql.connector.connect(
- user='root',
- password='**************',
- host='127.0.0.1',
- port=3306,
- database='jobs'
- )
- cursor = conn.cursor()
- cursor.execute("INSERT INTO zhilian(company,job,location,type,size,industry,experience,education,salary,number,categories) VALUES ('"+company+"','"+job+"','"+location+"','"+type+"','"+size+"','"+industry+"','"+experience+"','"+education+"','"+salary+"','"+number+"','"+categories+"')")
- print('第'+str(page)+'页,第'+str(i)+'条数据')
- print('************** 数据保存成功 **************')
- conn.commit()
- cursor.close()
- except Exception as e:
- print(e)
- print('第'+str(page)+'页,第'+str(i)+'条数据')
- print('*************!!!! 保存失败!!!! **************')
复制代码 |
|