思路:爬取代码中的url,获取子url——爬取子url,正则方式,获取目标信息——存入mysqlimport urllib
import re
import random
import mysql.connector
def use_proxy_1(url,proxy_add):
proxy = urllib.request.ProxyHandler({'http':proxy_add})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
headers =('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode(encoding='UTF-8',errors='replace')
return data
for page in range(1,90):
url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=489&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&sm=0&sg=5c6e7451ff0d48a6852cd7e35bb08be0&p='+str(page)+''
iplist =['124.128.39.138:8088','223.241.78.1064:8010','119.183.220.224:8088']
proxy_add = random.choice(iplist)
response = use_proxy_1(url,proxy_add)
response = response.replace(' ','')
pat = 'weight:.*?so=\d*"href="(.*?)"target="'
res = re.compile(pat,re.DOTALL)
res = res.findall(response)
print(page,len(res))
for i in range(0,60):#len(res)):
url1 = str(res[i]).replace('http','https')
if 'httpss' in url1:
url1 = str(res[i]).replace('httpss','https')
response1 = use_proxy_1(url1,proxy_add)
pat_company = 'var Str_CompName = "(.*?)";'
pat_job = '<h1>(.*?)</h1>'
pat_location = '<li><span>工作地点:</span><strong><a target=.*?">(.*?)</a>.*?</strong></li>'
pat_type = '<li><span>公司性质:</span><strong>(.*?)</strong></li>'
pat_size = '<li><span>公司规模:</span><strong>(.*?)</strong></li>'
pat_industry = '<li><span>公司行业:</span><strong><a target="_blank" href=".*?">(.*?)</a></strong></li>'
pat_experience = '<li><span>工作经验:</span><strong>(.*?)</strong></li>'
pat_education = '<li><span>最低学历:</span><strong>(.*?)</strong></li>'
pat_salary = '<li><span>职位月薪:</span><strong>(.*?) '
pat_number = '<li><span>招聘人数:</span><strong>(.*?) </strong></li>'
pat_categories ='<li><span>职位类别:</span><strong><a target="_blank" href=".*?">(.*?)</a></strong></li>'
try:
company = re.compile(pat_company).findall(response1)[0]
job = re.compile(pat_job).findall(response1)[0]
location = re.compile(pat_location).findall(response1)[0]
type = re.compile(pat_type).findall(response1)[0]
size = re.compile(pat_size).findall(response1)[0]
industry = re.compile(pat_industry).findall(response1)[0]
experience = re.compile(pat_experience).findall(response1)[0]
education = re.compile(pat_education).findall(response1)[0]
salary = re.compile(pat_salary).findall(response1)[0]
number = re.compile(pat_number).findall(response1)[0]
categories = re.compile(pat_categories).findall(response1)[0]
except Exception as e:
print(e)
try:
conn = mysql.connector.connect(
user='root',
password='**************',
host='127.0.0.1',
port=3306,
database='jobs'
)
cursor = conn.cursor()
cursor.execute("INSERT INTO zhilian(company,job,location,type,size,industry,experience,education,salary,number,categories) VALUES ('"+company+"','"+job+"','"+location+"','"+type+"','"+size+"','"+industry+"','"+experience+"','"+education+"','"+salary+"','"+number+"','"+categories+"')")
print('第'+str(page)+'页,第'+str(i)+'条数据')
print('************** 数据保存成功 **************')
conn.commit()
cursor.close()
except Exception as e:
print(e)
print('第'+str(page)+'页,第'+str(i)+'条数据')
print('*************!!!! 保存失败!!!! **************')
|