#登陆失败的原因可能是验证码输入不正确,请看以下验证码和预测的是否一样
#如果出现index找不到的情况,是因为存放城市信息的js文件没有响应,再运行下就好
import requests
import os
os.chdir(r'C:\Users\Administrator\Desktop\checkcode')
import re
import urllib
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn import svm
#将验证码变成四个向量
def split_checkcode(one):
for i in range(32 - one.shape[1]):
if i % 2 :
one = np.hstack( ( np.array([1]*40)[:,np.newaxis] ,one ) )
else :
one = np.hstack( ( one , np.array([1]*40)[:,np.newaxis] ) )
one = one.ravel()[np.newaxis,:]
return one
#处理验证码
def checkcode_process(image):
image = Image.open(image).convert('L')
image_np = np.array(image)
means = image_np.mean()
rows , cols = image_np.shape
for row in range(rows):
for col in range(cols) :
if image_np[row,col] < means:
image_np[row,col] = 0
else:
image_np[row,col] = 1
for i in range(1,rows-1):
for j in range(1,cols-1) :
num = 0
if image_np[i - 1,j]:num +=1
if image_np[i + 1,j]:num +=1
if image_np[i,j - 1]:num +=1
if image_np[i,j + 1]:num +=1
if num >= 3 :
image_np[i,j] = 1
for col in range(cols):
if len(np.where(image_np[:,col] == 0)[0]) <= 5:
image_np[:,col] = 1
select = VarianceThreshold(0.11)
new_image_np = select.fit_transform(image_np)
all_np = np.array_split(new_image_np,4,axis=1)
#Image.fromarray(all_np[0])
new_g = [split_checkcode(i) for i in all_np]
vector = np.vstack(new_g)
return vector
#将所有验证码放一起训练
x = []
for item in range(100):
image = str(item) + '.png'
vector = checkcode_process(image)
x.append(vector)
x = np.vstack( [x[i] for i in range(100)] )
class_ = 'QWERTYUIOPASDFGHJKLZXCVBNM'
y_class = {i : class_[i]for i in range(26)}
y_class2 = {class_[i] : i for i in range(26)}
str1 = ''
with open('1.txt', 'r') as f :
for i in f.readlines():
str1 += i
y = str1.replace('\n','')
new_yy = [y_class2[i] for i in y]
svc = svm.SVC(gamma=0.001,C=100)
svc.fit(x[:],new_yy[:])
#爬虫部分
def crawl(province ,cityname,cphm,hpzl_text,enginenumber ,classnumber):
data = {}
#用户输入的部分
data['province'] = province
data['cityname'] = cityname
data['cphm'] = cphm
data['hpzl_text'] = hpzl_text
data['enginenumber'] = enginenumber
data['classnumber'] = classnumber
#默认
data['province_id'] = 'undefined'
data['id360'] = '200'
data['sourceline'] = 'js'
data['classlen'] = len(data['classnumber'])
data['enginelen'] = len(data['enginenumber'])
header ={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
}
#省份区号都在js文件中
city_new = requests.get(r'http://static.weizhangwang.com/js/cityinfo_all.js?v=20178420' ,headers = header)
city_new.encoding = 'utf-8'
city_text = city_new.text
all_city = city_text.split('|')[1:-1]
all_city = [i.split(',') for i in all_city]
every_city = [[]for i in all_city]
for i in range(len(all_city)) :
for j in all_city[i] :
if j:
every_city[i].append(j)
for i ,j in enumerate(every_city):
if data['cityname'] in j :
index = i
break
#在开始的网页源代码中,直接自己建立
hpzl = {'大型汽车' : '01' , '小型汽车':'02' , '使馆汽车':'03' , '领馆汽车':'04' ,
'境外汽车' : '05' , '外籍汽车' : '06','两、三轮摩托车' : '07','轻便摩托车' : '08','使馆摩托车' : '09','领馆摩托车' : '10',
'境外摩托车' : '11','外籍摩托车' : '12','低速车' : '13','拖拉机' : '14','挂车' : '15','教练汽车' : '16','教练摩托车' : '17',
'临时入境汽车' : '20','临时入境摩托车' : '21','临时行驶车' : '22','警用汽车' : '23','警用摩托' : '24','其它' : '99'
}
#也在一个js文件中,直接自己建立
pid = {
'贵':1,'豫':2,'鲁':3,'川':4,'苏':5, '青':6,'新':7, '闽':8,'浙':9, '鄂':10,
'藏':11,'粤':12,'云':13,'京':14,'陕':15,'甘':16,'冀':17,'吉':18,'宁':19,
'湘':20,'皖':21,'蒙':22,'沪':23,'晋':24,'琼':25,'辽':26,'渝':27,
'黑':28,'津':29,'桂':31,'赣':30
}
#剩余要post的内容
data['province_sn'] = every_city[index][3]
data['city_sn'] = every_city[index][4]
data['c_id'] = every_city[index][0]
data['hpzl'] = hpzl[data['hpzl_text']]
data['areacode'] = every_city[index][5]
data['city_id'] = every_city[index][6]
data['pid'] = str(data['province_sn'])
data['jhcc'] = every_city[index][8]
data['km_jgjId'] = every_city[index][13]
data['sjb_carorg'] = every_city[index][13]
data['js_carorg'] = every_city[index][13]
sess = requests.session()
r = sess.post(r'http://www3.weizhangwang.com/queryallcar_2.php',headers = header , data=data)
r.encoding = 'utf-8'
carIntNO = re.findall(r'carIntNO=(.*?)&',r.text)[0]
check = sess.get(r'http://www3.weizhangwang.com/mysource/validatecode/code_gg.php',headers = header)
with open('101.png','wb') as f:
f.write(check.content)
vector = checkcode_process('101.png')
code = [y_class[i] for i in svc.predict(vector)]
code = ''.join(code)
print(code)
#code = input('checkcode:')
#把省份编码成网址的部分
provience = urllib.parse.quote(data['province'])
codeurl = 'http://www3.weizhangwang.com/mysource/validatecode/chk_code_new.php?act=gg&carIntNO='+carIntNO+\
'&carCode=' + data['classnumber'] +\
'&carCodeLen=' + str(len(data['classnumber'])) +\
'&carEngineCode=' + data['enginenumber'] +\
'&carEngineCodeLen=' + str(len(data['enginenumber'])) +\
'&c_id=' + data['c_id'] + \
'&sourceLine=js' +\
'&refer=weizhangwang' +\
'&city=' + data['jhcc'] + \
'&km_jgjId=' + data['km_jgjId'] +\
'&js_carorg=' + data['km_jgjId'] +\
'&province=' + provience
code_post = sess.post(codeurl , data={'code':code} ,headers =header)
text = code_post.text
#这里最坑了,一定要字符串处理,不然就进去的,我找了好久这个才登陆成功
text = text.replace('%','%25')
url = 'http://www3.weizhangwang.com/mysource/querywzjson_code.php?k=' + \
text + '&flag=-1'
#print(url)
result = sess.get(url , headers = header)
result.encoding = 'utf-8'
#print(result.text)
text1 = eval(result.text)
finally_str = text1['result_table']
result_str = re.findall(r'>(.*?)<',finally_str)[0]
if result_str:
return result_str if '恭喜' in result_str else 'N/A'
else:
return re.findall(r'<b>(.*?)<',finally_str)[0]
print(crawl('广东','深圳','F7638','大型汽车','144528','530080'))