|
发表于 2017-9-5 09:39:37
|
显示全部楼层
本帖最后由 小锟 于 2017-9-6 12:12 编辑
验证码在附件,请解压后放到C:\Users\Administrator\Desktop
因为图片处理的时候没有去掉曲线,所以经常会预测T,以及有横线的字母,以及k和x的不好分辨,最后加上字符串的切割的问题(我只是用np.array_split切割的)
测试了下准确率,单个字母为0.75 ,四个的话为0.35
- #登陆失败的原因可能是验证码输入不正确,请看以下验证码和预测的是否一样
- #如果出现index找不到的情况,是因为存放城市信息的js文件没有响应,再运行下就好
- import requests
- import os
- os.chdir(r'C:\Users\Administrator\Desktop\checkcode')
- import re
- import urllib
- import numpy as np
- from PIL import Image
- import matplotlib.pyplot as plt
- from sklearn.feature_selection import VarianceThreshold
- from sklearn import svm
- #将验证码变成四个向量
- def split_checkcode(one):
- for i in range(32 - one.shape[1]):
- if i % 2 :
- one = np.hstack( ( np.array([1]*40)[:,np.newaxis] ,one ) )
- else :
- one = np.hstack( ( one , np.array([1]*40)[:,np.newaxis] ) )
- one = one.ravel()[np.newaxis,:]
- return one
- #处理验证码
- def checkcode_process(image):
- image = Image.open(image).convert('L')
- image_np = np.array(image)
- means = image_np.mean()
- rows , cols = image_np.shape
- for row in range(rows):
- for col in range(cols) :
- if image_np[row,col] < means:
- image_np[row,col] = 0
- else:
- image_np[row,col] = 1
- for i in range(1,rows-1):
- for j in range(1,cols-1) :
- num = 0
- if image_np[i - 1,j]:num +=1
- if image_np[i + 1,j]:num +=1
- if image_np[i,j - 1]:num +=1
- if image_np[i,j + 1]:num +=1
- if num >= 3 :
- image_np[i,j] = 1
- for col in range(cols):
- if len(np.where(image_np[:,col] == 0)[0]) <= 5:
- image_np[:,col] = 1
- select = VarianceThreshold(0.11)
- new_image_np = select.fit_transform(image_np)
- all_np = np.array_split(new_image_np,4,axis=1)
- #Image.fromarray(all_np[0])
- new_g = [split_checkcode(i) for i in all_np]
- vector = np.vstack(new_g)
- return vector
- #将所有验证码放一起训练
- x = []
- for item in range(100):
- image = str(item) + '.png'
- vector = checkcode_process(image)
- x.append(vector)
- x = np.vstack( [x[i] for i in range(100)] )
- class_ = 'QWERTYUIOPASDFGHJKLZXCVBNM'
- y_class = {i : class_[i]for i in range(26)}
- y_class2 = {class_[i] : i for i in range(26)}
- str1 = ''
- with open('1.txt', 'r') as f :
- for i in f.readlines():
- str1 += i
- y = str1.replace('\n','')
- new_yy = [y_class2[i] for i in y]
- svc = svm.SVC(gamma=0.001,C=100)
- svc.fit(x[:],new_yy[:])
- #爬虫部分
- def crawl(province ,cityname,cphm,hpzl_text,enginenumber ,classnumber):
- data = {}
- #用户输入的部分
- data['province'] = province
- data['cityname'] = cityname
- data['cphm'] = cphm
- data['hpzl_text'] = hpzl_text
- data['enginenumber'] = enginenumber
- data['classnumber'] = classnumber
- #默认
- data['province_id'] = 'undefined'
- data['id360'] = '200'
- data['sourceline'] = 'js'
- data['classlen'] = len(data['classnumber'])
- data['enginelen'] = len(data['enginenumber'])
- header ={
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
- }
- #省份区号都在js文件中
- city_new = requests.get(r'http://static.weizhangwang.com/js/cityinfo_all.js?v=20178420' ,headers = header)
- city_new.encoding = 'utf-8'
- city_text = city_new.text
- all_city = city_text.split('|')[1:-1]
- all_city = [i.split(',') for i in all_city]
- every_city = [[]for i in all_city]
- for i in range(len(all_city)) :
- for j in all_city[i] :
- if j:
- every_city[i].append(j)
- for i ,j in enumerate(every_city):
- if data['cityname'] in j :
- index = i
- break
- #在开始的网页源代码中,直接自己建立
- hpzl = {'大型汽车' : '01' , '小型汽车':'02' , '使馆汽车':'03' , '领馆汽车':'04' ,
- '境外汽车' : '05' , '外籍汽车' : '06','两、三轮摩托车' : '07','轻便摩托车' : '08','使馆摩托车' : '09','领馆摩托车' : '10',
- '境外摩托车' : '11','外籍摩托车' : '12','低速车' : '13','拖拉机' : '14','挂车' : '15','教练汽车' : '16','教练摩托车' : '17',
- '临时入境汽车' : '20','临时入境摩托车' : '21','临时行驶车' : '22','警用汽车' : '23','警用摩托' : '24','其它' : '99'
- }
- #也在一个js文件中,直接自己建立
- pid = {
- '贵':1,'豫':2,'鲁':3,'川':4,'苏':5, '青':6,'新':7, '闽':8,'浙':9, '鄂':10,
- '藏':11,'粤':12,'云':13,'京':14,'陕':15,'甘':16,'冀':17,'吉':18,'宁':19,
- '湘':20,'皖':21,'蒙':22,'沪':23,'晋':24,'琼':25,'辽':26,'渝':27,
- '黑':28,'津':29,'桂':31,'赣':30
- }
- #剩余要post的内容
- data['province_sn'] = every_city[index][3]
- data['city_sn'] = every_city[index][4]
- data['c_id'] = every_city[index][0]
- data['hpzl'] = hpzl[data['hpzl_text']]
- data['areacode'] = every_city[index][5]
- data['city_id'] = every_city[index][6]
- data['pid'] = str(data['province_sn'])
- data['jhcc'] = every_city[index][8]
- data['km_jgjId'] = every_city[index][13]
- data['sjb_carorg'] = every_city[index][13]
- data['js_carorg'] = every_city[index][13]
- sess = requests.session()
- r = sess.post(r'http://www3.weizhangwang.com/queryallcar_2.php',headers = header , data=data)
- r.encoding = 'utf-8'
- carIntNO = re.findall(r'carIntNO=(.*?)&',r.text)[0]
- check = sess.get(r'http://www3.weizhangwang.com/mysource/validatecode/code_gg.php',headers = header)
- with open('101.png','wb') as f:
- f.write(check.content)
- vector = checkcode_process('101.png')
- code = [y_class[i] for i in svc.predict(vector)]
- code = ''.join(code)
- print(code)
- #code = input('checkcode:')
- #把省份编码成网址的部分
- provience = urllib.parse.quote(data['province'])
- codeurl = 'http://www3.weizhangwang.com/mysource/validatecode/chk_code_new.php?act=gg&carIntNO='+carIntNO+\
- '&carCode=' + data['classnumber'] +\
- '&carCodeLen=' + str(len(data['classnumber'])) +\
- '&carEngineCode=' + data['enginenumber'] +\
- '&carEngineCodeLen=' + str(len(data['enginenumber'])) +\
- '&c_id=' + data['c_id'] + \
- '&sourceLine=js' +\
- '&refer=weizhangwang' +\
- '&city=' + data['jhcc'] + \
- '&km_jgjId=' + data['km_jgjId'] +\
- '&js_carorg=' + data['km_jgjId'] +\
- '&province=' + provience
- code_post = sess.post(codeurl , data={'code':code} ,headers =header)
- text = code_post.text
- #这里最坑了,一定要字符串处理,不然就进去的,我找了好久这个才登陆成功
- text = text.replace('%','%25')
- url = 'http://www3.weizhangwang.com/mysource/querywzjson_code.php?k=' + \
- text + '&flag=-1'
- #print(url)
- result = sess.get(url , headers = header)
- result.encoding = 'utf-8'
- #print(result.text)
- text1 = eval(result.text)
- finally_str = text1['result_table']
- result_str = re.findall(r'>(.*?)<',finally_str)[0]
- if result_str:
- return result_str if '恭喜' in result_str else 'N/A'
- else:
- return re.findall(r'<b>(.*?)<',finally_str)[0]
- print(crawl('广东','深圳','F7638','大型汽车','144528','530080'))
复制代码
|
评分
-
查看全部评分
|