| 
 | 
 
 
发表于 2017-9-5 09:39:37
|
显示全部楼层
 
 
 
 本帖最后由 小锟 于 2017-9-6 12:12 编辑  
 
验证码在附件,请解压后放到C:\Users\Administrator\Desktop 
因为图片处理的时候没有去掉曲线,所以经常会预测T,以及有横线的字母,以及k和x的不好分辨,最后加上字符串的切割的问题(我只是用np.array_split切割的) 
测试了下准确率,单个字母为0.75 ,四个的话为0.35 
 
- #登陆失败的原因可能是验证码输入不正确,请看以下验证码和预测的是否一样
 
 - #如果出现index找不到的情况,是因为存放城市信息的js文件没有响应,再运行下就好
 
 - import requests
 
 - import os
 
 - os.chdir(r'C:\Users\Administrator\Desktop\checkcode')
 
 - import re
 
 - import urllib
 
 - import numpy as np
 
 - from PIL import Image
 
 - import matplotlib.pyplot as plt
 
 - from sklearn.feature_selection import VarianceThreshold
 
 - from sklearn import svm
 
  
- #将验证码变成四个向量
 
 - def split_checkcode(one):
 
 -     for i in range(32 - one.shape[1]):
 
 -         if i % 2 :
 
 -             one = np.hstack( (  np.array([1]*40)[:,np.newaxis] ,one ) )
 
 -         else :
 
 -             one = np.hstack( (  one , np.array([1]*40)[:,np.newaxis]  ) )
 
 -     one = one.ravel()[np.newaxis,:]
 
 -     return one
 
 - #处理验证码
 
 - def checkcode_process(image):
 
 -     image = Image.open(image).convert('L')
 
 -     image_np = np.array(image)
 
 -     means = image_np.mean()
 
 -     rows , cols = image_np.shape
 
 -     for row in range(rows):
 
 -         for col in range(cols) :
 
 -             if image_np[row,col] < means:
 
 -                 image_np[row,col] = 0
 
 -             else:
 
 -                 image_np[row,col] = 1
 
 -     for i in range(1,rows-1):
 
 -         for j in range(1,cols-1) :
 
 -             num = 0
 
 -             if image_np[i - 1,j]:num +=1
 
 -             if image_np[i + 1,j]:num +=1
 
 -             if image_np[i,j - 1]:num +=1
 
 -             if image_np[i,j + 1]:num +=1
 
 -             if num >= 3 :
 
 -                     image_np[i,j] = 1
 
 -     for col in range(cols):
 
 -         if len(np.where(image_np[:,col] == 0)[0]) <= 5:
 
 -             image_np[:,col] = 1
 
 -     select = VarianceThreshold(0.11)
 
 -     new_image_np = select.fit_transform(image_np)
 
 -     all_np = np.array_split(new_image_np,4,axis=1)
 
 -     #Image.fromarray(all_np[0])
 
 -     new_g = [split_checkcode(i) for i in all_np]
 
 -     vector = np.vstack(new_g)
 
 -     return vector
 
 - #将所有验证码放一起训练
 
 - x = []
 
 - for item in range(100):
 
 -     image = str(item) + '.png'
 
 -     vector = checkcode_process(image)
 
 -     x.append(vector)
 
 - x = np.vstack( [x[i] for i in range(100)] )
 
  
- class_ = 'QWERTYUIOPASDFGHJKLZXCVBNM'
 
 - y_class = {i : class_[i]for i in range(26)}
 
 - y_class2 = {class_[i] : i for i in range(26)}
 
 - str1 = ''
 
 - with open('1.txt', 'r') as f :
 
 -     for i in f.readlines():
 
 -         str1 += i
 
 - y = str1.replace('\n','')
 
 - new_yy = [y_class2[i] for i in y]
 
 - svc = svm.SVC(gamma=0.001,C=100)
 
 - svc.fit(x[:],new_yy[:])
 
  
 
- #爬虫部分
 
 - def crawl(province ,cityname,cphm,hpzl_text,enginenumber ,classnumber):
 
 -     data = {}
 
 -     #用户输入的部分
 
 -     data['province'] = province
 
 -     data['cityname'] = cityname
 
 -     data['cphm'] = cphm
 
 -     data['hpzl_text'] = hpzl_text
 
 -     data['enginenumber'] = enginenumber
 
 -     data['classnumber'] = classnumber
 
 -     #默认
 
 -     data['province_id'] = 'undefined'
 
 -     data['id360'] = '200'
 
 -     data['sourceline'] = 'js'
 
 -     data['classlen'] = len(data['classnumber'])
 
 -     data['enginelen'] = len(data['enginenumber'])
 
 -     header ={
 
 -     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
 
 -             }
 
 -     #省份区号都在js文件中
 
 -     city_new = requests.get(r'http://static.weizhangwang.com/js/cityinfo_all.js?v=20178420' ,headers = header)
 
 -     city_new.encoding = 'utf-8'
 
 -     city_text = city_new.text
 
 -     all_city = city_text.split('|')[1:-1]
 
 -     all_city = [i.split(',') for i in all_city]
 
 -     every_city = [[]for i in all_city]
 
 -     for i in range(len(all_city)) :
 
 -         for j in all_city[i] :
 
 -             if j:
 
 -                 every_city[i].append(j)
 
 -     for i ,j in enumerate(every_city):
 
 -         if data['cityname'] in j :
 
 -             index = i
 
 -             break
 
 -     #在开始的网页源代码中,直接自己建立
 
 -     hpzl = {'大型汽车' : '01' , '小型汽车':'02' , '使馆汽车':'03' , '领馆汽车':'04' ,
 
 -     '境外汽车' : '05' , '外籍汽车' : '06','两、三轮摩托车' : '07','轻便摩托车' : '08','使馆摩托车' : '09','领馆摩托车' : '10',
 
 -     '境外摩托车' : '11','外籍摩托车' : '12','低速车' : '13','拖拉机' : '14','挂车' : '15','教练汽车' : '16','教练摩托车' : '17',
 
 -     '临时入境汽车' : '20','临时入境摩托车' : '21','临时行驶车' : '22','警用汽车' : '23','警用摩托' : '24','其它' : '99'
 
 -             }
 
 -     #也在一个js文件中,直接自己建立
 
 -     pid = {
 
 -     '贵':1,'豫':2,'鲁':3,'川':4,'苏':5, '青':6,'新':7, '闽':8,'浙':9, '鄂':10,
 
 -     '藏':11,'粤':12,'云':13,'京':14,'陕':15,'甘':16,'冀':17,'吉':18,'宁':19,
 
 -     '湘':20,'皖':21,'蒙':22,'沪':23,'晋':24,'琼':25,'辽':26,'渝':27,
 
 -     '黑':28,'津':29,'桂':31,'赣':30
 
 -     }
 
 -     #剩余要post的内容
 
 -     data['province_sn'] = every_city[index][3]
 
 -     data['city_sn'] = every_city[index][4]
 
 -     data['c_id'] = every_city[index][0]
 
 -     data['hpzl'] = hpzl[data['hpzl_text']]
 
 -     data['areacode'] = every_city[index][5]
 
 -     data['city_id'] = every_city[index][6]
 
 -     data['pid'] = str(data['province_sn'])
 
 -     data['jhcc'] = every_city[index][8]
 
 -     data['km_jgjId'] = every_city[index][13]
 
 -     data['sjb_carorg'] = every_city[index][13]
 
 -     data['js_carorg'] = every_city[index][13]
 
  
 
-     sess = requests.session()
 
 -     r = sess.post(r'http://www3.weizhangwang.com/queryallcar_2.php',headers = header , data=data)
 
 -     r.encoding = 'utf-8'
 
 -     carIntNO = re.findall(r'carIntNO=(.*?)&',r.text)[0]
 
  
-     check = sess.get(r'http://www3.weizhangwang.com/mysource/validatecode/code_gg.php',headers = header)
 
 -     with open('101.png','wb') as f:
 
 -         f.write(check.content)
 
  
 
 
-     vector = checkcode_process('101.png')
 
 -     code = [y_class[i] for i in svc.predict(vector)]
 
 -     code = ''.join(code)
 
 -     print(code)
 
  
-     #code = input('checkcode:')
 
 -     #把省份编码成网址的部分
 
 -     provience = urllib.parse.quote(data['province'])
 
 -     codeurl = 'http://www3.weizhangwang.com/mysource/validatecode/chk_code_new.php?act=gg&carIntNO='+carIntNO+\
 
 -               '&carCode=' + data['classnumber'] +\
 
 -               '&carCodeLen=' + str(len(data['classnumber']))  +\
 
 -               '&carEngineCode=' + data['enginenumber'] +\
 
 -               '&carEngineCodeLen=' + str(len(data['enginenumber'])) +\
 
 -               '&c_id=' + data['c_id']   + \
 
 -               '&sourceLine=js' +\
 
 -               '&refer=weizhangwang'  +\
 
 -               '&city=' + data['jhcc'] + \
 
 -               '&km_jgjId='  + data['km_jgjId'] +\
 
 -               '&js_carorg=' + data['km_jgjId'] +\
 
 -               '&province=' + provience
 
  
-     code_post = sess.post(codeurl , data={'code':code} ,headers =header)
 
 -     text = code_post.text
 
 -     #这里最坑了,一定要字符串处理,不然就进去的,我找了好久这个才登陆成功
 
 -     text = text.replace('%','%25')
 
 -     url = 'http://www3.weizhangwang.com/mysource/querywzjson_code.php?k=' +  \
 
 -             text + '&flag=-1'
 
 -     #print(url)
 
 -     result = sess.get(url , headers = header)
 
 -     result.encoding = 'utf-8'
 
 -     #print(result.text)
 
  
-     text1 = eval(result.text)
 
 -     finally_str = text1['result_table']
 
 -     result_str = re.findall(r'>(.*?)<',finally_str)[0]
 
 -     if result_str:
 
 -         return  result_str if '恭喜' in result_str else 'N/A'
 
 -     else:
 
 -         return re.findall(r'<b>(.*?)<',finally_str)[0]
 
  
 
- print(crawl('广东','深圳','F7638','大型汽车','144528','530080'))
 
 
  复制代码 
 |   
 
评分
- 
查看全部评分
 
 
 
 
 
 |