| 
 | 
 
 
 楼主 |
发表于 2020-11-15 15:19:45
|
显示全部楼层
 
 
 
 
 
你好,还在吗?能帮我试试这代码吗? 
我连续在www.goubanjia.com上爬了数百代理ip没一个可以用,很奇怪!!!,弄得我怀疑是自己电脑问题 
 
- from multiprocessing.pool import ThreadPool
 
 - import requests
 
 - from lxml import etree
 
 - import time
 
  
 
 
- def open_url(url):
 
 -     headers = {
 
 -             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
 
 -     }
 
 -     re = requests.get(url=url, headers=headers)
 
 -     re.raise_for_status()
 
 -     re.encoding = 'utf-8'
 
 -     return re.text
 
 -     
 
 - def open_url_http(proxy):
 
 -     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',}
 
 -     try:
 
 -         res = requests.get(url=url_http, headers=headers, proxy=proxy)
 
 -         assert res.status_code == 200
 
 -         print('http://'+proxy, end='\t')
 
 -         con = res.json()['origin']
 
 -         if con == proxy:
 
 -             print('http://'+proxy+' 可以使用')
 
 -         else:
 
 -             print('error')
 
 -     except Exception as e:
 
 -             pass
 
  
- def open_url_https(proxy):
 
 -     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',}
 
 -     try:
 
 -         res = requests.get(url=url_https, headers=headers, proxy=proxy)
 
 -         assert res.status_code == 200
 
 -         print('https://'+proxy, end='\t')
 
 -         tree = etree.HTML(res.text)
 
 -         con = tree.xpath('//body/p//text()')
 
 -         if con[1] == proxy:
 
 -             print('https://'+proxy+' 可以使用')
 
 -         else:
 
 -             print('error')
 
 -     except Exception as e:
 
 -             pass
 
  
- def get_proxies():
 
 -     proxies_https = []
 
 -     proxies_http = []
 
 -     url = 'http://www.goubanjia.com/'
 
 -     re_text = open_url(url=url)
 
  
-     tree = etree.HTML(re_text)
 
 -     info_list = tree.xpath('//tbody/tr')
 
 -     for each in info_list:
 
 -         info_dict = {}
 
  
-         each_address_list = []
 
 -         info_dict['address'] = each.xpath('./td[1]//*')
 
 -         for each_address in info_dict['address']:
 
 -             if each_address.xpath('./@style') == ["display: none;"] or each_address.xpath('./@style') ==["display:none;"]:
 
 -                 pass
 
 -             else:
 
 -                 if len(each_address.xpath('./text()')) != 0:
 
 -                     each_address_list.append(each_address.xpath('./text()')[0])       
 
 -         info_dict['address'] = ''.join(each_address_list[:-1])+':'+each_address_list[-1]
 
  
-         info_dict['anonymity'] = each.xpath('./td[2]//text()')[0]
 
 -         info_dict['protocal'] = each.xpath('./td[3]//text()')[0]
 
 -         if info_dict['protocal'] == 'http':
 
 -             if info_dict['address'] not in proxies_http:
 
 -                 proxies_http.append(str(info_dict['address']))
 
 -         elif info_dict['protocal'] == 'https':
 
 -             if info_dict['address'] not in proxies_https:
 
 -                 proxies_https.append(str(info_dict['address']))
 
  
-     print(proxies_http)
 
 -     print(proxies_https)
 
 -     return proxies_http, proxies_https
 
 -     
 
  
 
- if __name__ == '__main__':
 
 -     
 
 -     url_https = 'https://202020.ip138.com/'
 
 -     url_http = 'http://httpbin.org/ip'
 
 -     
 
 -     
 
 -     def main():
 
 -         proxies_http, proxies_https = get_proxies()
 
 -         pool = ThreadPool(5)
 
 -         print('main start')
 
 -         for each_proxy in proxies_http:
 
 -             proxy = {"http":f"{each_proxy}"}
 
 -             pool.apply_async(open_url_http, (proxy,))
 
 -         for each_proxy in proxies_https:
 
 -             proxy = {"https":f"{each_proxy}"}
 
 -             pool.apply_async(open_url_https, (proxy,))
 
 -         pool.close()
 
 -         pool.join()
 
 -         print('main over')
 
  
-     for i in range(5):
 
 -         main()
 
 -         time.sleep(0.5)
 
 -     
 
 
  复制代码 |   
 
 
 
 |