|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 花蝴蝶¤ 于 2017-9-1 21:56 编辑
用python2.7爬取股票信息的时候遇到了如下编码问题:
转码前打印了一次发现是ascii格式,然后通过.decode('ascii').encode('utf-8')进行转码,但是转码后还是ascii格式的。并且输出文件里面的内容格式也是有点奇怪 。麻烦帮忙看一下哪里写错了,谢谢
完整代码如下:
- # -*- coding: utf-8 -*-
- import requests
- from bs4 import BeautifulSoup
- import traceback
- import re
- import random
- import chardet
- def getHTMLText(url):
- headers_list = [
- {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"},
- {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11"},
- {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.17 Safari/537.11"}
- ]
- header = random.choice(headers_list)
- try:
- r = requests.get(url = url, headers = header)
- r.raise_for_status()
- r.encoding = r.apparent_encoding
- return r.text
- #print(r.text)
- except:
- return ""
-
-
- def getStockList(lst, stockURL):
- html = getHTMLText(stockURL)
- # print(chardet.detect(html))
- soup = BeautifulSoup(html, "html.parser")
- a = soup.find_all("a")
- for i in a:
- try:
- href = i.attrs['href']
- lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
- except:
- continue
- # print(chardet.detect(lst))
- def getStockInfo(lst, stockURL, fpath):
- count = 0
- for stock in lst:
- url = stockURL + stock + ".html"
- html = getHTMLText(url)
- # print(chardet.detect(html))
- try:
- if html == "":
- continue
- infoDict = {}
- soup = BeautifulSoup(html, "html.parser")
- stockInfo = soup.find('div', attrs={'class':'stock-bets'})
- name = stockInfo.find_all(attrs = {'class':'bets-name'})[0]
- # print(name.text.split()[0])
- infoDict.update({'股票名称':name.text.split()[0]})
- # print(infoDict)
- keyList = stockInfo.find_all('dt')
- valueList = stockInfo.find_all('dd')
- for i in range(len(keyList)):
- key = keyList[i].text
- val = valueList[i].text
- # print(chardet.detect(key))
- # print(chardet.detect(val))
- infoDict[key] = val
- # print(infoDict)
- with open(fpath, 'a') as f:
- print(chardet.detect(str(infoDict)))
- content = str(infoDict).decode('ascii').encode('utf-8')
- print(chardet.detect(content))
- f.write( content + '\n')
- count += 1
- print("\r当前进度:{:.2f}%".format(count*100/len(lst)))
- except:
- count += 1
- print ("\r当前进度:{:.2f}%".format(count*100/len(lst)))
- continue
-
- def main():
- stock_list_url = "http://quote.eastmoney.com/stocklist.html"
- stock_info_url = "https://gupiao.baidu.com/stock/"
- output_file = './gupiao.txt'
- slist = []
- getStockList(slist, stock_list_url)
- getStockInfo(slist, stock_info_url, output_file)
-
- main()
复制代码
没注意你的python是2.7。
直接用str将字典转换成字符串这个方法在2.7是有问题的。
你可以换个方法。
- with .. as f:
- f.write('{')
- for i in dict:
- f.write(i+':'+dict[i])
- f.write('}')
复制代码
|
|