马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 花蝴蝶¤ 于 2017-9-1 21:56 编辑
用python2.7爬取股票信息的时候遇到了如下编码问题:
转码前打印了一次发现是ascii格式,然后通过.decode('ascii').encode('utf-8')进行转码,但是转码后还是ascii格式的。并且输出文件里面的内容格式也是有点奇怪 。麻烦帮忙看一下哪里写错了,谢谢
完整代码如下:# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import traceback
import re
import random
import chardet
def getHTMLText(url):
headers_list = [
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.17 Safari/537.11"}
]
header = random.choice(headers_list)
try:
r = requests.get(url = url, headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
#print(r.text)
except:
return ""
def getStockList(lst, stockURL):
html = getHTMLText(stockURL)
# print(chardet.detect(html))
soup = BeautifulSoup(html, "html.parser")
a = soup.find_all("a")
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
except:
continue
# print(chardet.detect(lst))
def getStockInfo(lst, stockURL, fpath):
count = 0
for stock in lst:
url = stockURL + stock + ".html"
html = getHTMLText(url)
# print(chardet.detect(html))
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html, "html.parser")
stockInfo = soup.find('div', attrs={'class':'stock-bets'})
name = stockInfo.find_all(attrs = {'class':'bets-name'})[0]
# print(name.text.split()[0])
infoDict.update({'股票名称':name.text.split()[0]})
# print(infoDict)
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
# print(chardet.detect(key))
# print(chardet.detect(val))
infoDict[key] = val
# print(infoDict)
with open(fpath, 'a') as f:
print(chardet.detect(str(infoDict)))
content = str(infoDict).decode('ascii').encode('utf-8')
print(chardet.detect(content))
f.write( content + '\n')
count += 1
print("\r当前进度:{:.2f}%".format(count*100/len(lst)))
except:
count += 1
print ("\r当前进度:{:.2f}%".format(count*100/len(lst)))
continue
def main():
stock_list_url = "http://quote.eastmoney.com/stocklist.html"
stock_info_url = "https://gupiao.baidu.com/stock/"
output_file = './gupiao.txt'
slist = []
getStockList(slist, stock_list_url)
getStockInfo(slist, stock_info_url, output_file)
main()
没注意你的python是2.7。
直接用str将字典转换成字符串这个方法在2.7是有问题的。
你可以换个方法。 with .. as f:
f.write('{')
for i in dict:
f.write(i+':'+dict[i])
f.write('}')
|