import matplotlib.pyplot as plt
import pandas as pd
from collections import OrderedDict
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
examDict = {'学习时间':[0.5,0.75,1,1.25,1.5,1.75,1.75,2,2.25,2.5,2.75,3,
3.25,3.5,4,4.25,4.5,4.75,5,5.0],
'分数':[10,22,13,43,20,22,33,50,62,48,55,75,62,
73,81,76,64,82,90,93]
}
examOrderedDict = OrderedDict(examDict)
examDf = pd.DataFrame(examOrderedDict)
# examDf.head()
exam_X = examDf.loc[:,'学习时间']
exam_y = examDf.loc[:,'分数']
plt.scatter(exam_X,exam_y,color='b',label='exam data')
plt.legend(loc=2)
plt.xlabel('Hours')
plt.ylabel('Score')
plt.show()
# 下面建立训练数据和测试数据
X_train,X_test,y_train,y_test = train_test_split(exam_X,exam_y,train_size=0.8)
print('X的原始数据特征:',exam_X.shape)
print('X的训练数据特征:',X_train.shape)
print('X的测试数据特征:',X_test.shape)
plt.scatter(X_train,y_train,color='b',label='train data')
plt.scatter(X_test,y_test,color='r',label='test data')
plt.legend(loc='best')
plt.show()