简单线性回归

import pandas as pd
#  导入orderedDict，为了建立有序的字典
from collections import OrderedDict
# 导入绘图包
import matplotlib.pyplot as plt
import matplotlib

准备数据集

'''准备数据集'''
# 其中<学习时间>为特征数据，<分数>为标签
examDict={
    '学习时间':[0.50,0.75,1.00,1.25,1.50,1.75,1.75,2.00,2.25,
            2.50,2.75,3.00,3.25,3.50,4.00,4.25,4.50,4.75,5.00,5.50],
    '分数':    [10,  22,  13,  43,  20,  22,  33,  50,  62,  
              48,  55,  75,  62,  73,  81,  76,  64,  82,  90,  93]
}
# 建立有序字典，返回的数据按照插入的顺序
examOrderDict=OrderedDict(examDict)
# 建立DataFrame格式数据
examDf=pd.DataFrame(examOrderDict)
examDf.head()

	学习时间	分数
0	0.50	10
1	0.75	22
2	1.00	13
3	1.25	43
4	1.50	20

提取特征和标签数据

'''提取特征和标签数据'''
# 特征数据
examX=examDf['学习时间']
# 标签数据
examY=examDf['分数']

绘制散点图

'''绘制散点图 '''
plt.scatter(examX,examY,color='b',label='exam data',marker='o')

# 添加图标标签
plt.xlabel('Hours')
plt.ylabel('Score')

# 显示图像
plt.show()

	学习时间	分数
学习时间	1.000000	0.923985
分数	0.923985	1.000000

回归分析

一般问题发生的原因有很多种，很难全部找到。通过相关分析可以确定<学习时间>和<分数>高度相关，实际业务中更是如此，无法根据相关性采取具体行动。此时，就需要用到回归分析。

回归方程：y=a+bx,即一组数据的最佳拟合线

a:回归系数
b:截距

求a和b：

误差平方和  SSE=∑(实际值-预测值)²
通过最小二乘法使得误差平方和SSE最小，即可得出a和b

1、提取特征和标签

'''提取特征和标签数据'''
# 特征features
examX=examDf['学习时间']
# 标签labes
examY=examDf['分数']

2、建立训练数据和测试数据

train_test_split是交叉验证中常用的函数，功能是从样本中随机选取训练数据（train）和测试数据（test）

train_test_split(*arrays, **options)

第一个参数：所要划分的样本特征
第二个参数：所要划分的样本标签    
test_size : float, int or None, optional (default=None)，测试数据的量
    - If float,  0.0 ~1.0，表示占比
    - If int,表示测试数据的绝对数量 
    - If None, train_size的剩余部分作为测试数据
    - If ``train_size`` is also None, it will be set to 0.25.   
train_size : float, int, or None, (default=None)，训练数据的量
    - If float,  0.0 ~1.0，表示占比
    - If int,表示训练数据的绝对数量 
    - If None, test_size的剩余部分作为测试数据

random_state : int, RandomState instance or None, optional (default=None)
    - If int, random_state is the seed used by the random number generator;
    - If RandomState instance, random_state is the random number generator;
    - If None, the random number generator is the RandomState instance used
    by `np.random`.
shuffle : boolean, optional (default=True)，是否重新分割数据， If shuffle=False then stratify must be None.    
stratify : array-like or None (default=None)，如果不为空，用此作为标签

'''建立训练数据和测试数据'''
from sklearn.model_selection import train_test_split

#建立训练数据和测试数据
# 变量依次为：数量数据特征、测试数据特征、训练数据标签、测试数据标签
x_train, x_test, y_train, y_test=train_test_split(examX,examY,train_size=0.8)

#输出各数据大小
# 特征
print('原始数据特征:',examX.shape ,
      '\n训练数据特征：',x_train.shape,
      '\n测试数据特征： ',x_test.shape )
# 标签
print('\n原始数据标签:',examY.shape ,
      '\n训练数据标签：',y_train.shape,
      '\n测试数据标签： ',y_test.shape )

原始数据特征: (20,) 
训练数据特征： (16,) 
测试数据特征：  (4,)

原始数据标签: (20,) 
训练数据标签： (16,) 
测试数据标签：  (4,)

3、绘制散点图

'''绘制散点图'''
# 训练数据散点图
plt.scatter(x_train, y_train,color='b',label='train data')
# 测试数据散点图
plt.scatter(x_test, y_test, color='r', label='test data')

# 在坐标系中添加图例,用于显示label='train data'、label='test data'。loc表示图例放置位置
'''
loc表示图例放置位置
     Loc          Location Code
===============   =============
'best'            0
'upper right'     1
'upper left'      2
'lower left'      3
'lower right'     4
'right'           5
'center left'     6
'center right'    7
'lower center'    8
'upper center'    9
'center'          10
===============   =============
'''
plt.legend(loc=4)
# 设置x轴和y轴标签
plt.xlabel('Hours')
plt.ylabel('score')

# 显示图像
plt.show()

在这里插入图片描述

3、训练模型（使用训练数据）

'''线性回归'''
'''
Reshape your data either using array.reshape(-1, 1) if your data has a single feature 
or array.reshape(1, -1) if it contains a single sample.
期望的是2D的数组，而x_train和y_train均为Series
如果只有一个特征，需要reshape(-1, 1)；
如果包含一个样本，需要reshape(1, -1)。
'''
# 导入线性回归
from sklearn.linear_model import LinearRegression

# 创建模型：线性回归
model=LinearRegression()

# 训练模型
model.fit(x_train.values,y_train.values)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)
…（省略报错中间信息）
ValueError: Expected 2D array, got 1D array instead:
array=[5.5  1.25 2.5  4.75 2.25 0.5  3.25 1.75 4.5  3.5  0.75 4.   2.   3.
 1.5  4.25].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

print('x_train type:',type(x_train),'\ny_train type:',type(y_train))
print('x_train.values为一维数组,shape:\n',x_train.values.shape,
     '\ny_train.values为一维数组,shape:\n',y_train.values.shape)
print('x_train reshape后，XX行，1列:\n',x_train.values.reshape(-1,1).shape)

x_train type: <class 'pandas.core.series.Series'> 
y_train type: <class 'pandas.core.series.Series'>
x_train.values为一维数组,shape:
 (16,) 
y_train.values为一维数组,shape:
 (16,)
x_train reshape后，XX行，1列:
 (16, 1)

'''线性回归'''

# 将训练数据特征x_train转换为2D array XX行*1列
X_train=x_train.values.reshape(-1,1)

# 导入线性回归
from sklearn.linear_model import LinearRegression

# 创建模型：线性回归
model=LinearRegression()

# 训练模型
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

'''
回归函数：y=a+bx
截距intercept: a
回归系数coef : b
'''
a=model.intercept_
b=model.coef_
print('最佳拟合线的截距：a=',a,'最佳拟合线的回归系数:b=',b)

最佳拟合线的截距：a= 13.229672225416905 最佳拟合线的回归系数:b= [15.07901093]

'''绘制训练散点图及最佳拟合线'''

# 1.训练数据散点图
plt.scatter(X_train,y_train,color='b',label='train data')

# 2.绘制最佳拟合线
# 训练数据的预测值
y_train_pred=model.predict(X_train)
# 绘制最佳拟合线
plt.plot(X_train,y_train_pred,color='y',linewidth=2,label='best line')

# 4.添加图例 和 标签
plt.legend(loc='upper left')
plt.xlabel('Hours')
plt.ylabel('score')

# 5.显示图像
plt.show()

4、训练模型（使用测试数据）

score(self, X, y, sample_weight=None)返回决定系数 R²,也称判定系数，或拟合优度。反应了y的波动有多少百分比能被x的波动所描述。

R²=1-残差平方和/总平方和

当数据确定时，总平方和是确定的
残差平方和越小，R²越接近于1，拟合情况越好

# 将测试数据特征x_test 转换为2D array  XX行*1列
X_test=x_test.values.reshape(-1,1)

# 返回决定系数R²，0.85说明模型拟合效果还不错
model.score(X_test,y_test)

0.8513510464294525

'''绘制训练&测试散点图及最佳拟合线'''

# 1.绘制训练数据散点图
plt.scatter(X_train,y_train,color='b',label='train data')

# 2.绘制最佳拟合线
# 训练数据的预测值
y_train_pred=model.predict(X_train)
# 绘制最佳拟合线
plt.plot(X_train,y_train_pred,color='y',linewidth=2,label='best line')

# 3.绘制测试数据的散点图
plt.scatter(X_test,y_test,color='r',label='test data')

# 4.添加图例和标签
plt.legend(loc=2)
plt.xlabel("Hours")
plt.ylabel("Score")

# 5.显示图像
plt.show()