目录
3.2.1 原理简介
1.普通线性回归
2.广义线性模型
3.逻辑回归
3.2.2 算法步骤
3.2.3 实战
1.数据集
2.sklearn实现
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import numpy as np
import pytest
def load_data():
diabetes = datasets.load_diabetes()
return train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)
@pytest.fixture
def data():
"""Fixture to load data."""
return load_data()
def test_LinearRegression(data):
X_train, X_test, y_train, y_test = data
regression = linear_model.LinearRegression()
regression.fit(X_train, y_train)
print('\nCoefficients:%s, intercept:%.2f' % (regression.coef_, regression.intercept_))
print("Residual sum of squares:%.2f" % np.mean((regression.predict(X_test) - y_test) ** 2))
print('Score:%.2f' % regression.score(X_test, y_test))
代码结果如下:
def load_iris():
iris = datasets.load_iris()
x = iris.data
y = iris.target
return model_selection.train_test_split(x, y, test_size=0.3, random_state=1, shuffle=True, stratify=y)
@pytest.fixture
def data2():
"""Fixture to load data."""
return load_iris()
def test_LogisticRegression(data2):
X_train, X_test, y_train, y_test = data2
regression = linear_model.LogisticRegression()
regression.fit(X_train, y_train)
print('\nCoefficients:%s, intercept:%s' % (regression.coef_, regression.intercept_))
print('Score:%.2f' % regression.score(X_test, y_test))
代码结果如下
def test_LogisticRegression_multinomial(data2):
X_train, X_test, y_train, y_test = data2
regression = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
regression.fit(X_train, y_train)
print('\nCoefficients:%s, intercept:%s' % (regression.coef_, regression.intercept_))
print('Score:%.2f' % regression.score(X_test, y_test))
代码结果如下
def test_LogisticRegression_C(data2):
X_train, X_test, y_train, y_test = data2
Cs = np.logspace(-2, 4, num=100)
scores = []
for C in Cs:
regression = linear_model.LogisticRegression(C=C)
regression.fit(X_train, y_train)
scores.append(regression.score(X_test, y_test))
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(Cs, scores)
ax.set_xlabel(r"C")
ax.set_ylabel(r"score")
ax.set_xscale('log')
ax.set_title(r"Logistic Regression")
plt.show()
代码结果如下
3.算法实现
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
iris = load_iris()
data = iris.data
target = iris.target
X = data[0:100, [0, 2]]
y = target[0:100]
label = np.array(y)
index_0 = np.where(label == 0)
plt.scatter(X[index_0, 0], X[index_0, 1], marker='x', color='b', label='0', s=15)
index_1 = np.where(label == 1)
plt.scatter(X[index_1, 0], X[index_1, 1], marker='o', color='r', label='1', s=15)
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()
代码结果如下
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
# Iris 数据集中的目标变量 y 是整数向量(类标签),
# 但逻辑回归模型期望 y 为二进制格式(0 和 1)以进行二进制分类。需要相应地转换标签。
class LogisticRegressionBinary(object):
def __init__(self):
self.W = None
def train(self, X, y, lr=0.01, num_iters=5000):
num_train, num_feature = X.shape
self.W = 0.001 * np.random.randn(num_feature, 1).reshape((-1, 1))
loss = []
for i in range(num_iters):
error, dW = self.compute_loss(X, y)
self.W += - lr * dW
loss.append(error)
if i % 200 == 0:
print('i= %d, error= %f' % (i, error))
return loss
def compute_loss(self, X, y):
num_train = X.shape[0]
h = self.output(X)
loss = - np.sum((y * np.log(h) + (1 - y) * np.log(1 - h))) / num_train
dW = X.T.dot(h - y) / num_train
return loss, dW
def output(self, X):
g = np.dot(X, self.W)
return self.sigmoid(g)
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def predict(self, X_test):
h = self.output(X_test)
return np.where(h >= 0.5, 1, 0)
# 加载 Iris 数据集
iris = load_iris()
X = iris.data
y = iris.target
# 鸢尾花数据集有三个类(0,1,2),筛选出类 0 和 1 的数据
# 对于二元分类,筛选类 0 和 1 的数据
binary_filter = y < 2
X = X[binary_filter]
y = y[binary_filter].reshape((-1, 1))
# 在 X 矩阵左侧添加全 1 的列,说明模型中的截距项
one = np.ones((X.shape[0], 1))
X_train = np.hstack((one, X))
# 训练 Logistic 回归模型,使用 Logistic 回归进行二进制分类。
classify = LogisticRegressionBinary()
loss = classify.train(X_train, y)
# 输出学习的权重
print("Learned weights:\n", classify.W)
# 绘制迭代的损失曲线
plt.plot(loss)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.title('Loss curve for Logistic Regression')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
class logistic(object):
def __init__(self):
self.W = None
def train(self, X, y, lr=0.01, num_iters=5000):
num_train, num_feature = X.shape
self.W = 0.001 * np.random.randn(num_feature, 1).reshape((-1, 1))
loss = []
for i in range(num_iters):
error, dW = self.compute_loss(X, y)
self.W += - lr * dW
loss.append(error)
if i % 200 == 0:
print('i= %d, error= %f' % (i, error))
return loss
def compute_loss(self, X, y):
num_train = X.shape[0]
h = self.output(X)
loss = - np.sum((y * np.log(h) + (1 - y) * np.log(1 - h))) / num_train
dW = X.T.dot(h - y) / num_train
return loss, dW
def output(self, X):
g = np.dot(X, self.W)
return self.sigmoid(g)
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def predict(self, X_test):
h = self.output(X_test)
return np.where(h >= 0.5, 1, 0)
iris = load_iris()
data = iris.data
target = iris.target
X = data[0:100, [0, 2]]
y = target[0:100]
y = y.reshape((-1, 1))
one = np.ones((X.shape[0], 1))
X_train = np.hstack((one, X))
classify = logistic()
loss = classify.train(X_train, y)
label = np.array(y)
index_0 = np.where(label == 0)
plt.scatter(X[index_0, 0], X[index_0, 1], marker='x', c='b', label='0', s=15)
index_1 = np.where(label == 1)
plt.scatter(X[index_1, 0], X[index_1, 1], marker='o', c='r', label='1', s=15)
# 绘制分类边界线
x1 = np.arange(4, 7.5, 0.5)
x2 = (- classify.W[0] - classify.W[1] * x1) / classify.W[2]
plt.plot(x1, x2, color='black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()
3.2.4 实验
1.实验目的
2.实验数据
3.实验要求
4.实验代码
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import pandas as pd
# 加载鸢尾花数据集
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
# 将数字标签转换为类别标签
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
# 绘制散点图
sns.scatterplot(data=iris_df, x='sepal length (cm)', y='sepal width (cm)', hue='species')
plt.title('Iris Dataset Scatter Plot')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend(title='Species')
plt.show()
代码结果如下:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# 加载糖尿病数据集
diabetes = load_diabetes()
X = diabetes.data # 特征数据
y = diabetes.target # 目标数据
# 将y调整成列向量
y = y.reshape(-1, 1)
# 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# 初始化权重 W 和偏置 b
W = np.zeros((X_train.shape[1], 1)) # 初始化为0,X_train.shape[1]是特征数
b = 0
# 初始预测值 h(x) 全为 0
y_pred = np.dot(X_train, W) + b # 由于 W 和 b 都是 0,所以预测值为 0
# 计算初始损失值 (MSE)
initial_loss = (1 / (2 * len(y_train))) * np.sum((y_pred - y_train) ** 2)
print("糖尿病数据集的初始损失值:", initial_loss)
代码结果如图:
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 只保留 Setosa 和 Versicolor 两类 (target == 0 是 Setosa, target == 1 是 Versicolor)
mask = y < 2
X = X[mask]
y = y[mask]
# 目标变量转换为 0 和 1
y = (y == 0).astype(int)
# 标准化数据
scaler = StandardScaler()
X = scaler.fit_transform(X)
# 定义 Sigmoid 函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 定义梯度计算函数
def compute_gradient(X, y, theta):
m = X.shape[0]
h = sigmoid(X @ theta)
gradient = (X.T @ (h - y)) / m
return gradient
# 梯度下降法实现
def gradient_descent(X, y, theta, learning_rate, num_iterations):
for i in range(num_iterations):
gradient = compute_gradient(X, y, theta)
theta = theta - learning_rate * gradient
return theta
# 定义 Hessian 计算函数
def compute_hessian(X, theta):
m = X.shape[0]
h = sigmoid(X @ theta)
R = np.diag(h * (1 - h))
H = X.T @ R @ X / m
return H
# 牛顿法实现
def newton_method(X, y, theta, num_iterations):
for i in range(num_iterations):
gradient = compute_gradient(X, y, theta)
hessian = compute_hessian(X, theta)
theta = theta - np.linalg.inv(hessian) @ gradient
return theta
# 预测函数
def predict(X, theta):
return (sigmoid(X @ theta) >= 0.5).astype(int)
# 交叉验证
def cross_validate_model(X, y, method, num_folds=5, learning_rate=0.01, num_iterations=1000):
kf = KFold(n_splits=num_folds)
accuracies = []
for train_index, val_index in kf.split(X):
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
theta = np.zeros(X_train.shape[1])
# 根据选择的方法优化参数
if method == 'gd':
theta_optimal = gradient_descent(X_train, y_train, theta, learning_rate, num_iterations)
elif method == 'newton':
theta_optimal = newton_method(X_train, y_train, theta, num_iterations=10)
# 在验证集上进行预测
y_pred = predict(X_val, theta_optimal)
# 计算准确率
accuracy = np.mean(y_pred == y_val)
accuracies.append(accuracy)
avg_accuracy = np.mean(accuracies)
return avg_accuracy
# 绘制分类边界
def plot_decision_boundary(X, y, theta, title):
# 创建网格范围
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
# 计算网格点的预测值
Z = sigmoid(np.c_[xx.ravel(), yy.ravel()] @ theta)
Z = Z.reshape(xx.shape)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体
plt.rcParams['axes.unicode_minus'] = False
# 绘制决策边界
plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], alpha=0.5, cmap='coolwarm')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', cmap='coolwarm')
plt.xlabel('花萼长度')
plt.ylabel('花萼宽度')
plt.title(title)
plt.show()
# 主程序l
if __name__ == "__main__":
# 使用交叉验证对梯度下降和牛顿法进行验证
accuracy_gd = cross_validate_model(X, y, method='gd', num_folds=5)
accuracy_newton = cross_validate_model(X, y, method='newton', num_folds=5)
print(f'梯度下降法在交叉验证中的平均准确率: {accuracy_gd:.9f}')
print(f'牛顿迭代法在交叉验证中的平均准确率: {accuracy_newton:.9f}')
# 选择前两个特征进行可视化
X_selected = X[:, :2] # 取前两个特征
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
# 使用梯度下降法优化
theta = np.zeros(X_train.shape[1])
theta_optimal_gd = gradient_descent(X_train, y_train, theta, learning_rate=0.01, num_iterations=1000)
# 使用牛顿法优化
theta_optimal_newton = newton_method(X_train, y_train, theta, num_iterations=10)
# 绘制分类边界
plot_decision_boundary(X_train, y_train, theta_optimal_gd, '梯度下降法分类边界')
plot_decision_boundary(X_train, y_train, theta_optimal_newton, '牛顿迭代法分类边界')
结果: