[Машинное обучение от нуля до одного sklearn] · 0.1.1 · Линейная аппроксимация
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 引入包
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
class Args:
seed=1234
data_file="sample_data.csv"
num_samples=100
train_size=0.75
test_size=0.25
num_epochs=100
args = Args()
# 设置随机种子来保证结果可复现
np.random.seed(args.seed)
# 生成数据
def generate_data(num_samples):
X = np.array(range(num_samples))
random_noise = np.random.uniform(-20,20,size=num_samples)
y = 3.65*X + 10 + random_noise # add some noise
return X, y
# 生成随机但是线性的数据
X, y = generate_data(args.num_samples)
data = np.vstack([X, y]).T
df = pd.DataFrame(data, columns=['X', 'y'])
print(df.head())
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
df["X"].values.reshape(-1, 1), df["y"], test_size=args.test_size,
random_state=args.seed)
print ("X_train:", X_train.shape)
print ("y_train:", y_train.shape)
print ("X_test:", X_test.shape)
print ("y_test:", y_test.shape)
# 标准化训练集
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1,1))
# 分别在训练集和测试集上做标准化
standardized_X_train = X_scaler.transform(X_train)
standardized_y_train = y_scaler.transform(y_train.values.reshape(-1,1)).ravel()
standardized_X_test = X_scaler.transform(X_test)
standardized_y_test = y_scaler.transform(y_test.values.reshape(-1,1)).ravel()
print ("mean:", np.mean(standardized_X_train, axis=0),
np.mean(standardized_y_train, axis=0)) # mean should be ~0
print ("std:", np.std(standardized_X_train, axis=0),
np.std(standardized_y_train, axis=0)) # std should be 1
lm = SGDRegressor(loss="squared_loss", penalty="none", max_iter=args.num_epochs)
# lm model
print(lm)
lm.fit(X=standardized_X_train, y=standardized_y_train)
pred_train = (lm.predict(standardized_X_train) * np.sqrt(y_scaler.var_)) + y_scaler.mean_
pred_test = (lm.predict(standardized_X_test) * np.sqrt(y_scaler.var_)) + y_scaler.mean_
# 调整图大小
plt.figure(figsize=(16,9))
# 画出训练数据
plt.subplot(1, 2, 1)
plt.title("Train")
plt.scatter(X_train, y_train, label="y_train")
plt.plot(X_train, pred_train, color="red", linewidth=1, linestyle="-", label="lm")
plt.legend(loc='lower right')
# 画出测试数据
plt.subplot(1, 2, 2)
plt.title("Test")
plt.scatter(X_test, y_test, label="y_test")
plt.plot(X_test, pred_test, color="red", linewidth=1, linestyle="-", label="lm")
plt.legend(loc='lower right')
plt.show()
X y
0 0.0 -2.339222
1 1.0 18.534351
2 2.0 14.809110
3 3.0 32.364343
4 4.0 35.799032
X_train: (75, 1)
y_train: (75,)
X_test: (25, 1)
y_test: (25,)
mean: [8.22952817e-17] 3.1798637796972193e-16
std: [1.] 1.0000000000000002
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
eta0=0.01, fit_intercept=True, l1_ratio=0.15,
learning_rate='invscaling', loss='squared_loss', max_iter=100,
n_iter=None, n_iter_no_change=5, penalty='none', power_t=0.25,
random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
verbose=0, warm_start=False)
- image out: