Project1
UCI数据集: http://archive.ics.uci.edu/ml/index.php
任选一个数据集,任选一种ML算法:逻辑回归、决策树、神经网络、SVM等
源码+实验报告 交给助教
Deadline: 学期末考试前
实验准备¶
In [11]:
Copied!
import pandas as pd
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split ,StratifiedKFold,StratifiedShuffleSplit
import pydotplus
from io import StringIO
import pandas as pd
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split ,StratifiedKFold,StratifiedShuffleSplit
import pydotplus
from io import StringIO
In [2]:
Copied!
Data = pd.read_csv("data/wine.data")
print(Data)
Data = pd.read_csv("data/wine.data")
print(Data)
1 14.23 1.71 2.43 15.6 127 2.8 3.06 .28 2.29 5.64 1.04 \
0 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05
1 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03
2 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86
3 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04
4 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05
.. .. ... ... ... ... ... ... ... ... ... ... ...
172 3 13.71 5.65 2.45 20.5 95 1.68 0.61 0.52 1.06 7.70 0.64
173 3 13.40 3.91 2.48 23.0 102 1.80 0.75 0.43 1.41 7.30 0.70
174 3 13.27 4.28 2.26 20.0 120 1.59 0.69 0.43 1.35 10.20 0.59
175 3 13.17 2.59 2.37 20.0 120 1.65 0.68 0.53 1.46 9.30 0.60
176 3 14.13 4.10 2.74 24.5 96 2.05 0.76 0.56 1.35 9.20 0.61
3.92 1065
0 3.40 1050
1 3.17 1185
2 3.45 1480
3 2.93 735
4 2.85 1450
.. ... ...
172 1.74 740
173 1.56 750
174 1.56 835
175 1.62 840
176 1.60 560
[177 rows x 14 columns]
In [3]:
Copied!
Data = pd.read_csv("data/wine.data")
Data.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "Diluted wines", "Proline"]
print(Data.info())
print(Data.head())
n = Data.shape[0]
p = Data.shape[1] - 1
print(f"数据集样本数量:{n},特征数量:{p}")
Data = pd.read_csv("data/wine.data")
Data.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "Diluted wines", "Proline"]
print(Data.info())
print(Data.head())
n = Data.shape[0]
p = Data.shape[1] - 1
print(f"数据集样本数量:{n},特征数量:{p}")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177 entries, 0 to 176 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 class 177 non-null int64 1 Alcohol 177 non-null float64 2 Malic acid 177 non-null float64 3 Ash 177 non-null float64 4 Alcalinity of ash 177 non-null float64 5 Magnesium 177 non-null int64 6 Total phenols 177 non-null float64 7 Flavanoids 177 non-null float64 8 Nonflavanoid phenols 177 non-null float64 9 Proanthocyanins 177 non-null float64 10 Color intensity 177 non-null float64 11 Hue 177 non-null float64 12 Diluted wines 177 non-null float64 13 Proline 177 non-null int64 dtypes: float64(11), int64(3) memory usage: 19.5 KB None class Alcohol Malic acid Ash Alcalinity of ash Magnesium \ 0 1 13.20 1.78 2.14 11.2 100 1 1 13.16 2.36 2.67 18.6 101 2 1 14.37 1.95 2.50 16.8 113 3 1 13.24 2.59 2.87 21.0 118 4 1 14.20 1.76 2.45 15.2 112 Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins \ 0 2.65 2.76 0.26 1.28 1 2.80 3.24 0.30 2.81 2 3.85 3.49 0.24 2.18 3 2.80 2.69 0.39 1.82 4 3.27 3.39 0.34 1.97 Color intensity Hue Diluted wines Proline 0 4.38 1.05 3.40 1050 1 5.68 1.03 3.17 1185 2 7.80 0.86 3.45 1480 3 4.32 1.04 2.93 735 4 6.75 1.05 2.85 1450 数据集样本数量:177,特征数量:13
In [4]:
Copied!
count = dict(Data['class'].value_counts())
print(count)
count = dict(Data['class'].value_counts())
print(count)
{2: 71, 1: 58, 3: 48}
特征的初步观察&降维¶
特征之间的相关系数观察
In [5]:
Copied!
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(Data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(Data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
Out[5]:
<Axes: >
PCA降维
In [6]:
Copied!
X = Data.iloc[:,1:].values
X = StandardScaler().fit_transform(X)
pca = PCA(n_components=0.95) #保证降维后保持95%的信息
pca.fit(X)
X = pca.transform(X)
Y = Data['class']
print(X.shape)
X = Data.iloc[:,1:].values
X = StandardScaler().fit_transform(X)
pca = PCA(n_components=0.95) #保证降维后保持95%的信息
pca.fit(X)
X = pca.transform(X)
Y = Data['class']
print(X.shape)
(177, 10)
数据集划分¶
刚刚PCA降维之后从原本的13个特征变成了现在的10个特征,现在用留出法来划分数据集为训练集和验证集。
In [7]:
Copied!
seed = 666666
random.seed(seed)
data = np.column_stack((Data['class'], X))
Data = pd.DataFrame(data = data[0:,0:], columns=['Y','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])
# 80%用于训练
n_train = int(n * 0.8)
n_test = n - n_train
index = range(0,n)
index_selected = random.sample(index,n_train)
index_selected.sort()
Data_train = Data.loc[index_selected]
Data_test = Data.drop(index = index_selected)
print(Data_train.head())
print(Data_test.head())
seed = 666666
random.seed(seed)
data = np.column_stack((Data['class'], X))
Data = pd.DataFrame(data = data[0:,0:], columns=['Y','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])
# 80%用于训练
n_train = int(n * 0.8)
n_test = n - n_train
index = range(0,n)
index_selected = random.sample(index,n_train)
index_selected.sort()
Data_train = Data.loc[index_selected]
Data_test = Data.drop(index = index_selected)
print(Data_train.head())
print(Data_test.head())
Y X1 X2 X3 X4 X5 X6 X7 \
0 1.0 2.230243 0.302313 -2.032920 -0.281906 -0.259540 -0.927615 0.079724
1 1.0 2.531922 -1.062257 0.976724 0.735727 -0.198602 0.557252 0.432350
2 1.0 3.754677 -2.805309 -0.180370 0.577125 -0.257871 0.100109 -0.364924
5 1.0 2.458228 -1.207065 -0.988236 -0.004678 -1.030103 -0.613463 0.066139
6 1.0 2.061605 -1.640485 0.143768 -1.199707 0.010484 -1.445137 0.058393
X8 X9 X10
0 1.026470 -0.313145 0.131256
1 -0.335611 -1.179069 0.006752
2 0.646847 0.067906 0.373320
5 -0.375278 -0.534132 0.925159
6 0.228268 0.081620 0.793148
Y X1 X2 X3 X4 X5 X6 X7 \
3 1.0 1.020131 -0.888380 2.023870 -0.432792 0.275235 -0.403139 0.454721
4 1.0 3.049199 -2.170007 -0.638747 -0.487628 -0.631358 0.130817 0.421297
17 1.0 3.540130 -2.573414 -0.498949 -0.852835 -1.168261 0.306130 -0.006409
18 1.0 2.092741 -1.088861 -0.164086 0.428353 0.910059 -1.405046 -0.070578
20 1.0 1.108045 -0.252482 0.936185 1.008533 0.362160 -1.210140 0.323099
X8 X9 X10
3 0.412033 0.338059 -0.096320
4 0.398731 -0.113454 -0.020025
17 -0.019910 -0.356006 -0.668649
18 0.578653 0.065942 -0.232590
20 -0.101735 -0.579496 -0.220181
在数据集划分之后,要进行一些预处理,主要是对X进行标准化。
In [8]:
Copied!
X_train = Data_train.drop(columns = ['Y'],axis = 1)
Y_train = Data_train.Y
X_test = Data_test.drop(columns = ['Y'],axis=1)
Y_test = Data_test.Y
X_train_standardized = preprocessing.scale(X_train, with_mean = True, with_std=True) / np.sqrt(n_train)
X_test_standardized = preprocessing.scale(X_test, with_mean = True, with_std=True) / np.sqrt(n_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')
X_train = Data_train.drop(columns = ['Y'],axis = 1)
Y_train = Data_train.Y
X_test = Data_test.drop(columns = ['Y'],axis=1)
Y_test = Data_test.Y
X_train_standardized = preprocessing.scale(X_train, with_mean = True, with_std=True) / np.sqrt(n_train)
X_test_standardized = preprocessing.scale(X_test, with_mean = True, with_std=True) / np.sqrt(n_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')
使用各模型进行分类¶
SVM¶
In [24]:
Copied!
params = {
'kernel':['linear', 'rbf', 'poly', 'sigmoid'],
'C': [1e-3,0.01,0.1, 1, 10, 100,1000],
}
svm = GridSearchCV(SVC(),params)
svm = svm.fit(X_train_standardized, Y_train)
print('best params:', svm.best_params_)
y_pred = svm.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_svm = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_svm}')
print(classification_report(Y_test, y_pred))
params = {
'kernel':['linear', 'rbf', 'poly', 'sigmoid'],
'C': [1e-3,0.01,0.1, 1, 10, 100,1000],
}
svm = GridSearchCV(SVC(),params)
svm = svm.fit(X_train_standardized, Y_train)
print('best params:', svm.best_params_)
y_pred = svm.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_svm = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_svm}')
print(classification_report(Y_test, y_pred))
best params: {'C': 100, 'kernel': 'linear'}
预测正确的数目为33,准确率:0.9166666666666666
precision recall f1-score support
1 1.00 1.00 1.00 12
2 1.00 0.80 0.89 15
3 0.75 1.00 0.86 9
accuracy 0.92 36
macro avg 0.92 0.93 0.92 36
weighted avg 0.94 0.92 0.92 36
In [25]:
Copied!
#五折交叉验证
svm=SVC(C=100,kernel='linear')
scores = cross_val_score(svm, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
#五折交叉验证
svm=SVC(C=100,kernel='linear')
scores = cross_val_score(svm, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
五折交叉验证结果:[0.91666667 0.97222222 1. 1. 0.94285714]
决策树¶
In [32]:
Copied!
tree_model = tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_leaf=4, ccp_alpha=0.0)
tree_model.fit(X_train_standardized, Y_train)
y_pred = tree_model.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_tree = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_tree}')
print(classification_report(Y_test, y_pred))
tree_model = tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_leaf=4, ccp_alpha=0.0)
tree_model.fit(X_train_standardized, Y_train)
y_pred = tree_model.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_tree = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_tree}')
print(classification_report(Y_test, y_pred))
预测正确的数目为32,准确率:0.8888888888888888
precision recall f1-score support
1 0.85 0.92 0.88 12
2 0.92 0.80 0.86 15
3 0.90 1.00 0.95 9
accuracy 0.89 36
macro avg 0.89 0.91 0.89 36
weighted avg 0.89 0.89 0.89 36
In [33]:
Copied!
#五折交叉验证
scores = cross_val_score(tree_model, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
#五折交叉验证
scores = cross_val_score(tree_model, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
五折交叉验证结果:[0.97222222 0.91666667 0.91428571 0.94285714 0.91428571]
决策树可视化
In [34]:
Copied!
dot_data = StringIO()
feature_names = ['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10']
target_names = ['1', '2', '3']
tree.export_graphviz(tree_model, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("tree.pdf")
dot_data = StringIO()
feature_names = ['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10']
target_names = ['1', '2', '3']
tree.export_graphviz(tree_model, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("tree.pdf")
Out[34]:
True
多层感知机¶
In [39]:
Copied!
mlp = Pipeline([('scale', StandardScaler()), ('MLPRegressor', MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6, 2), max_iter=5000, random_state=1))])
mlp = mlp.fit(X_train_standardized, Y_train)
y_pred = mlp.predict(X_test_standardized)
y_pred = np.around(y_pred)
for i in range(len(y_pred)):
if y_pred[i] >= 3:
y_pred[i] = 3
num_true = np.sum(y_pred == Y_test)
acc_mlp = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_mlp}')
print(classification_report(Y_test, y_pred))
mlp = Pipeline([('scale', StandardScaler()), ('MLPRegressor', MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6, 2), max_iter=5000, random_state=1))])
mlp = mlp.fit(X_train_standardized, Y_train)
y_pred = mlp.predict(X_test_standardized)
y_pred = np.around(y_pred)
for i in range(len(y_pred)):
if y_pred[i] >= 3:
y_pred[i] = 3
num_true = np.sum(y_pred == Y_test)
acc_mlp = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_mlp}')
print(classification_report(Y_test, y_pred))
预测正确的数目为28,准确率:0.7777777777777778
precision recall f1-score support
1 0.92 0.92 0.92 12
2 0.89 0.53 0.67 15
3 0.60 1.00 0.75 9
accuracy 0.78 36
macro avg 0.80 0.82 0.78 36
weighted avg 0.83 0.78 0.77 36
逻辑回归¶
In [40]:
Copied!
lr = LogisticRegression().fit(X_train_standardized, Y_train)
y_pred = lr.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_lr = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_lr}')
print(classification_report(Y_test, y_pred))
lr = LogisticRegression().fit(X_train_standardized, Y_train)
y_pred = lr.predict(X_test_standardized)
num_true = np.sum(y_pred == Y_test)
acc_lr = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_lr}')
print(classification_report(Y_test, y_pred))
预测正确的数目为35,准确率:0.9722222222222222
precision recall f1-score support
1 0.92 1.00 0.96 12
2 1.00 0.93 0.97 15
3 1.00 1.00 1.00 9
accuracy 0.97 36
macro avg 0.97 0.98 0.98 36
weighted avg 0.97 0.97 0.97 36
In [41]:
Copied!
#五折交叉验证
scores = cross_val_score(lr, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
#五折交叉验证
scores = cross_val_score(lr, X, Y, cv=5, scoring='accuracy')
print(f'五折交叉验证结果:{scores}')
五折交叉验证结果:[0.97222222 0.97222222 1. 1. 1. ]
随机森林¶
In [42]:
Copied!
forest = RandomForestRegressor(n_estimators=100, min_samples_split=5, max_depth=10)
forest = forest.fit(X_train_standardized, Y_train)
y_pred = forest.predict(X_test_standardized)
y_pred = np.around(y_pred)
num_true = np.sum(y_pred == Y_test)
acc_forest = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_forest}')
print(classification_report(Y_test, y_pred))
forest = RandomForestRegressor(n_estimators=100, min_samples_split=5, max_depth=10)
forest = forest.fit(X_train_standardized, Y_train)
y_pred = forest.predict(X_test_standardized)
y_pred = np.around(y_pred)
num_true = np.sum(y_pred == Y_test)
acc_forest = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_forest}')
print(classification_report(Y_test, y_pred))
预测正确的数目为34,准确率:0.9444444444444444
precision recall f1-score support
1 1.00 0.92 0.96 12
2 0.93 0.93 0.93 15
3 0.90 1.00 0.95 9
accuracy 0.94 36
macro avg 0.94 0.95 0.95 36
weighted avg 0.95 0.94 0.94 36
随机森林的可视化
In [43]:
Copied!
estimators = forest.estimators_
for index, model in enumerate(estimators):
dot_data = tree.export_graphviz(model , out_file=None,
feature_names=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'],
class_names=['1', '2', '3'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("forest.pdf")
estimators = forest.estimators_
for index, model in enumerate(estimators):
dot_data = tree.export_graphviz(model , out_file=None,
feature_names=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'],
class_names=['1', '2', '3'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("forest.pdf")
adaboost¶
In [44]:
Copied!
adaboost = AdaBoostRegressor(n_estimators=100, learning_rate=0.1)
adaboost = adaboost.fit(X_train_standardized, Y_train)
y_pred = adaboost.predict(X_test_standardized)
y_pred = np.around(y_pred)
num_true = np.sum(y_pred == Y_test)
acc_adaboost = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_adaboost}')
print(classification_report(Y_test, y_pred))
adaboost = AdaBoostRegressor(n_estimators=100, learning_rate=0.1)
adaboost = adaboost.fit(X_train_standardized, Y_train)
y_pred = adaboost.predict(X_test_standardized)
y_pred = np.around(y_pred)
num_true = np.sum(y_pred == Y_test)
acc_adaboost = num_true / Y_test.shape[0]
print(f'预测正确的数目为{num_true},准确率:{acc_adaboost}')
print(classification_report(Y_test, y_pred))
预测正确的数目为33,准确率:0.9166666666666666
precision recall f1-score support
1 0.92 0.92 0.92 12
2 0.93 0.87 0.90 15
3 0.90 1.00 0.95 9
accuracy 0.92 36
macro avg 0.92 0.93 0.92 36
weighted avg 0.92 0.92 0.92 36
In [48]:
Copied!
x = ['SVM', 'tree', 'mlp','lr', 'forest', 'adaboost']
y = [acc_svm, acc_tree, acc_mlp,acc_lr,acc_forest,acc_adaboost]
plt.bar(x, y)
plt.xticks(x)
plt.xlabel("model")
plt.ylabel("accuracy")
plt.title("model comparison")
plt.show()
x = ['SVM', 'tree', 'mlp','lr', 'forest', 'adaboost']
y = [acc_svm, acc_tree, acc_mlp,acc_lr,acc_forest,acc_adaboost]
plt.bar(x, y)
plt.xticks(x)
plt.xlabel("model")
plt.ylabel("accuracy")
plt.title("model comparison")
plt.show()