Our dataset is the covtype dataset.
This is the link to download the daset:
Overall description:
- m X n = 581,012 X 55 Key features including: Elevation, Aspect, Slope etc. The last column is the cover_type classification from 1 to 7.
#1. Import the input_path fot the dataset as csv file and necessary library
import os
import time
#Please fill in the input folder path and the output folder path
input_path = '/Users/phupham/Desktop/UniSyd/Semester3/MachineLearning/Assignment2/covtype.csv'
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
#2. Read the value into attr_value for X and cat for Y and preliminary visualization of the data
df = pd.read_csv(input_path,header = None )
attr_value = df.iloc[:,0:53].values
cat = df[54].values
(581012, 55)
#3. Split the data 80% for training and 20% for testing and Preprocessing
Trainset,Testset,Trainset_cat, Testset_cat = train_test_split(attr_value, cat,test_size=0.2, random_state=0)
# Small data set for preprocessing test
X_train, X_test, y_train, y_test = train_test_split(Trainset, Trainset_cat, test_size=0.5, random_state=0)
#3A. Preprocessing with PCA. Below is for testing of the preprocesing
#Apply PCA to decrease the dimension of the dataset
pca = PCA(n_components=5)
newtraining = pca.fit_transform(X_train)
newtest = pca.transform(X_test)
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =1,algorithm='kd_tree'),y_train)
pred = neigh.predict(newtest)
print("Processing time is %s seconds" % (time.time() - start_time))
a = accuracy_score(y_test, pred)
print("Accuracy is %0.5f "%a)
Processing time is 3.619994878768921 seconds
Accuracy is 0.94701
#3B. Preprocessing with Variance Feature Selection
#Apply Variance Feature Selection
selector = VarianceThreshold(threshold = 0.1)
newtraining = selector.fit_transform(X_train)
newtest = selector.fit_transform(X_test)
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =1,algorithm='kd_tree'),y_train)
pred = neigh.predict(newtest)
print("Processing time is %s seconds" % (time.time() - start_time))
a = accuracy_score(y_test, pred)
print("Accuracy is %0.5f "%a)
Processing time is 6.361757040023804 seconds
Accuracy is 0.95658
#4. 10fold-Cross-validation and apply KNN
selector = VarianceThreshold(threshold = 0.1)
newtraining = selector.fit_transform(Trainset)
newtest = selector.fit_transform(Testset)
#This may take a few minutes to run
# Test the appropriate number of k
cv_acc =[]
k_group = [1,3,5,13,15,21]
for k in k_group:
neigh = KNeighborsClassifier(n_neighbors=k,algorithm='kd_tree' )
start_time = time.time()
score = cross_val_score(neigh,newtraining,Trainset_cat, cv=3, scoring='accuracy')
skf = StratifiedKFold(n_splits=10)
for train_index,test_index in skf.split(newtraining,Trainset_cat):
X_train, X_test = newtraining[train_index], newtraining[test_index]
y_train, y_test = Trainset_cat[train_index], Trainset_cat[test_index]
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =3,algorithm='kd_tree'),y_train)
pred = neigh.predict(X_test)
t = time.time() - start_time
print("Processing time is %s seconds" % t)
a = accuracy_score(y_test, pred)
print("Accuracy for %d fold is %0.5f "%(n, a))
n= n+1
Processing time is 3.100414991378784 seconds
Accuracy for 0 fold is 0.96799
Processing time is 2.786444902420044 seconds
Accuracy for 1 fold is 0.96648
Processing time is 2.910871744155884 seconds
Accuracy for 2 fold is 0.96743
Processing time is 2.9090850353240967 seconds
Accuracy for 3 fold is 0.96812
Processing time is 3.82912278175354 seconds
Accuracy for 4 fold is 0.96685
Processing time is 3.389090061187744 seconds
Accuracy for 5 fold is 0.96781
Processing time is 3.58927583694458 seconds
Accuracy for 6 fold is 0.96818
Processing time is 3.0815889835357666 seconds
Accuracy for 7 fold is 0.96725
Processing time is 3.61145281791687 seconds
Accuracy for 8 fold is 0.96603
Processing time is 3.541551113128662 seconds
Accuracy for 9 fold is 0.96684
fig1 =plt.figure()
ax1 = fig1.add_subplot(111)
line1 = ax1.plot(acc, 'r',label = "accuracy_rate",linewidth = 2, )
ax1.set_ylabel("Accuracy rate")
ax2 =fig1.add_subplot(111, sharex=ax1, frameon=False)
line2 = ax2.plot(process, 'c--', label = "processing_time", linewidth = 2, linestyle = '--' )
ax2.set_ylabel("Processing time")
ax1.set_xlabel("Cross validation time nth")
#legend((line1, line2), ("accuracy_rate", "processing_time"))
<matplotlib.legend.Legend at 0x10f2e2898>
#5. Apply the model to the Testset
neigh = KNeighborsClassifier(n_neighbors =3,algorithm='kd_tree'), Trainset_cat)
pred = neigh.predict(newtest)
a = accuracy_score(Testset_cat, pred)
report = metrics.classification_report(Testset_cat,pred)
precision recall f1-score support
1 0.97 0.97 0.97 42212
2 0.97 0.98 0.97 56849
3 0.96 0.97 0.96 7094
4 0.91 0.79 0.85 569
5 0.91 0.89 0.90 1886
6 0.94 0.94 0.94 3502
7 0.97 0.97 0.97 4091
avg / total 0.97 0.97 0.97 116203
#Table of the confusion matrix
y_true = pd.Series(Testset_cat)
y_pred = pd.Series(pred)
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
Predicted | 1 | 2 | 3 | 4 | 5 | 6 | 7 | All |
True | ||||||||
1 | 40931 | 1171 | 0 | 0 | 12 | 3 | 95 | 42212 |
2 | 1129 | 55428 | 77 | 0 | 138 | 60 | 17 | 56849 |
3 | 1 | 108 | 6848 | 29 | 10 | 98 | 0 | 7094 |
4 | 0 | 1 | 87 | 452 | 0 | 29 | 0 | 569 |
5 | 24 | 175 | 10 | 0 | 1672 | 5 | 0 | 1886 |
6 | 5 | 73 | 109 | 18 | 5 | 3292 | 0 | 3502 |
7 | 99 | 19 | 0 | 0 | 0 | 0 | 3973 | 4091 |
All | 42189 | 56975 | 7131 | 499 | 1837 | 3487 | 4085 | 116203 |
confusion = metrics.confusion_matrix(Testset_cat,pred)
target_name = ['1','2','3','4','5','6','7']
def plot_cmatrix(cm,classes,title= 'Confusion matrix for Covtype',
plt.imshow(cm, interpolation='nearest', cmap=cmap)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.figure(figsize=(10, 5))
plot_cmatrix(confusion, classes=target_name,title='Confusion matrix KNN')
#This may take a minute to run
classifier = OneVsRestClassifier(neigh)
predicted_prob =, Trainset_cat).predict_proba(newtest)
accurate_y = label_binarize(Testset_cat, classes=[1,2,3,4,5,6,7])
n_classes = accurate_y.shape[1]
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range (n_classes):
fpr[i], tpr[i], _ = roc_curve(accurate_y[:,i], predicted_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
for i in range(n_classes):
plt.plot(fpr[i], tpr[i], lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i+1, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.legend(loc="lower right")