KNN
Our dataset is the covtype dataset.
This is the link to download the daset: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
Overall description:
- m X n = 581,012 X 55 Key features including: Elevation, Aspect, Slope etc. The last column is the cover_type classification from 1 to 7.
#1. Import the input_path fot the dataset as csv file and necessary library
import os
import time
#Please fill in the input folder path and the output folder path
input_path = '/Users/phupham/Desktop/UniSyd/Semester3/MachineLearning/Assignment2/covtype.csv'
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
/Users/phupham/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
#2. Read the value into attr_value for X and cat for Y and preliminary visualization of the data
df = pd.read_csv(input_path,header = None )
df.shape
attr_value = df.iloc[:,0:53].values
cat = df[54].values
(581012, 55)
#3. Split the data 80% for training and 20% for testing and Preprocessing
Trainset,Testset,Trainset_cat, Testset_cat = train_test_split(attr_value, cat,test_size=0.2, random_state=0)
# Small data set for preprocessing test
X_train, X_test, y_train, y_test = train_test_split(Trainset, Trainset_cat, test_size=0.5, random_state=0)
#3A. Preprocessing with PCA. Below is for testing of the preprocesing
'''
#Apply PCA to decrease the dimension of the dataset
pca = PCA(n_components=5)
newtraining = pca.fit_transform(X_train)
newtest = pca.transform(X_test)
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =1,algorithm='kd_tree')
neigh.fit(newtraining,y_train)
pred = neigh.predict(newtest)
print("Processing time is %s seconds" % (time.time() - start_time))
a = accuracy_score(y_test, pred)
print("Accuracy is %0.5f "%a)
'''
Processing time is 3.619994878768921 seconds
Accuracy is 0.94701
#3B. Preprocessing with Variance Feature Selection
'''
#Apply Variance Feature Selection
selector = VarianceThreshold(threshold = 0.1)
newtraining = selector.fit_transform(X_train)
newtest = selector.fit_transform(X_test)
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =1,algorithm='kd_tree')
neigh.fit(newtraining,y_train)
pred = neigh.predict(newtest)
print("Processing time is %s seconds" % (time.time() - start_time))
a = accuracy_score(y_test, pred)
print("Accuracy is %0.5f "%a)
'''
Processing time is 6.361757040023804 seconds
Accuracy is 0.95658
APPLY VARIANCE FEATURE SELECTION AS CONSISTENT WITH OTHER METHODS
#4. 10fold-Cross-validation and apply KNN
selector = VarianceThreshold(threshold = 0.1)
newtraining = selector.fit_transform(Trainset)
newtest = selector.fit_transform(Testset)
'''
#This may take a few minutes to run
# Test the appropriate number of k
cv_acc =[]
k_group = [1,3,5,13,15,21]
for k in k_group:
neigh = KNeighborsClassifier(n_neighbors=k,algorithm='kd_tree' )
start_time = time.time()
score = cross_val_score(neigh,newtraining,Trainset_cat, cv=3, scoring='accuracy')
cv_acc.append(score.mean())
print(cv_acc)
'''
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(Trainset,Trainset_cat)
print(skf)
n=0
acc=[]
process=[]
for train_index,test_index in skf.split(newtraining,Trainset_cat):
X_train, X_test = newtraining[train_index], newtraining[test_index]
y_train, y_test = Trainset_cat[train_index], Trainset_cat[test_index]
start_time = time.time()
neigh = KNeighborsClassifier(n_neighbors =3,algorithm='kd_tree')
neigh.fit(X_train,y_train)
pred = neigh.predict(X_test)
t = time.time() - start_time
print("Processing time is %s seconds" % t)
a = accuracy_score(y_test, pred)
print("Accuracy for %d fold is %0.5f "%(n, a))
n= n+1
acc.append(a)
process.append(t)
Processing time is 3.100414991378784 seconds
Accuracy for 0 fold is 0.96799
Processing time is 2.786444902420044 seconds
Accuracy for 1 fold is 0.96648
Processing time is 2.910871744155884 seconds
Accuracy for 2 fold is 0.96743
Processing time is 2.9090850353240967 seconds
Accuracy for 3 fold is 0.96812
Processing time is 3.82912278175354 seconds
Accuracy for 4 fold is 0.96685
Processing time is 3.389090061187744 seconds
Accuracy for 5 fold is 0.96781
Processing time is 3.58927583694458 seconds
Accuracy for 6 fold is 0.96818
Processing time is 3.0815889835357666 seconds
Accuracy for 7 fold is 0.96725
Processing time is 3.61145281791687 seconds
Accuracy for 8 fold is 0.96603
Processing time is 3.541551113128662 seconds
Accuracy for 9 fold is 0.96684
'''
fig1 =plt.figure()
ax1 = fig1.add_subplot(111)
line1 = ax1.plot(acc, 'r',label = "accuracy_rate",linewidth = 2, )
ax1.set_ylabel("Accuracy rate")
ax2 =fig1.add_subplot(111, sharex=ax1, frameon=False)
line2 = ax2.plot(process, 'c--', label = "processing_time", linewidth = 2, linestyle = '--' )
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
ax2.set_ylabel("Processing time")
ax1.set_xlabel("Cross validation time nth")
#legend((line1, line2), ("accuracy_rate", "processing_time"))
#show()
ax1.legend(loc=0)
'''
<matplotlib.legend.Legend at 0x10f2e2898>
#5. Apply the model to the Testset
neigh = KNeighborsClassifier(n_neighbors =3,algorithm='kd_tree')
neigh.fit(newtraining, Trainset_cat)
pred = neigh.predict(newtest)
a = accuracy_score(Testset_cat, pred)
print(a)
0.968959493301
report = metrics.classification_report(Testset_cat,pred)
print(report)
precision recall f1-score support
1 0.97 0.97 0.97 42212
2 0.97 0.98 0.97 56849
3 0.96 0.97 0.96 7094
4 0.91 0.79 0.85 569
5 0.91 0.89 0.90 1886
6 0.94 0.94 0.94 3502
7 0.97 0.97 0.97 4091
avg / total 0.97 0.97 0.97 116203
#Table of the confusion matrix
y_true = pd.Series(Testset_cat)
y_pred = pd.Series(pred)
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
Predicted | 1 | 2 | 3 | 4 | 5 | 6 | 7 | All |
---|---|---|---|---|---|---|---|---|
True | ||||||||
1 | 40931 | 1171 | 0 | 0 | 12 | 3 | 95 | 42212 |
2 | 1129 | 55428 | 77 | 0 | 138 | 60 | 17 | 56849 |
3 | 1 | 108 | 6848 | 29 | 10 | 98 | 0 | 7094 |
4 | 0 | 1 | 87 | 452 | 0 | 29 | 0 | 569 |
5 | 24 | 175 | 10 | 0 | 1672 | 5 | 0 | 1886 |
6 | 5 | 73 | 109 | 18 | 5 | 3292 | 0 | 3502 |
7 | 99 | 19 | 0 | 0 | 0 | 0 | 3973 | 4091 |
All | 42189 | 56975 | 7131 | 499 | 1837 | 3487 | 4085 | 116203 |
'''
confusion = metrics.confusion_matrix(Testset_cat,pred)
target_name = ['1','2','3','4','5','6','7']
def plot_cmatrix(cm,classes,title= 'Confusion matrix for Covtype',cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
np.set_printoptions(precision=2)
plt.figure(figsize=(10, 5))
plot_cmatrix(confusion, classes=target_name,title='Confusion matrix KNN')
plt.show()
'''
#This may take a minute to run
classifier = OneVsRestClassifier(neigh)
predicted_prob = classifier.fit(newtraining, Trainset_cat).predict_proba(newtest)
accurate_y = label_binarize(Testset_cat, classes=[1,2,3,4,5,6,7])
n_classes = accurate_y.shape[1]
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range (n_classes):
fpr[i], tpr[i], _ = roc_curve(accurate_y[:,i], predicted_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
lw=1
for i in range(n_classes):
plt.plot(fpr[i], tpr[i], lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i+1, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.legend(loc="lower right")
plt.show()