# Hide warnings
import warnings
warnings.simplefilter('ignore')
The following code cells will import necessary libraries and import the dataset from the UCI repository as a Pandas DataFram
#import and change module name
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names)
data.columns
data.head()
data.shape
data.dtypes
The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms.
# Build our dataset using custom pandas dataframe
clases = data.loc[:,'Class']
clases.head()
# generate list of DNA sequence
sequence = list(data.loc[:, 'Sequence'])
sequence
#Remove tab from each sequence
dic = {}
for i, seq in enumerate(sequence):
nucleotides = list(seq)
nucleotides = [char for char in nucleotides if char != '\t']
#append class assignment
nucleotides.append(clases[i])
dic[i] = nucleotides
dic[0]
# Convert Dict object into dataframe
df = pd.DataFrame(dic)
df.head()
# transpose dataframe into correct format
df = df.transpose()
df.head()
df.columns
# Rename
df.rename(columns = {57:'Class'}, inplace = True)
df.columns
df.head()
#Encoding
numerical_df = pd.get_dummies(df)
numerical_df.head()
# Drop class_- or Class_+ either of one
numerical_df.drop('Class_-', axis = 1, inplace = True)
numerical_df.head()
# rename Class_+ to Class
numerical_df.rename(columns = {'Class_+':'Class'}, inplace = True)
Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different classification algorithms. It's relatively easy to test multiple models; as a result, we will compare and contrast the performance of ten different algorithms.
#Importing different classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
X = numerical_df.drop(['Class'], axis = 1).values
y = numerical_df['Class'].values
#define a seed for reproducibility
seed = 1
# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)
# Define scoring method
scoring = 'accuracy'
# Model building to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AddaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
Classifiers = [
KNeighborsClassifier(n_neighbors = 3),
GaussianProcessClassifier(1.0*RBF(1.0)),
DecisionTreeClassifier(max_depth = 5),
RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),
MLPClassifier(alpha = 1),
AdaBoostClassifier(),
GaussianNB(),
svm.SVC(kernel = 'linear'),
svm.SVC(kernel = 'rbf'),
svm.SVC(kernel = 'sigmoid')
]
models = zip(names, Classifiers)
# import KFold
from sklearn.model_selection import KFold, cross_val_score
names = []
result = []
for name, model in models:
kfold = KFold(n_splits = 10, random_state = 1)
cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
result.append(cv_results)
names.append(name)
msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
print(msg)
Now that we will evaluate our classification algorithms using accuracy score and classification report.
#Test the algorithm on the test data set
models = zip(names, Classifiers)
for name, model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(name)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))