65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
from imblearn.over_sampling import SMOTE
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
|
|
# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
|
|
class MultiColumnLabelEncoder:
|
|
def __init__(self, columns=None):
|
|
self.columns = columns # array of column names to encode
|
|
|
|
def fit(self, X, y=None):
|
|
return self # not relevant here
|
|
|
|
def transform(self, X):
|
|
'''
|
|
Transforms columns of X specified in self.columns using
|
|
LabelEncoder(). If no columns specified, transforms all
|
|
columns in X.
|
|
'''
|
|
output = X.copy()
|
|
if self.columns is not None:
|
|
for col in self.columns:
|
|
output[col] = LabelEncoder().fit_transform(output[col])
|
|
else:
|
|
for colname, col in output.iteritems():
|
|
output[colname] = LabelEncoder().fit_transform(col)
|
|
return output
|
|
|
|
def fit_transform(self, X, y=None):
|
|
return self.fit(X, y).transform(X)
|
|
|
|
|
|
df = pd.read_csv('training4tuplabeled.csv')
|
|
y = df['sublabel']
|
|
df = df.drop(columns=['sublabel'])
|
|
df = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(df)
|
|
|
|
oversampler = SMOTE()
|
|
X, y = oversampler.fit_resample(df, y)
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
|
|
|
|
model = RandomForestClassifier(max_depth=None, n_estimators=30, n_jobs=-1)
|
|
model.fit(X_train, y_train)
|
|
|
|
accuracy = model.score(X_test, y_test)
|
|
print('Accuracy: ', accuracy)
|
|
y_pred_train = model.predict(X_train)
|
|
y_pred_test = model.predict(X_test)
|
|
print("\n *************** TRAINING ****************")
|
|
# cm_train = confusion_matrix(y_train, y_pred_train)
|
|
# print(cm_train)
|
|
print(classification_report(y_train, y_pred_train))
|
|
print("\n ************** VALIDATION ***************")
|
|
# cm_test = confusion_matrix(y_test, y_pred_test)
|
|
# print(cm_test)
|
|
print(classification_report(y_test, y_pred_test))
|
|
|
|
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
|