import numpy as np import pandas as pd import pickle from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder # Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn class MultiColumnLabelEncoder: def __init__(self, columns=None): self.columns = columns # array of column names to encode def fit(self, X, y=None): return self # not relevant here def transform(self, X): ''' Transforms columns of X specified in self.columns using LabelEncoder(). If no columns specified, transforms all columns in X. ''' output = X.copy() if self.columns is not None: for col in self.columns: output[col] = LabelEncoder().fit_transform(output[col]) else: for colname, col in output.iteritems(): output[colname] = LabelEncoder().fit_transform(col) return output def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) df = pd.read_csv('training4tuplabeled.csv') y = df['sublabel'] df = df.drop(columns=['sublabel']) df = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(df) oversampler = SMOTE() X, y = oversampler.fit_resample(df, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0) model = RandomForestClassifier(max_depth=None, n_estimators=30, n_jobs=-1) model.fit(X_train, y_train) accuracy = model.score(X_test, y_test) print('Accuracy: ', accuracy) y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) print("\n *************** TRAINING ****************") # cm_train = confusion_matrix(y_train, y_pred_train) # print(cm_train) print(classification_report(y_train, y_pred_train)) print("\n ************** VALIDATION ***************") # cm_test = confusion_matrix(y_test, y_pred_test) # print(cm_test) print(classification_report(y_test, y_pred_test)) pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))