diff --git a/competition/random_forest.py b/competition/random_forest.py index 923c4ce..0540f0a 100644 --- a/competition/random_forest.py +++ b/competition/random_forest.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sn +import pickle from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split @@ -21,9 +20,9 @@ df.drop(['flowStartMilliseconds'], 1, inplace=True) X = np.array(df.drop(columns=['sublabel'])) y = np.array(df['sublabel']) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) -clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='gini', random_state=0, class_weight="balanced") +clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='gini', random_state=0) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) @@ -32,19 +31,11 @@ y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) print("\n *************** TRAINING ****************") cm_train = confusion_matrix(y_train, y_pred_train) -plt.figure(figsize=(10, 7)) -sn.heatmap(cm_train, annot=True) -plt.xlabel('Truth') -plt.ylabel('Predicted') -plt.show() +print(cm_train) print(classification_report(y_train, y_pred_train)) print("\n ************** VALIDATION ***************") cm_test = confusion_matrix(y_test, y_pred_test) -plt.figure(figsize=(10, 7)) -sn.heatmap(cm_test, annot=True) -plt.xlabel('Truth') -plt.ylabel('Predicted') -plt.show() +print(cm_test) print(classification_report(y_test, y_pred_test)) -example_measure = np.array([ip_to_bin('2.1.1.1'), ip_to_bin('2.1.1.2'), 0, 0, 1]) +pickle.dump(clf, open('network_traffic_classifier.sav', 'wb'))