From ddc9ce5c20af86cb696376c27fa5ad5864378f9e Mon Sep 17 00:00:00 2001 From: Tobias Eidelpes Date: Sat, 5 Jun 2021 15:16:36 +0200 Subject: [PATCH] Add random forest classifier --- competition/random_forest.py | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 competition/random_forest.py diff --git a/competition/random_forest.py b/competition/random_forest.py new file mode 100644 index 0000000..923c4ce --- /dev/null +++ b/competition/random_forest.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sn +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.model_selection import train_test_split + + +def ip_to_bin(x): + parts = x.split('.') + return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) + + +df = pd.read_csv('training4tuplabeled.csv', + converters={ + 'sourceIPAddress': lambda x: ip_to_bin(x), + 'destinationIPAddress': lambda x: ip_to_bin(x) + }) +df.drop(['flowStartMilliseconds'], 1, inplace=True) +X = np.array(df.drop(columns=['sublabel'])) +y = np.array(df['sublabel']) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) + +clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='gini', random_state=0, class_weight="balanced") +clf.fit(X_train, y_train) + +accuracy = clf.score(X_test, y_test) +print('Accuracy: ', accuracy) +y_pred_train = clf.predict(X_train) +y_pred_test = clf.predict(X_test) +print("\n *************** TRAINING ****************") +cm_train = confusion_matrix(y_train, y_pred_train) +plt.figure(figsize=(10, 7)) +sn.heatmap(cm_train, annot=True) +plt.xlabel('Truth') +plt.ylabel('Predicted') +plt.show() +print(classification_report(y_train, y_pred_train)) +print("\n ************** VALIDATION ***************") +cm_test = confusion_matrix(y_test, y_pred_test) +plt.figure(figsize=(10, 7)) +sn.heatmap(cm_test, annot=True) +plt.xlabel('Truth') +plt.ylabel('Predicted') +plt.show() +print(classification_report(y_test, y_pred_test)) + +example_measure = np.array([ip_to_bin('2.1.1.1'), ip_to_bin('2.1.1.2'), 0, 0, 1])