From a0adc4e7a992eedf272f5ea4e89400e9760a3edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Windsperger?= Date: Sat, 5 Jun 2021 17:26:02 +0200 Subject: [PATCH] Add draft of trainer and classifier script --- competition/classifier.py | 29 ++++++------ competition/trainer.py | 92 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 competition/trainer.py diff --git a/competition/classifier.py b/competition/classifier.py index 023bcc5..0e0878a 100644 --- a/competition/classifier.py +++ b/competition/classifier.py @@ -1,24 +1,27 @@ -# 1. Importing new CSV data in pandas dataframes import pandas as pd import pickle -from sklearn import preprocessing -data = pd.read_csv("input.csv") -x = data.to_numpy() # Preprocessing data - encode ip addresses to numerical values -le = preprocessing.LabelEncoder() -le.fit(data['sourceIPAddress']) -data['sourceIPAddress'] = le.transform(data['sourceIPAddress']) -le.fit(data['destinationIPAddress']) -data['destinationIPAddress'] = le.transform(data['destinationIPAddress']) +def ip_to_bin(ip): + parts = ip.split('.') + return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) -# 3. Loading a trained model + +# 1. Import data +data = pd.read_csv('input.csv', + converters={ + 'sourceIPAddress': lambda x1: ip_to_bin(x1), + 'destinationIPAddress': lambda x2: ip_to_bin(x2) + }) +x = data.to_numpy() + +# 2. Loading a trained model and predict model = pickle.load(open('network_traffic_classifier.sav', 'rb')) y_pred = model.predict(x) +data['label'] = y_pred print(data) -data.append(y_pred) - - +# 3. Save output file +pickle.dump(pd, open('output.csv', 'wb')) diff --git a/competition/trainer.py b/competition/trainer.py new file mode 100644 index 0000000..c8e0bcc --- /dev/null +++ b/competition/trainer.py @@ -0,0 +1,92 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix +import pickle +from sklearn import preprocessing + + +# Preprocessing data - encode ip addresses to numerical values +def ip_to_bin(ip): + parts = ip.split('.') + return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) + + +# 1. Importing data +data = pd.read_csv('training4tuplabeled.csv', + converters={ + 'sourceIPAddress': lambda x1: ip_to_bin(x1), + 'destinationIPAddress': lambda x2: ip_to_bin(x2) + }) + +# 2. Separating labels from data +y = data["sublabel"] +data = data.drop(columns=["sublabel"]) +x = data.to_numpy() + +print(data) + +# 3. Splitting data into training/test subsets for model training and validation +x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y) + +# 4. Fitting a Naive Gaussian classifier with the training split +from sklearn.neighbors import KNeighborsClassifier +model = KNeighborsClassifier(3) +model.fit(x_train, y_train) + +# 5. The obtained model is tested with both the training and test split +# to ensure no underfitting and overfitting issues +y_pred_train = model.predict(x_train) +y_pred_test = model.predict(x_test) + +print("\n *************** TRAINING ****************") +print("\n Confusion matrix:") +print(confusion_matrix(y_train, y_pred_train)) +print(classification_report(y_train, y_pred_train)) +print("\n ************** VALIDATION ***************") +print("\n Confusion matrix:") +print(confusion_matrix(y_test, y_pred_test)) +print(classification_report(y_test, y_pred_test)) + +# 6. Saving the obtained model +pickle.dump(model, open('network_traffic_classifier.sav', 'wb')) + + +# *************** TRAINING **************** +# +# Confusion matrix: +# [[913963 262 239 0 282] +# [ 1998 670 1 0 4] +# [ 916 1 613 0 0] +# [ 97 0 0 1 0] +# [ 3618 4 0 0 1023]] +# precision recall f1-score support +# +# 0 0.99 1.00 1.00 914746 +# Attempted Information Leak 0.72 0.25 0.37 2673 +# Generic Protocol Command Decode 0.72 0.40 0.51 1530 +# Misc activity 1.00 0.01 0.02 98 +# Potentially Bad Traffic 0.78 0.22 0.34 4645 +# +# accuracy 0.99 923692 +# macro avg 0.84 0.38 0.45 923692 +# weighted avg 0.99 0.99 0.99 923692 + +# ************** VALIDATION *************** +# +# Confusion matrix: +# [[228207 164 136 1 180] +# [ 591 74 0 0 3] +# [ 307 0 75 0 0] +# [ 24 0 0 0 0] +# [ 1096 2 0 0 64]] +# precision recall f1-score support +# +# 0 0.99 1.00 0.99 228688 +# Attempted Information Leak 0.31 0.11 0.16 668 +# Generic Protocol Command Decode 0.36 0.20 0.25 382 +# Misc activity 0.00 0.00 0.00 24 +# Potentially Bad Traffic 0.26 0.06 0.09 1162 +# +# accuracy 0.99 230924 +# macro avg 0.38 0.27 0.30 230924 +# weighted avg 0.98 0.99 0.99 230924