From a0adc4e7a992eedf272f5ea4e89400e9760a3edb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Windsperger?= <e1302775@student.tuwien.ac.at>
Date: Sat, 5 Jun 2021 17:26:02 +0200
Subject: [PATCH] Add draft of trainer and classifier script

---
 competition/classifier.py | 29 ++++++------
 competition/trainer.py    | 92 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 13 deletions(-)
 create mode 100644 competition/trainer.py

diff --git a/competition/classifier.py b/competition/classifier.py
index 023bcc5..0e0878a 100644
--- a/competition/classifier.py
+++ b/competition/classifier.py
@@ -1,24 +1,27 @@
-# 1. Importing new CSV data in pandas dataframes
 import pandas as pd
 import pickle
-from sklearn import preprocessing
 
-data = pd.read_csv("input.csv")
-x = data.to_numpy()
 
 # Preprocessing data - encode ip addresses to numerical values
-le = preprocessing.LabelEncoder()
-le.fit(data['sourceIPAddress'])
-data['sourceIPAddress'] = le.transform(data['sourceIPAddress'])
-le.fit(data['destinationIPAddress'])
-data['destinationIPAddress'] = le.transform(data['destinationIPAddress'])
+def ip_to_bin(ip):
+    parts = ip.split('.')
+    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
 
-# 3. Loading a trained model
+
+# 1. Import data
+data = pd.read_csv('input.csv',
+                   converters={
+                       'sourceIPAddress': lambda x1: ip_to_bin(x1),
+                       'destinationIPAddress': lambda x2: ip_to_bin(x2)
+                   })
+x = data.to_numpy()
+
+# 2. Loading a trained model and predict
 model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
 y_pred = model.predict(x)
 
+data['label'] = y_pred
 print(data)
 
-data.append(y_pred)
-
-
+# 3. Save output file
+pickle.dump(pd, open('output.csv', 'wb'))
diff --git a/competition/trainer.py b/competition/trainer.py
new file mode 100644
index 0000000..c8e0bcc
--- /dev/null
+++ b/competition/trainer.py
@@ -0,0 +1,92 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix
+import pickle
+from sklearn import preprocessing
+
+
+# Preprocessing data - encode ip addresses to numerical values
+def ip_to_bin(ip):
+    parts = ip.split('.')
+    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
+
+
+# 1. Importing data
+data = pd.read_csv('training4tuplabeled.csv',
+                   converters={
+                       'sourceIPAddress': lambda x1: ip_to_bin(x1),
+                       'destinationIPAddress': lambda x2: ip_to_bin(x2)
+                   })
+
+# 2. Separating labels from data
+y = data["sublabel"]
+data = data.drop(columns=["sublabel"])
+x = data.to_numpy()
+
+print(data)
+
+# 3. Splitting data into training/test subsets for model training and validation
+x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)
+
+# 4. Fitting a Naive Gaussian classifier with the training split
+from sklearn.neighbors import KNeighborsClassifier
+model = KNeighborsClassifier(3)
+model.fit(x_train, y_train)
+
+# 5. The obtained model is tested with both the training and test split
+# to ensure no underfitting and overfitting issues
+y_pred_train = model.predict(x_train)
+y_pred_test = model.predict(x_test)
+
+print("\n *************** TRAINING ****************")
+print("\n Confusion matrix:")
+print(confusion_matrix(y_train, y_pred_train))
+print(classification_report(y_train, y_pred_train))
+print("\n ************** VALIDATION ***************")
+print("\n Confusion matrix:")
+print(confusion_matrix(y_test, y_pred_test))
+print(classification_report(y_test, y_pred_test))
+
+# 6. Saving the obtained model
+pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
+
+
+# *************** TRAINING ****************
+#
+# Confusion matrix:
+# [[913963    262    239      0    282]
+#  [  1998    670      1      0      4]
+# [   916      1    613      0      0]
+# [    97      0      0      1      0]
+# [  3618      4      0      0   1023]]
+# precision    recall  f1-score   support
+#
+# 0       0.99      1.00      1.00    914746
+# Attempted Information Leak       0.72      0.25      0.37      2673
+# Generic Protocol Command Decode       0.72      0.40      0.51      1530
+# Misc activity       1.00      0.01      0.02        98
+# Potentially Bad Traffic       0.78      0.22      0.34      4645
+#
+# accuracy                           0.99    923692
+# macro avg       0.84      0.38      0.45    923692
+# weighted avg       0.99      0.99      0.99    923692
+
+# ************** VALIDATION ***************
+#
+# Confusion matrix:
+# [[228207    164    136      1    180]
+#  [   591     74      0      0      3]
+# [   307      0     75      0      0]
+# [    24      0      0      0      0]
+# [  1096      2      0      0     64]]
+# precision    recall  f1-score   support
+#
+# 0       0.99      1.00      0.99    228688
+# Attempted Information Leak       0.31      0.11      0.16       668
+# Generic Protocol Command Decode       0.36      0.20      0.25       382
+# Misc activity       0.00      0.00      0.00        24
+# Potentially Bad Traffic       0.26      0.06      0.09      1162
+#
+# accuracy                           0.99    230924
+# macro avg       0.38      0.27      0.30    230924
+# weighted avg       0.98      0.99      0.99    230924