netsec-lab/competition/trainer.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from sklearn import preprocessing


# Preprocessing data - encode ip addresses to numerical values
def ip_to_bin(ip):
    parts = ip.split('.')
    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])


# 1. Importing data
data = pd.read_csv('training4tuplabeled.csv',
                   converters={
                       'sourceIPAddress': lambda x1: ip_to_bin(x1),
                       'destinationIPAddress': lambda x2: ip_to_bin(x2)
                   })

# 2. Separating labels from data
y = data["sublabel"]
data = data.drop(columns=["sublabel"])
x = data.to_numpy()

print(data)

# 3. Splitting data into training/test subsets for model training and validation
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)

# 4. Fitting a Naive Gaussian classifier with the training split
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(3)
model.fit(x_train, y_train)

# 5. The obtained model is tested with both the training and test split
# to ensure no underfitting and overfitting issues
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print("\n *************** TRAINING ****************")
print("\n Confusion matrix:")
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("\n ************** VALIDATION ***************")
print("\n Confusion matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

# 6. Saving the obtained model
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))


# *************** TRAINING ****************
#
# Confusion matrix:
# [[913963    262    239      0    282]
#  [  1998    670      1      0      4]
# [   916      1    613      0      0]
# [    97      0      0      1      0]
# [  3618      4      0      0   1023]]
# precision    recall  f1-score   support
#
# 0       0.99      1.00      1.00    914746
# Attempted Information Leak       0.72      0.25      0.37      2673
# Generic Protocol Command Decode       0.72      0.40      0.51      1530
# Misc activity       1.00      0.01      0.02        98
# Potentially Bad Traffic       0.78      0.22      0.34      4645
#
# accuracy                           0.99    923692
# macro avg       0.84      0.38      0.45    923692
# weighted avg       0.99      0.99      0.99    923692

# ************** VALIDATION ***************
#
# Confusion matrix:
# [[228207    164    136      1    180]
#  [   591     74      0      0      3]
# [   307      0     75      0      0]
# [    24      0      0      0      0]
# [  1096      2      0      0     64]]
# precision    recall  f1-score   support
#
# 0       0.99      1.00      0.99    228688
# Attempted Information Leak       0.31      0.11      0.16       668
# Generic Protocol Command Decode       0.36      0.20      0.25       382
# Misc activity       0.00      0.00      0.00        24
# Potentially Bad Traffic       0.26      0.06      0.09      1162
#
# accuracy                           0.99    230924
# macro avg       0.38      0.27      0.30    230924
# weighted avg       0.98      0.99      0.99    230924