93 lines
3.3 KiB
Python
93 lines
3.3 KiB
Python
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
import pickle
|
|
from sklearn import preprocessing
|
|
|
|
|
|
# Preprocessing data - encode ip addresses to numerical values
|
|
def ip_to_bin(ip):
|
|
parts = ip.split('.')
|
|
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
|
|
|
|
|
# 1. Importing data
|
|
data = pd.read_csv('training4tuplabeled.csv',
|
|
converters={
|
|
'sourceIPAddress': lambda x1: ip_to_bin(x1),
|
|
'destinationIPAddress': lambda x2: ip_to_bin(x2)
|
|
})
|
|
|
|
# 2. Separating labels from data
|
|
y = data["sublabel"]
|
|
data = data.drop(columns=["sublabel"])
|
|
x = data.to_numpy()
|
|
|
|
print(data)
|
|
|
|
# 3. Splitting data into training/test subsets for model training and validation
|
|
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)
|
|
|
|
# 4. Fitting a Naive Gaussian classifier with the training split
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
model = KNeighborsClassifier(3)
|
|
model.fit(x_train, y_train)
|
|
|
|
# 5. The obtained model is tested with both the training and test split
|
|
# to ensure no underfitting and overfitting issues
|
|
y_pred_train = model.predict(x_train)
|
|
y_pred_test = model.predict(x_test)
|
|
|
|
print("\n *************** TRAINING ****************")
|
|
print("\n Confusion matrix:")
|
|
print(confusion_matrix(y_train, y_pred_train))
|
|
print(classification_report(y_train, y_pred_train))
|
|
print("\n ************** VALIDATION ***************")
|
|
print("\n Confusion matrix:")
|
|
print(confusion_matrix(y_test, y_pred_test))
|
|
print(classification_report(y_test, y_pred_test))
|
|
|
|
# 6. Saving the obtained model
|
|
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
|
|
|
|
|
|
# *************** TRAINING ****************
|
|
#
|
|
# Confusion matrix:
|
|
# [[913963 262 239 0 282]
|
|
# [ 1998 670 1 0 4]
|
|
# [ 916 1 613 0 0]
|
|
# [ 97 0 0 1 0]
|
|
# [ 3618 4 0 0 1023]]
|
|
# precision recall f1-score support
|
|
#
|
|
# 0 0.99 1.00 1.00 914746
|
|
# Attempted Information Leak 0.72 0.25 0.37 2673
|
|
# Generic Protocol Command Decode 0.72 0.40 0.51 1530
|
|
# Misc activity 1.00 0.01 0.02 98
|
|
# Potentially Bad Traffic 0.78 0.22 0.34 4645
|
|
#
|
|
# accuracy 0.99 923692
|
|
# macro avg 0.84 0.38 0.45 923692
|
|
# weighted avg 0.99 0.99 0.99 923692
|
|
|
|
# ************** VALIDATION ***************
|
|
#
|
|
# Confusion matrix:
|
|
# [[228207 164 136 1 180]
|
|
# [ 591 74 0 0 3]
|
|
# [ 307 0 75 0 0]
|
|
# [ 24 0 0 0 0]
|
|
# [ 1096 2 0 0 64]]
|
|
# precision recall f1-score support
|
|
#
|
|
# 0 0.99 1.00 0.99 228688
|
|
# Attempted Information Leak 0.31 0.11 0.16 668
|
|
# Generic Protocol Command Decode 0.36 0.20 0.25 382
|
|
# Misc activity 0.00 0.00 0.00 24
|
|
# Potentially Bad Traffic 0.26 0.06 0.09 1162
|
|
#
|
|
# accuracy 0.99 230924
|
|
# macro avg 0.38 0.27 0.30 230924
|
|
# weighted avg 0.98 0.99 0.99 230924
|