import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix import pickle from sklearn import preprocessing # Preprocessing data - encode ip addresses to numerical values def ip_to_bin(ip): parts = ip.split('.') return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) # 1. Importing data data = pd.read_csv('training4tuplabeled.csv', converters={ 'sourceIPAddress': lambda x1: ip_to_bin(x1), 'destinationIPAddress': lambda x2: ip_to_bin(x2) }) # 2. Separating labels from data y = data["sublabel"] data = data.drop(columns=["sublabel"]) x = data.to_numpy() print(data) # 3. Splitting data into training/test subsets for model training and validation x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y) # 4. Fitting a Naive Gaussian classifier with the training split from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(3) model.fit(x_train, y_train) # 5. The obtained model is tested with both the training and test split # to ensure no underfitting and overfitting issues y_pred_train = model.predict(x_train) y_pred_test = model.predict(x_test) print("\n *************** TRAINING ****************") print("\n Confusion matrix:") print(confusion_matrix(y_train, y_pred_train)) print(classification_report(y_train, y_pred_train)) print("\n ************** VALIDATION ***************") print("\n Confusion matrix:") print(confusion_matrix(y_test, y_pred_test)) print(classification_report(y_test, y_pred_test)) # 6. Saving the obtained model pickle.dump(model, open('network_traffic_classifier.sav', 'wb')) # *************** TRAINING **************** # # Confusion matrix: # [[913963 262 239 0 282] # [ 1998 670 1 0 4] # [ 916 1 613 0 0] # [ 97 0 0 1 0] # [ 3618 4 0 0 1023]] # precision recall f1-score support # # 0 0.99 1.00 1.00 914746 # Attempted Information Leak 0.72 0.25 0.37 2673 # Generic Protocol Command Decode 0.72 0.40 0.51 1530 # Misc activity 1.00 0.01 0.02 98 # Potentially Bad Traffic 0.78 0.22 0.34 4645 # # accuracy 0.99 923692 # macro avg 0.84 0.38 0.45 923692 # weighted avg 0.99 0.99 0.99 923692 # ************** VALIDATION *************** # # Confusion matrix: # [[228207 164 136 1 180] # [ 591 74 0 0 3] # [ 307 0 75 0 0] # [ 24 0 0 0 0] # [ 1096 2 0 0 64]] # precision recall f1-score support # # 0 0.99 1.00 0.99 228688 # Attempted Information Leak 0.31 0.11 0.16 668 # Generic Protocol Command Decode 0.36 0.20 0.25 382 # Misc activity 0.00 0.00 0.00 24 # Potentially Bad Traffic 0.26 0.06 0.09 1162 # # accuracy 0.99 230924 # macro avg 0.38 0.27 0.30 230924 # weighted avg 0.98 0.99 0.99 230924