Add draft of trainer and classifier script

This commit is contained in:
Günter Windsperger 2021-06-05 17:26:02 +02:00
parent 0273476710
commit a0adc4e7a9
2 changed files with 108 additions and 13 deletions

View File

@ -1,24 +1,27 @@
# 1. Importing new CSV data in pandas dataframes
import pandas as pd import pandas as pd
import pickle import pickle
from sklearn import preprocessing
data = pd.read_csv("input.csv")
x = data.to_numpy()
# Preprocessing data - encode ip addresses to numerical values # Preprocessing data - encode ip addresses to numerical values
le = preprocessing.LabelEncoder() def ip_to_bin(ip):
le.fit(data['sourceIPAddress']) parts = ip.split('.')
data['sourceIPAddress'] = le.transform(data['sourceIPAddress']) return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
le.fit(data['destinationIPAddress'])
data['destinationIPAddress'] = le.transform(data['destinationIPAddress'])
# 3. Loading a trained model
# 1. Import data
data = pd.read_csv('input.csv',
converters={
'sourceIPAddress': lambda x1: ip_to_bin(x1),
'destinationIPAddress': lambda x2: ip_to_bin(x2)
})
x = data.to_numpy()
# 2. Loading a trained model and predict
model = pickle.load(open('network_traffic_classifier.sav', 'rb')) model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
y_pred = model.predict(x) y_pred = model.predict(x)
data['label'] = y_pred
print(data) print(data)
data.append(y_pred) # 3. Save output file
pickle.dump(pd, open('output.csv', 'wb'))

92
competition/trainer.py Normal file
View File

@ -0,0 +1,92 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from sklearn import preprocessing
# Preprocessing data - encode ip addresses to numerical values
def ip_to_bin(ip):
parts = ip.split('.')
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
# 1. Importing data
data = pd.read_csv('training4tuplabeled.csv',
converters={
'sourceIPAddress': lambda x1: ip_to_bin(x1),
'destinationIPAddress': lambda x2: ip_to_bin(x2)
})
# 2. Separating labels from data
y = data["sublabel"]
data = data.drop(columns=["sublabel"])
x = data.to_numpy()
print(data)
# 3. Splitting data into training/test subsets for model training and validation
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)
# 4. Fitting a Naive Gaussian classifier with the training split
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(3)
model.fit(x_train, y_train)
# 5. The obtained model is tested with both the training and test split
# to ensure no underfitting and overfitting issues
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print("\n *************** TRAINING ****************")
print("\n Confusion matrix:")
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("\n ************** VALIDATION ***************")
print("\n Confusion matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
# 6. Saving the obtained model
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
# *************** TRAINING ****************
#
# Confusion matrix:
# [[913963 262 239 0 282]
# [ 1998 670 1 0 4]
# [ 916 1 613 0 0]
# [ 97 0 0 1 0]
# [ 3618 4 0 0 1023]]
# precision recall f1-score support
#
# 0 0.99 1.00 1.00 914746
# Attempted Information Leak 0.72 0.25 0.37 2673
# Generic Protocol Command Decode 0.72 0.40 0.51 1530
# Misc activity 1.00 0.01 0.02 98
# Potentially Bad Traffic 0.78 0.22 0.34 4645
#
# accuracy 0.99 923692
# macro avg 0.84 0.38 0.45 923692
# weighted avg 0.99 0.99 0.99 923692
# ************** VALIDATION ***************
#
# Confusion matrix:
# [[228207 164 136 1 180]
# [ 591 74 0 0 3]
# [ 307 0 75 0 0]
# [ 24 0 0 0 0]
# [ 1096 2 0 0 64]]
# precision recall f1-score support
#
# 0 0.99 1.00 0.99 228688
# Attempted Information Leak 0.31 0.11 0.16 668
# Generic Protocol Command Decode 0.36 0.20 0.25 382
# Misc activity 0.00 0.00 0.00 24
# Potentially Bad Traffic 0.26 0.06 0.09 1162
#
# accuracy 0.99 230924
# macro avg 0.38 0.27 0.30 230924
# weighted avg 0.98 0.99 0.99 230924