Add draft of trainer and classifier script
This commit is contained in:
parent
0273476710
commit
a0adc4e7a9
@ -1,24 +1,27 @@
|
||||
# 1. Importing new CSV data in pandas dataframes
|
||||
import pandas as pd
|
||||
import pickle
|
||||
from sklearn import preprocessing
|
||||
|
||||
data = pd.read_csv("input.csv")
|
||||
x = data.to_numpy()
|
||||
|
||||
# Preprocessing data - encode ip addresses to numerical values
|
||||
le = preprocessing.LabelEncoder()
|
||||
le.fit(data['sourceIPAddress'])
|
||||
data['sourceIPAddress'] = le.transform(data['sourceIPAddress'])
|
||||
le.fit(data['destinationIPAddress'])
|
||||
data['destinationIPAddress'] = le.transform(data['destinationIPAddress'])
|
||||
def ip_to_bin(ip):
|
||||
parts = ip.split('.')
|
||||
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
||||
|
||||
# 3. Loading a trained model
|
||||
|
||||
# 1. Import data
|
||||
data = pd.read_csv('input.csv',
|
||||
converters={
|
||||
'sourceIPAddress': lambda x1: ip_to_bin(x1),
|
||||
'destinationIPAddress': lambda x2: ip_to_bin(x2)
|
||||
})
|
||||
x = data.to_numpy()
|
||||
|
||||
# 2. Loading a trained model and predict
|
||||
model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
|
||||
y_pred = model.predict(x)
|
||||
|
||||
data['label'] = y_pred
|
||||
print(data)
|
||||
|
||||
data.append(y_pred)
|
||||
|
||||
|
||||
# 3. Save output file
|
||||
pickle.dump(pd, open('output.csv', 'wb'))
|
||||
|
||||
92
competition/trainer.py
Normal file
92
competition/trainer.py
Normal file
@ -0,0 +1,92 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
import pickle
|
||||
from sklearn import preprocessing
|
||||
|
||||
|
||||
# Preprocessing data - encode ip addresses to numerical values
|
||||
def ip_to_bin(ip):
|
||||
parts = ip.split('.')
|
||||
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
||||
|
||||
|
||||
# 1. Importing data
|
||||
data = pd.read_csv('training4tuplabeled.csv',
|
||||
converters={
|
||||
'sourceIPAddress': lambda x1: ip_to_bin(x1),
|
||||
'destinationIPAddress': lambda x2: ip_to_bin(x2)
|
||||
})
|
||||
|
||||
# 2. Separating labels from data
|
||||
y = data["sublabel"]
|
||||
data = data.drop(columns=["sublabel"])
|
||||
x = data.to_numpy()
|
||||
|
||||
print(data)
|
||||
|
||||
# 3. Splitting data into training/test subsets for model training and validation
|
||||
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)
|
||||
|
||||
# 4. Fitting a Naive Gaussian classifier with the training split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
model = KNeighborsClassifier(3)
|
||||
model.fit(x_train, y_train)
|
||||
|
||||
# 5. The obtained model is tested with both the training and test split
|
||||
# to ensure no underfitting and overfitting issues
|
||||
y_pred_train = model.predict(x_train)
|
||||
y_pred_test = model.predict(x_test)
|
||||
|
||||
print("\n *************** TRAINING ****************")
|
||||
print("\n Confusion matrix:")
|
||||
print(confusion_matrix(y_train, y_pred_train))
|
||||
print(classification_report(y_train, y_pred_train))
|
||||
print("\n ************** VALIDATION ***************")
|
||||
print("\n Confusion matrix:")
|
||||
print(confusion_matrix(y_test, y_pred_test))
|
||||
print(classification_report(y_test, y_pred_test))
|
||||
|
||||
# 6. Saving the obtained model
|
||||
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
|
||||
|
||||
|
||||
# *************** TRAINING ****************
|
||||
#
|
||||
# Confusion matrix:
|
||||
# [[913963 262 239 0 282]
|
||||
# [ 1998 670 1 0 4]
|
||||
# [ 916 1 613 0 0]
|
||||
# [ 97 0 0 1 0]
|
||||
# [ 3618 4 0 0 1023]]
|
||||
# precision recall f1-score support
|
||||
#
|
||||
# 0 0.99 1.00 1.00 914746
|
||||
# Attempted Information Leak 0.72 0.25 0.37 2673
|
||||
# Generic Protocol Command Decode 0.72 0.40 0.51 1530
|
||||
# Misc activity 1.00 0.01 0.02 98
|
||||
# Potentially Bad Traffic 0.78 0.22 0.34 4645
|
||||
#
|
||||
# accuracy 0.99 923692
|
||||
# macro avg 0.84 0.38 0.45 923692
|
||||
# weighted avg 0.99 0.99 0.99 923692
|
||||
|
||||
# ************** VALIDATION ***************
|
||||
#
|
||||
# Confusion matrix:
|
||||
# [[228207 164 136 1 180]
|
||||
# [ 591 74 0 0 3]
|
||||
# [ 307 0 75 0 0]
|
||||
# [ 24 0 0 0 0]
|
||||
# [ 1096 2 0 0 64]]
|
||||
# precision recall f1-score support
|
||||
#
|
||||
# 0 0.99 1.00 0.99 228688
|
||||
# Attempted Information Leak 0.31 0.11 0.16 668
|
||||
# Generic Protocol Command Decode 0.36 0.20 0.25 382
|
||||
# Misc activity 0.00 0.00 0.00 24
|
||||
# Potentially Bad Traffic 0.26 0.06 0.09 1162
|
||||
#
|
||||
# accuracy 0.99 230924
|
||||
# macro avg 0.38 0.27 0.30 230924
|
||||
# weighted avg 0.98 0.99 0.99 230924
|
||||
Loading…
x
Reference in New Issue
Block a user