Add draft of trainer and classifier script

2021-06-05 17:26:02 +02:00 · 2021-06-05 17:26:02 +02:00 · a0adc4e7a9
commit a0adc4e7a9
parent 0273476710
2 changed files with 108 additions and 13 deletions
--- a/competition/classifier.py
+++ b/competition/classifier.py
@ -1,24 +1,27 @@
 # 1. Importing new CSV data in pandas dataframes
 import pandas as pd
 import pickle
 from sklearn import preprocessing
 data = pd.read_csv("input.csv")
 x = data.to_numpy()
 # Preprocessing data - encode ip addresses to numerical values
-le = preprocessing.LabelEncoder()
+def ip_to_bin(ip):
-le.fit(data['sourceIPAddress'])
+    parts = ip.split('.')
-data['sourceIPAddress'] = le.transform(data['sourceIPAddress'])
+    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
 le.fit(data['destinationIPAddress'])
 data['destinationIPAddress'] = le.transform(data['destinationIPAddress'])
-# 3. Loading a trained model
+
 # 1. Import data
 data = pd.read_csv('input.csv',
                   converters={
                       'sourceIPAddress': lambda x1: ip_to_bin(x1),
                       'destinationIPAddress': lambda x2: ip_to_bin(x2)
                   })
 x = data.to_numpy()
 # 2. Loading a trained model and predict
 model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
 y_pred = model.predict(x)
 data['label'] = y_pred
 print(data)
-data.append(y_pred)
+# 3. Save output file
-
+pickle.dump(pd, open('output.csv', 'wb'))
--- a/competition/trainer.py
+++ b/competition/trainer.py
@ -0,0 +1,92 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report, confusion_matrix
 import pickle
 from sklearn import preprocessing
 # Preprocessing data - encode ip addresses to numerical values
 def ip_to_bin(ip):
    parts = ip.split('.')
    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
 # 1. Importing data
 data = pd.read_csv('training4tuplabeled.csv',
                   converters={
                       'sourceIPAddress': lambda x1: ip_to_bin(x1),
                       'destinationIPAddress': lambda x2: ip_to_bin(x2)
                   })
 # 2. Separating labels from data
 y = data["sublabel"]
 data = data.drop(columns=["sublabel"])
 x = data.to_numpy()
 print(data)
 # 3. Splitting data into training/test subsets for model training and validation
 x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y)
 # 4. Fitting a Naive Gaussian classifier with the training split
 from sklearn.neighbors import KNeighborsClassifier
 model = KNeighborsClassifier(3)
 model.fit(x_train, y_train)
 # 5. The obtained model is tested with both the training and test split
 # to ensure no underfitting and overfitting issues
 y_pred_train = model.predict(x_train)
 y_pred_test = model.predict(x_test)
 print("\n *************** TRAINING ****************")
 print("\n Confusion matrix:")
 print(confusion_matrix(y_train, y_pred_train))
 print(classification_report(y_train, y_pred_train))
 print("\n ************** VALIDATION ***************")
 print("\n Confusion matrix:")
 print(confusion_matrix(y_test, y_pred_test))
 print(classification_report(y_test, y_pred_test))
 # 6. Saving the obtained model
 pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
 # *************** TRAINING ****************
 #
 # Confusion matrix:
 # [[913963    262    239      0    282]
 #  [  1998    670      1      0      4]
 # [   916      1    613      0      0]
 # [    97      0      0      1      0]
 # [  3618      4      0      0   1023]]
 # precision    recall  f1-score   support
 #
 # 0       0.99      1.00      1.00    914746
 # Attempted Information Leak       0.72      0.25      0.37      2673
 # Generic Protocol Command Decode       0.72      0.40      0.51      1530
 # Misc activity       1.00      0.01      0.02        98
 # Potentially Bad Traffic       0.78      0.22      0.34      4645
 #
 # accuracy                           0.99    923692
 # macro avg       0.84      0.38      0.45    923692
 # weighted avg       0.99      0.99      0.99    923692
 # ************** VALIDATION ***************
 #
 # Confusion matrix:
 # [[228207    164    136      1    180]
 #  [   591     74      0      0      3]
 # [   307      0     75      0      0]
 # [    24      0      0      0      0]
 # [  1096      2      0      0     64]]
 # precision    recall  f1-score   support
 #
 # 0       0.99      1.00      0.99    228688
 # Attempted Information Leak       0.31      0.11      0.16       668
 # Generic Protocol Command Decode       0.36      0.20      0.25       382
 # Misc activity       0.00      0.00      0.00        24
 # Potentially Bad Traffic       0.26      0.06      0.09      1162
 #
 # accuracy                           0.99    230924
 # macro avg       0.38      0.27      0.30    230924
 # weighted avg       0.98      0.99      0.99    230924