46 lines
1.6 KiB
Python
46 lines
1.6 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
from imblearn.over_sampling import SMOTE
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
def ip_to_bin(x):
|
|
parts = x.split('.')
|
|
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
|
|
|
|
|
df = pd.read_csv('training4tuplabeled.csv',
|
|
converters={
|
|
'sourceIPAddress': lambda x: ip_to_bin(x),
|
|
'destinationIPAddress': lambda x: ip_to_bin(x)
|
|
})
|
|
df.drop(['flowStartMilliseconds'], 1, inplace=True)
|
|
X = np.array(df.drop(columns=['sublabel']))
|
|
y = np.array(df['sublabel'])
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
|
|
|
|
oversampler = SMOTE(random_state=777, k_neighbors=5)
|
|
X_train, y_train = oversampler.fit_resample(X_train, y_train)
|
|
|
|
clf = RandomForestClassifier(n_estimators=20, n_jobs=-1, criterion='gini', random_state=0)
|
|
clf.fit(X_train, y_train)
|
|
|
|
accuracy = clf.score(X_test, y_test)
|
|
print('Accuracy: ', accuracy)
|
|
y_pred_train = clf.predict(X_train)
|
|
y_pred_test = clf.predict(X_test)
|
|
print("\n *************** TRAINING ****************")
|
|
cm_train = confusion_matrix(y_train, y_pred_train)
|
|
print(cm_train)
|
|
print(classification_report(y_train, y_pred_train))
|
|
print("\n ************** VALIDATION ***************")
|
|
cm_test = confusion_matrix(y_test, y_pred_test)
|
|
print(cm_test)
|
|
print(classification_report(y_test, y_pred_test))
|
|
|
|
pickle.dump(clf, open('network_traffic_classifier.sav', 'wb'))
|