import numpy as np import pandas as pd import pickle from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split def ip_to_bin(x): parts = x.split('.') return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) df = pd.read_csv('training4tuplabeled.csv', converters={ 'sourceIPAddress': lambda x: ip_to_bin(x), 'destinationIPAddress': lambda x: ip_to_bin(x) }) df.drop(['flowStartMilliseconds'], 1, inplace=True) X = np.array(df.drop(columns=['sublabel'])) y = np.array(df['sublabel']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='gini', random_state=0) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print('Accuracy: ', accuracy) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) print("\n *************** TRAINING ****************") cm_train = confusion_matrix(y_train, y_pred_train) print(cm_train) print(classification_report(y_train, y_pred_train)) print("\n ************** VALIDATION ***************") cm_test = confusion_matrix(y_test, y_pred_test) print(cm_test) print(classification_report(y_test, y_pred_test)) pickle.dump(clf, open('network_traffic_classifier.sav', 'wb'))