Update rf trainer +classifier
+ Change preprocessing method + Do sampling before data splitting
This commit is contained in:
parent
2656724bb4
commit
f87eb289e4
@ -1,45 +1,45 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pickle
|
import pickle
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
|
||||||
# Preprocessing data - encode ip addresses to numerical values
|
# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
|
||||||
def ip_to_bin(ip):
|
class MultiColumnLabelEncoder:
|
||||||
parts = ip.split('.')
|
def __init__(self, columns=None):
|
||||||
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
self.columns = columns # array of column names to encode
|
||||||
|
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self # not relevant here
|
||||||
|
|
||||||
# Postprocessing data - decode numerical values to ip addresses
|
def transform(self, X):
|
||||||
def bin_to_ip(ipnum):
|
'''
|
||||||
o1 = int(ipnum / 16777216) % 256
|
Transforms columns of X specified in self.columns using
|
||||||
o2 = int(ipnum / 65536) % 256
|
LabelEncoder(). If no columns specified, transforms all
|
||||||
o3 = int(ipnum / 256) % 256
|
columns in X.
|
||||||
o4 = int(ipnum) % 256
|
'''
|
||||||
return '%(o1)s.%(o2)s.%(o3)s.%(o4)s' % locals()
|
output = X.copy()
|
||||||
|
if self.columns is not None:
|
||||||
|
for col in self.columns:
|
||||||
|
output[col] = LabelEncoder().fit_transform(output[col])
|
||||||
|
else:
|
||||||
|
for colname, col in output.iteritems():
|
||||||
|
output[colname] = LabelEncoder().fit_transform(col)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def fit_transform(self, X, y=None):
|
||||||
|
return self.fit(X, y).transform(X)
|
||||||
|
|
||||||
|
|
||||||
# 1. Import data
|
# 1. Import data
|
||||||
data = pd.read_csv('input.csv',
|
data = pd.read_csv('input.csv')
|
||||||
converters={
|
data_end = pd.read_csv('input.csv')
|
||||||
'sourceIPAddress': lambda x1: ip_to_bin(x1),
|
data = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(data)
|
||||||
'destinationIPAddress': lambda x2: ip_to_bin(x2)
|
|
||||||
})
|
|
||||||
# Save flowStartMilliseconds column for later insertion
|
|
||||||
flowStartMillisecondsColumn = data['flowStartMilliseconds']
|
|
||||||
# Remove flowStartMilliseconds column
|
|
||||||
data.drop(['flowStartMilliseconds'], 1, inplace=True)
|
|
||||||
x = data.to_numpy()
|
x = data.to_numpy()
|
||||||
|
|
||||||
# 2. Loading a trained model and predict
|
# 2. Loading a trained model and predict
|
||||||
model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
|
model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
|
||||||
y_pred = model.predict(x)
|
y_pred = model.predict(x)
|
||||||
|
|
||||||
data['sublabel'] = y_pred
|
# 3. Add predictions and save output file
|
||||||
# Insert flowStartMilliseconds column again
|
data_end['sublabel'] = y_pred
|
||||||
data.insert(loc=0, column='flowStartMilliseconds', value=flowStartMillisecondsColumn)
|
data_end.to_csv('output.csv', index=False)
|
||||||
# Convert numerical IPs back to proper IP addresses
|
|
||||||
data['sourceIPAddress'] = data['sourceIPAddress'].apply(bin_to_ip)
|
|
||||||
data['destinationIPAddress'] = data['destinationIPAddress'].apply(bin_to_ip)
|
|
||||||
print(data)
|
|
||||||
|
|
||||||
# 3. Save output file
|
|
||||||
data.to_csv('output.csv', index=False)
|
|
||||||
|
|||||||
@ -5,41 +5,60 @@ from imblearn.over_sampling import SMOTE
|
|||||||
from sklearn.ensemble import RandomForestClassifier
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
from sklearn.metrics import classification_report, confusion_matrix
|
from sklearn.metrics import classification_report, confusion_matrix
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
|
||||||
def ip_to_bin(x):
|
# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
|
||||||
parts = x.split('.')
|
class MultiColumnLabelEncoder:
|
||||||
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
def __init__(self, columns=None):
|
||||||
|
self.columns = columns # array of column names to encode
|
||||||
|
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self # not relevant here
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
'''
|
||||||
|
Transforms columns of X specified in self.columns using
|
||||||
|
LabelEncoder(). If no columns specified, transforms all
|
||||||
|
columns in X.
|
||||||
|
'''
|
||||||
|
output = X.copy()
|
||||||
|
if self.columns is not None:
|
||||||
|
for col in self.columns:
|
||||||
|
output[col] = LabelEncoder().fit_transform(output[col])
|
||||||
|
else:
|
||||||
|
for colname, col in output.iteritems():
|
||||||
|
output[colname] = LabelEncoder().fit_transform(col)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def fit_transform(self, X, y=None):
|
||||||
|
return self.fit(X, y).transform(X)
|
||||||
|
|
||||||
|
|
||||||
df = pd.read_csv('training4tuplabeled.csv',
|
df = pd.read_csv('training4tuplabeled.csv')
|
||||||
converters={
|
y = df['sublabel']
|
||||||
'sourceIPAddress': lambda x: ip_to_bin(x),
|
df = df.drop(columns=['sublabel'])
|
||||||
'destinationIPAddress': lambda x: ip_to_bin(x)
|
df = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(df)
|
||||||
})
|
|
||||||
df.drop(['flowStartMilliseconds'], 1, inplace=True)
|
|
||||||
X = np.array(df.drop(columns=['sublabel']))
|
|
||||||
y = np.array(df['sublabel'])
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
|
oversampler = SMOTE()
|
||||||
|
X, y = oversampler.fit_resample(df, y)
|
||||||
|
|
||||||
oversampler = SMOTE(random_state=777, k_neighbors=5)
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
|
||||||
X_train, y_train = oversampler.fit_resample(X_train, y_train)
|
|
||||||
|
|
||||||
clf = RandomForestClassifier(n_estimators=20, n_jobs=-1, criterion='gini', random_state=0)
|
model = RandomForestClassifier(max_depth=None, n_estimators=30, n_jobs=-1)
|
||||||
clf.fit(X_train, y_train)
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
accuracy = clf.score(X_test, y_test)
|
accuracy = model.score(X_test, y_test)
|
||||||
print('Accuracy: ', accuracy)
|
print('Accuracy: ', accuracy)
|
||||||
y_pred_train = clf.predict(X_train)
|
y_pred_train = model.predict(X_train)
|
||||||
y_pred_test = clf.predict(X_test)
|
y_pred_test = model.predict(X_test)
|
||||||
print("\n *************** TRAINING ****************")
|
print("\n *************** TRAINING ****************")
|
||||||
cm_train = confusion_matrix(y_train, y_pred_train)
|
# cm_train = confusion_matrix(y_train, y_pred_train)
|
||||||
print(cm_train)
|
# print(cm_train)
|
||||||
print(classification_report(y_train, y_pred_train))
|
print(classification_report(y_train, y_pred_train))
|
||||||
print("\n ************** VALIDATION ***************")
|
print("\n ************** VALIDATION ***************")
|
||||||
cm_test = confusion_matrix(y_test, y_pred_test)
|
# cm_test = confusion_matrix(y_test, y_pred_test)
|
||||||
print(cm_test)
|
# print(cm_test)
|
||||||
print(classification_report(y_test, y_pred_test))
|
print(classification_report(y_test, y_pred_test))
|
||||||
|
|
||||||
pickle.dump(clf, open('network_traffic_classifier.sav', 'wb'))
|
pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user