diff --git a/competition/classifier.py b/competition/classifier.py index ef5aa56..aaae160 100644 --- a/competition/classifier.py +++ b/competition/classifier.py @@ -1,45 +1,45 @@ import pandas as pd import pickle +from sklearn.preprocessing import LabelEncoder -# Preprocessing data - encode ip addresses to numerical values -def ip_to_bin(ip): - parts = ip.split('.') - return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) +# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn +class MultiColumnLabelEncoder: + def __init__(self, columns=None): + self.columns = columns # array of column names to encode + def fit(self, X, y=None): + return self # not relevant here -# Postprocessing data - decode numerical values to ip addresses -def bin_to_ip(ipnum): - o1 = int(ipnum / 16777216) % 256 - o2 = int(ipnum / 65536) % 256 - o3 = int(ipnum / 256) % 256 - o4 = int(ipnum) % 256 - return '%(o1)s.%(o2)s.%(o3)s.%(o4)s' % locals() + def transform(self, X): + ''' + Transforms columns of X specified in self.columns using + LabelEncoder(). If no columns specified, transforms all + columns in X. + ''' + output = X.copy() + if self.columns is not None: + for col in self.columns: + output[col] = LabelEncoder().fit_transform(output[col]) + else: + for colname, col in output.iteritems(): + output[colname] = LabelEncoder().fit_transform(col) + return output + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) # 1. Import data -data = pd.read_csv('input.csv', - converters={ - 'sourceIPAddress': lambda x1: ip_to_bin(x1), - 'destinationIPAddress': lambda x2: ip_to_bin(x2) - }) -# Save flowStartMilliseconds column for later insertion -flowStartMillisecondsColumn = data['flowStartMilliseconds'] -# Remove flowStartMilliseconds column -data.drop(['flowStartMilliseconds'], 1, inplace=True) +data = pd.read_csv('input.csv') +data_end = pd.read_csv('input.csv') +data = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(data) x = data.to_numpy() # 2. Loading a trained model and predict model = pickle.load(open('network_traffic_classifier.sav', 'rb')) y_pred = model.predict(x) -data['sublabel'] = y_pred -# Insert flowStartMilliseconds column again -data.insert(loc=0, column='flowStartMilliseconds', value=flowStartMillisecondsColumn) -# Convert numerical IPs back to proper IP addresses -data['sourceIPAddress'] = data['sourceIPAddress'].apply(bin_to_ip) -data['destinationIPAddress'] = data['destinationIPAddress'].apply(bin_to_ip) -print(data) - -# 3. Save output file -data.to_csv('output.csv', index=False) +# 3. Add predictions and save output file +data_end['sublabel'] = y_pred +data_end.to_csv('output.csv', index=False) diff --git a/competition/random_forest.py b/competition/random_forest.py index ed51684..ba22fa8 100644 --- a/competition/random_forest.py +++ b/competition/random_forest.py @@ -5,41 +5,60 @@ from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder -def ip_to_bin(x): - parts = x.split('.') - return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) +# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn +class MultiColumnLabelEncoder: + def __init__(self, columns=None): + self.columns = columns # array of column names to encode + + def fit(self, X, y=None): + return self # not relevant here + + def transform(self, X): + ''' + Transforms columns of X specified in self.columns using + LabelEncoder(). If no columns specified, transforms all + columns in X. + ''' + output = X.copy() + if self.columns is not None: + for col in self.columns: + output[col] = LabelEncoder().fit_transform(output[col]) + else: + for colname, col in output.iteritems(): + output[colname] = LabelEncoder().fit_transform(col) + return output + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) -df = pd.read_csv('training4tuplabeled.csv', - converters={ - 'sourceIPAddress': lambda x: ip_to_bin(x), - 'destinationIPAddress': lambda x: ip_to_bin(x) - }) -df.drop(['flowStartMilliseconds'], 1, inplace=True) -X = np.array(df.drop(columns=['sublabel'])) -y = np.array(df['sublabel']) +df = pd.read_csv('training4tuplabeled.csv') +y = df['sublabel'] +df = df.drop(columns=['sublabel']) +df = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(df) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) +oversampler = SMOTE() +X, y = oversampler.fit_resample(df, y) -oversampler = SMOTE(random_state=777, k_neighbors=5) -X_train, y_train = oversampler.fit_resample(X_train, y_train) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0) -clf = RandomForestClassifier(n_estimators=20, n_jobs=-1, criterion='gini', random_state=0) -clf.fit(X_train, y_train) +model = RandomForestClassifier(max_depth=None, n_estimators=30, n_jobs=-1) +model.fit(X_train, y_train) -accuracy = clf.score(X_test, y_test) +accuracy = model.score(X_test, y_test) print('Accuracy: ', accuracy) -y_pred_train = clf.predict(X_train) -y_pred_test = clf.predict(X_test) +y_pred_train = model.predict(X_train) +y_pred_test = model.predict(X_test) print("\n *************** TRAINING ****************") -cm_train = confusion_matrix(y_train, y_pred_train) -print(cm_train) +# cm_train = confusion_matrix(y_train, y_pred_train) +# print(cm_train) print(classification_report(y_train, y_pred_train)) print("\n ************** VALIDATION ***************") -cm_test = confusion_matrix(y_test, y_pred_test) -print(cm_test) +# cm_test = confusion_matrix(y_test, y_pred_test) +# print(cm_test) print(classification_report(y_test, y_pred_test)) -pickle.dump(clf, open('network_traffic_classifier.sav', 'wb')) +pickle.dump(model, open('network_traffic_classifier.sav', 'wb'))