46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
import pandas as pd
|
|
import pickle
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
|
|
# Source: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
|
|
class MultiColumnLabelEncoder:
|
|
def __init__(self, columns=None):
|
|
self.columns = columns # array of column names to encode
|
|
|
|
def fit(self, X, y=None):
|
|
return self # not relevant here
|
|
|
|
def transform(self, X):
|
|
'''
|
|
Transforms columns of X specified in self.columns using
|
|
LabelEncoder(). If no columns specified, transforms all
|
|
columns in X.
|
|
'''
|
|
output = X.copy()
|
|
if self.columns is not None:
|
|
for col in self.columns:
|
|
output[col] = LabelEncoder().fit_transform(output[col])
|
|
else:
|
|
for colname, col in output.iteritems():
|
|
output[colname] = LabelEncoder().fit_transform(col)
|
|
return output
|
|
|
|
def fit_transform(self, X, y=None):
|
|
return self.fit(X, y).transform(X)
|
|
|
|
|
|
# 1. Import data
|
|
data = pd.read_csv('input.csv')
|
|
data_end = pd.read_csv('input.csv')
|
|
data = MultiColumnLabelEncoder(columns=['sourceIPAddress', 'destinationIPAddress']).fit_transform(data)
|
|
x = data.to_numpy()
|
|
|
|
# 2. Loading a trained model and predict
|
|
model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
|
|
y_pred = model.predict(x)
|
|
|
|
# 3. Add predictions and save output file
|
|
data_end['sublabel'] = y_pred
|
|
data_end.to_csv('output.csv', index=False)
|