46 lines
1.5 KiB
Python
46 lines
1.5 KiB
Python
import pandas as pd
|
|
import pickle
|
|
|
|
|
|
# Preprocessing data - encode ip addresses to numerical values
|
|
def ip_to_bin(ip):
|
|
parts = ip.split('.')
|
|
return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
|
|
|
|
|
|
# Postprocessing data - decode numerical values to ip addresses
|
|
def bin_to_ip(ipnum):
|
|
o1 = int(ipnum / 16777216) % 256
|
|
o2 = int(ipnum / 65536) % 256
|
|
o3 = int(ipnum / 256) % 256
|
|
o4 = int(ipnum) % 256
|
|
return '%(o1)s.%(o2)s.%(o3)s.%(o4)s' % locals()
|
|
|
|
|
|
# 1. Import data
|
|
data = pd.read_csv('input.csv',
|
|
converters={
|
|
'sourceIPAddress': lambda x1: ip_to_bin(x1),
|
|
'destinationIPAddress': lambda x2: ip_to_bin(x2)
|
|
})
|
|
# Save flowStartMilliseconds column for later insertion
|
|
flowStartMillisecondsColumn = data['flowStartMilliseconds']
|
|
# Remove flowStartMilliseconds column
|
|
data.drop(['flowStartMilliseconds'], 1, inplace=True)
|
|
x = data.to_numpy()
|
|
|
|
# 2. Loading a trained model and predict
|
|
model = pickle.load(open('network_traffic_classifier.sav', 'rb'))
|
|
y_pred = model.predict(x)
|
|
|
|
data['sublabel'] = y_pred
|
|
# Insert flowStartMilliseconds column again
|
|
data.insert(loc=0, column='flowStartMilliseconds', value=flowStartMillisecondsColumn)
|
|
# Convert numerical IPs back to proper IP addresses
|
|
data['sourceIPAddress'] = data['sourceIPAddress'].apply(bin_to_ip)
|
|
data['destinationIPAddress'] = data['destinationIPAddress'].apply(bin_to_ip)
|
|
print(data)
|
|
|
|
# 3. Save output file
|
|
pickle.dump(pd, open('output.csv', 'wb'))
|