import pandas as pd import pickle # Preprocessing data - encode ip addresses to numerical values def ip_to_bin(ip): parts = ip.split('.') return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3]) # Postprocessing data - decode numerical values to ip addresses def bin_to_ip(ipnum): o1 = int(ipnum / 16777216) % 256 o2 = int(ipnum / 65536) % 256 o3 = int(ipnum / 256) % 256 o4 = int(ipnum) % 256 return '%(o1)s.%(o2)s.%(o3)s.%(o4)s' % locals() # 1. Import data data = pd.read_csv('input.csv', converters={ 'sourceIPAddress': lambda x1: ip_to_bin(x1), 'destinationIPAddress': lambda x2: ip_to_bin(x2) }) # Save flowStartMilliseconds column for later insertion flowStartMillisecondsColumn = data['flowStartMilliseconds'] # Remove flowStartMilliseconds column data.drop(['flowStartMilliseconds'], 1, inplace=True) x = data.to_numpy() # 2. Loading a trained model and predict model = pickle.load(open('network_traffic_classifier.sav', 'rb')) y_pred = model.predict(x) data['sublabel'] = y_pred # Insert flowStartMilliseconds column again data.insert(loc=0, column='flowStartMilliseconds', value=flowStartMillisecondsColumn) # Convert numerical IPs back to proper IP addresses data['sourceIPAddress'] = data['sourceIPAddress'].apply(bin_to_ip) data['destinationIPAddress'] = data['destinationIPAddress'].apply(bin_to_ip) print(data) # 3. Save output file pickle.dump(pd, open('output.csv', 'wb'))