import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdate from datetime import datetime from statistics import median ''' IMPORTANT: Remove leading space from column '# Bytes' in csv file or pandas won't read it correctly. ''' dataset = pd.read_csv('./global_last10years.csv', index_col=0).fillna(0) # Convert unix timestamps to dates timestamps = mdate.epoch2num(dataset.index) date_formatter = mdate.DateFormatter('%y-%m-%d %H:%M:%S') # List of packet counts per hour (daily avg) ts_packets = dataset.iloc[:,1] # List of bytes per hour (daily avg) ts_bytes = dataset.iloc[:,0] # List of unique source IPs per hour (daily avg) ts_uIPs = dataset.iloc[:,2] # List of unique destination IPs per hour (daily avg) ts_uIPd = dataset.iloc[:,3] ### rep-14: Signals correlation ### print('Correlations between different columns (NaNs replace with 0):') print(dataset.corr(method='pearson').round(decimals=2)) ### rep-15: Destinations/Sources ratio ### ratio_uIPd_uIPs = median(ts_uIPd) / median(ts_uIPs) print('Ratio between median IP destinations and median IP sources: ', ratio_uIPd_uIPs) ### rep-16: Peak in unique sources ### date = datetime.fromtimestamp(ts_uIPs.idxmax()) print('Peak in unique sources: ', date)