Split exercise3 into multiple files

This commit is contained in:
Tobias Eidelpes 2021-05-14 19:01:29 +02:00
parent c4dc23a9f8
commit aac0e096f7
6 changed files with 40 additions and 3729 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
data data
venv venv
.idea .idea
*.ipynb
.ipynb_checkpoints

View File

@ -1,43 +0,0 @@
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from datetime import datetime
from statistics import median
'''
IMPORTANT: Remove leading space from column '# Bytes' in csv file or pandas
won't read it correctly.
'''
dataset = pd.read_csv('./global_last10years.csv', index_col=0).fillna(0)
# Convert unix timestamps to dates
timestamps = mdate.epoch2num(dataset.index)
date_formatter = mdate.DateFormatter('%y-%m-%d %H:%M:%S')
# List of packet counts per hour (daily avg)
ts_packets = dataset.iloc[:,1]
# List of bytes per hour (daily avg)
ts_bytes = dataset.iloc[:,0]
# List of unique source IPs per hour (daily avg)
ts_uIPs = dataset.iloc[:,2]
# List of unique destination IPs per hour (daily avg)
ts_uIPd = dataset.iloc[:,3]
### rep-14: Signals correlation ###
print('Correlations between different columns (NaNs replace with 0):')
print(dataset.corr(method='pearson').round(decimals=2))
### rep-15: Destinations/Sources ratio ###
ratio_uIPd_uIPs = median(ts_uIPd) / median(ts_uIPs)
print('Ratio between median IP destinations and median IP sources: ', ratio_uIPd_uIPs)
### rep-16: Peak in unique sources ###
date = datetime.fromtimestamp(ts_uIPs.idxmax())
print('Peak in unique sources: ', date)

File diff suppressed because it is too large Load Diff

12
ex3/rep-14.py Normal file
View File

@ -0,0 +1,12 @@
import pandas as pd
'''
IMPORTANT: Remove leading space from column '# Bytes' in csv file or pandas
won't read it correctly.
'''
# Read dataset and fill missing rows with zeroes
dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
print('Correlations between different columns (NaNs replace with 0):')
print(dataset.corr(method='pearson').round(decimals=2))

14
ex3/rep-15.py Normal file
View File

@ -0,0 +1,14 @@
import pandas as pd
from statistics import median
# Read dataset and fill missing rows with zeroes
dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
# List of unique source IPs per hour (daily avg)
ts_uIPs = dataset.iloc[:,2]
# List of unique destination IPs per hour (daily avg)
ts_uIPd = dataset.iloc[:,3]
ratio_uIPd_uIPs = median(ts_uIPd) / median(ts_uIPs)
print('Ratio between median IP destinations and median IP sources: ', round(ratio_uIPd_uIPs, 2))

12
ex3/rep-16.py Normal file
View File

@ -0,0 +1,12 @@
import pandas as pd
import matplotlib.dates as mdate
from datetime import datetime
# Read dataset and fill missing rows with zeroes
dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
# List of unique source IPs per hour (daily avg)
ts_uIPs = dataset.iloc[:,2]
date = datetime.fromtimestamp(ts_uIPs.idxmax())
print('Peak in unique sources: ', date)