Split exercise3 into multiple files

2021-05-14 19:01:29 +02:00 · 2021-05-14 19:01:29 +02:00 · aac0e096f7
commit aac0e096f7
parent c4dc23a9f8
6 changed files with 40 additions and 3729 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 data
 venv
 .idea
+*.ipynb
+.ipynb_checkpoints
--- a/ex3/exercise3.py
+++ b/ex3/exercise3.py
@ -1,43 +0,0 @@
-import pandas as pd
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdate
-from datetime import datetime
-from statistics import median
-
-'''
-IMPORTANT: Remove leading space from column '# Bytes' in csv file or pandas
-won't read it correctly.
-'''
-
-dataset = pd.read_csv('./global_last10years.csv', index_col=0).fillna(0)
-
-# Convert unix timestamps to dates
-timestamps = mdate.epoch2num(dataset.index)
-date_formatter = mdate.DateFormatter('%y-%m-%d %H:%M:%S')
-
-# List of packet counts per hour (daily avg)
-ts_packets = dataset.iloc[:,1]
-
-# List of bytes per hour (daily avg)
-ts_bytes = dataset.iloc[:,0]
-
-# List of unique source IPs per hour (daily avg)
-ts_uIPs = dataset.iloc[:,2]
-
-# List of unique destination IPs per hour (daily avg)
-ts_uIPd = dataset.iloc[:,3]
-
-### rep-14: Signals correlation ###
-
-print('Correlations between different columns (NaNs replace with 0):')
-print(dataset.corr(method='pearson').round(decimals=2))
-
-### rep-15: Destinations/Sources ratio ###
-
-ratio_uIPd_uIPs = median(ts_uIPd) / median(ts_uIPs)
-print('Ratio between median IP destinations and median IP sources: ', ratio_uIPd_uIPs)
-
-### rep-16: Peak in unique sources ###
-
-date = datetime.fromtimestamp(ts_uIPs.idxmax())
-print('Peak in unique sources: ', date)
--- a/ex3/global_last10years.csv
+++ b/ex3/global_last10years.csv
--- a/ex3/rep-14.py
+++ b/ex3/rep-14.py
@ -0,0 +1,12 @@
+import pandas as pd
+
+'''
+IMPORTANT: Remove leading space from column '# Bytes' in csv file or pandas
+won't read it correctly.
+'''
+
+# Read dataset and fill missing rows with zeroes
+dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
+
+print('Correlations between different columns (NaNs replace with 0):')
+print(dataset.corr(method='pearson').round(decimals=2))
--- a/ex3/rep-15.py
+++ b/ex3/rep-15.py
@ -0,0 +1,14 @@
+import pandas as pd
+from statistics import median
+
+# Read dataset and fill missing rows with zeroes
+dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
+
+# List of unique source IPs per hour (daily avg)
+ts_uIPs = dataset.iloc[:,2]
+
+# List of unique destination IPs per hour (daily avg)
+ts_uIPd = dataset.iloc[:,3]
+
+ratio_uIPd_uIPs = median(ts_uIPd) / median(ts_uIPs)
+print('Ratio between median IP destinations and median IP sources: ', round(ratio_uIPd_uIPs, 2))
--- a/ex3/rep-16.py
+++ b/ex3/rep-16.py
@ -0,0 +1,12 @@
+import pandas as pd
+import matplotlib.dates as mdate
+from datetime import datetime
+
+# Read dataset and fill missing rows with zeroes
+dataset = pd.read_csv('data/global_last10years.csv', index_col=0).fillna(0)
+
+# List of unique source IPs per hour (daily avg)
+ts_uIPs = dataset.iloc[:,2]
+
+date = datetime.fromtimestamp(ts_uIPs.idxmax())
+print('Peak in unique sources: ', date)