From d88a6766c956c883ceeff40c251623d83a542c89 Mon Sep 17 00:00:00 2001
From: Tobias Eidelpes <tobias@eidelpes.info>
Date: Sun, 26 Feb 2023 16:45:25 +0100
Subject: [PATCH] Add object detection evaluation

---
 thesis/references.bib |  32 ++++++++++++++
 thesis/thesis.tex     | 100 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)

diff --git a/thesis/references.bib b/thesis/references.bib
index 9a0e33c..cca9a43 100644
--- a/thesis/references.bib
+++ b/thesis/references.bib
@@ -141,6 +141,24 @@
   keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
 }
 
+@article{kuznetsova2020,
+  title = {The {{Open Images Dataset V4}}: {{Unified}} Image Classification, Object Detection, and Visual Relationship Detection at Scale},
+  shorttitle = {The {{Open Images Dataset V4}}},
+  author = {Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and Duerig, Tom and Ferrari, Vittorio},
+  date = {2020-07},
+  journaltitle = {International Journal of Computer Vision},
+  shortjournal = {Int J Comput Vis},
+  volume = {128},
+  number = {7},
+  eprint = {1811.00982},
+  eprinttype = {arxiv},
+  pages = {1956--1981},
+  issn = {0920-5691, 1573-1405},
+  doi = {10.1007/s11263-020-01316-z},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition}
+}
+
 @article{lopez-garcia2022,
   title = {Machine {{Learning-Based Processing}} of {{Multispectral}} and {{RGB UAV Imagery}} for the {{Multitemporal Monitoring}} of {{Vineyard Water Status}}},
   author = {López-García, Patricia and Intrigliolo, Diego and Moreno, Miguel A. and Martínez-Moreno, Alejandro and Ortega, José Fernando and Pérez-Álvarez, Eva Pilar and Ballesteros, Rocío},
@@ -274,6 +292,20 @@
   keywords = {Image processing,Multimodal deep learning,Plant water stress,Time-series modeling}
 }
 
+@misc{zheng2019,
+  title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}},
+  shorttitle = {Distance-{{IoU Loss}}},
+  author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
+  date = {2019-11-19},
+  number = {arXiv:1911.08287},
+  eprint = {1911.08287},
+  eprinttype = {arxiv},
+  publisher = {{arXiv}},
+  doi = {10.48550/arXiv.1911.08287},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition}
+}
+
 @article{zhong2022,
   title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}},
   author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei},
diff --git a/thesis/thesis.tex b/thesis/thesis.tex
index 398a977..713a459 100644
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@@ -65,6 +65,10 @@
 % For bachelor and master:
 \setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum.
 
+\newacronym{xai}{XAI}{Explainable Artificial Intelligence}
+\newacronym{lime}{LIME}{Local Interpretable Model Agnostic Explanation}
+\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping}
+
 \begin{document}
 
 \frontmatter % Switches to roman numbering.
@@ -105,7 +109,103 @@
 
 \chapter{Evaluation}
 
+The following sections contain a detailed evaluation of the model in
+various scenarios. First, we present metrics from the training phases
+of the constituent models. Second, we employ methods from the field of
+\gls{xai} such as \gls{lime} and \gls{grad-cam} to get a better
+understanding of the models' abstractions. Finally, we turn to the
+models' aggregate performance on the test set and discuss whether the
+initial goals set by the problem description have been met or not.
 
+\section{Object Detection}
+\label{sec:eval-yolo}
+
+The object detection model was trained for 300 epochs and the weights
+from the best-performing epoch were saved. The model's fitness for
+each epoch is calculated as the weighted average of \textsf{mAP}@0.5
+and \textsf{mAP}@0.5:0.95:
+
+\begin{equation}
+  \label{eq:fitness}
+  f_{epoch} = 0.1 \cdot \mathsf{mAP}@0.5 + 0.9 \cdot \mathsf{mAP}@0.5\mathrm{:}0.95
+\end{equation}
+
+Figure~\ref{fig:fitness} shows the model's fitness over the training
+period of 300 epochs. The gray vertical line indicates the maximum
+fitness of 0.61 at epoch 133. The weights of that epoch were frozen to
+be the final model parameters. Since the fitness metric assigns the
+\textsf{mAP} at the higher range the overwhelming weight, the
+\textsf{mAP}@0.5 starts to decrease after epoch 30, but the
+\textsf{mAP}@0.5:0.95 picks up the slack until the maximum fitness at
+epoch 133. This is an indication that the model achieves good
+performance early on and continues to gain higher confidence values
+until performance deteriorates due to overfitting.
+
+\begin{figure}
+  \centering
+  \includegraphics{graphics/model_fitness.pdf}
+  \caption[Model fitness per epoch.]{Model fitness for each epoch
+    calculated as in equation~\ref{eq:fitness}.}
+  \label{fig:fitness}
+\end{figure}
+
+Overall precision and recall per epoch are shown in
+figure~\ref{fig:prec-rec}. The values indicate that neither precision
+nor recall change materially during training. In fact, precision
+starts to decrease from the beginning, while recall experiences a
+barely noticeable increase. Taken together with the box and object
+loss from figure~\ref{fig:box-obj-loss}, we speculate that the
+pre-trained model already generalizes well to plant detection. Any
+further training solely impacts the confidence of detection, but does
+not lead to higher detection rates. This conclusion is supported by
+the increasing \textsf{mAP}@0.5:0.95.
+
+\begin{figure}
+  \centering
+  \includegraphics{graphics/precision_recall.pdf}
+  \caption{Overall precision and recall during training for each epoch.}
+  \label{fig:prec-rec}
+\end{figure}
+
+Further culprits for the flat precision and recall values may be found
+in bad ground truth data. The labels from the Open Images
+Dataset~\cite{kuznetsova2020} are sometimes not fine-grained
+enough. Images which contain multiple individual—often
+overlapping—plants are labeled with one large bounding box instead of
+multiple smaller ones. The model recognizes the individual plants and
+returns tighter bounding boxes even if that is not what is specified
+in the ground truth. Therefore, it is prudent to limit the training
+phase to relatively few epochs in order to not penalize the more
+accurate detections of the model. The smaller bounding boxes make more
+sense considering the fact that the cutout is passed to the classifier
+in a later stage. Smaller bounding boxes help the classifier to only
+focus on one plant at a time and to not get distracted by multiple
+plants in potentially different stages of wilting.
+
+The box loss
+decreases slightly during training which indicates that the bounding
+boxes become tighter around objects of interest. With increasing
+training time, however, the object loss increases, indicating that
+less and less plants are present in the predicted bounding boxes. It
+is likely that overfitting is a cause for the increasing object loss
+from epoch 40 onward. Since the best weights as measured by fitness
+are found at epoch 133 and the object loss accelerates from that
+point, epoch 133 is probably the right cutoff before overfitting
+occurs.
+
+\begin{figure}
+  \centering
+  \includegraphics{graphics/val_box_obj_loss.pdf}
+  \caption[Box and object loss.]{Box and object
+    loss{\protect\footnotemark} measured against the validation set.}
+  \label{fig:box-obj-loss}
+\end{figure}
+
+\footnotetext{The class loss is omitted because there is only one
+  class in the dataset and the loss is therefore always 0.}
+
+\begin{center}
+\end{center}
 
 \backmatter