From d88a6766c956c883ceeff40c251623d83a542c89 Mon Sep 17 00:00:00 2001 From: Tobias Eidelpes Date: Sun, 26 Feb 2023 16:45:25 +0100 Subject: [PATCH] Add object detection evaluation --- thesis/references.bib | 32 ++++++++++++++ thesis/thesis.tex | 100 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/thesis/references.bib b/thesis/references.bib index 9a0e33c..cca9a43 100644 --- a/thesis/references.bib +++ b/thesis/references.bib @@ -141,6 +141,24 @@ keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning} } +@article{kuznetsova2020, + title = {The {{Open Images Dataset V4}}: {{Unified}} Image Classification, Object Detection, and Visual Relationship Detection at Scale}, + shorttitle = {The {{Open Images Dataset V4}}}, + author = {Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and Duerig, Tom and Ferrari, Vittorio}, + date = {2020-07}, + journaltitle = {International Journal of Computer Vision}, + shortjournal = {Int J Comput Vis}, + volume = {128}, + number = {7}, + eprint = {1811.00982}, + eprinttype = {arxiv}, + pages = {1956--1981}, + issn = {0920-5691, 1573-1405}, + doi = {10.1007/s11263-020-01316-z}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Computer Vision and Pattern Recognition} +} + @article{lopez-garcia2022, title = {Machine {{Learning-Based Processing}} of {{Multispectral}} and {{RGB UAV Imagery}} for the {{Multitemporal Monitoring}} of {{Vineyard Water Status}}}, author = {López-García, Patricia and Intrigliolo, Diego and Moreno, Miguel A. and Martínez-Moreno, Alejandro and Ortega, José Fernando and Pérez-Álvarez, Eva Pilar and Ballesteros, Rocío}, @@ -274,6 +292,20 @@ keywords = {Image processing,Multimodal deep learning,Plant water stress,Time-series modeling} } +@misc{zheng2019, + title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}}, + shorttitle = {Distance-{{IoU Loss}}}, + author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei}, + date = {2019-11-19}, + number = {arXiv:1911.08287}, + eprint = {1911.08287}, + eprinttype = {arxiv}, + publisher = {{arXiv}}, + doi = {10.48550/arXiv.1911.08287}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Computer Vision and Pattern Recognition} +} + @article{zhong2022, title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}}, author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei}, diff --git a/thesis/thesis.tex b/thesis/thesis.tex index 398a977..713a459 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -65,6 +65,10 @@ % For bachelor and master: \setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum. +\newacronym{xai}{XAI}{Explainable Artificial Intelligence} +\newacronym{lime}{LIME}{Local Interpretable Model Agnostic Explanation} +\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping} + \begin{document} \frontmatter % Switches to roman numbering. @@ -105,7 +109,103 @@ \chapter{Evaluation} +The following sections contain a detailed evaluation of the model in +various scenarios. First, we present metrics from the training phases +of the constituent models. Second, we employ methods from the field of +\gls{xai} such as \gls{lime} and \gls{grad-cam} to get a better +understanding of the models' abstractions. Finally, we turn to the +models' aggregate performance on the test set and discuss whether the +initial goals set by the problem description have been met or not. +\section{Object Detection} +\label{sec:eval-yolo} + +The object detection model was trained for 300 epochs and the weights +from the best-performing epoch were saved. The model's fitness for +each epoch is calculated as the weighted average of \textsf{mAP}@0.5 +and \textsf{mAP}@0.5:0.95: + +\begin{equation} + \label{eq:fitness} + f_{epoch} = 0.1 \cdot \mathsf{mAP}@0.5 + 0.9 \cdot \mathsf{mAP}@0.5\mathrm{:}0.95 +\end{equation} + +Figure~\ref{fig:fitness} shows the model's fitness over the training +period of 300 epochs. The gray vertical line indicates the maximum +fitness of 0.61 at epoch 133. The weights of that epoch were frozen to +be the final model parameters. Since the fitness metric assigns the +\textsf{mAP} at the higher range the overwhelming weight, the +\textsf{mAP}@0.5 starts to decrease after epoch 30, but the +\textsf{mAP}@0.5:0.95 picks up the slack until the maximum fitness at +epoch 133. This is an indication that the model achieves good +performance early on and continues to gain higher confidence values +until performance deteriorates due to overfitting. + +\begin{figure} + \centering + \includegraphics{graphics/model_fitness.pdf} + \caption[Model fitness per epoch.]{Model fitness for each epoch + calculated as in equation~\ref{eq:fitness}.} + \label{fig:fitness} +\end{figure} + +Overall precision and recall per epoch are shown in +figure~\ref{fig:prec-rec}. The values indicate that neither precision +nor recall change materially during training. In fact, precision +starts to decrease from the beginning, while recall experiences a +barely noticeable increase. Taken together with the box and object +loss from figure~\ref{fig:box-obj-loss}, we speculate that the +pre-trained model already generalizes well to plant detection. Any +further training solely impacts the confidence of detection, but does +not lead to higher detection rates. This conclusion is supported by +the increasing \textsf{mAP}@0.5:0.95. + +\begin{figure} + \centering + \includegraphics{graphics/precision_recall.pdf} + \caption{Overall precision and recall during training for each epoch.} + \label{fig:prec-rec} +\end{figure} + +Further culprits for the flat precision and recall values may be found +in bad ground truth data. The labels from the Open Images +Dataset~\cite{kuznetsova2020} are sometimes not fine-grained +enough. Images which contain multiple individual—often +overlapping—plants are labeled with one large bounding box instead of +multiple smaller ones. The model recognizes the individual plants and +returns tighter bounding boxes even if that is not what is specified +in the ground truth. Therefore, it is prudent to limit the training +phase to relatively few epochs in order to not penalize the more +accurate detections of the model. The smaller bounding boxes make more +sense considering the fact that the cutout is passed to the classifier +in a later stage. Smaller bounding boxes help the classifier to only +focus on one plant at a time and to not get distracted by multiple +plants in potentially different stages of wilting. + +The box loss +decreases slightly during training which indicates that the bounding +boxes become tighter around objects of interest. With increasing +training time, however, the object loss increases, indicating that +less and less plants are present in the predicted bounding boxes. It +is likely that overfitting is a cause for the increasing object loss +from epoch 40 onward. Since the best weights as measured by fitness +are found at epoch 133 and the object loss accelerates from that +point, epoch 133 is probably the right cutoff before overfitting +occurs. + +\begin{figure} + \centering + \includegraphics{graphics/val_box_obj_loss.pdf} + \caption[Box and object loss.]{Box and object + loss{\protect\footnotemark} measured against the validation set.} + \label{fig:box-obj-loss} +\end{figure} + +\footnotetext{The class loss is omitted because there is only one + class in the dataset and the loss is therefore always 0.} + +\begin{center} +\end{center} \backmatter