313 lines
14 KiB
TeX
313 lines
14 KiB
TeX
% Copyright (C) 2014-2020 by Thomas Auzinger <thomas@auzinger.name>
|
|
|
|
\documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information.
|
|
|
|
% Load packages to allow in- and output of non-ASCII characters.
|
|
\usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters.
|
|
\usepackage[T1]{fontenc} % Determines font encoding of the output. Font packages have to be included before this line.
|
|
\usepackage[utf8]{inputenc} % Determines encoding of the input. All input files have to use UTF8 encoding.
|
|
|
|
% Extended LaTeX functionality is enables by including packages with \usepackage{...}.
|
|
\usepackage{amsmath} % Extended typesetting of mathematical expression.
|
|
\usepackage{amssymb} % Provides a multitude of mathematical symbols.
|
|
\usepackage{mathtools} % Further extensions of mathematical typesetting.
|
|
\usepackage{microtype} % Small-scale typographic enhancements.
|
|
\usepackage[inline]{enumitem} % User control over the layout of lists (itemize, enumerate, description).
|
|
\usepackage{multirow} % Allows table elements to span several rows.
|
|
\usepackage{booktabs} % Improves the typesettings of tables.
|
|
\usepackage{subcaption} % Allows the use of subfigures and enables their referencing.
|
|
\usepackage[ruled,linesnumbered,algochapter]{algorithm2e} % Enables the writing of pseudo code.
|
|
\usepackage[usenames,dvipsnames,table]{xcolor} % Allows the definition and use of colors. This package has to be included before tikz.
|
|
\usepackage{nag} % Issues warnings when best practices in writing LaTeX documents are violated.
|
|
\usepackage{todonotes} % Provides tooltip-like todo notes.
|
|
\usepackage{hyperref} % Enables cross linking in the electronic document version. This package has to be included second to last.
|
|
\usepackage[acronym,toc]{glossaries} % Enables the generation of glossaries and lists fo acronyms. This package has to be included last.
|
|
|
|
% Define convenience functions to use the author name and the thesis title in the PDF document properties.
|
|
\newcommand{\authorname}{Tobias Eidelpes} % The author name without titles.
|
|
\newcommand{\thesistitle}{Flower State Classification for Watering System} % The title of the thesis. The English version should be used, if it exists.
|
|
|
|
% Set PDF document properties
|
|
\hypersetup
|
|
{
|
|
pdfpagelayout = TwoPageRight, % How the document is shown in PDF viewers (optional).
|
|
linkbordercolor = {Melon}, % The color of the borders of boxes around crosslinks (optional).
|
|
pdfauthor = {\authorname}, % The author's name in the document properties (optional).
|
|
pdftitle = {\thesistitle}, % The document's title in the document properties (optional).
|
|
pdfsubject = {Subject}, % The document's subject in the document properties (optional).
|
|
pdfkeywords = {Object Detection, Image Classification, Machine Learning, Embedded Programming} % The document's keywords in the document properties (optional).
|
|
}
|
|
|
|
\setpnumwidth{2.5em} % Avoid overfull hboxes in the table of contents (see memoir manual).
|
|
\setsecnumdepth{subsection} % Enumerate subsections.
|
|
|
|
\nonzeroparskip % Create space between paragraphs (optional).
|
|
\setlength{\parindent}{0pt} % Remove paragraph identation (optional).
|
|
|
|
\makeindex % Use an optional index.
|
|
\makeglossaries % Use an optional glossary.
|
|
%\glstocfalse % Remove the glossaries from the table of contents.
|
|
|
|
% Set persons with 4 arguments:
|
|
% {title before name}{name}{title after name}{gender}
|
|
% where both titles are optional (i.e. can be given as empty brackets {}).
|
|
\setauthor{}{\authorname}{BSc}{male}
|
|
\setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male}
|
|
|
|
\setregnumber{01527193}
|
|
\setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}.
|
|
\settitle{\thesistitle}{Flower State Classification for Watering System} % Sets English and German version of the title (both can be English or German).
|
|
|
|
% Select the thesis type: bachelor / master / doctor / phd-school.
|
|
% Master:
|
|
\setthesis{master}
|
|
\setmasterdegree{dipl.} % dipl. / rer.nat. / rer.soc.oec. / master
|
|
|
|
% For bachelor and master:
|
|
\setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum.
|
|
|
|
\newacronym{xai}{XAI}{Explainable Artificial Intelligence}
|
|
\newacronym{lime}{LIME}{Local Interpretable Model Agnostic Explanation}
|
|
\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping}
|
|
\newacronym{oid}{OID}{Open Images Dataset}
|
|
\newacronym{ap}{AP}{Average Precision}
|
|
\newacronym{iou}{IOU}{Intersection over Union}
|
|
\newacronym{map}{mAP}{mean average precision}
|
|
|
|
\begin{document}
|
|
|
|
\frontmatter % Switches to roman numbering.
|
|
% The structure of the thesis has to conform to the guidelines at
|
|
% https://informatics.tuwien.ac.at/study-services
|
|
|
|
\addtitlepage{naustrian} % German title page (not for dissertations at the PhD School).
|
|
\addtitlepage{english} % English title page.
|
|
\addstatementpage
|
|
|
|
\begin{danksagung*}
|
|
\todo{Ihr Text hier.}
|
|
\end{danksagung*}
|
|
|
|
\begin{acknowledgements*}
|
|
\todo{Enter your text here.}
|
|
\end{acknowledgements*}
|
|
|
|
\begin{kurzfassung}
|
|
\todo{Ihr Text hier.}
|
|
\end{kurzfassung}
|
|
|
|
\begin{abstract}
|
|
\todo{Enter your text here.}
|
|
\end{abstract}
|
|
|
|
% Select the language of the thesis, e.g., english or naustrian.
|
|
\selectlanguage{english}
|
|
|
|
% Add a table of contents (toc).
|
|
\tableofcontents % Starred version, i.e., \tableofcontents*, removes the self-entry.
|
|
|
|
% Switch to arabic numbering and start the enumeration of chapters in the table of content.
|
|
\mainmatter
|
|
|
|
% \chapter{Introduction}
|
|
% \todo{Enter your text here.}
|
|
|
|
\chapter{Evaluation}
|
|
|
|
The following sections contain a detailed evaluation of the model in
|
|
various scenarios. First, we present metrics from the training phases
|
|
of the constituent models. Second, we employ methods from the field of
|
|
\gls{xai} such as \gls{lime} and \gls{grad-cam} to get a better
|
|
understanding of the models' abstractions. Finally, we turn to the
|
|
models' aggregate performance on the test set and discuss whether the
|
|
initial goals set by the problem description have been met or not.
|
|
|
|
\section{Object Detection}
|
|
\label{sec:yolo-eval}
|
|
|
|
The object detection model was pre-trained on the COCO~\cite{lin2015}
|
|
dataset and fine-tuned with data from the \gls{oid}
|
|
\cite{kuznetsova2020} in its sixth version. Since the full \gls{oid}
|
|
dataset contains considerably more classes and samples than would be
|
|
feasibly trainable on a small cluster of GPUs, only images from the
|
|
two classes \emph{Plant} and \emph{Houseplant} have been
|
|
downloaded. The samples from the Houseplant class are merged into the
|
|
Plant class because the distinction between the two is not necessary
|
|
for our model. Furthermore, the \gls{oid} contains not only bounding
|
|
box annotations for object detection tasks, but also instance
|
|
segmentations, classification labels and more. These are not needed
|
|
for our purposes and are omitted as well. In total, the dataset
|
|
consists of 91479 images with a roughly 85/5/10 split for training,
|
|
validation and testing, respectively.
|
|
|
|
\subsection{Training Phase}
|
|
\label{sec:yolo-training-phase}
|
|
|
|
The object detection model was trained for 300 epochs on 79204 images
|
|
with 284130 ground truth labels. The weights from the best-performing
|
|
epoch were saved. The model's fitness for each epoch is calculated as
|
|
the weighted average of \textsf{mAP}@0.5 and \textsf{mAP}@0.5:0.95:
|
|
|
|
\begin{equation}
|
|
\label{eq:fitness}
|
|
f_{epoch} = 0.1 \cdot \mathsf{mAP}@0.5 + 0.9 \cdot \mathsf{mAP}@0.5\mathrm{:}0.95
|
|
\end{equation}
|
|
|
|
Figure~\ref{fig:fitness} shows the model's fitness over the training
|
|
period of 300 epochs. The gray vertical line indicates the maximum
|
|
fitness of 0.61 at epoch 133. The weights of that epoch were frozen to
|
|
be the final model parameters. Since the fitness metric assigns the
|
|
\textsf{mAP} at the higher range the overwhelming weight, the
|
|
\textsf{mAP}@0.5 starts to decrease after epoch 30, but the
|
|
\textsf{mAP}@0.5:0.95 picks up the slack until the maximum fitness at
|
|
epoch 133. This is an indication that the model achieves good
|
|
performance early on and continues to gain higher confidence values
|
|
until performance deteriorates due to overfitting.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/model_fitness.pdf}
|
|
\caption[Model fitness per epoch.]{Model fitness for each epoch
|
|
calculated as in equation~\ref{eq:fitness}. The vertical gray line
|
|
at 133 marks the epoch with the highest fitness.}
|
|
\label{fig:fitness}
|
|
\end{figure}
|
|
|
|
Overall precision and recall per epoch are shown in
|
|
figure~\ref{fig:prec-rec}. The values indicate that neither precision
|
|
nor recall change materially during training. In fact, precision
|
|
starts to decrease from the beginning, while recall experiences a
|
|
barely noticeable increase. Taken together with the box and object
|
|
loss from figure~\ref{fig:box-obj-loss}, we speculate that the
|
|
pre-trained model already generalizes well to plant detection because
|
|
one of the categories in the COCO~\cite{lin2015} dataset is
|
|
\emph{potted plant}. Any further training solely impacts the
|
|
confidence of detection, but does not lead to higher detection
|
|
rates. This conclusion is supported by the increasing
|
|
\textsf{mAP}@0.5:0.95 until epoch 133.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/precision_recall.pdf}
|
|
\caption{Overall precision and recall during training for each
|
|
epoch. The vertical gray line at 133 marks the epoch with the
|
|
highest fitness.}
|
|
\label{fig:prec-rec}
|
|
\end{figure}
|
|
|
|
Further culprits for the flat precision and recall values may be found
|
|
in bad ground truth data. The labels from the \gls{oid} are sometimes not
|
|
fine-grained enough. Images which contain multiple individual—often
|
|
overlapping—plants are labeled with one large bounding box instead of
|
|
multiple smaller ones. The model recognizes the individual plants and
|
|
returns tighter bounding boxes even if that is not what is specified
|
|
in the ground truth. Therefore, it is prudent to limit the training
|
|
phase to relatively few epochs in order to not penalize the more
|
|
accurate detections of the model. The smaller bounding boxes make more
|
|
sense considering the fact that the cutout is passed to the classifier
|
|
in a later stage. Smaller bounding boxes help the classifier to only
|
|
focus on one plant at a time and to not get distracted by multiple
|
|
plants in potentially different stages of wilting.
|
|
|
|
The box loss decreases slightly during training which indicates that
|
|
the bounding boxes become tighter around objects of interest. With
|
|
increasing training time, however, the object loss increases,
|
|
indicating that less and less plants are present in the predicted
|
|
bounding boxes. It is likely that overfitting is a cause for the
|
|
increasing object loss from epoch 40 onward. Since the best weights as
|
|
measured by fitness are found at epoch 133 and the object loss
|
|
accelerates from that point, epoch 133 is probably the correct cutoff
|
|
before overfitting occurs.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/val_box_obj_loss.pdf}
|
|
\caption[Box and object loss.]{Box and object loss measured against
|
|
the validation set of 3091 images and 4092 ground truth
|
|
labels. The class loss is omitted because there is only one class
|
|
in the dataset and the loss is therefore always zero.}
|
|
\label{fig:box-obj-loss}
|
|
\end{figure}
|
|
|
|
\subsection{Test Phase}
|
|
\label{ssec:test-phase}
|
|
|
|
Of the 91479 images around 10\% were used for the test phase. These
|
|
images contain a total of 12238 ground truth
|
|
labels. Table~\ref{tab:yolo-metrics} shows precision, recall and the
|
|
harmonic mean of both (F1-score). The results indicate that the model
|
|
errs on the side of sensitivity because recall is higher than
|
|
precision. Although some detections are not labeled as plants in the
|
|
dataset, if there is a labeled plant in the ground truth data, the
|
|
chance is high that it will be detected. This behavior is in line with
|
|
how the model's detections are handled in practice. The detections are
|
|
drawn on the original image and the user is able to check the bounding
|
|
boxes visually. If there are wrong detections, the user can ignore
|
|
them and focus on the relevant ones instead. A higher recall will thus
|
|
serve the user's needs better than a high precision.
|
|
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{lrrrr}
|
|
\toprule
|
|
{} & Precision & Recall & F1-score & Support \\
|
|
\midrule
|
|
Plant & 0.547571 & 0.737866 & 0.628633 & 12238.0 \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption{Precision, recall and F1-score for the object detection model.}
|
|
\label{tab:yolo-metrics}
|
|
\end{table}
|
|
|
|
Figure~\ref{fig:yolo-ap} shows the \gls{ap} for the \gls{iou}
|
|
thresholds of 0.5 and 0.95. Predicted bounding boxes with an \gls{iou}
|
|
of less than 0.5 are not taken into account for the precision and
|
|
recall values of table~\ref{tab:yolo-metrics}. COCO's \cite{lin2015}
|
|
main evaluation metric is the \gls{ap} averaged across the \gls{iou}
|
|
thresholds from 0.5 to 0.95 in 0.05 steps. This value is then averaged
|
|
across all classes and called \gls{map}. The object detection model
|
|
achieves a state-of-the-art \gls{map} of 0.5727 for the \emph{Plant}
|
|
class.
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics{graphics/APpt5-pt95.pdf}
|
|
\caption[Object detection AP@0.5 and AP@0.95.]{Precision-recall
|
|
curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a
|
|
specific threshold is defined as the area under the
|
|
precision-recall curve of that threshold. The \gls{map} across
|
|
\gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps
|
|
\textsf{mAP}@0.5:0.95 is 0.5727.}
|
|
\label{fig:yolo-ap}
|
|
\end{figure}
|
|
|
|
|
|
\backmatter
|
|
|
|
% Use an optional list of figures.
|
|
\listoffigures % Starred version, i.e., \listoffigures*, removes the toc entry.
|
|
|
|
% Use an optional list of tables.
|
|
\cleardoublepage % Start list of tables on the next empty right hand page.
|
|
\listoftables % Starred version, i.e., \listoftables*, removes the toc entry.
|
|
|
|
% Use an optional list of alogrithms.
|
|
\listofalgorithms
|
|
\addcontentsline{toc}{chapter}{List of Algorithms}
|
|
|
|
% Add an index.
|
|
\printindex
|
|
|
|
% Add a glossary.
|
|
\printglossaries
|
|
|
|
% Add a bibliography.
|
|
\bibliographystyle{alpha}
|
|
\bibliography{references}
|
|
|
|
\end{document}
|
|
%%% Local Variables:
|
|
%%% mode: latex
|
|
%%% TeX-master: t
|
|
%%% End:
|