603 lines
28 KiB
TeX
603 lines
28 KiB
TeX
% Copyright (C) 2014-2020 by Thomas Auzinger <thomas@auzinger.name>
|
|
|
|
\documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information.
|
|
|
|
% Load packages to allow in- and output of non-ASCII characters.
|
|
\usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters.
|
|
\usepackage[T1]{fontenc} % Determines font encoding of the output. Font packages have to be included before this line.
|
|
\usepackage[utf8]{inputenc} % Determines encoding of the input. All input files have to use UTF8 encoding.
|
|
|
|
% Extended LaTeX functionality is enables by including packages with \usepackage{...}.
|
|
\usepackage{amsmath} % Extended typesetting of mathematical expression.
|
|
\usepackage{amssymb} % Provides a multitude of mathematical symbols.
|
|
\usepackage{mathtools} % Further extensions of mathematical typesetting.
|
|
\usepackage{microtype} % Small-scale typographic enhancements.
|
|
\usepackage[inline]{enumitem} % User control over the layout of lists (itemize, enumerate, description).
|
|
\usepackage{multirow} % Allows table elements to span several rows.
|
|
\usepackage{booktabs} % Improves the typesettings of tables.
|
|
\usepackage{subcaption} % Allows the use of subfigures and enables their referencing.
|
|
\usepackage[ruled,linesnumbered,algochapter]{algorithm2e} % Enables the writing of pseudo code.
|
|
\usepackage[usenames,dvipsnames,table]{xcolor} % Allows the definition and use of colors. This package has to be included before tikz.
|
|
\usepackage{nag} % Issues warnings when best practices in writing LaTeX documents are violated.
|
|
\usepackage{todonotes} % Provides tooltip-like todo notes.
|
|
\usepackage[backend=biber,style=trad-alpha,isbn=false,eprint=false,maxcitenames=3]{biblatex}
|
|
\usepackage{hyperref} % Enables cross linking in the electronic document version. This package has to be included second to last.
|
|
\usepackage[acronym,toc]{glossaries} % Enables the generation of glossaries and lists fo acronyms. This package has to be included last.
|
|
|
|
\addbibresource{references.bib}
|
|
|
|
% Define convenience functions to use the author name and the thesis title in the PDF document properties.
|
|
\newcommand{\authorname}{Tobias Eidelpes} % The author name without titles.
|
|
\newcommand{\thesistitle}{Flower State Classification for Watering System} % The title of the thesis. The English version should be used, if it exists.
|
|
|
|
% Set PDF document properties
|
|
\hypersetup
|
|
{
|
|
pdfpagelayout = TwoPageRight, % How the document is shown in PDF viewers (optional).
|
|
linkbordercolor = {Melon}, % The color of the borders of boxes around crosslinks (optional).
|
|
pdfauthor = {\authorname}, % The author's name in the document properties (optional).
|
|
pdftitle = {\thesistitle}, % The document's title in the document properties (optional).
|
|
pdfsubject = {Subject}, % The document's subject in the document properties (optional).
|
|
pdfkeywords = {Object Detection, Image Classification, Machine Learning, Embedded Programming} % The document's keywords in the document properties (optional).
|
|
}
|
|
|
|
\setpnumwidth{2.5em} % Avoid overfull hboxes in the table of contents (see memoir manual).
|
|
\setsecnumdepth{subsection} % Enumerate subsections.
|
|
|
|
\nonzeroparskip % Create space between paragraphs (optional).
|
|
\setlength{\parindent}{0pt} % Remove paragraph identation (optional).
|
|
|
|
\makeindex % Use an optional index.
|
|
\makeglossaries % Use an optional glossary.
|
|
%\glstocfalse % Remove the glossaries from the table of contents.
|
|
|
|
% Set persons with 4 arguments:
|
|
% {title before name}{name}{title after name}{gender}
|
|
% where both titles are optional (i.e. can be given as empty brackets {}).
|
|
\setauthor{}{\authorname}{BSc}{male}
|
|
\setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male}
|
|
|
|
\setregnumber{01527193}
|
|
\setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}.
|
|
\settitle{\thesistitle}{Flower State Classification for Watering System} % Sets English and German version of the title (both can be English or German).
|
|
|
|
% Select the thesis type: bachelor / master / doctor / phd-school.
|
|
% Master:
|
|
\setthesis{master}
|
|
\setmasterdegree{dipl.} % dipl. / rer.nat. / rer.soc.oec. / master
|
|
|
|
% For bachelor and master:
|
|
\setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum.
|
|
|
|
\newacronym{xai}{XAI}{Explainable Artificial Intelligence}
|
|
\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping}
|
|
\newacronym{cam}{CAM}{Class Activation Mapping}
|
|
\newacronym{oid}{OID}{Open Images Dataset}
|
|
\newacronym{ap}{AP}{Average Precision}
|
|
\newacronym{iou}{IOU}{Intersection over Union}
|
|
\newacronym{map}{mAP}{mean average precision}
|
|
\newacronym{resnet}{ResNet}{Residual Neural Network}
|
|
\newacronym{cnn}{CNN}{Convolutional Neural Network}
|
|
\newacronym{sgd}{SGD}{Stochastic Gradient Descent}
|
|
|
|
\begin{document}
|
|
|
|
\frontmatter % Switches to roman numbering.
|
|
% The structure of the thesis has to conform to the guidelines at
|
|
% https://informatics.tuwien.ac.at/study-services
|
|
|
|
\addtitlepage{naustrian} % German title page (not for dissertations at the PhD School).
|
|
\addtitlepage{english} % English title page.
|
|
\addstatementpage
|
|
|
|
\begin{danksagung*}
|
|
\todo{Ihr Text hier.}
|
|
\end{danksagung*}
|
|
|
|
\begin{acknowledgements*}
|
|
\todo{Enter your text here.}
|
|
\end{acknowledgements*}
|
|
|
|
\begin{kurzfassung}
|
|
\todo{Ihr Text hier.}
|
|
\end{kurzfassung}
|
|
|
|
\begin{abstract}
|
|
\todo{Enter your text here.}
|
|
\end{abstract}
|
|
|
|
% Select the language of the thesis, e.g., english or naustrian.
|
|
\selectlanguage{english}
|
|
|
|
% Add a table of contents (toc).
|
|
\tableofcontents % Starred version, i.e., \tableofcontents*, removes the self-entry.
|
|
|
|
% Switch to arabic numbering and start the enumeration of chapters in the table of content.
|
|
\mainmatter
|
|
|
|
% \chapter{Introduction}
|
|
% \todo{Enter your text here.}
|
|
|
|
\chapter{Evaluation}
|
|
|
|
The following sections contain a detailed evaluation of the model in
|
|
various scenarios. First, we present metrics from the training phases
|
|
of the constituent models. Second, we employ methods from the field of
|
|
\gls{xai} such as \gls{grad-cam} to get a better understanding of the
|
|
models' abstractions. Finally, we turn to the models' aggregate
|
|
performance on the test set and discuss whether the initial goals set
|
|
by the problem description have been met or not.
|
|
|
|
\section{Object Detection}
|
|
\label{sec:yolo-eval}
|
|
|
|
The object detection model was pre-trained on the COCO~\cite{lin2015}
|
|
dataset and fine-tuned with data from the \gls{oid}
|
|
\cite{kuznetsova2020} in its sixth version. Since the full \gls{oid}
|
|
dataset contains considerably more classes and samples than would be
|
|
feasibly trainable on a small cluster of GPUs, only images from the
|
|
two classes \emph{Plant} and \emph{Houseplant} have been
|
|
downloaded. The samples from the Houseplant class are merged into the
|
|
Plant class because the distinction between the two is not necessary
|
|
for our model. Furthermore, the \gls{oid} contains not only bounding
|
|
box annotations for object detection tasks, but also instance
|
|
segmentations, classification labels and more. These are not needed
|
|
for our purposes and are omitted as well. In total, the dataset
|
|
consists of 91479 images with a roughly 85/5/10 split for training,
|
|
validation and testing, respectively.
|
|
|
|
\subsection{Training Phase}
|
|
\label{ssec:yolo-training-phase}
|
|
|
|
The object detection model was trained for 300 epochs on 79204 images
|
|
with 284130 ground truth labels. The weights from the best-performing
|
|
epoch were saved. The model's fitness for each epoch is calculated as
|
|
the weighted average of \textsf{mAP}@0.5 and \textsf{mAP}@0.5:0.95:
|
|
|
|
\begin{equation}
|
|
\label{eq:fitness}
|
|
f_{epoch} = 0.1 \cdot \mathsf{mAP}@0.5 + 0.9 \cdot \mathsf{mAP}@0.5\mathrm{:}0.95
|
|
\end{equation}
|
|
|
|
Figure~\ref{fig:fitness} shows the model's fitness over the training
|
|
period of 300 epochs. The gray vertical line indicates the maximum
|
|
fitness of 0.61 at epoch 133. The weights of that epoch were frozen to
|
|
be the final model parameters. Since the fitness metric assigns the
|
|
\textsf{mAP} at the higher range the overwhelming weight, the
|
|
\textsf{mAP}@0.5 starts to decrease after epoch 30, but the
|
|
\textsf{mAP}@0.5:0.95 picks up the slack until the maximum fitness at
|
|
epoch 133. This is an indication that the model achieves good
|
|
performance early on and continues to gain higher confidence values
|
|
until performance deteriorates due to overfitting.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/model_fitness.pdf}
|
|
\caption[Object detection fitness per epoch.]{Object detection model
|
|
fitness for each epoch calculated as in
|
|
equation~\ref{eq:fitness}. The vertical gray line at 133 marks the
|
|
epoch with the highest fitness.}
|
|
\label{fig:fitness}
|
|
\end{figure}
|
|
|
|
Overall precision and recall per epoch are shown in
|
|
figure~\ref{fig:prec-rec}. The values indicate that neither precision
|
|
nor recall change materially during training. In fact, precision
|
|
starts to decrease from the beginning, while recall experiences a
|
|
barely noticeable increase. Taken together with the box and object
|
|
loss from figure~\ref{fig:box-obj-loss}, we speculate that the
|
|
pre-trained model already generalizes well to plant detection because
|
|
one of the categories in the COCO~\cite{lin2015} dataset is
|
|
\emph{potted plant}. Any further training solely impacts the
|
|
confidence of detection, but does not lead to higher detection
|
|
rates. This conclusion is supported by the increasing
|
|
\textsf{mAP}@0.5:0.95 until epoch 133.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/precision_recall.pdf}
|
|
\caption[Object detection precision and recall during
|
|
training.]{Overall precision and recall during training for each
|
|
epoch. The vertical gray line at 133 marks the epoch with the
|
|
highest fitness.}
|
|
\label{fig:prec-rec}
|
|
\end{figure}
|
|
|
|
Further culprits for the flat precision and recall values may be found
|
|
in bad ground truth data. The labels from the \gls{oid} are sometimes not
|
|
fine-grained enough. Images which contain multiple individual—often
|
|
overlapping—plants are labeled with one large bounding box instead of
|
|
multiple smaller ones. The model recognizes the individual plants and
|
|
returns tighter bounding boxes even if that is not what is specified
|
|
in the ground truth. Therefore, it is prudent to limit the training
|
|
phase to relatively few epochs in order to not penalize the more
|
|
accurate detections of the model. The smaller bounding boxes make more
|
|
sense considering the fact that the cutout is passed to the classifier
|
|
in a later stage. Smaller bounding boxes help the classifier to only
|
|
focus on one plant at a time and to not get distracted by multiple
|
|
plants in potentially different stages of wilting.
|
|
|
|
The box loss decreases slightly during training which indicates that
|
|
the bounding boxes become tighter around objects of interest. With
|
|
increasing training time, however, the object loss increases,
|
|
indicating that less and less plants are present in the predicted
|
|
bounding boxes. It is likely that overfitting is a cause for the
|
|
increasing object loss from epoch 40 onward. Since the best weights as
|
|
measured by fitness are found at epoch 133 and the object loss
|
|
accelerates from that point, epoch 133 is probably the correct cutoff
|
|
before overfitting occurs.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/val_box_obj_loss.pdf}
|
|
\caption[Object detection box and object loss.]{Box and object loss
|
|
measured against the validation set of 3091 images and 4092 ground
|
|
truth labels. The class loss is omitted because there is only one
|
|
class in the dataset and the loss is therefore always zero.}
|
|
\label{fig:box-obj-loss}
|
|
\end{figure}
|
|
|
|
\subsection{Test Phase}
|
|
\label{ssec:yolo-test-phase}
|
|
|
|
Of the 91479 images around 10\% were used for the test phase. These
|
|
images contain a total of 12238 ground truth
|
|
labels. Table~\ref{tab:yolo-metrics} shows precision, recall and the
|
|
harmonic mean of both (F1-score). The results indicate that the model
|
|
errs on the side of sensitivity because recall is higher than
|
|
precision. Although some detections are not labeled as plants in the
|
|
dataset, if there is a labeled plant in the ground truth data, the
|
|
chance is high that it will be detected. This behavior is in line with
|
|
how the model's detections are handled in practice. The detections are
|
|
drawn on the original image and the user is able to check the bounding
|
|
boxes visually. If there are wrong detections, the user can ignore
|
|
them and focus on the relevant ones instead. A higher recall will thus
|
|
serve the user's needs better than a high precision.
|
|
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{lrrrr}
|
|
\toprule
|
|
{} & Precision & Recall & F1-score & Support \\
|
|
\midrule
|
|
Plant & 0.547571 & 0.737866 & 0.628633 & 12238.0 \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption{Precision, recall and F1-score for the object detection model.}
|
|
\label{tab:yolo-metrics}
|
|
\end{table}
|
|
|
|
Figure~\ref{fig:yolo-ap} shows the \gls{ap} for the \gls{iou}
|
|
thresholds of 0.5 and 0.95. Predicted bounding boxes with an \gls{iou}
|
|
of less than 0.5 are not taken into account for the precision and
|
|
recall values of table~\ref{tab:yolo-metrics}. The lower the detection
|
|
threshold, the more plants are detected. Conversely, a higher
|
|
detection threshold leaves potential plants undetected. The
|
|
precision-recall curves confirm this behavior because the area under
|
|
the curve for the threshold of 0.5 is higher than for the threshold of
|
|
0.95 ($0.66$ versus $0.41$). These values are combined in COCO's
|
|
\cite{lin2015} main evaluation metric which is the \gls{ap} averaged
|
|
across the \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps. This
|
|
value is then averaged across all classes and called \gls{map}. The
|
|
object detection model achieves a state-of-the-art \gls{map} of 0.5727
|
|
for the \emph{Plant} class.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/APpt5-pt95.pdf}
|
|
\caption[Object detection AP@0.5 and AP@0.95.]{Precision-recall
|
|
curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a
|
|
specific threshold is defined as the area under the
|
|
precision-recall curve of that threshold. The \gls{map} across
|
|
\gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps
|
|
\textsf{mAP}@0.5:0.95 is 0.5727.}
|
|
\label{fig:yolo-ap}
|
|
\end{figure}
|
|
|
|
\section{Classification}
|
|
\label{sec:resnet-eval}
|
|
|
|
The classifier receives cutouts from the object detection model and
|
|
determines whether the image shows a stressed plant or not. To achieve
|
|
this goal, we trained a \gls{resnet} \cite{he2016} on a dataset of 452
|
|
images of healthy and 452 stressed plants. We chose the \gls{resnet}
|
|
architecture due to its popularity and ease of implementation as well
|
|
as its consistently high performance on various classification
|
|
tasks. While its classification speed in comparison with networks
|
|
optimized for mobile and edge devices (e.g. MobileNet) is
|
|
significantly lower, the deeper structure and the additional
|
|
parameters are necessary for the fairly complex task at
|
|
hand. Furthermore, the generous time budget for object detection
|
|
\emph{and} classification allows for more accurate results at the
|
|
expense of speed. The architecture allows for multiple different
|
|
structures, depending on the amount of layers. The smallest one has 18
|
|
and the largest 152 layers with 34, 50 and 101 in-between. The larger
|
|
networks have better accuracy in general, but come with trade-offs
|
|
regarding training and inference time as well as required space. The
|
|
50 layer architecture (\gls{resnet}50) is adequate for our use case.
|
|
|
|
\subsection{Training Phase}
|
|
\label{ssec:resnet-training-phase}
|
|
|
|
The dataset was split 85/15 into training and validation sets. The
|
|
images in the training set were augmented with a random crop to arrive
|
|
at the expected image dimensions of 224 pixels. Additionally, the
|
|
training images were modified with a random horizontal flip to
|
|
increase the variation in the set and to train a rotation invariant
|
|
classifier. All images, regardless of their membership in the training
|
|
or validation set, were normalized with the mean and standard
|
|
deviation of the ImageNet~\cite{deng2009} dataset, which the original
|
|
\gls{resnet} model was pre-trained with. Training was done for 50
|
|
epochs and the best-performing model as measured by validation
|
|
accuracy was selected as the final version.
|
|
|
|
Figure~\ref{fig:classifier-training-metrics} shows accuracy and loss
|
|
on the training and validation sets. There is a clear upwards trend
|
|
until epoch 20 when validation accuracy and loss stabilize at around
|
|
0.84 and 0.3, respectively. The quick convergence and resistance to
|
|
overfitting can be attributed to the model already having robust
|
|
feature extraction capabilities.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/classifier-metrics.pdf}
|
|
\caption[Classifier accuracy and loss during training.]{Accuracy and
|
|
loss during training of the classifier. The model converges
|
|
quickly, but additional epochs do not cause validation loss to
|
|
increase, which would indicate overfitting. The maximum validation
|
|
accuracy of 0.9118 is achieved at epoch 27.}
|
|
\label{fig:classifier-training-metrics}
|
|
\end{figure}
|
|
|
|
\subsection{Hyper-parameter Optimization}
|
|
\label{ssec:resnet-hyp-opt}
|
|
|
|
In order to improve the aforementioned accuracy values, we perform
|
|
hyper-parameter optimization across a wide range of
|
|
parameters. Table~\ref{tab:resnet-hyps} lists the hyper-parameters and
|
|
their possible values. Since the number of all combinations of values
|
|
is 11520 and each combination is trained for 10 epochs with a training
|
|
time of approximately six minutes per combination, exhausting the
|
|
search space would take 48 days. Due to time limitations, we have
|
|
chosen to not search exhaustively but to pick random combinations
|
|
instead. Random search works surprisingly well---especially compared to
|
|
grid search---in a number of domains, one of which is hyper-parameter
|
|
optimization~\cite{bergstra2012}.
|
|
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{lr}
|
|
\toprule
|
|
Parameter & Values \\
|
|
\midrule
|
|
optimizer & adam, sgd \\
|
|
batch size & 4, 8, 16, 32, 64 \\
|
|
learning rate & 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.1 \\
|
|
step size & 2, 3, 5, 7 \\
|
|
gamma & 0.1, 0.5 \\
|
|
beta one & 0.9, 0.99 \\
|
|
beta two & 0.5, 0.9, 0.99, 0.999 \\
|
|
eps & 0.00000001, 0.1, 1 \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption{Hyper-parameters and their possible values during
|
|
optimization.}
|
|
\label{tab:resnet-hyps}
|
|
\end{table}
|
|
|
|
The random search was run for 138 iterations which equates to a 75\%
|
|
probability that the best solution lies within 1\% of the theoretical
|
|
maximum~\eqref{eq:opt-prob}. Figure~\ref{fig:resnet-hyp-results} shows
|
|
three of the eight parameters and their impact on a high
|
|
F1-score. \gls{sgd} has less variation in its results than
|
|
Adam~\cite{kingma2017} and manages to provide eight out of the ten
|
|
best results. The number of epochs to train for was chosen based on
|
|
the observation that almost all configurations converge well before
|
|
reaching the tenth epoch. The assumption that a training run with ten
|
|
epochs provides a good proxy for final performance is supported by the
|
|
quick convergence of validation accuracy and loss in
|
|
figure~\ref{fig:classifier-training-metrics}.
|
|
|
|
\begin{equation}\label{eq:opt-prob}
|
|
1 - (1 - 0.01)^{138} \approx 0.75
|
|
\end{equation}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/classifier-hyp-metrics.pdf}
|
|
\caption[Classifier hyper-parameter optimization results.]{This
|
|
figure shows three of the eight hyper-parameters and their
|
|
performance measured by the F1-score during 138
|
|
trials. Differently colored markers show the batch size with
|
|
darker colors representing a larger batch size. The type of marker
|
|
(circle or cross) shows which optimizer was used. The x-axis shows
|
|
the learning rate on a logarithmic scale. In general, a learning
|
|
rate between 0.003 and 0.01 results in more robust and better
|
|
F1-scores. Larger batch sizes more often lead to better
|
|
performance as well. As for the type of optimizer, \gls{sgd}
|
|
produced the best iteration with an F1-score of 0.9783. Adam tends
|
|
to require more customization of its parameters than \gls{sgd} to
|
|
achieve good results.}
|
|
\label{fig:resnet-hyp-results}
|
|
\end{figure}
|
|
|
|
\subsection{Class Activation Maps}
|
|
\label{ssec:resnet-cam}
|
|
|
|
Neural networks are notorious for their black-box behavior, where it
|
|
is possible to observe the inputs and the corresponding outputs, but
|
|
the stage in-between stays hidden from view. Models are continuously
|
|
developed and deployed to aid in human decision-making and sometimes
|
|
supplant it. It is, therefore, crucial to obtain some amount of
|
|
interpretability of what the model does \emph{inside} to be able to
|
|
explain why a decision was made in a certain way. The research field
|
|
of \gls{xai} gained significance during the last few years because of
|
|
the development of new methods to peek inside these black boxes.
|
|
|
|
One such method, \gls{cam}~\cite{zhou2015}, is a popular tool to
|
|
produce visual explanations for decisions made by
|
|
\glspl{cnn}. Convolutional layers essentially function as object
|
|
detectors as long as no fully-connected layers perform the
|
|
classification. This ability to localize regions of interest which
|
|
play a significant role in the type of class the model predicts, can
|
|
be retained until the last layer and used to generate activation maps
|
|
for the predictions.
|
|
|
|
A more recent approach to generating a \gls{cam} via gradients is
|
|
proposed by~\textcite{selvaraju2020}. Their \gls{grad-cam} approach
|
|
works by computing the gradient of the feature maps of the last
|
|
convolutional layer with respect to the specified class. The last
|
|
layer is chosen because the authors find that ``[…] Grad-CAM maps
|
|
become progressively worse as we move to earlier convolutional layers
|
|
as they have smaller receptive fields and only focus on less semantic
|
|
local features.''~\cite[p.5]{selvaraju2020}
|
|
|
|
Turning to our classifier, figure~\ref{fig:resnet-cam} shows the
|
|
\glspl{cam} for \emph{healthy} and \emph{stressed}. While the regions
|
|
of interest for the \emph{healthy} class lie on the healthy plant, the
|
|
\emph{stressed} plant is barely considered and mostly rendered as
|
|
background information (blue). Conversely, when asked to explain the
|
|
inputs to the \emph{stressed} classification, the regions of interest
|
|
predominantly stay on the thirsty as opposed to the healthy plant. In
|
|
fact, the large hanging leaves play a significant role in determining
|
|
the class the image belongs to. This is an additional data point
|
|
confirming that the model focuses on the \emph{right} parts of the
|
|
image during classification.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/classifier-cam.pdf}
|
|
\caption[Classifier \glspl{cam}.]{The top left image shows the
|
|
original image of the same plant in a stressed (left) and healthy
|
|
(right) state. In the top right image, the \gls{cam} for the class
|
|
\emph{healthy} is laid over the original image. The classifier
|
|
draws its conclusion mainly from the healthy plant, which is
|
|
indicated by the red hot spots around the tips of the plant. The
|
|
bottom right image shows the \gls{cam} for the \emph{stressed}
|
|
class. The classifier focuses on the hanging leaves of the thirsty
|
|
plant. The image was classified as \emph{stressed} with a
|
|
confidence of 70\%.}
|
|
\label{fig:resnet-cam}
|
|
\end{figure}
|
|
|
|
|
|
\section{Aggregate Model}
|
|
\label{sec:aggregate-model}
|
|
|
|
In this section we turn to the evaluation of the aggregate model. We
|
|
have confirmed the performance of the constituent models: the object
|
|
detection and the classification model. It remains to evaluate the
|
|
complete pipeline from gathering detections of potential plants in an
|
|
image and forwarding them to the classifier to obtaining the results
|
|
as either healthy or stressed with their associated confidence scores.
|
|
|
|
The test set contains 640 images which were obtained from a google
|
|
search using the terms \emph{thirsty plant}, \emph{wilted plant} and
|
|
\emph{stressed plant}. Images which clearly show one or multiple
|
|
plants with some amount of visible stress were added to the
|
|
dataset. Care was taken to include plants with various degrees of
|
|
stress and in various locations and lighting conditions. The search
|
|
not only provided images of stressed plants, but also of healthy
|
|
plants due to articles, which describe how to care for plants, having
|
|
a banner image of healthy plants. The dataset is biased towards potted
|
|
plants which are commonly put on display in western
|
|
households. Furthermore, many plants, such as succulents, are sought
|
|
after for home environments because of their ease of maintenance. Due
|
|
to their inclusion in the dataset and how they exhibit water stress,
|
|
the test set nevertheless contains a wide variety of scenarios.
|
|
|
|
After collecting the images, the aggregate model was run on them to
|
|
obtain initial bounding boxes and classifications for ground truth
|
|
labeling. Letting the model do the work beforehand and then correcting
|
|
the labels allowed to include more images in the test set because they
|
|
could be labeled more easily. Additionally, going over the detections
|
|
and classifications provided a comprehensive view on how the models
|
|
work and what their weaknesses and strengths are. After the labels
|
|
have been corrected, the ground truth of the test set contains 662
|
|
bounding boxes of healthy plants and 488 of stressed plants.
|
|
|
|
\begin{table}
|
|
\centering
|
|
\begin{tabular}{lrrrr}
|
|
\toprule
|
|
{} & Precision & Recall & F1-score & Support \\
|
|
\midrule
|
|
Healthy & 0.824 & 0.745 & 0.783 & 662.0 \\
|
|
Stressed & 0.707 & 0.783 & 0.743 & 488.0 \\
|
|
micro avg & 0.769 & 0.761 & 0.765 & 1150.0 \\
|
|
macro avg & 0.766 & 0.764 & 0.763 & 1150.0 \\
|
|
weighted avg & 0.775 & 0.761 & 0.766 & 1150.0 \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption{Precision, recall and F1-score for the aggregate model.}
|
|
\label{tab:model-metrics}
|
|
\end{table}
|
|
|
|
Table~\ref{tab:model-metrics} shows precision, recall and the F1-score
|
|
for both classes \emph{Healthy} and \emph{Stressed}. Both precision
|
|
and recall are balanced and the F1-score is high. Unfortunately, these
|
|
values do not take the accuracy of bounding boxes into account and
|
|
thus have only limited expressive power.
|
|
|
|
Figure~\ref{fig:aggregate-ap} shows the precision and recall curves
|
|
for both classes at different \gls{iou} thresholds. The left plot
|
|
shows the \gls{ap} for each class at the threshold of 0.5 and the
|
|
right one at 0.95. The \gls{map} is 0.6226 and calculated across all
|
|
classes as the median of the \gls{iou} thresholds from 0.5 to 0.95 in
|
|
0.05 steps. The difference between \gls{map}@0.5 and \gls{map}@0.95 is
|
|
fairly small which indicates that the bounding boxes encapsulate the
|
|
objects of interest well. The cliffs at around 0.77 (left) and 0.7
|
|
(right) happen at a detection threshold of 0.5. The classifier's last
|
|
layer is a softmax layer which necessarily transforms the input into a
|
|
probability of showing either a healthy or stressed plant. If the
|
|
probability of an image showing a healthy plant is below 0.5, it is no
|
|
longer classified as healthy but as stressed. The threshold for
|
|
discriminating the two classes lies at the 0.5 value and is therefore
|
|
the cutoff for either class.
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{graphics/APmodel.pdf}
|
|
\caption[Aggregate model AP@0.5 and AP@0.95.]{Precision-recall
|
|
curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a
|
|
specific threshold is defined as the area under the
|
|
precision-recall curve of that threshold. The \gls{map} across
|
|
\gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps
|
|
\textsf{mAP}@0.5:0.95 is 0.6226.}
|
|
\label{fig:aggregate-ap}
|
|
\end{figure}
|
|
|
|
Overall, we believe that the aggregate model shows sufficient
|
|
predictive performance to be deployed in the field. The detections are
|
|
accurate, especially for potted plants, and the classification into
|
|
healthy and stressed is robust.
|
|
|
|
\backmatter
|
|
|
|
% Use an optional list of figures.
|
|
\listoffigures % Starred version, i.e., \listoffigures*, removes the toc entry.
|
|
|
|
% Use an optional list of tables.
|
|
\cleardoublepage % Start list of tables on the next empty right hand page.
|
|
\listoftables % Starred version, i.e., \listoftables*, removes the toc entry.
|
|
|
|
% Use an optional list of algorithms.
|
|
\listofalgorithms
|
|
\addcontentsline{toc}{chapter}{List of Algorithms}
|
|
|
|
% Add an index.
|
|
\printindex
|
|
|
|
% Add a glossary.
|
|
\printglossaries
|
|
|
|
% Add a bibliography.
|
|
%\bibliographystyle{alpha}
|
|
\printbibliography
|
|
|
|
\end{document}
|
|
%%% Local Variables:
|
|
%%% mode: latex
|
|
%%% TeX-master: t
|
|
%%% End:
|