Final version of thesis

This commit is contained in:
Tobias Eidelpes 2023-12-27 16:45:43 +01:00
parent ee8f4dec85
commit a6ed18303e
4 changed files with 221 additions and 72 deletions

View File

@ -244,7 +244,7 @@
@online{chan2020, @online{chan2020,
title = {Healthy and {{Wilted Houseplant Images}}}, title = {Healthy and {{Wilted Houseplant Images}}},
author = {Chan, Russell}, author = {Chan, Russell},
date = {2020-01-17}, date = {2020-01},
url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images}, url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images},
urldate = {2023-12-08}, urldate = {2023-12-08},
abstract = {A collection of 904 houseplant images, classified as either healthy or wilted}, abstract = {A collection of 904 houseplant images, classified as either healthy or wilted},
@ -1422,7 +1422,6 @@
pages = {71--83}, pages = {71--83},
publisher = {{World Scientific Publishing Co.}}, publisher = {{World Scientific Publishing Co.}},
issn = {2301-3850}, issn = {2301-3850},
doi = {10.1142/S2301385020500053},
keywords = {Area-wise classification,Support Vector Machine (SVM),Unmanned Aerial Vehicle (UAV),wheat drought mapping}, keywords = {Area-wise classification,Support Vector Machine (SVM),Unmanned Aerial Vehicle (UAV),wheat drought mapping},
file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf} file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
} }

Binary file not shown.

View File

@ -1,6 +1,6 @@
% Copyright (C) 2014-2020 by Thomas Auzinger <thomas@auzinger.name> % Copyright (C) 2014-2020 by Thomas Auzinger <thomas@auzinger.name>
\documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information. \documentclass[final]{vutinfth} % Remove option 'final' to obtain debug information.
% Load packages to allow in- and output of non-ASCII characters. % Load packages to allow in- and output of non-ASCII characters.
\usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters. \usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters.
@ -51,7 +51,7 @@
\nonzeroparskip % Create space between paragraphs (optional). \nonzeroparskip % Create space between paragraphs (optional).
\setlength{\parindent}{0pt} % Remove paragraph identation (optional). \setlength{\parindent}{0pt} % Remove paragraph identation (optional).
\setcounter{tocdepth}{3} \setcounter{tocdepth}{2}
\makeindex % Use an optional index. \makeindex % Use an optional index.
\makeglossaries % Use an optional glossary. \makeglossaries % Use an optional glossary.
@ -64,7 +64,7 @@
\setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male} \setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male}
\setregnumber{01527193} \setregnumber{01527193}
\setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}. \setdate{27}{12}{2023} % Set date with 3 arguments: {day}{month}{year}.
\settitle{\thesistitle}{Plant Detection and State Classification with Machine Learning} % Sets English and German version of the title (both can be English or German). \settitle{\thesistitle}{Plant Detection and State Classification with Machine Learning} % Sets English and German version of the title (both can be English or German).
% Select the thesis type: bachelor / master / doctor / phd-school. % Select the thesis type: bachelor / master / doctor / phd-school.
@ -153,19 +153,73 @@ Challenge}
\addstatementpage \addstatementpage
\begin{danksagung*} \begin{danksagung*}
\todo{Ihr Text hier.} Ich danke vor allem dem Betreuer dieser Arbeit—Horst Eidenberger—für
die rasche und zeitnahe Bearbeitung meiner Anliegen sowie das
aussagekräftige Feedback, das ich laufend erhalten habe. Die von ihm
vorgegebene Roadmap erleichtert das Arbeiten nicht nur in der
Anfangsphase, sondern auch später ungemein.
Meine Eltern sind diejenigen, die mir das Studium überhaupt erst
ermöglicht haben. Die laufende Unterstützung auch in den späteren
Jahren—primär in Form von gemeinsamen Mittagessen—ließ mich auf das
konzentrieren, was wichtig ist.
Zu guter Letzt geht noch ein Dank an meine Studienkollegen, die bei
Fragen, zu Diskussionen und zum Entspannen immer erreichbar
waren. Der gelegentliche Austausch macht mir immer Freude und ich
hoffe, dass wir dies auch in Zukunft aufrechterhalten können.
\end{danksagung*} \end{danksagung*}
\begin{acknowledgements*} \begin{acknowledgements*}
\todo{Enter your text here.} I would especially like to thank the supervisor of this thesis—Horst
Eidenberger—for the quick and prompt processing of my requests as
well as the meaningful feedback I received on an ongoing basis. The
roadmap he provided not only made this thesis easier in the initial
phase but also later on.
My parents are the ones who made my studies possible in the first
place. The ongoing support, even in the later years—primarily in
the form of shared lunches—allowed me to concentrate on what is
important.
Last but not least, I would like to thank my fellow colleagues, who
have always been available for questions, discussions and
relaxation. I always enjoy the occasional exchange and I hope that
we can continue to do those in the future.
\end{acknowledgements*} \end{acknowledgements*}
\begin{kurzfassung} \begin{kurzfassung}
\todo{Ihr Text hier.} Wassermangel in Zimmerpflanzen kann ihr Wachstum negativ
beeinflussen. Derzeitige Lösungen zur Überwachung von Wasserstress
sind hauptsächlich für landwirtschaftliche Anwendungen
vorgesehen. Wir präsentieren den ersten Deep-Learning-basierten
Prototyp zur Klassifizierung des Wasserstresslevels gängiger
Zimmerpflanzen. Unser zweistufiger Ansatz besteht aus einem
Erkennungs- und einem Klassifizierungsschritt und wird anhand eines
eigens erstellten Datensatzes evaluiert. Die Parameter des Modells
werden mit gängigen Methoden der Hyperparameteroptimierung
ausgewählt. Der Prototyp wird auf einem embedded Computer
bereitgestellt, der eine autonome Pflanzenüberwachung
ermöglicht. Die Vorhersagen unseres Modells werden kontinuierlich
über eine API veröffentlicht, wodurch nachgelagerte
Bewässerungssysteme automatisch Zimmerpflanzen ohne menschliche
Intervention bewässern können. Unser optimiertes Modell erreicht
einen mAP-Wert von \num{0.3838}.
\end{kurzfassung} \end{kurzfassung}
\begin{abstract} \begin{abstract}
\todo{Enter your text here.} Water deficiency in household plants can adversely affect
growth. Existing solutions to monitor water stress are primarily
intended for agricultural contexts. We present the first deep
learning based prototype to classify water stress of common
household plants. Our two-stage approach consists of a detection and
a classification step and is evaluated on a new dataset. The model
parameters are optimized with a hyperparameter search. The prototype
is deployed to an embedded device enabling autonomous plant
monitoring. The predictions of our model are published continuously
via an API, allowing downstream watering systems to automatically
water household plants without human intervention. Our optimized
model achieves a mAP of \num{0.3838} on unseen images.
\end{abstract} \end{abstract}
% Select the language of the thesis, e.g., english or naustrian. % Select the language of the thesis, e.g., english or naustrian.
@ -389,7 +443,7 @@ problem further so that additional requirements which arise during
development can be incorporated properly. development can be incorporated properly.
The prototyping process is embedded within the concepts of the The prototyping process is embedded within the concepts of the
\emph{Scientific Method}. This thesis not only produces a prototype, \emph{Scientific Method}. This thesis not only produces a prototype
but also explores the problem of plant detection and classification but also explores the problem of plant detection and classification
scientifically. Exploration of the problem requires making falsifiable scientifically. Exploration of the problem requires making falsifiable
hypotheses (see section~\ref{sec:motivation}), gathering empirical hypotheses (see section~\ref{sec:motivation}), gathering empirical
@ -652,7 +706,7 @@ The simplest activation function is the identity function. It is defined as
If all layers in an artificial neural network use the identity If all layers in an artificial neural network use the identity
activation function, the network is equivalent to a single-layer activation function, the network is equivalent to a single-layer
structure. The identity function is often used for layers which do not structure. The identity function is often used for layers which do not
need an activation function per se, but require one to uphold need an activation function per se but require one to uphold
consistency with the rest of the network structure. consistency with the rest of the network structure.
\subsubsection{Heaviside Step} \subsubsection{Heaviside Step}
@ -865,11 +919,11 @@ network for an input-output pair. The algorithm computes the gradient
iteratively starting from the last layer and works its way backward iteratively starting from the last layer and works its way backward
through the network until it reaches the first layer. through the network until it reaches the first layer.
Strictly speaking, backpropagation only computes the gradient, but Strictly speaking, backpropagation only computes the gradient but does
does not determine how the gradient is used to learn the new not determine how the gradient is used to learn the new weights. Once
weights. Once the backpropagation algorithm has computed the gradient, the backpropagation algorithm has computed the gradient, that gradient
that gradient is passed to an algorithm which finds a local minimum of is passed to an algorithm which finds a local minimum of it. This step
it. This step is usually performed by some variant of gradient descent is usually performed by some variant of gradient descent
\cite{cauchy1847}. \cite{cauchy1847}.
\section{Object Detection} \section{Object Detection}
@ -1129,19 +1183,20 @@ on all levels. \glspl{fpn} are an important building block of many
state-of-the-art object detectors. state-of-the-art object detectors.
A \gls{fpn} first computes the feature pyramid bottom-up with a A \gls{fpn} first computes the feature pyramid bottom-up with a
scaling step of two. The lower levels capture less semantic information scaling step of two. The lower levels capture less semantic
than the higher levels, but include more spatial information due to information than the higher levels but include more spatial
the higher granularity. In a second step, the \gls{fpn} upsamples the information due to the higher granularity. In a second step, the
higher levels such that the dimensions of two consecutive layers are \gls{fpn} upsamples the higher levels such that the dimensions of two
the same. The upsampled top layer is merged with the layer beneath it consecutive layers are the same. The upsampled top layer is merged
via element-wise addition and convolved with a one by one with the layer beneath it via element-wise addition and convolved with
convolutional layer to reduce channel dimensions and to smooth out a one by one convolutional layer to reduce channel dimensions and to
potential artifacts introduced during the upsampling step. The results smooth out potential artifacts introduced during the upsampling
of that operation constitute the new \emph{top layer} and the process step. The results of that operation constitute the new \emph{top
continues with the layer below it until the finest resolution feature layer} and the process continues with the layer below it until the
map is generated. In this way, the features of the different layers at finest resolution feature map is generated. In this way, the features
different scales are fused to obtain a feature map with high semantic of the different layers at different scales are fused to obtain a
information but also high spatial information. feature map with high semantic information but also high spatial
information.
\textcite{lin2017} report results on \gls{coco} with a \gls{map}@0.5 \textcite{lin2017} report results on \gls{coco} with a \gls{map}@0.5
of 59.1\% with a Faster R-\gls{cnn} structure and a ResNet-101 of 59.1\% with a Faster R-\gls{cnn} structure and a ResNet-101
@ -1184,7 +1239,7 @@ architecture.
\gls{ssd} was proposed by \textcite{liu2016} and functions similarly \gls{ssd} was proposed by \textcite{liu2016} and functions similarly
to \gls{yolo} in that it does not need an extra proposal generation to \gls{yolo} in that it does not need an extra proposal generation
step, but instead detects and classifies objects in one go. The aim of step but instead detects and classifies objects in one go. The aim of
one-stage detectors is to be considerably faster and at least as one-stage detectors is to be considerably faster and at least as
accurate as two-stage detectors. While \gls{yolo} paved the way for accurate as two-stage detectors. While \gls{yolo} paved the way for
one-stage detectors, the detection accuracy is significantly lower one-stage detectors, the detection accuracy is significantly lower
@ -1200,7 +1255,7 @@ last fully-connected layer removed such that the output of the
\gls{cnn} is a scaled down representation of the input image. The \gls{cnn} is a scaled down representation of the input image. The
extra layers are intended to capture features at different scales and extra layers are intended to capture features at different scales and
compare them during training to a range of default anchor boxes. This compare them during training to a range of default anchor boxes. This
idea comes from MultiBox \cite{erhan2014}, but is implemented in idea comes from MultiBox \cite{erhan2014} but is implemented in
\gls{ssd} with a slight twist: during matching of default boxes to the \gls{ssd} with a slight twist: during matching of default boxes to the
ground truth, boxes with a Jaccard overlap (\gls{iou}) of less than ground truth, boxes with a Jaccard overlap (\gls{iou}) of less than
$0.5$ are discarded. In one-stage detector terms, the feature $0.5$ are discarded. In one-stage detector terms, the feature
@ -1446,7 +1501,7 @@ rate of 1.6\% over their own replicated AlexNet result of 18.1\%.
GoogLeNet, also known as Inception v1, was proposed by GoogLeNet, also known as Inception v1, was proposed by
\textcite{szegedy2015} to increase the depth of the network without \textcite{szegedy2015} to increase the depth of the network without
introducing too much additional complexity. Since the relevant parts introducing too much additional complexity. Since the relevant parts
of an image can often be of different sizes, but kernels within of an image can often be of different sizes but kernels within
convolutional layers are fixed, there is a mismatch between what can convolutional layers are fixed, there is a mismatch between what can
realistically be detected by the layers and what is present in the realistically be detected by the layers and what is present in the
dataset. Therefore, the authors propose to perform multiple dataset. Therefore, the authors propose to perform multiple
@ -1521,7 +1576,7 @@ The authors of DenseNet \cite{huang2017} go one step further than
ResNets by connecting every convolutional layer to every other layer ResNets by connecting every convolutional layer to every other layer
in the chain. Previously, each layer was connected in sequence with in the chain. Previously, each layer was connected in sequence with
the one before and the one after it. Residual connections establish a the one before and the one after it. Residual connections establish a
link between the previous layer and the next one, but still do not link between the previous layer and the next one but still do not
always propagate enough information forward. These \emph{shortcut always propagate enough information forward. These \emph{shortcut
connections} from earlier layers to later layers are thus only taking connections} from earlier layers to later layers are thus only taking
place in an episodic way for short sections in the chain. DenseNets place in an episodic way for short sections in the chain. DenseNets
@ -1784,7 +1839,7 @@ strategies roughly goes through the following stages
condition is reached. condition is reached.
\end{enumerate} \end{enumerate}
This strategy is more efficient than grid search or random search, but This strategy is more efficient than grid search or random search but
requires a substantial amount of iterations for good solutions and can requires a substantial amount of iterations for good solutions and can
thus be too expensive for hyperparameter optimization thus be too expensive for hyperparameter optimization
\cite{bischl2023}. We use an evolution strategy based on a genetic \cite{bischl2023}. We use an evolution strategy based on a genetic
@ -2101,13 +2156,13 @@ set which only comprises individual plants and their associated
classification (\emph{stressed} or \emph{healthy}). classification (\emph{stressed} or \emph{healthy}).
Both datasets (object detection and classification) only allow us to Both datasets (object detection and classification) only allow us to
train and validate each model separately. A third dataset is needed train and validate each model separately. A third dataset is needed to
to evaluate the detection/classification pipeline as a whole. To this evaluate the detection/classification pipeline as a whole. To this
end, we construct our own dataset where all plants per image are end, we construct our own dataset where all plants per image are
labeled with bounding boxes as well as the classes \emph{stressed} or labeled with bounding boxes as well as the classes \emph{stressed} or
\emph{healthy}. This dataset is small in comparison to the one with \emph{healthy}. This dataset is small in comparison to the one with
which the object detection model is trained, but suffices because it which the object detection model is trained but suffices because it is
is only used for evaluation. Labeling each sample in the evaluation only used for evaluation. Labeling each sample in the evaluation
dataset manually is still a laborious task which is why each image is dataset manually is still a laborious task which is why each image is
\emph{preannotated} by the already existing object detection and \emph{preannotated} by the already existing object detection and
classification model. The task of labeling thus becomes a task of classification model. The task of labeling thus becomes a task of
@ -2120,9 +2175,9 @@ models.
In the following sections we will go into detail about the two In the following sections we will go into detail about the two
selected architectures for our prototype. The object detector we selected architectures for our prototype. The object detector we
chose---\gls{yolo}v7---is part of a larger family of models which all chose---\gls{yolo}v7---is part of a larger family of models which all
function similarly, but have undergone substantial changes from function similarly but have undergone substantial changes from version
version to version. In order to understand the used model, we trace to version. In order to understand the used model, we trace the
the improvements to the \gls{yolo} family from version one to version improvements to the \gls{yolo} family from version one to version
seven. For the classification stage, we have opted for a ResNet seven. For the classification stage, we have opted for a ResNet
architecture which is also described in detail. architecture which is also described in detail.
@ -2162,7 +2217,7 @@ output from a one by one layer. This approach reduces complexity
substantially---as has been demonstrated with GoogleNet. Every block of substantially---as has been demonstrated with GoogleNet. Every block of
convolutional layers is followed by a two by two maxpool layer for convolutional layers is followed by a two by two maxpool layer for
downsampling. The model expects an input image of size $448$ by $448$ downsampling. The model expects an input image of size $448$ by $448$
pixels, but has been pretrained on ImageNet with half that resolution pixels but has been pretrained on ImageNet with half that resolution
(i.e. $224$ by $224$ pixels). After the convolutional layers, the (i.e. $224$ by $224$ pixels). After the convolutional layers, the
authors add two fully-connected layers to produce an output of size authors add two fully-connected layers to produce an output of size
$7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data $7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data
@ -2227,7 +2282,7 @@ predefined boxes which are called \emph{anchor boxes}. The authors
note that finding a good set of prior anchor boxes by hand is note that finding a good set of prior anchor boxes by hand is
error-prone and suggest finding them via $k$-means clustering error-prone and suggest finding them via $k$-means clustering
(dimension priors). They select five anchor boxes per grid cell which (dimension priors). They select five anchor boxes per grid cell which
still results in high recall, but does not introduce too much still results in high recall but does not introduce too much
complexity. complexity.
These additional details result in an improved \gls{map} of 78.6\% on These additional details result in an improved \gls{map} of 78.6\% on
@ -2245,7 +2300,7 @@ structures of state-of-the-art models at the time, the authors
introduce a deeper feature extractor called Darknet-53. It makes use introduce a deeper feature extractor called Darknet-53. It makes use
of the residual connections popularized by ResNet \cite{he2016} (see of the residual connections popularized by ResNet \cite{he2016} (see
section~\ref{sssec:theory-resnet}). Darknet-53 is more accurate than section~\ref{sssec:theory-resnet}). Darknet-53 is more accurate than
Darknet-19 and compares to ResNet-101, but can process more images per Darknet-19 and compares to ResNet-101 but can process more images per
second (\qty{78}{fps} versus \qty{53}{fps}). The activation function second (\qty{78}{fps} versus \qty{53}{fps}). The activation function
throughout the network is still leaky \gls{relu}, as in earlier throughout the network is still leaky \gls{relu}, as in earlier
versions. versions.
@ -2320,7 +2375,7 @@ the \emph{n} stands for \emph{nano}. Additional versions with
increasing parameters are \gls{yolo}v5s (small), \gls{yolo}v5m increasing parameters are \gls{yolo}v5s (small), \gls{yolo}v5m
(medium), \gls{yolo}v5l (large), and \gls{yolo}v5x (extra large). The (medium), \gls{yolo}v5l (large), and \gls{yolo}v5x (extra large). The
smaller models are intended to be used in resource constrained smaller models are intended to be used in resource constrained
environments such as edge devices, but come with a cost in environments such as edge devices but come with a cost in
accuracy. Conversely, the larger models are for tasks where high accuracy. Conversely, the larger models are for tasks where high
accuracy is paramount and enough computational resources are accuracy is paramount and enough computational resources are
available. The \gls{yolo}v5x model achieves a \gls{map} of 50.7\% on available. The \gls{yolo}v5x model achieves a \gls{map} of 50.7\% on
@ -2570,7 +2625,7 @@ loss from figure~\ref{fig:box-obj-loss}, we speculate that the
pre-trained model already generalizes well to plant detection because pre-trained model already generalizes well to plant detection because
one of the categories in the \gls{coco} \cite{lin2015} dataset is one of the categories in the \gls{coco} \cite{lin2015} dataset is
\emph{potted plant}. Any further training solely impacts the \emph{potted plant}. Any further training solely impacts the
confidence of detection, but does not lead to higher detection confidence of detection but does not lead to higher detection
rates. This conclusion is supported by the increasing rates. This conclusion is supported by the increasing
\gls{map}@0.5:0.95 until epoch \num{133}. \gls{map}@0.5:0.95 until epoch \num{133}.
@ -2668,10 +2723,10 @@ percentage points higher than in the non-optimized version.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics{graphics/precision_recall_final.pdf} \includegraphics{graphics/precision_recall_final.pdf}
\caption[Hyper-parameter optimized object detection precision and \caption[Hyperparameter optimized object detection precision and
recall during training.]{Overall precision and recall during recall during training]{Overall precision and recall during training
training for each epoch of the optimized model. The vertical gray for each epoch of the optimized model. The vertical gray line at
line at \num{27} marks the epoch with the highest fitness.} \num{27} marks the epoch with the highest fitness.}
\label{fig:hyp-opt-prec-rec} \label{fig:hyp-opt-prec-rec}
\end{figure} \end{figure}
@ -2684,8 +2739,8 @@ non-optimized version and recall hovers at the same levels.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics{graphics/val_box_obj_loss_final.pdf} \includegraphics{graphics/val_box_obj_loss_final.pdf}
\caption[Hyper-parameter optimized object detection box and object \caption[Hyperparameter optimized object detection box and object
loss.]{Box and object loss measured against the validation set of loss]{Box and object loss measured against the validation set of
\num{3091} images and \num{4092} ground truth labels. The class \num{3091} images and \num{4092} ground truth labels. The class
loss is omitted because there is only one class in the dataset and loss is omitted because there is only one class in the dataset and
the loss is therefore always zero.} the loss is therefore always zero.}
@ -2925,13 +2980,13 @@ plant}, \emph{wilted plant} and \emph{stressed plant}. Images which
clearly show one or multiple plants with some amount of visible stress clearly show one or multiple plants with some amount of visible stress
were added to the dataset. Care was taken to include plants with were added to the dataset. Care was taken to include plants with
various degrees of stress and in various locations and lighting various degrees of stress and in various locations and lighting
conditions. The search not only provided images of stressed plants, conditions. The search not only provided images of stressed plants but
but also of healthy plants. The dataset is biased towards potted also of healthy plants. The dataset is biased towards potted plants
plants which are commonly put on display in western which are commonly put on display in western households. Furthermore,
households. Furthermore, many plants, such as succulents, are sought many plants, such as succulents, are sought after for home
after for home environments because of their ease of maintenance. Due environments because of their ease of maintenance. Due to their
to their inclusion in the dataset and how they exhibit water stress, inclusion in the dataset and how they exhibit water stress, the test
the test set contains a wide variety of scenarios. set contains a wide variety of scenarios.
After collecting the images, the aggregate model was run on them to After collecting the images, the aggregate model was run on them to
obtain initial bounding boxes and classifications for ground truth obtain initial bounding boxes and classifications for ground truth
@ -3054,8 +3109,8 @@ from table~\ref{tab:yolo-metrics-hyp} are better, the
\begin{figure} \begin{figure}
\centering \centering
\includegraphics{graphics/APpt5-pt95-final.pdf} \includegraphics{graphics/APpt5-pt95-final.pdf}
\caption[Hyper-parameter optimized object detection AP@0.5 and \caption[Hyperparameter optimized object detection AP@0.5 and
AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of AP@0.95]{Precision-recall curves for \gls{iou} thresholds of
\num{0.5} and \num{0.95}. The \gls{ap} of a specific threshold is \num{0.5} and \num{0.95}. The \gls{ap} of a specific threshold is
defined as the area under the precision-recall curve of that defined as the area under the precision-recall curve of that
threshold. The \gls{map} across \gls{iou} thresholds from threshold. The \gls{map} across \gls{iou} thresholds from
@ -3263,7 +3318,7 @@ therefore the cutoff for either class.
So far the metrics shown in table~\ref{tab:model-metrics} are obtained So far the metrics shown in table~\ref{tab:model-metrics} are obtained
with the non-optimized versions of both the object detection and with the non-optimized versions of both the object detection and
classification model. Hyper-parameter optimization of the classifier classification model. Hyperparameter optimization of the classifier
led to significant model improvements, while the object detector has led to significant model improvements, while the object detector has
improved precision but lower recall and slightly lower \gls{map} improved precision but lower recall and slightly lower \gls{map}
values. To evaluate the final aggregate model which consists of the values. To evaluate the final aggregate model which consists of the
@ -3298,7 +3353,7 @@ class. Precision for the stressed class is lower with the optimized
model, but recall is significantly higher (\num{0.502} model, but recall is significantly higher (\num{0.502}
vs. \num{0.623}). The higher recall results in a three percentage vs. \num{0.623}). The higher recall results in a three percentage
point gain for the $\mathrm{F}_1$-score in the stressed point gain for the $\mathrm{F}_1$-score in the stressed
class. Overall, precision is the same but recall has improved class. Overall, precision is the same, but recall has improved
significantly, which also results in a noticeable improvement for the significantly, which also results in a noticeable improvement for the
average $\mathrm{F}_1$-score across both classes. average $\mathrm{F}_1$-score across both classes.
@ -3340,7 +3395,7 @@ optimal for our purposes. The class \emph{plant} does not seem to have
been defined rigorously. Large patches of grass, for example, are been defined rigorously. Large patches of grass, for example, are
labeled with large bounding boxes. Trees are sometimes labeled, but labeled with large bounding boxes. Trees are sometimes labeled, but
only if their size suggests that they could be bushes or similar types only if their size suggests that they could be bushes or similar types
of plant. Large corn fields are also labeled as plants, but again with of plant. Large corn fields are also labeled as plants but again with
one large bounding box. If multiple plants are densely packed, the one large bounding box. If multiple plants are densely packed, the
annotators often label them as belonging to one plant and thus one annotators often label them as belonging to one plant and thus one
bounding box. Sometimes the effort has been made to delineate plants bounding box. Sometimes the effort has been made to delineate plants
@ -3462,20 +3517,115 @@ direction.
\chapter{Conclusion} \chapter{Conclusion}
\label{chap:conclusion} \label{chap:conclusion}
Conclude the thesis with a short recap of the results and the In this thesis, we have developed a prototype system for plant
discussion. Establish whether the research questions from detection and classification using a machine learning model deployed
section~\ref{sec:methods} can be answered successfully. on an edge device. The model consists of a two-stage approach wherein
the first stage detects plants and the second stage classifies
them. This approach has been chosen because of the limited
availability of data to train one model end-to-end and comes with
downsides such as an increased error rate and additional training,
optimization, and evaluation complexity. Despite these downsides, the
prototype performs well in the homeowner context where the variety of
plants is limited. This conclusion is supported by the metrics
discussed in chapter~\ref{chap:evaluation}.
Estimated 2 pages for this chapter. The optimization of the model has been shown to require a substantial
amount of computational resources and proved to be difficult to get
right. The object detection model in particular needs many iterations
during the hyperparameter search to converge to a global optimum. We
attribute these difficulties to the model complexity of the \gls{yolo}
series and the numerous hyperparameters which are available. The
classifier, however, is comparatively simpler from an architectural
standpoint and lends itself more easily to optimization.
Revisiting the research questions posed in
section~\ref{sec:motivation}, we can now assess the extent to which
our findings have addressed them.
\begin{enumerate}
\item \emph{How well does the model work in theory and how well in
practice?}
The optimized model achieves a \gls{map} of \num{0.3838} which
suggests that the prototype works well on unseen data. The plant
detection is robust, particularly for household plants and the
classifier shows strong performance for a wide array of common
plants. Contrary to our expectations, the stress classification is
not more difficult than the detection step. In fact, the problems we
encountered during the optimization of the detection model are
likely to stem from the increased complexity of the detection versus
the classification task. The various different ways in which plants
show water stress does not seem to be a limiting factor for stress
classification.
\item \emph{What are possible reasons for it to work/not work?}
We have demonstrated possible reasons for why either the constituent
models or the aggregate model underperform. In general, we conclude
that the prototype does work and can be used within the context
established in chapter~\ref{chap:introduction}. Our expectation that
dataset curation will play a major role in successfully implementing
the prototype turned out to be true. For example, some of the
problems with the plant detection model can be attributed to the
inconsistent labeling information present in the \gls{oid}. Care had
to be taken during the creation of the dataset the aggregate model
was evaluated on to not introduce a bias which favors the
predictions.
\item \emph{What are possible improvements to the system in the
future?}
Specific improvements to the prototype include curating bigger
datasets to train on, running the hyperparameter optimization for
more iterations and more epochs per iteration, and including experts
such as botanists to establish higher confidence in the ground
truth. Unfortunately, the first two suggestions result in a
significantly higher computational cost during training,
optimization, and evaluation. This observation applies to most
machine learning models and there is always a trade-off between
model performance and training/optimization time.
\end{enumerate}
\section{Future Work} \section{Future Work}
\label{sec:future-work} \label{sec:future-work}
Suggest further research directions regarding the approach. Give an An interesting further research direction for plant detection and
outlook on further possibilities in this research field with respect classification is exploring the viability of single-stage
to object detection and plant classification. approaches. Even though our two-stage approach leads to acceptable
results, we believe that incorporating the classification step into
the plant detection step would likely yield better results. A unified
single-stage approach does not fully deal with the problem of
propagated errors but should substantially reduce it. An advantage of
this approach is that the resulting model could be optimized more
easily because the loss function is dependent on object detection as
well as classification. A disadvantage, however,—and this is the
reason why we have not adopted such an approach—is that a unified
model also needs large datasets it can be trained on.
Estimated 1 page for this section Additional datasets to train a plant detection and classification
model on are needed. While we were able to find separate datasets to
train the individual models on, it also meant that we were not able to
implement the aforementioned single-stage approach. If there is enough
interest in this research problem, it should be possible to create
large datasets which encode expert knowledge. Since there is such a
variety of plants and how they express nutrient deficiencies or
illnesses, only experts are able to create correct ground truth labels
for all or most of them. In the limited context of this thesis, we
were able to label common household plants with additional information
from the Internet. As soon as more exotic plants are added to the
datasets, layman knowledge reaches its limits. Having more and better
ground truth labels should result in better detection and
classification performance as well as a more robust evaluation.
Future research could add additional information to the datasets such
that models are able to work with more data. For example, including
images of plants in the infrared spectrum would provide a visual
measure of evapotranspiration. This additional perspective might allow
the model to better discriminate between stressed and non-stressed
plants. Other valuable perspectives could be provided by sensor data
which track soil moisture, humidity and radiant flux. Although this
has been done in single-plant agricultural settings, it has not been
tried in a multi-plant household context.
\backmatter \backmatter

Binary file not shown.