Final version of thesis

2023-12-27 16:45:43 +01:00 · 2023-12-27 16:45:43 +01:00 · a6ed18303e
commit a6ed18303e
parent ee8f4dec85
4 changed files with 221 additions and 72 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -244,7 +244,7 @@
@online{chan2020,
  title = {Healthy and {{Wilted Houseplant Images}}},
  author = {Chan, Russell},
-  date = {2020-01-17},
+  date = {2020-01},
  url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images},
  urldate = {2023-12-08},
  abstract = {A collection of 904 houseplant images, classified as either healthy or wilted},
@ -1422,7 +1422,6 @@
  pages = {71--83},
  publisher = {{World Scientific Publishing Co.}},
  issn = {2301-3850},
-  doi = {10.1142/S2301385020500053},
  keywords = {Area-wise classification,Support Vector Machine (SVM),Unmanned Aerial Vehicle (UAV),wheat drought mapping},
  file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
 }
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -1,6 +1,6 @@
 % Copyright (C) 2014-2020 by Thomas Auzinger <thomas@auzinger.name>

-\documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information.
+\documentclass[final]{vutinfth} % Remove option 'final' to obtain debug information.

 % Load packages to allow in- and output of non-ASCII characters.
 \usepackage{lmodern}        % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters.
@ -51,7 +51,7 @@
 \nonzeroparskip             % Create space between paragraphs (optional).
 \setlength{\parindent}{0pt} % Remove paragraph identation (optional).

-\setcounter{tocdepth}{3}
+\setcounter{tocdepth}{2}

 \makeindex      % Use an optional index.
 \makeglossaries % Use an optional glossary.
@ -64,7 +64,7 @@
 \setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male}

 \setregnumber{01527193}
-\setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}.
+\setdate{27}{12}{2023} % Set date with 3 arguments: {day}{month}{year}.
 \settitle{\thesistitle}{Plant Detection and State Classification with Machine Learning} % Sets English and German version of the title (both can be English or German).

 % Select the thesis type: bachelor / master / doctor / phd-school.
@ -153,19 +153,73 @@ Challenge}
 \addstatementpage

 \begin{danksagung*}
-\todo{Ihr Text hier.}
+  Ich danke vor allem dem Betreuer dieser Arbeit—Horst Eidenberger—für
+  die rasche und zeitnahe Bearbeitung meiner Anliegen sowie das
+  aussagekräftige Feedback, das ich laufend erhalten habe. Die von ihm
+  vorgegebene Roadmap erleichtert das Arbeiten nicht nur in der
+  Anfangsphase, sondern auch später ungemein.
+
+  Meine Eltern sind diejenigen, die mir das Studium überhaupt erst
+  ermöglicht haben. Die laufende Unterstützung auch in den späteren
+  Jahren—primär in Form von gemeinsamen Mittagessen—ließ mich auf das
+  konzentrieren, was wichtig ist.
+
+  Zu guter Letzt geht noch ein Dank an meine Studienkollegen, die bei
+  Fragen, zu Diskussionen und zum Entspannen immer erreichbar
+  waren. Der gelegentliche Austausch macht mir immer Freude und ich
+  hoffe, dass wir dies auch in Zukunft aufrechterhalten können.
 \end{danksagung*}

 \begin{acknowledgements*}
-\todo{Enter your text here.}
+  I would especially like to thank the supervisor of this thesis—Horst
+  Eidenberger—for the quick and prompt processing of my requests as
+  well as the meaningful feedback I received on an ongoing basis. The
+  roadmap he provided not only made this thesis easier in the initial
+  phase but also later on.
+
+  My parents are the ones who made my studies possible in the first
+  place. The ongoing support, even in the later years—primarily in
+  the form of shared lunches—allowed me to concentrate on what is
+  important.
+
+  Last but not least, I would like to thank my fellow colleagues, who
+  have always been available for questions, discussions and
+  relaxation. I always enjoy the occasional exchange and I hope that
+  we can continue to do those in the future.
 \end{acknowledgements*}

 \begin{kurzfassung}
-\todo{Ihr Text hier.}
+  Wassermangel in Zimmerpflanzen kann ihr Wachstum negativ
+  beeinflussen. Derzeitige Lösungen zur Überwachung von Wasserstress
+  sind hauptsächlich für landwirtschaftliche Anwendungen
+  vorgesehen. Wir präsentieren den ersten Deep-Learning-basierten
+  Prototyp zur Klassifizierung des Wasserstresslevels gängiger
+  Zimmerpflanzen. Unser zweistufiger Ansatz besteht aus einem
+  Erkennungs- und einem Klassifizierungsschritt und wird anhand eines
+  eigens erstellten Datensatzes evaluiert. Die Parameter des Modells
+  werden mit gängigen Methoden der Hyperparameteroptimierung
+  ausgewählt. Der Prototyp wird auf einem embedded Computer
+  bereitgestellt, der eine autonome Pflanzenüberwachung
+  ermöglicht. Die Vorhersagen unseres Modells werden kontinuierlich
+  über eine API veröffentlicht, wodurch nachgelagerte
+  Bewässerungssysteme automatisch Zimmerpflanzen ohne menschliche
+  Intervention bewässern können. Unser optimiertes Modell erreicht
+  einen mAP-Wert von \num{0.3838}.
 \end{kurzfassung}

 \begin{abstract}
-\todo{Enter your text here.}
+  Water deficiency in household plants can adversely affect
+  growth. Existing solutions to monitor water stress are primarily
+  intended for agricultural contexts. We present the first deep
+  learning based prototype to classify water stress of common
+  household plants. Our two-stage approach consists of a detection and
+  a classification step and is evaluated on a new dataset. The model
+  parameters are optimized with a hyperparameter search. The prototype
+  is deployed to an embedded device enabling autonomous plant
+  monitoring. The predictions of our model are published continuously
+  via an API, allowing downstream watering systems to automatically
+  water household plants without human intervention. Our optimized
+  model achieves a mAP of \num{0.3838} on unseen images.
 \end{abstract}

 % Select the language of the thesis, e.g., english or naustrian.
@ -389,7 +443,7 @@ problem further so that additional requirements which arise during
 development can be incorporated properly.

 The prototyping process is embedded within the concepts of the
-\emph{Scientific Method}. This thesis not only produces a prototype,
+\emph{Scientific Method}. This thesis not only produces a prototype
 but also explores the problem of plant detection and classification
 scientifically. Exploration of the problem requires making falsifiable
 hypotheses (see section~\ref{sec:motivation}), gathering empirical
@ -652,7 +706,7 @@ The simplest activation function is the identity function. It is defined as
 If all layers in an artificial neural network use the identity
 activation function, the network is equivalent to a single-layer
 structure. The identity function is often used for layers which do not
-need an activation function per se, but require one to uphold
+need an activation function per se but require one to uphold
 consistency with the rest of the network structure.

 \subsubsection{Heaviside Step}
@ -865,11 +919,11 @@ network for an input-output pair. The algorithm computes the gradient
 iteratively starting from the last layer and works its way backward
 through the network until it reaches the first layer.

-Strictly speaking, backpropagation only computes the gradient, but
-does not determine how the gradient is used to learn the new
-weights. Once the backpropagation algorithm has computed the gradient,
-that gradient is passed to an algorithm which finds a local minimum of
-it. This step is usually performed by some variant of gradient descent
+Strictly speaking, backpropagation only computes the gradient but does
+not determine how the gradient is used to learn the new weights. Once
+the backpropagation algorithm has computed the gradient, that gradient
+is passed to an algorithm which finds a local minimum of it. This step
+is usually performed by some variant of gradient descent
 \cite{cauchy1847}.

 \section{Object Detection}
@ -1129,19 +1183,20 @@ on all levels. \glspl{fpn} are an important building block of many
 state-of-the-art object detectors.

 A \gls{fpn} first computes the feature pyramid bottom-up with a
-scaling step of two. The lower levels capture less semantic information
-than the higher levels, but include more spatial information due to
-the higher granularity. In a second step, the \gls{fpn} upsamples the
-higher levels such that the dimensions of two consecutive layers are
-the same. The upsampled top layer is merged with the layer beneath it
-via element-wise addition and convolved with a one by one
-convolutional layer to reduce channel dimensions and to smooth out
-potential artifacts introduced during the upsampling step. The results
-of that operation constitute the new \emph{top layer} and the process
-continues with the layer below it until the finest resolution feature
-map is generated. In this way, the features of the different layers at
-different scales are fused to obtain a feature map with high semantic
-information but also high spatial information.
+scaling step of two. The lower levels capture less semantic
+information than the higher levels but include more spatial
+information due to the higher granularity. In a second step, the
+\gls{fpn} upsamples the higher levels such that the dimensions of two
+consecutive layers are the same. The upsampled top layer is merged
+with the layer beneath it via element-wise addition and convolved with
+a one by one convolutional layer to reduce channel dimensions and to
+smooth out potential artifacts introduced during the upsampling
+step. The results of that operation constitute the new \emph{top
+  layer} and the process continues with the layer below it until the
+finest resolution feature map is generated. In this way, the features
+of the different layers at different scales are fused to obtain a
+feature map with high semantic information but also high spatial
+information.

 \textcite{lin2017} report results on \gls{coco} with a \gls{map}@0.5
 of 59.1\% with a Faster R-\gls{cnn} structure and a ResNet-101
@ -1184,7 +1239,7 @@ architecture.

 \gls{ssd} was proposed by \textcite{liu2016} and functions similarly
 to \gls{yolo} in that it does not need an extra proposal generation
-step, but instead detects and classifies objects in one go. The aim of
+step but instead detects and classifies objects in one go. The aim of
 one-stage detectors is to be considerably faster and at least as
 accurate as two-stage detectors. While \gls{yolo} paved the way for
 one-stage detectors, the detection accuracy is significantly lower
@ -1200,7 +1255,7 @@ last fully-connected layer removed such that the output of the
 \gls{cnn} is a scaled down representation of the input image. The
 extra layers are intended to capture features at different scales and
 compare them during training to a range of default anchor boxes. This
-idea comes from MultiBox \cite{erhan2014}, but is implemented in
+idea comes from MultiBox \cite{erhan2014} but is implemented in
 \gls{ssd} with a slight twist: during matching of default boxes to the
 ground truth, boxes with a Jaccard overlap (\gls{iou}) of less than
 $0.5$ are discarded. In one-stage detector terms, the feature
@ -1446,7 +1501,7 @@ rate of 1.6\% over their own replicated AlexNet result of 18.1\%.
 GoogLeNet, also known as Inception v1, was proposed by
 \textcite{szegedy2015} to increase the depth of the network without
 introducing too much additional complexity. Since the relevant parts
-of an image can often be of different sizes, but kernels within
+of an image can often be of different sizes but kernels within
 convolutional layers are fixed, there is a mismatch between what can
 realistically be detected by the layers and what is present in the
 dataset. Therefore, the authors propose to perform multiple
@ -1521,7 +1576,7 @@ The authors of DenseNet \cite{huang2017} go one step further than
 ResNets by connecting every convolutional layer to every other layer
 in the chain. Previously, each layer was connected in sequence with
 the one before and the one after it. Residual connections establish a
-link between the previous layer and the next one, but still do not
+link between the previous layer and the next one but still do not
 always propagate enough information forward. These \emph{shortcut
 connections} from earlier layers to later layers are thus only taking
 place in an episodic way for short sections in the chain. DenseNets
@ -1784,7 +1839,7 @@ strategies roughly goes through the following stages
  condition is reached.
 \end{enumerate}

-This strategy is more efficient than grid search or random search, but
+This strategy is more efficient than grid search or random search but
 requires a substantial amount of iterations for good solutions and can
 thus be too expensive for hyperparameter optimization
 \cite{bischl2023}. We use an evolution strategy based on a genetic
@ -2101,13 +2156,13 @@ set which only comprises individual plants and their associated
 classification (\emph{stressed} or \emph{healthy}).

 Both datasets (object detection and classification) only allow us to
-train and validate each model separately. A third dataset is needed
-to evaluate the detection/classification pipeline as a whole. To this
+train and validate each model separately. A third dataset is needed to
+evaluate the detection/classification pipeline as a whole. To this
 end, we construct our own dataset where all plants per image are
 labeled with bounding boxes as well as the classes \emph{stressed} or
 \emph{healthy}. This dataset is small in comparison to the one with
-which the object detection model is trained, but suffices because it
-is only used for evaluation. Labeling each sample in the evaluation
+which the object detection model is trained but suffices because it is
+only used for evaluation. Labeling each sample in the evaluation
 dataset manually is still a laborious task which is why each image is
 \emph{preannotated} by the already existing object detection and
 classification model. The task of labeling thus becomes a task of
@ -2120,9 +2175,9 @@ models.
 In the following sections we will go into detail about the two
 selected architectures for our prototype. The object detector we
 chose---\gls{yolo}v7---is part of a larger family of models which all
-function similarly, but have undergone substantial changes from
-version to version. In order to understand the used model, we trace
-the improvements to the \gls{yolo} family from version one to version
+function similarly but have undergone substantial changes from version
+to version. In order to understand the used model, we trace the
+improvements to the \gls{yolo} family from version one to version
 seven. For the classification stage, we have opted for a ResNet
 architecture which is also described in detail.

@ -2162,7 +2217,7 @@ output from a one by one layer. This approach reduces complexity
 substantially---as has been demonstrated with GoogleNet. Every block of
 convolutional layers is followed by a two by two maxpool layer for
 downsampling. The model expects an input image of size $448$ by $448$
-pixels, but has been pretrained on ImageNet with half that resolution
+pixels but has been pretrained on ImageNet with half that resolution
 (i.e. $224$ by $224$ pixels). After the convolutional layers, the
 authors add two fully-connected layers to produce an output of size
 $7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data
@ -2227,7 +2282,7 @@ predefined boxes which are called \emph{anchor boxes}. The authors
 note that finding a good set of prior anchor boxes by hand is
 error-prone and suggest finding them via $k$-means clustering
 (dimension priors). They select five anchor boxes per grid cell which
-still results in high recall, but does not introduce too much
+still results in high recall but does not introduce too much
 complexity.

 These additional details result in an improved \gls{map} of 78.6\% on
@ -2245,7 +2300,7 @@ structures of state-of-the-art models at the time, the authors
 introduce a deeper feature extractor called Darknet-53. It makes use
 of the residual connections popularized by ResNet \cite{he2016} (see
 section~\ref{sssec:theory-resnet}). Darknet-53 is more accurate than
-Darknet-19 and compares to ResNet-101, but can process more images per
+Darknet-19 and compares to ResNet-101 but can process more images per
 second (\qty{78}{fps} versus \qty{53}{fps}). The activation function
 throughout the network is still leaky \gls{relu}, as in earlier
 versions.
@ -2320,7 +2375,7 @@ the \emph{n} stands for \emph{nano}. Additional versions with
 increasing parameters are \gls{yolo}v5s (small), \gls{yolo}v5m
 (medium), \gls{yolo}v5l (large), and \gls{yolo}v5x (extra large). The
 smaller models are intended to be used in resource constrained
-environments such as edge devices, but come with a cost in
+environments such as edge devices but come with a cost in
 accuracy. Conversely, the larger models are for tasks where high
 accuracy is paramount and enough computational resources are
 available. The \gls{yolo}v5x model achieves a \gls{map} of 50.7\% on
@ -2570,7 +2625,7 @@ loss from figure~\ref{fig:box-obj-loss}, we speculate that the
 pre-trained model already generalizes well to plant detection because
 one of the categories in the \gls{coco} \cite{lin2015} dataset is
 \emph{potted plant}. Any further training solely impacts the
-confidence of detection, but does not lead to higher detection
+confidence of detection but does not lead to higher detection
 rates. This conclusion is supported by the increasing
 \gls{map}@0.5:0.95 until epoch \num{133}.

@ -2668,10 +2723,10 @@ percentage points higher than in the non-optimized version.
 \begin{figure}
  \centering
  \includegraphics{graphics/precision_recall_final.pdf}
-  \caption[Hyper-parameter optimized object detection precision and
-  recall during training.]{Overall precision and recall during
-    training for each epoch of the optimized model. The vertical gray
-    line at \num{27} marks the epoch with the highest fitness.}
+  \caption[Hyperparameter optimized object detection precision and
+  recall during training]{Overall precision and recall during training
+    for each epoch of the optimized model. The vertical gray line at
+    \num{27} marks the epoch with the highest fitness.}
  \label{fig:hyp-opt-prec-rec}
 \end{figure}

@ -2684,8 +2739,8 @@ non-optimized version and recall hovers at the same levels.
 \begin{figure}
  \centering
  \includegraphics{graphics/val_box_obj_loss_final.pdf}
-  \caption[Hyper-parameter optimized object detection box and object
-  loss.]{Box and object loss measured against the validation set of
+  \caption[Hyperparameter optimized object detection box and object
+  loss]{Box and object loss measured against the validation set of
    \num{3091} images and \num{4092} ground truth labels. The class
    loss is omitted because there is only one class in the dataset and
    the loss is therefore always zero.}
@ -2925,13 +2980,13 @@ plant}, \emph{wilted plant} and \emph{stressed plant}. Images which
 clearly show one or multiple plants with some amount of visible stress
 were added to the dataset. Care was taken to include plants with
 various degrees of stress and in various locations and lighting
-conditions. The search not only provided images of stressed plants,
-but also of healthy plants. The dataset is biased towards potted
-plants which are commonly put on display in western
-households. Furthermore, many plants, such as succulents, are sought
-after for home environments because of their ease of maintenance. Due
-to their inclusion in the dataset and how they exhibit water stress,
-the test set contains a wide variety of scenarios.
+conditions. The search not only provided images of stressed plants but
+also of healthy plants. The dataset is biased towards potted plants
+which are commonly put on display in western households. Furthermore,
+many plants, such as succulents, are sought after for home
+environments because of their ease of maintenance. Due to their
+inclusion in the dataset and how they exhibit water stress, the test
+set contains a wide variety of scenarios.

 After collecting the images, the aggregate model was run on them to
 obtain initial bounding boxes and classifications for ground truth
@ -3054,8 +3109,8 @@ from table~\ref{tab:yolo-metrics-hyp} are better, the
 \begin{figure}
  \centering
  \includegraphics{graphics/APpt5-pt95-final.pdf}
-  \caption[Hyper-parameter optimized object detection AP@0.5 and
-  AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of
+  \caption[Hyperparameter optimized object detection AP@0.5 and
+  AP@0.95]{Precision-recall curves for \gls{iou} thresholds of
    \num{0.5} and \num{0.95}. The \gls{ap} of a specific threshold is
    defined as the area under the precision-recall curve of that
    threshold. The \gls{map} across \gls{iou} thresholds from
@ -3263,7 +3318,7 @@ therefore the cutoff for either class.

 So far the metrics shown in table~\ref{tab:model-metrics} are obtained
 with the non-optimized versions of both the object detection and
-classification model. Hyper-parameter optimization of the classifier
+classification model. Hyperparameter optimization of the classifier
 led to significant model improvements, while the object detector has
 improved precision but lower recall and slightly lower \gls{map}
 values. To evaluate the final aggregate model which consists of the
@ -3298,7 +3353,7 @@ class. Precision for the stressed class is lower with the optimized
 model, but recall is significantly higher (\num{0.502}
 vs. \num{0.623}). The higher recall results in a three percentage
 point gain for the $\mathrm{F}_1$-score in the stressed
-class. Overall, precision is the same but recall has improved
+class. Overall, precision is the same, but recall has improved
 significantly, which also results in a noticeable improvement for the
 average $\mathrm{F}_1$-score across both classes.

@ -3340,7 +3395,7 @@ optimal for our purposes. The class \emph{plant} does not seem to have
 been defined rigorously. Large patches of grass, for example, are
 labeled with large bounding boxes. Trees are sometimes labeled, but
 only if their size suggests that they could be bushes or similar types
-of plant. Large corn fields are also labeled as plants, but again with
+of plant. Large corn fields are also labeled as plants but again with
 one large bounding box. If multiple plants are densely packed, the
 annotators often label them as belonging to one plant and thus one
 bounding box. Sometimes the effort has been made to delineate plants
@ -3462,20 +3517,115 @@ direction.
 \chapter{Conclusion}
 \label{chap:conclusion}

-Conclude the thesis with a short recap of the results and the
-discussion. Establish whether the research questions from
-section~\ref{sec:methods} can be answered successfully.
+In this thesis, we have developed a prototype system for plant
+detection and classification using a machine learning model deployed
+on an edge device. The model consists of a two-stage approach wherein
+the first stage detects plants and the second stage classifies
+them. This approach has been chosen because of the limited
+availability of data to train one model end-to-end and comes with
+downsides such as an increased error rate and additional training,
+optimization, and evaluation complexity. Despite these downsides, the
+prototype performs well in the homeowner context where the variety of
+plants is limited. This conclusion is supported by the metrics
+discussed in chapter~\ref{chap:evaluation}.

-Estimated 2 pages for this chapter.
+The optimization of the model has been shown to require a substantial
+amount of computational resources and proved to be difficult to get
+right. The object detection model in particular needs many iterations
+during the hyperparameter search to converge to a global optimum. We
+attribute these difficulties to the model complexity of the \gls{yolo}
+series and the numerous hyperparameters which are available. The
+classifier, however, is comparatively simpler from an architectural
+standpoint and lends itself more easily to optimization. 
+
+Revisiting the research questions posed in
+section~\ref{sec:motivation}, we can now assess the extent to which
+our findings have addressed them.
+
+\begin{enumerate}
+\item \emph{How well does the model work in theory and how well in
+    practice?}
+
+  The optimized model achieves a \gls{map} of \num{0.3838} which
+  suggests that the prototype works well on unseen data. The plant
+  detection is robust, particularly for household plants and the
+  classifier shows strong performance for a wide array of common
+  plants. Contrary to our expectations, the stress classification is
+  not more difficult than the detection step. In fact, the problems we
+  encountered during the optimization of the detection model are
+  likely to stem from the increased complexity of the detection versus
+  the classification task. The various different ways in which plants
+  show water stress does not seem to be a limiting factor for stress
+  classification.
+
+\item \emph{What are possible reasons for it to work/not work?}
+
+  We have demonstrated possible reasons for why either the constituent
+  models or the aggregate model underperform. In general, we conclude
+  that the prototype does work and can be used within the context
+  established in chapter~\ref{chap:introduction}. Our expectation that
+  dataset curation will play a major role in successfully implementing
+  the prototype turned out to be true. For example, some of the
+  problems with the plant detection model can be attributed to the
+  inconsistent labeling information present in the \gls{oid}. Care had
+  to be taken during the creation of the dataset the aggregate model
+  was evaluated on to not introduce a bias which favors the
+  predictions.
+
+\item \emph{What are possible improvements to the system in the
+    future?}
+
+  Specific improvements to the prototype include curating bigger
+  datasets to train on, running the hyperparameter optimization for
+  more iterations and more epochs per iteration, and including experts
+  such as botanists to establish higher confidence in the ground
+  truth. Unfortunately, the first two suggestions result in a
+  significantly higher computational cost during training,
+  optimization, and evaluation. This observation applies to most
+  machine learning models and there is always a trade-off between
+  model performance and training/optimization time.
+\end{enumerate}

 \section{Future Work}
 \label{sec:future-work}

-Suggest further research directions regarding the approach. Give an
-outlook on further possibilities in this research field with respect
-to object detection and plant classification.
+An interesting further research direction for plant detection and
+classification is exploring the viability of single-stage
+approaches. Even though our two-stage approach leads to acceptable
+results, we believe that incorporating the classification step into
+the plant detection step would likely yield better results. A unified
+single-stage approach does not fully deal with the problem of
+propagated errors but should substantially reduce it. An advantage of
+this approach is that the resulting model could be optimized more
+easily because the loss function is dependent on object detection as
+well as classification. A disadvantage, however,—and this is the
+reason why we have not adopted such an approach—is that a unified
+model also needs large datasets it can be trained on.

-Estimated 1 page for this section
+Additional datasets to train a plant detection and classification
+model on are needed. While we were able to find separate datasets to
+train the individual models on, it also meant that we were not able to
+implement the aforementioned single-stage approach. If there is enough
+interest in this research problem, it should be possible to create
+large datasets which encode expert knowledge. Since there is such a
+variety of plants and how they express nutrient deficiencies or
+illnesses, only experts are able to create correct ground truth labels
+for all or most of them. In the limited context of this thesis, we
+were able to label common household plants with additional information
+from the Internet. As soon as more exotic plants are added to the
+datasets, layman knowledge reaches its limits. Having more and better
+ground truth labels should result in better detection and
+classification performance as well as a more robust evaluation.
+
+Future research could add additional information to the datasets such
+that models are able to work with more data. For example, including
+images of plants in the infrared spectrum would provide a visual
+measure of evapotranspiration. This additional perspective might allow
+the model to better discriminate between stressed and non-stressed
+plants. Other valuable perspectives could be provided by sensor data
+which track soil moisture, humidity and radiant flux. Although this
+has been done in single-plant agricultural settings, it has not been
+tried in a multi-plant household context.

 \backmatter

--- a/thesis/vutinfth.pdf
+++ b/thesis/vutinfth.pdf