diff --git a/thesis/references.bib b/thesis/references.bib index 5de391a..de49568 100644 --- a/thesis/references.bib +++ b/thesis/references.bib @@ -244,7 +244,7 @@ @online{chan2020, title = {Healthy and {{Wilted Houseplant Images}}}, author = {Chan, Russell}, - date = {2020-01-17}, + date = {2020-01}, url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images}, urldate = {2023-12-08}, abstract = {A collection of 904 houseplant images, classified as either healthy or wilted}, @@ -1422,7 +1422,6 @@ pages = {71--83}, publisher = {{World Scientific Publishing Co.}}, issn = {2301-3850}, - doi = {10.1142/S2301385020500053}, keywords = {Area-wise classification,Support Vector Machine (SVM),Unmanned Aerial Vehicle (UAV),wheat drought mapping}, file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf} } diff --git a/thesis/thesis.pdf b/thesis/thesis.pdf index 7271d59..a0b7fec 100644 Binary files a/thesis/thesis.pdf and b/thesis/thesis.pdf differ diff --git a/thesis/thesis.tex b/thesis/thesis.tex index 79cbb4a..532f369 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -1,6 +1,6 @@ % Copyright (C) 2014-2020 by Thomas Auzinger -\documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information. +\documentclass[final]{vutinfth} % Remove option 'final' to obtain debug information. % Load packages to allow in- and output of non-ASCII characters. \usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters. @@ -51,7 +51,7 @@ \nonzeroparskip % Create space between paragraphs (optional). \setlength{\parindent}{0pt} % Remove paragraph identation (optional). -\setcounter{tocdepth}{3} +\setcounter{tocdepth}{2} \makeindex % Use an optional index. \makeglossaries % Use an optional glossary. @@ -64,7 +64,7 @@ \setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male} \setregnumber{01527193} -\setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}. +\setdate{27}{12}{2023} % Set date with 3 arguments: {day}{month}{year}. \settitle{\thesistitle}{Plant Detection and State Classification with Machine Learning} % Sets English and German version of the title (both can be English or German). % Select the thesis type: bachelor / master / doctor / phd-school. @@ -153,19 +153,73 @@ Challenge} \addstatementpage \begin{danksagung*} -\todo{Ihr Text hier.} + Ich danke vor allem dem Betreuer dieser Arbeit—Horst Eidenberger—für + die rasche und zeitnahe Bearbeitung meiner Anliegen sowie das + aussagekräftige Feedback, das ich laufend erhalten habe. Die von ihm + vorgegebene Roadmap erleichtert das Arbeiten nicht nur in der + Anfangsphase, sondern auch später ungemein. + + Meine Eltern sind diejenigen, die mir das Studium überhaupt erst + ermöglicht haben. Die laufende Unterstützung auch in den späteren + Jahren—primär in Form von gemeinsamen Mittagessen—ließ mich auf das + konzentrieren, was wichtig ist. + + Zu guter Letzt geht noch ein Dank an meine Studienkollegen, die bei + Fragen, zu Diskussionen und zum Entspannen immer erreichbar + waren. Der gelegentliche Austausch macht mir immer Freude und ich + hoffe, dass wir dies auch in Zukunft aufrechterhalten können. \end{danksagung*} \begin{acknowledgements*} -\todo{Enter your text here.} + I would especially like to thank the supervisor of this thesis—Horst + Eidenberger—for the quick and prompt processing of my requests as + well as the meaningful feedback I received on an ongoing basis. The + roadmap he provided not only made this thesis easier in the initial + phase but also later on. + + My parents are the ones who made my studies possible in the first + place. The ongoing support, even in the later years—primarily in + the form of shared lunches—allowed me to concentrate on what is + important. + + Last but not least, I would like to thank my fellow colleagues, who + have always been available for questions, discussions and + relaxation. I always enjoy the occasional exchange and I hope that + we can continue to do those in the future. \end{acknowledgements*} \begin{kurzfassung} -\todo{Ihr Text hier.} + Wassermangel in Zimmerpflanzen kann ihr Wachstum negativ + beeinflussen. Derzeitige Lösungen zur Überwachung von Wasserstress + sind hauptsächlich für landwirtschaftliche Anwendungen + vorgesehen. Wir präsentieren den ersten Deep-Learning-basierten + Prototyp zur Klassifizierung des Wasserstresslevels gängiger + Zimmerpflanzen. Unser zweistufiger Ansatz besteht aus einem + Erkennungs- und einem Klassifizierungsschritt und wird anhand eines + eigens erstellten Datensatzes evaluiert. Die Parameter des Modells + werden mit gängigen Methoden der Hyperparameteroptimierung + ausgewählt. Der Prototyp wird auf einem embedded Computer + bereitgestellt, der eine autonome Pflanzenüberwachung + ermöglicht. Die Vorhersagen unseres Modells werden kontinuierlich + über eine API veröffentlicht, wodurch nachgelagerte + Bewässerungssysteme automatisch Zimmerpflanzen ohne menschliche + Intervention bewässern können. Unser optimiertes Modell erreicht + einen mAP-Wert von \num{0.3838}. \end{kurzfassung} \begin{abstract} -\todo{Enter your text here.} + Water deficiency in household plants can adversely affect + growth. Existing solutions to monitor water stress are primarily + intended for agricultural contexts. We present the first deep + learning based prototype to classify water stress of common + household plants. Our two-stage approach consists of a detection and + a classification step and is evaluated on a new dataset. The model + parameters are optimized with a hyperparameter search. The prototype + is deployed to an embedded device enabling autonomous plant + monitoring. The predictions of our model are published continuously + via an API, allowing downstream watering systems to automatically + water household plants without human intervention. Our optimized + model achieves a mAP of \num{0.3838} on unseen images. \end{abstract} % Select the language of the thesis, e.g., english or naustrian. @@ -389,7 +443,7 @@ problem further so that additional requirements which arise during development can be incorporated properly. The prototyping process is embedded within the concepts of the -\emph{Scientific Method}. This thesis not only produces a prototype, +\emph{Scientific Method}. This thesis not only produces a prototype but also explores the problem of plant detection and classification scientifically. Exploration of the problem requires making falsifiable hypotheses (see section~\ref{sec:motivation}), gathering empirical @@ -652,7 +706,7 @@ The simplest activation function is the identity function. It is defined as If all layers in an artificial neural network use the identity activation function, the network is equivalent to a single-layer structure. The identity function is often used for layers which do not -need an activation function per se, but require one to uphold +need an activation function per se but require one to uphold consistency with the rest of the network structure. \subsubsection{Heaviside Step} @@ -865,11 +919,11 @@ network for an input-output pair. The algorithm computes the gradient iteratively starting from the last layer and works its way backward through the network until it reaches the first layer. -Strictly speaking, backpropagation only computes the gradient, but -does not determine how the gradient is used to learn the new -weights. Once the backpropagation algorithm has computed the gradient, -that gradient is passed to an algorithm which finds a local minimum of -it. This step is usually performed by some variant of gradient descent +Strictly speaking, backpropagation only computes the gradient but does +not determine how the gradient is used to learn the new weights. Once +the backpropagation algorithm has computed the gradient, that gradient +is passed to an algorithm which finds a local minimum of it. This step +is usually performed by some variant of gradient descent \cite{cauchy1847}. \section{Object Detection} @@ -1129,19 +1183,20 @@ on all levels. \glspl{fpn} are an important building block of many state-of-the-art object detectors. A \gls{fpn} first computes the feature pyramid bottom-up with a -scaling step of two. The lower levels capture less semantic information -than the higher levels, but include more spatial information due to -the higher granularity. In a second step, the \gls{fpn} upsamples the -higher levels such that the dimensions of two consecutive layers are -the same. The upsampled top layer is merged with the layer beneath it -via element-wise addition and convolved with a one by one -convolutional layer to reduce channel dimensions and to smooth out -potential artifacts introduced during the upsampling step. The results -of that operation constitute the new \emph{top layer} and the process -continues with the layer below it until the finest resolution feature -map is generated. In this way, the features of the different layers at -different scales are fused to obtain a feature map with high semantic -information but also high spatial information. +scaling step of two. The lower levels capture less semantic +information than the higher levels but include more spatial +information due to the higher granularity. In a second step, the +\gls{fpn} upsamples the higher levels such that the dimensions of two +consecutive layers are the same. The upsampled top layer is merged +with the layer beneath it via element-wise addition and convolved with +a one by one convolutional layer to reduce channel dimensions and to +smooth out potential artifacts introduced during the upsampling +step. The results of that operation constitute the new \emph{top + layer} and the process continues with the layer below it until the +finest resolution feature map is generated. In this way, the features +of the different layers at different scales are fused to obtain a +feature map with high semantic information but also high spatial +information. \textcite{lin2017} report results on \gls{coco} with a \gls{map}@0.5 of 59.1\% with a Faster R-\gls{cnn} structure and a ResNet-101 @@ -1184,7 +1239,7 @@ architecture. \gls{ssd} was proposed by \textcite{liu2016} and functions similarly to \gls{yolo} in that it does not need an extra proposal generation -step, but instead detects and classifies objects in one go. The aim of +step but instead detects and classifies objects in one go. The aim of one-stage detectors is to be considerably faster and at least as accurate as two-stage detectors. While \gls{yolo} paved the way for one-stage detectors, the detection accuracy is significantly lower @@ -1200,7 +1255,7 @@ last fully-connected layer removed such that the output of the \gls{cnn} is a scaled down representation of the input image. The extra layers are intended to capture features at different scales and compare them during training to a range of default anchor boxes. This -idea comes from MultiBox \cite{erhan2014}, but is implemented in +idea comes from MultiBox \cite{erhan2014} but is implemented in \gls{ssd} with a slight twist: during matching of default boxes to the ground truth, boxes with a Jaccard overlap (\gls{iou}) of less than $0.5$ are discarded. In one-stage detector terms, the feature @@ -1446,7 +1501,7 @@ rate of 1.6\% over their own replicated AlexNet result of 18.1\%. GoogLeNet, also known as Inception v1, was proposed by \textcite{szegedy2015} to increase the depth of the network without introducing too much additional complexity. Since the relevant parts -of an image can often be of different sizes, but kernels within +of an image can often be of different sizes but kernels within convolutional layers are fixed, there is a mismatch between what can realistically be detected by the layers and what is present in the dataset. Therefore, the authors propose to perform multiple @@ -1521,7 +1576,7 @@ The authors of DenseNet \cite{huang2017} go one step further than ResNets by connecting every convolutional layer to every other layer in the chain. Previously, each layer was connected in sequence with the one before and the one after it. Residual connections establish a -link between the previous layer and the next one, but still do not +link between the previous layer and the next one but still do not always propagate enough information forward. These \emph{shortcut connections} from earlier layers to later layers are thus only taking place in an episodic way for short sections in the chain. DenseNets @@ -1784,7 +1839,7 @@ strategies roughly goes through the following stages condition is reached. \end{enumerate} -This strategy is more efficient than grid search or random search, but +This strategy is more efficient than grid search or random search but requires a substantial amount of iterations for good solutions and can thus be too expensive for hyperparameter optimization \cite{bischl2023}. We use an evolution strategy based on a genetic @@ -2101,13 +2156,13 @@ set which only comprises individual plants and their associated classification (\emph{stressed} or \emph{healthy}). Both datasets (object detection and classification) only allow us to -train and validate each model separately. A third dataset is needed -to evaluate the detection/classification pipeline as a whole. To this +train and validate each model separately. A third dataset is needed to +evaluate the detection/classification pipeline as a whole. To this end, we construct our own dataset where all plants per image are labeled with bounding boxes as well as the classes \emph{stressed} or \emph{healthy}. This dataset is small in comparison to the one with -which the object detection model is trained, but suffices because it -is only used for evaluation. Labeling each sample in the evaluation +which the object detection model is trained but suffices because it is +only used for evaluation. Labeling each sample in the evaluation dataset manually is still a laborious task which is why each image is \emph{preannotated} by the already existing object detection and classification model. The task of labeling thus becomes a task of @@ -2120,9 +2175,9 @@ models. In the following sections we will go into detail about the two selected architectures for our prototype. The object detector we chose---\gls{yolo}v7---is part of a larger family of models which all -function similarly, but have undergone substantial changes from -version to version. In order to understand the used model, we trace -the improvements to the \gls{yolo} family from version one to version +function similarly but have undergone substantial changes from version +to version. In order to understand the used model, we trace the +improvements to the \gls{yolo} family from version one to version seven. For the classification stage, we have opted for a ResNet architecture which is also described in detail. @@ -2162,7 +2217,7 @@ output from a one by one layer. This approach reduces complexity substantially---as has been demonstrated with GoogleNet. Every block of convolutional layers is followed by a two by two maxpool layer for downsampling. The model expects an input image of size $448$ by $448$ -pixels, but has been pretrained on ImageNet with half that resolution +pixels but has been pretrained on ImageNet with half that resolution (i.e. $224$ by $224$ pixels). After the convolutional layers, the authors add two fully-connected layers to produce an output of size $7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data @@ -2227,7 +2282,7 @@ predefined boxes which are called \emph{anchor boxes}. The authors note that finding a good set of prior anchor boxes by hand is error-prone and suggest finding them via $k$-means clustering (dimension priors). They select five anchor boxes per grid cell which -still results in high recall, but does not introduce too much +still results in high recall but does not introduce too much complexity. These additional details result in an improved \gls{map} of 78.6\% on @@ -2245,7 +2300,7 @@ structures of state-of-the-art models at the time, the authors introduce a deeper feature extractor called Darknet-53. It makes use of the residual connections popularized by ResNet \cite{he2016} (see section~\ref{sssec:theory-resnet}). Darknet-53 is more accurate than -Darknet-19 and compares to ResNet-101, but can process more images per +Darknet-19 and compares to ResNet-101 but can process more images per second (\qty{78}{fps} versus \qty{53}{fps}). The activation function throughout the network is still leaky \gls{relu}, as in earlier versions. @@ -2320,7 +2375,7 @@ the \emph{n} stands for \emph{nano}. Additional versions with increasing parameters are \gls{yolo}v5s (small), \gls{yolo}v5m (medium), \gls{yolo}v5l (large), and \gls{yolo}v5x (extra large). The smaller models are intended to be used in resource constrained -environments such as edge devices, but come with a cost in +environments such as edge devices but come with a cost in accuracy. Conversely, the larger models are for tasks where high accuracy is paramount and enough computational resources are available. The \gls{yolo}v5x model achieves a \gls{map} of 50.7\% on @@ -2570,7 +2625,7 @@ loss from figure~\ref{fig:box-obj-loss}, we speculate that the pre-trained model already generalizes well to plant detection because one of the categories in the \gls{coco} \cite{lin2015} dataset is \emph{potted plant}. Any further training solely impacts the -confidence of detection, but does not lead to higher detection +confidence of detection but does not lead to higher detection rates. This conclusion is supported by the increasing \gls{map}@0.5:0.95 until epoch \num{133}. @@ -2668,10 +2723,10 @@ percentage points higher than in the non-optimized version. \begin{figure} \centering \includegraphics{graphics/precision_recall_final.pdf} - \caption[Hyper-parameter optimized object detection precision and - recall during training.]{Overall precision and recall during - training for each epoch of the optimized model. The vertical gray - line at \num{27} marks the epoch with the highest fitness.} + \caption[Hyperparameter optimized object detection precision and + recall during training]{Overall precision and recall during training + for each epoch of the optimized model. The vertical gray line at + \num{27} marks the epoch with the highest fitness.} \label{fig:hyp-opt-prec-rec} \end{figure} @@ -2684,8 +2739,8 @@ non-optimized version and recall hovers at the same levels. \begin{figure} \centering \includegraphics{graphics/val_box_obj_loss_final.pdf} - \caption[Hyper-parameter optimized object detection box and object - loss.]{Box and object loss measured against the validation set of + \caption[Hyperparameter optimized object detection box and object + loss]{Box and object loss measured against the validation set of \num{3091} images and \num{4092} ground truth labels. The class loss is omitted because there is only one class in the dataset and the loss is therefore always zero.} @@ -2925,13 +2980,13 @@ plant}, \emph{wilted plant} and \emph{stressed plant}. Images which clearly show one or multiple plants with some amount of visible stress were added to the dataset. Care was taken to include plants with various degrees of stress and in various locations and lighting -conditions. The search not only provided images of stressed plants, -but also of healthy plants. The dataset is biased towards potted -plants which are commonly put on display in western -households. Furthermore, many plants, such as succulents, are sought -after for home environments because of their ease of maintenance. Due -to their inclusion in the dataset and how they exhibit water stress, -the test set contains a wide variety of scenarios. +conditions. The search not only provided images of stressed plants but +also of healthy plants. The dataset is biased towards potted plants +which are commonly put on display in western households. Furthermore, +many plants, such as succulents, are sought after for home +environments because of their ease of maintenance. Due to their +inclusion in the dataset and how they exhibit water stress, the test +set contains a wide variety of scenarios. After collecting the images, the aggregate model was run on them to obtain initial bounding boxes and classifications for ground truth @@ -3054,8 +3109,8 @@ from table~\ref{tab:yolo-metrics-hyp} are better, the \begin{figure} \centering \includegraphics{graphics/APpt5-pt95-final.pdf} - \caption[Hyper-parameter optimized object detection AP@0.5 and - AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of + \caption[Hyperparameter optimized object detection AP@0.5 and + AP@0.95]{Precision-recall curves for \gls{iou} thresholds of \num{0.5} and \num{0.95}. The \gls{ap} of a specific threshold is defined as the area under the precision-recall curve of that threshold. The \gls{map} across \gls{iou} thresholds from @@ -3263,7 +3318,7 @@ therefore the cutoff for either class. So far the metrics shown in table~\ref{tab:model-metrics} are obtained with the non-optimized versions of both the object detection and -classification model. Hyper-parameter optimization of the classifier +classification model. Hyperparameter optimization of the classifier led to significant model improvements, while the object detector has improved precision but lower recall and slightly lower \gls{map} values. To evaluate the final aggregate model which consists of the @@ -3298,7 +3353,7 @@ class. Precision for the stressed class is lower with the optimized model, but recall is significantly higher (\num{0.502} vs. \num{0.623}). The higher recall results in a three percentage point gain for the $\mathrm{F}_1$-score in the stressed -class. Overall, precision is the same but recall has improved +class. Overall, precision is the same, but recall has improved significantly, which also results in a noticeable improvement for the average $\mathrm{F}_1$-score across both classes. @@ -3340,7 +3395,7 @@ optimal for our purposes. The class \emph{plant} does not seem to have been defined rigorously. Large patches of grass, for example, are labeled with large bounding boxes. Trees are sometimes labeled, but only if their size suggests that they could be bushes or similar types -of plant. Large corn fields are also labeled as plants, but again with +of plant. Large corn fields are also labeled as plants but again with one large bounding box. If multiple plants are densely packed, the annotators often label them as belonging to one plant and thus one bounding box. Sometimes the effort has been made to delineate plants @@ -3462,20 +3517,115 @@ direction. \chapter{Conclusion} \label{chap:conclusion} -Conclude the thesis with a short recap of the results and the -discussion. Establish whether the research questions from -section~\ref{sec:methods} can be answered successfully. +In this thesis, we have developed a prototype system for plant +detection and classification using a machine learning model deployed +on an edge device. The model consists of a two-stage approach wherein +the first stage detects plants and the second stage classifies +them. This approach has been chosen because of the limited +availability of data to train one model end-to-end and comes with +downsides such as an increased error rate and additional training, +optimization, and evaluation complexity. Despite these downsides, the +prototype performs well in the homeowner context where the variety of +plants is limited. This conclusion is supported by the metrics +discussed in chapter~\ref{chap:evaluation}. -Estimated 2 pages for this chapter. +The optimization of the model has been shown to require a substantial +amount of computational resources and proved to be difficult to get +right. The object detection model in particular needs many iterations +during the hyperparameter search to converge to a global optimum. We +attribute these difficulties to the model complexity of the \gls{yolo} +series and the numerous hyperparameters which are available. The +classifier, however, is comparatively simpler from an architectural +standpoint and lends itself more easily to optimization. + +Revisiting the research questions posed in +section~\ref{sec:motivation}, we can now assess the extent to which +our findings have addressed them. + +\begin{enumerate} +\item \emph{How well does the model work in theory and how well in + practice?} + + The optimized model achieves a \gls{map} of \num{0.3838} which + suggests that the prototype works well on unseen data. The plant + detection is robust, particularly for household plants and the + classifier shows strong performance for a wide array of common + plants. Contrary to our expectations, the stress classification is + not more difficult than the detection step. In fact, the problems we + encountered during the optimization of the detection model are + likely to stem from the increased complexity of the detection versus + the classification task. The various different ways in which plants + show water stress does not seem to be a limiting factor for stress + classification. + +\item \emph{What are possible reasons for it to work/not work?} + + We have demonstrated possible reasons for why either the constituent + models or the aggregate model underperform. In general, we conclude + that the prototype does work and can be used within the context + established in chapter~\ref{chap:introduction}. Our expectation that + dataset curation will play a major role in successfully implementing + the prototype turned out to be true. For example, some of the + problems with the plant detection model can be attributed to the + inconsistent labeling information present in the \gls{oid}. Care had + to be taken during the creation of the dataset the aggregate model + was evaluated on to not introduce a bias which favors the + predictions. + +\item \emph{What are possible improvements to the system in the + future?} + + Specific improvements to the prototype include curating bigger + datasets to train on, running the hyperparameter optimization for + more iterations and more epochs per iteration, and including experts + such as botanists to establish higher confidence in the ground + truth. Unfortunately, the first two suggestions result in a + significantly higher computational cost during training, + optimization, and evaluation. This observation applies to most + machine learning models and there is always a trade-off between + model performance and training/optimization time. +\end{enumerate} \section{Future Work} \label{sec:future-work} -Suggest further research directions regarding the approach. Give an -outlook on further possibilities in this research field with respect -to object detection and plant classification. +An interesting further research direction for plant detection and +classification is exploring the viability of single-stage +approaches. Even though our two-stage approach leads to acceptable +results, we believe that incorporating the classification step into +the plant detection step would likely yield better results. A unified +single-stage approach does not fully deal with the problem of +propagated errors but should substantially reduce it. An advantage of +this approach is that the resulting model could be optimized more +easily because the loss function is dependent on object detection as +well as classification. A disadvantage, however,—and this is the +reason why we have not adopted such an approach—is that a unified +model also needs large datasets it can be trained on. -Estimated 1 page for this section +Additional datasets to train a plant detection and classification +model on are needed. While we were able to find separate datasets to +train the individual models on, it also meant that we were not able to +implement the aforementioned single-stage approach. If there is enough +interest in this research problem, it should be possible to create +large datasets which encode expert knowledge. Since there is such a +variety of plants and how they express nutrient deficiencies or +illnesses, only experts are able to create correct ground truth labels +for all or most of them. In the limited context of this thesis, we +were able to label common household plants with additional information +from the Internet. As soon as more exotic plants are added to the +datasets, layman knowledge reaches its limits. Having more and better +ground truth labels should result in better detection and +classification performance as well as a more robust evaluation. + +Future research could add additional information to the datasets such +that models are able to work with more data. For example, including +images of plants in the infrared spectrum would provide a visual +measure of evapotranspiration. This additional perspective might allow +the model to better discriminate between stressed and non-stressed +plants. Other valuable perspectives could be provided by sensor data +which track soil moisture, humidity and radiant flux. Although this +has been done in single-plant agricultural settings, it has not been +tried in a multi-plant household context. \backmatter diff --git a/thesis/vutinfth.pdf b/thesis/vutinfth.pdf index 627a237..9221b80 100644 Binary files a/thesis/vutinfth.pdf and b/thesis/vutinfth.pdf differ