% Copyright (C) 2014-2020 by Thomas Auzinger \documentclass[draft,final]{vutinfth} % Remove option 'final' to obtain debug information. % Load packages to allow in- and output of non-ASCII characters. \usepackage{lmodern} % Use an extension of the original Computer Modern font to minimize the use of bitmapped letters. \usepackage[T1]{fontenc} % Determines font encoding of the output. Font packages have to be included before this line. \usepackage[utf8]{inputenc} % Determines encoding of the input. All input files have to use UTF8 encoding. % Extended LaTeX functionality is enables by including packages with \usepackage{...}. \usepackage{amsmath} % Extended typesetting of mathematical expression. \usepackage{amssymb} % Provides a multitude of mathematical symbols. \usepackage{mathtools} % Further extensions of mathematical typesetting. \usepackage{microtype} % Small-scale typographic enhancements. \usepackage[inline]{enumitem} % User control over the layout of lists (itemize, enumerate, description). \usepackage{multirow} % Allows table elements to span several rows. \usepackage{booktabs} % Improves the typesettings of tables. \usepackage{subcaption} % Allows the use of subfigures and enables their referencing. \usepackage[ruled,linesnumbered,algochapter]{algorithm2e} % Enables the writing of pseudo code. \usepackage[usenames,dvipsnames,table]{xcolor} % Allows the definition and use of colors. This package has to be included before tikz. \usepackage{nag} % Issues warnings when best practices in writing LaTeX documents are violated. \usepackage{todonotes} % Provides tooltip-like todo notes. \usepackage[backend=biber,style=trad-alpha,isbn=false,eprint=false,maxcitenames=3]{biblatex} \usepackage{hyperref} % Enables cross linking in the electronic document version. This package has to be included second to last. \usepackage[acronym,toc]{glossaries} % Enables the generation of glossaries and lists fo acronyms. This package has to be included last. \addbibresource{references.bib} % Define convenience functions to use the author name and the thesis title in the PDF document properties. \newcommand{\authorname}{Tobias Eidelpes} % The author name without titles. \newcommand{\thesistitle}{Flower State Classification for Watering System} % The title of the thesis. The English version should be used, if it exists. % Set PDF document properties \hypersetup { pdfpagelayout = TwoPageRight, % How the document is shown in PDF viewers (optional). linkbordercolor = {Melon}, % The color of the borders of boxes around crosslinks (optional). pdfauthor = {\authorname}, % The author's name in the document properties (optional). pdftitle = {\thesistitle}, % The document's title in the document properties (optional). pdfsubject = {Subject}, % The document's subject in the document properties (optional). pdfkeywords = {Object Detection, Image Classification, Machine Learning, Embedded Programming} % The document's keywords in the document properties (optional). } \setpnumwidth{2.5em} % Avoid overfull hboxes in the table of contents (see memoir manual). \setsecnumdepth{subsection} % Enumerate subsections. \nonzeroparskip % Create space between paragraphs (optional). \setlength{\parindent}{0pt} % Remove paragraph identation (optional). \makeindex % Use an optional index. \makeglossaries % Use an optional glossary. %\glstocfalse % Remove the glossaries from the table of contents. % Set persons with 4 arguments: % {title before name}{name}{title after name}{gender} % where both titles are optional (i.e. can be given as empty brackets {}). \setauthor{}{\authorname}{BSc}{male} \setadvisor{Ao.Univ.-Prof. Dr.}{Horst Eidenberger}{}{male} \setregnumber{01527193} \setdate{20}{02}{2023} % Set date with 3 arguments: {day}{month}{year}. \settitle{\thesistitle}{Flower State Classification for Watering System} % Sets English and German version of the title (both can be English or German). % Select the thesis type: bachelor / master / doctor / phd-school. % Master: \setthesis{master} \setmasterdegree{dipl.} % dipl. / rer.nat. / rer.soc.oec. / master % For bachelor and master: \setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum. \newacronym{xai}{XAI}{Explainable Artificial Intelligence} \newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping} \newacronym{cam}{CAM}{Class Activation Mapping} \newacronym{oid}{OID}{Open Images Dataset} \newacronym{ap}{AP}{Average Precision} \newacronym{iou}{IOU}{Intersection over Union} \newacronym{map}{mAP}{mean average precision} \newacronym{resnet}{ResNet}{Residual Neural Network} \newacronym{cnn}{CNN}{Convolutional Neural Network} \begin{document} \frontmatter % Switches to roman numbering. % The structure of the thesis has to conform to the guidelines at % https://informatics.tuwien.ac.at/study-services \addtitlepage{naustrian} % German title page (not for dissertations at the PhD School). \addtitlepage{english} % English title page. \addstatementpage \begin{danksagung*} \todo{Ihr Text hier.} \end{danksagung*} \begin{acknowledgements*} \todo{Enter your text here.} \end{acknowledgements*} \begin{kurzfassung} \todo{Ihr Text hier.} \end{kurzfassung} \begin{abstract} \todo{Enter your text here.} \end{abstract} % Select the language of the thesis, e.g., english or naustrian. \selectlanguage{english} % Add a table of contents (toc). \tableofcontents % Starred version, i.e., \tableofcontents*, removes the self-entry. % Switch to arabic numbering and start the enumeration of chapters in the table of content. \mainmatter % \chapter{Introduction} % \todo{Enter your text here.} \chapter{Evaluation} The following sections contain a detailed evaluation of the model in various scenarios. First, we present metrics from the training phases of the constituent models. Second, we employ methods from the field of \gls{xai} such as \gls{grad-cam} to get a better understanding of the models' abstractions. Finally, we turn to the models' aggregate performance on the test set and discuss whether the initial goals set by the problem description have been met or not. \section{Object Detection} \label{sec:yolo-eval} The object detection model was pre-trained on the COCO~\cite{lin2015} dataset and fine-tuned with data from the \gls{oid} \cite{kuznetsova2020} in its sixth version. Since the full \gls{oid} dataset contains considerably more classes and samples than would be feasibly trainable on a small cluster of GPUs, only images from the two classes \emph{Plant} and \emph{Houseplant} have been downloaded. The samples from the Houseplant class are merged into the Plant class because the distinction between the two is not necessary for our model. Furthermore, the \gls{oid} contains not only bounding box annotations for object detection tasks, but also instance segmentations, classification labels and more. These are not needed for our purposes and are omitted as well. In total, the dataset consists of 91479 images with a roughly 85/5/10 split for training, validation and testing, respectively. \subsection{Training Phase} \label{ssec:yolo-training-phase} The object detection model was trained for 300 epochs on 79204 images with 284130 ground truth labels. The weights from the best-performing epoch were saved. The model's fitness for each epoch is calculated as the weighted average of \textsf{mAP}@0.5 and \textsf{mAP}@0.5:0.95: \begin{equation} \label{eq:fitness} f_{epoch} = 0.1 \cdot \mathsf{mAP}@0.5 + 0.9 \cdot \mathsf{mAP}@0.5\mathrm{:}0.95 \end{equation} Figure~\ref{fig:fitness} shows the model's fitness over the training period of 300 epochs. The gray vertical line indicates the maximum fitness of 0.61 at epoch 133. The weights of that epoch were frozen to be the final model parameters. Since the fitness metric assigns the \textsf{mAP} at the higher range the overwhelming weight, the \textsf{mAP}@0.5 starts to decrease after epoch 30, but the \textsf{mAP}@0.5:0.95 picks up the slack until the maximum fitness at epoch 133. This is an indication that the model achieves good performance early on and continues to gain higher confidence values until performance deteriorates due to overfitting. \begin{figure} \centering \includegraphics{graphics/model_fitness.pdf} \caption[Object detection fitness per epoch.]{Object detection model fitness for each epoch calculated as in equation~\ref{eq:fitness}. The vertical gray line at 133 marks the epoch with the highest fitness.} \label{fig:fitness} \end{figure} Overall precision and recall per epoch are shown in figure~\ref{fig:prec-rec}. The values indicate that neither precision nor recall change materially during training. In fact, precision starts to decrease from the beginning, while recall experiences a barely noticeable increase. Taken together with the box and object loss from figure~\ref{fig:box-obj-loss}, we speculate that the pre-trained model already generalizes well to plant detection because one of the categories in the COCO~\cite{lin2015} dataset is \emph{potted plant}. Any further training solely impacts the confidence of detection, but does not lead to higher detection rates. This conclusion is supported by the increasing \textsf{mAP}@0.5:0.95 until epoch 133. \begin{figure} \centering \includegraphics{graphics/precision_recall.pdf} \caption[Object detection precision and recall during training.]{Overall precision and recall during training for each epoch. The vertical gray line at 133 marks the epoch with the highest fitness.} \label{fig:prec-rec} \end{figure} Further culprits for the flat precision and recall values may be found in bad ground truth data. The labels from the \gls{oid} are sometimes not fine-grained enough. Images which contain multiple individual—often overlapping—plants are labeled with one large bounding box instead of multiple smaller ones. The model recognizes the individual plants and returns tighter bounding boxes even if that is not what is specified in the ground truth. Therefore, it is prudent to limit the training phase to relatively few epochs in order to not penalize the more accurate detections of the model. The smaller bounding boxes make more sense considering the fact that the cutout is passed to the classifier in a later stage. Smaller bounding boxes help the classifier to only focus on one plant at a time and to not get distracted by multiple plants in potentially different stages of wilting. The box loss decreases slightly during training which indicates that the bounding boxes become tighter around objects of interest. With increasing training time, however, the object loss increases, indicating that less and less plants are present in the predicted bounding boxes. It is likely that overfitting is a cause for the increasing object loss from epoch 40 onward. Since the best weights as measured by fitness are found at epoch 133 and the object loss accelerates from that point, epoch 133 is probably the correct cutoff before overfitting occurs. \begin{figure} \centering \includegraphics{graphics/val_box_obj_loss.pdf} \caption[Object detection box and object loss.]{Box and object loss measured against the validation set of 3091 images and 4092 ground truth labels. The class loss is omitted because there is only one class in the dataset and the loss is therefore always zero.} \label{fig:box-obj-loss} \end{figure} \subsection{Test Phase} \label{ssec:yolo-test-phase} Of the 91479 images around 10\% were used for the test phase. These images contain a total of 12238 ground truth labels. Table~\ref{tab:yolo-metrics} shows precision, recall and the harmonic mean of both (F1-score). The results indicate that the model errs on the side of sensitivity because recall is higher than precision. Although some detections are not labeled as plants in the dataset, if there is a labeled plant in the ground truth data, the chance is high that it will be detected. This behavior is in line with how the model's detections are handled in practice. The detections are drawn on the original image and the user is able to check the bounding boxes visually. If there are wrong detections, the user can ignore them and focus on the relevant ones instead. A higher recall will thus serve the user's needs better than a high precision. \begin{table}[h] \centering \begin{tabular}{lrrrr} \toprule {} & Precision & Recall & F1-score & Support \\ \midrule Plant & 0.547571 & 0.737866 & 0.628633 & 12238.0 \\ \bottomrule \end{tabular} \caption{Precision, recall and F1-score for the object detection model.} \label{tab:yolo-metrics} \end{table} Figure~\ref{fig:yolo-ap} shows the \gls{ap} for the \gls{iou} thresholds of 0.5 and 0.95. Predicted bounding boxes with an \gls{iou} of less than 0.5 are not taken into account for the precision and recall values of table~\ref{tab:yolo-metrics}. The lower the detection threshold, the more plants are detected. Conversely, a higher detection threshold leaves potential plants undetected. The precision-recall curves confirm this behavior because the area under the curve for the threshold of 0.5 is higher than for the threshold of 0.95 ($0.66$ versus $0.41$). These values are combined in COCO's \cite{lin2015} main evaluation metric which is the \gls{ap} averaged across the \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps. This value is then averaged across all classes and called \gls{map}. The object detection model achieves a state-of-the-art \gls{map} of 0.5727 for the \emph{Plant} class. \begin{figure} \centering \includegraphics{graphics/APpt5-pt95.pdf} \caption[Object detection AP@0.5 and AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a specific threshold is defined as the area under the precision-recall curve of that threshold. The \gls{map} across \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps \textsf{mAP}@0.5:0.95 is 0.5727.} \label{fig:yolo-ap} \end{figure} \section{Classification} \label{sec:resnet-eval} The classifier receives cutouts from the object detection model and determines whether the image shows a stressed plant or not. To achieve this goal, we trained a \gls{resnet} \cite{he2016} on a dataset of 452 images of healthy and 452 stressed plants. We chose the \gls{resnet} architecture due to its popularity and ease of implementation as well as its consistently high performance on various classification tasks. While its classification speed in comparison with networks optimized for mobile and edge devices (e.g. MobileNet) is significantly lower, the deeper structure and the additional parameters are necessary for the fairly complex task at hand. Furthermore, the generous time budget for object detection \emph{and} classification allows for more accurate results at the expense of speed. The architecture allows for multiple different structures, depending on the amount of layers. The smallest one has 18 and the largest 152 layers with 34, 50 and 101 in-between. The larger networks have better accuracy in general, but come with trade-offs regarding training and inference time as well as required space. The 50 layer architecture (\gls{resnet}50) is adequate for our use case. \subsection{Training Phase} \label{ssec:resnet-training-phase} The dataset was split 85/15 into training and validation sets. The images in the training set were augmented with a random crop to arrive at the expected image dimensions of 224 pixels. Additionally, the training images were modified with a random horizontal flip to increase the variation in the set and to train a rotation invariant classifier. All images, regardless of their membership in the training or validation set, were normalized with the mean and standard deviation of the ImageNet~\cite{deng2009} dataset, which the original \gls{resnet} model was pre-trained with. Training was done for 50 epochs and the best-performing model as measured by validation accuracy was selected as the final version. Figure~\ref{fig:classifier-training-metrics} shows accuracy and loss on the training and validation sets. There is a clear upwards trend until epoch 20 when validation accuracy and loss stabilize at around 0.84 and 0.3, respectively. The quick convergence and resistance to overfitting can be attributed to the model already having robust feature extraction capabilities. \begin{figure} \centering \includegraphics{graphics/classifier-metrics.pdf} \caption[Classifier accuracy and loss during training.]{Accuracy and loss during training of the classifier. The model converges quickly, but additional epochs do not cause validation loss to increase, which would indicate overfitting. The maximum validation accuracy of 0.9118 is achieved at epoch 27.} \label{fig:classifier-training-metrics} \end{figure} \subsection{Class Activation Maps} \label{ssec:resnet-cam} Neural networks are notorious for their black-box behavior, where it is possible to observe the inputs and the corresponding outputs, but the stage in-between stays hidden from view. Models are continuously developed and deployed to aid in human decision-making and sometimes supplant it. It is, therefore, crucial to obtain some amount of interpretability of what the model does \emph{inside} to be able to explain why a decision was made in a certain way. The research field of \gls{xai} gained significance during the last few years because of the development of new methods to peek inside these black boxes. One such method, \gls{cam}~\cite{zhou2015}, is a popular tool to produce visual explanations for decisions made by \glspl{cnn}. Convolutional layers essentially function as object detectors as long as no fully-connected layers perform the classification. This ability to localize regions of interest which play a significant role in the type of class the model predicts, can be retained until the last layer and used to generate activation maps for the predictions. A more recent approach to generating a \gls{cam} via gradients is proposed by~\textcite{selvaraju2020}. Their \gls{grad-cam} approach works by computing the gradient of the feature maps of the last convolutional layer with respect to the specified class. The last layer is chosen because the authors find that ``[…] Grad-CAM maps become progressively worse as we move to earlier convolutional layers as they have smaller receptive fields and only focus on less semantic local features.''~\cite[p.5]{selvaraju2020} Turning to our classifier, figure~\ref{fig:resnet-cam} shows the \glspl{cam} for \emph{healthy} and \emph{stressed}. While the regions of interest for the \emph{healthy} class lie on the healthy plant, the \emph{stressed} plant is barely considered and mostly rendered as background information (blue). Conversely, when asked to explain the inputs to the \emph{stressed} classification, the regions of interest predominantly stay on the thirsty as opposed to the healthy plant. In fact, the large hanging leaves play a significant role in determining the class the image belongs to. This is an additional data point confirming that the model focuses on the \emph{right} parts of the image during classification. \begin{figure} \centering \includegraphics{graphics/classifier-cam.pdf} \caption[Classifier \glspl{cam}.]{The top left image shows the original image of the same plant in a stressed (left) and healthy (right) state. In the top right image, the \gls{cam} for the class \emph{healthy} is laid over the original image. The classifier draws its conclusion mainly from the healthy plant, which is indicated by the red hot spots around the tips of the plant. The bottom right image shows the \gls{cam} for the \emph{stressed} class. The classifier focuses on the hanging leaves of the thirsty plant. The image was classified as \emph{stressed} with a confidence of 70\%.} \label{fig:resnet-cam} \end{figure} \section{Aggregate Model} \label{sec:aggregate-model} In this section we turn to the evaluation of the aggregate model. We have confirmed the performance of the constituent models: the object detection and the classification model. It remains to evaluate the complete pipeline from gathering detections of potential plants in an image and forwarding them to the classifier to obtaining the results as either healthy or stressed with their associated confidence scores. The test set contains 640 images which were obtained from a google search using the terms \emph{thirsty plant}, \emph{wilted plant} and \emph{stressed plant}. Images which clearly show one or multiple plants with some amount of visible stress were added to the dataset. Care was taken to include plants with various degrees of stress and in various locations and lighting conditions. The search not only provided images of stressed plants, but also of healthy plants due to articles, which describe how to care for plants, having a banner image of healthy plants. The dataset is biased towards potted plants which are commonly put on display in western households. Furthermore, many plants, such as succulents, are sought after for home environments because of their ease of maintenance. Due to their inclusion in the dataset and how they exhibit water stress, the test set nevertheless contains a wide variety of scenarios. After collecting the images, the aggregate model was run on them to obtain initial bounding boxes and classifications for ground truth labeling. Letting the model do the work beforehand and then correcting the labels allowed to include more images in the test set because they could be labeled more easily. Additionally, going over the detections and classifications provided a comprehensive view on how the models work and what their weaknesses and strengths are. After the labels have been corrected, the ground truth of the test set contains 662 bounding boxes of healthy plants and 488 of stressed plants. \begin{table} \centering \begin{tabular}{lrrrr} \toprule {} & Precision & Recall & F1-score & Support \\ \midrule Healthy & 0.824 & 0.745 & 0.783 & 662.0 \\ Stressed & 0.707 & 0.783 & 0.743 & 488.0 \\ micro avg & 0.769 & 0.761 & 0.765 & 1150.0 \\ macro avg & 0.766 & 0.764 & 0.763 & 1150.0 \\ weighted avg & 0.775 & 0.761 & 0.766 & 1150.0 \\ \bottomrule \end{tabular} \caption{Precision, recall and F1-score for the aggregate model.} \label{tab:model-metrics} \end{table} Table~\ref{tab:model-metrics} shows precision, recall and the F1-score for both classes \emph{Healthy} and \emph{Stressed}. Both precision and recall are balanced and the F1-score is high. Unfortunately, these values do not take the accuracy of bounding boxes into account and thus have only limited expressive power. Figure~\ref{fig:aggregate-ap} shows the precision and recall curves for both classes at different \gls{iou} thresholds. The left plot shows the \gls{ap} for each class at the threshold of 0.5 and the right one at 0.95. The \gls{map} is 0.6226 and calculated across all classes as the median of the \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps. The difference between \gls{map}@0.5 and \gls{map}@0.95 is fairly small which indicates that the bounding boxes encapsulate the objects of interest well. The cliffs at around 0.77 (left) and 0.7 (right) happen at a detection threshold of 0.5. The classifier's last layer is a softmax layer which necessarily transforms the input into a probability of showing either a healthy or stressed plant. If the probability of an image showing a healthy plant is below 0.5, it is no longer classified as healthy but as stressed. The threshold for discriminating the two classes lies at the 0.5 value and is therefore the cutoff for either class. \begin{figure} \centering \includegraphics{graphics/APmodel.pdf} \caption[Aggregate model AP@0.5 and AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a specific threshold is defined as the area under the precision-recall curve of that threshold. The \gls{map} across \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps \textsf{mAP}@0.5:0.95 is 0.6226.} \label{fig:aggregate-ap} \end{figure} Overall, we believe that the aggregate model shows sufficient predictive performance to be deployed in the field. The detections are accurate, especially for potted plants, and the classification into healthy and stressed is robust. \backmatter % Use an optional list of figures. \listoffigures % Starred version, i.e., \listoffigures*, removes the toc entry. % Use an optional list of tables. \cleardoublepage % Start list of tables on the next empty right hand page. \listoftables % Starred version, i.e., \listoftables*, removes the toc entry. % Use an optional list of algorithms. \listofalgorithms \addcontentsline{toc}{chapter}{List of Algorithms} % Add an index. \printindex % Add a glossary. \printglossaries % Add a bibliography. %\bibliographystyle{alpha} \printbibliography \end{document} %%% Local Variables: %%% mode: latex %%% TeX-master: t %%% End: