Switch to biber and add classifier eval

This commit is contained in:
Tobias Eidelpes 2023-03-08 17:05:27 +01:00
parent 0e37585e57
commit 088d6647c1
12 changed files with 1395 additions and 420 deletions

View File

@ -0,0 +1,51 @@
epoch,train_loss,train_acc,val_loss,val_acc
0,0.6290,0.6920,0.4647,0.7843
1,0.7473,0.6970,0.5752,0.7941
2,0.5318,0.7531,0.4026,0.8137
3,0.5383,0.7693,0.4723,0.7745
4,0.8356,0.7282,0.9150,0.6569
5,0.5942,0.7693,0.4992,0.7745
6,0.4339,0.8092,0.4625,0.7451
7,0.3620,0.8454,0.3219,0.8431
8,0.3350,0.8579,0.2812,0.8725
9,0.3237,0.8653,0.3355,0.8333
10,0.3264,0.8516,0.2956,0.8627
11,0.3434,0.8541,0.3215,0.8235
12,0.3129,0.8641,0.2990,0.8333
13,0.3072,0.8616,0.2975,0.8725
14,0.2892,0.8815,0.2844,0.9020
15,0.3042,0.8616,0.3124,0.8235
16,0.3007,0.8703,0.3064,0.8431
17,0.2580,0.8928,0.3006,0.8431
18,0.2844,0.8778,0.3441,0.8137
19,0.2994,0.8815,0.2690,0.8824
20,0.3063,0.8653,0.3406,0.8137
21,0.2839,0.8766,0.2964,0.8431
22,0.2871,0.8815,0.3140,0.8333
23,0.2874,0.8766,0.3014,0.8725
24,0.2694,0.8890,0.3236,0.8333
25,0.3031,0.8778,0.3001,0.8431
26,0.2996,0.8641,0.3217,0.8235
27,0.3109,0.8766,0.2569,0.9118
28,0.2949,0.8703,0.3114,0.8333
29,0.3067,0.8728,0.3476,0.8235
30,0.3084,0.8828,0.2851,0.8627
31,0.3158,0.8653,0.2967,0.8431
32,0.3153,0.8603,0.2872,0.8529
33,0.3017,0.8716,0.2775,0.8431
34,0.3171,0.8791,0.3527,0.8235
35,0.3027,0.8666,0.2903,0.8333
36,0.2997,0.8840,0.2729,0.8627
37,0.2867,0.8840,0.2564,0.8922
38,0.3044,0.8628,0.3305,0.8333
39,0.2978,0.8840,0.3722,0.8235
40,0.2906,0.8678,0.2804,0.8627
41,0.2822,0.8778,0.2952,0.8725
42,0.3403,0.8466,0.3014,0.8529
43,0.3040,0.8803,0.3133,0.8333
44,0.3107,0.8815,0.2954,0.8431
45,0.2785,0.8828,0.3147,0.8431
46,0.3109,0.8778,0.2670,0.8725
47,0.2848,0.8766,0.3000,0.8529
48,0.2832,0.8791,0.2768,0.8627
49,0.2606,0.8890,0.2756,0.8824
1 epoch train_loss train_acc val_loss val_acc
2 0 0.6290 0.6920 0.4647 0.7843
3 1 0.7473 0.6970 0.5752 0.7941
4 2 0.5318 0.7531 0.4026 0.8137
5 3 0.5383 0.7693 0.4723 0.7745
6 4 0.8356 0.7282 0.9150 0.6569
7 5 0.5942 0.7693 0.4992 0.7745
8 6 0.4339 0.8092 0.4625 0.7451
9 7 0.3620 0.8454 0.3219 0.8431
10 8 0.3350 0.8579 0.2812 0.8725
11 9 0.3237 0.8653 0.3355 0.8333
12 10 0.3264 0.8516 0.2956 0.8627
13 11 0.3434 0.8541 0.3215 0.8235
14 12 0.3129 0.8641 0.2990 0.8333
15 13 0.3072 0.8616 0.2975 0.8725
16 14 0.2892 0.8815 0.2844 0.9020
17 15 0.3042 0.8616 0.3124 0.8235
18 16 0.3007 0.8703 0.3064 0.8431
19 17 0.2580 0.8928 0.3006 0.8431
20 18 0.2844 0.8778 0.3441 0.8137
21 19 0.2994 0.8815 0.2690 0.8824
22 20 0.3063 0.8653 0.3406 0.8137
23 21 0.2839 0.8766 0.2964 0.8431
24 22 0.2871 0.8815 0.3140 0.8333
25 23 0.2874 0.8766 0.3014 0.8725
26 24 0.2694 0.8890 0.3236 0.8333
27 25 0.3031 0.8778 0.3001 0.8431
28 26 0.2996 0.8641 0.3217 0.8235
29 27 0.3109 0.8766 0.2569 0.9118
30 28 0.2949 0.8703 0.3114 0.8333
31 29 0.3067 0.8728 0.3476 0.8235
32 30 0.3084 0.8828 0.2851 0.8627
33 31 0.3158 0.8653 0.2967 0.8431
34 32 0.3153 0.8603 0.2872 0.8529
35 33 0.3017 0.8716 0.2775 0.8431
36 34 0.3171 0.8791 0.3527 0.8235
37 35 0.3027 0.8666 0.2903 0.8333
38 36 0.2997 0.8840 0.2729 0.8627
39 37 0.2867 0.8840 0.2564 0.8922
40 38 0.3044 0.8628 0.3305 0.8333
41 39 0.2978 0.8840 0.3722 0.8235
42 40 0.2906 0.8678 0.2804 0.8627
43 41 0.2822 0.8778 0.2952 0.8725
44 42 0.3403 0.8466 0.3014 0.8529
45 43 0.3040 0.8803 0.3133 0.8333
46 44 0.3107 0.8815 0.2954 0.8431
47 45 0.2785 0.8828 0.3147 0.8431
48 46 0.3109 0.8778 0.2670 0.8725
49 47 0.2848 0.8766 0.3000 0.8529
50 48 0.2832 0.8791 0.2768 0.8627
51 49 0.2606 0.8890 0.2756 0.8824

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -523,7 +523,8 @@
],
"source": [
"fig, ax = plt.subplots(1, 1, figsize=set_size(width, subplots=(1,1)))\n",
"sns.lineplot(x='epoch', y='value', style='metric', dashes=[\"\", (2,1)], data=df_aranged, color='black', linewidth=1)\n",
"sns.lineplot(x='epoch', y='value', style='metric', dashes=[\"\", (2,1)], data=df_aranged,\n",
" color='black', linewidth=1)\n",
"ax.set_ylim([0, 1])\n",
"ax.set_xticks(np.arange(0, 350, 50))\n",
"ax.set_ylabel('')\n",

View File

@ -6,7 +6,7 @@ SOURCE=thesis
# Build the thesis document
pdflatex $SOURCE
bibtex $SOURCE
biber $SOURCE
pdflatex $SOURCE
pdflatex $SOURCE
makeindex -t $SOURCE.glg -s $SOURCE.ist -o $SOURCE.gls $SOURCE.glo

BIN
thesis/graphics/APmodel.pdf Normal file

Binary file not shown.

BIN
thesis/graphics/CMmodel.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -99,30 +99,28 @@
keywords = {artificial intelligence,crop management,livestock management,machine learning,precision agriculture,precision livestock farming,soil management,water management}
}
@misc{bochkovskiy2020,
@online{bochkovskiy2020,
title = {{{YOLOv4}}: {{Optimal Speed}} and {{Accuracy}} of {{Object Detection}}},
shorttitle = {{{YOLOv4}}},
author = {Bochkovskiy, Alexey and Wang, Chien-Yao and Liao, Hong-Yuan Mark},
date = {2020-04-22},
number = {arXiv:2004.10934},
eprint = {2004.10934},
eprint = {arXiv:2004.10934},
eprinttype = {arxiv},
publisher = {{arXiv}},
doi = {10.48550/arXiv.2004.10934},
archiveprefix = {arXiv},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Electrical Engineering and Systems Science - Image and Video Processing}
}
@misc{brown2020,
@online{brown2020,
title = {Language {{Models}} Are {{Few-Shot Learners}}},
author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
date = {2020-07-22},
number = {arXiv:2005.14165},
eprint = {2005.14165},
eprint = {arXiv:2005.14165},
eprinttype = {arxiv},
publisher = {{arXiv}},
doi = {10.48550/arXiv.2005.14165},
archiveprefix = {arXiv},
pubstate = {preprint},
keywords = {Computer Science - Computation and Language}
}
@ -141,6 +139,31 @@
keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
}
@inproceedings{deng2009,
title = {{{ImageNet}}: {{A}} Large-Scale Hierarchical Image Database},
shorttitle = {{{ImageNet}}},
booktitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
date = {2009-06},
pages = {248--255},
issn = {1063-6919},
doi = {10.1109/CVPR.2009.5206848},
eventtitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
keywords = {Explosions,Image databases,Image retrieval,Information retrieval,Internet,Large-scale systems,Multimedia databases,Ontologies,Robustness,Spine}
}
@inproceedings{he2016,
title = {Deep {{Residual Learning}} for {{Image Recognition}}},
booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
date = {2016-06},
pages = {770--778},
issn = {1063-6919},
doi = {10.1109/CVPR.2016.90},
eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
keywords = {Complexity theory,Degradation,Image recognition,Image segmentation,Neural networks,Training,Visualization}
}
@article{kuznetsova2020,
title = {The {{Open Images Dataset V4}}: {{Unified}} Image Classification, Object Detection, and Visual Relationship Detection at Scale},
shorttitle = {The {{Open Images Dataset V4}}},
@ -152,24 +175,23 @@
number = {7},
eprint = {1811.00982},
eprinttype = {arxiv},
eprintclass = {cs},
pages = {1956--1981},
issn = {0920-5691, 1573-1405},
doi = {10.1007/s11263-020-01316-z},
archiveprefix = {arXiv},
keywords = {Computer Science - Computer Vision and Pattern Recognition}
}
@misc{lin2015,
@online{lin2015,
title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}},
shorttitle = {Microsoft {{COCO}}},
author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Bourdev, Lubomir and Girshick, Ross and Hays, James and Perona, Pietro and Ramanan, Deva and Zitnick, C. Lawrence and Dollár, Piotr},
date = {2015-02-20},
number = {arXiv:1405.0312},
eprint = {1405.0312},
eprint = {arXiv:1405.0312},
eprinttype = {arxiv},
publisher = {{arXiv}},
doi = {10.48550/arXiv.1405.0312},
archiveprefix = {arXiv},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}
}
@ -261,6 +283,24 @@
keywords = {Agriculture,Cameras,Computational modeling,computer vision,edge and cloud computing,IoT,machine learning,Sensor systems,Sensors,smart farming,Stress,Temperature sensors}
}
@article{selvaraju2020,
title = {Grad-{{CAM}}: {{Visual Explanations}} from {{Deep Networks}} via {{Gradient-based Localization}}},
shorttitle = {Grad-{{CAM}}},
author = {Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},
date = {2020-02},
journaltitle = {International Journal of Computer Vision},
shortjournal = {Int J Comput Vis},
volume = {128},
number = {2},
eprint = {1610.02391},
eprinttype = {arxiv},
eprintclass = {cs},
pages = {336--359},
issn = {0920-5691, 1573-1405},
doi = {10.1007/s11263-019-01228-7},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning}
}
@article{su2020,
title = {Machine {{Learning-Based Crop Drought Mapping System}} by {{UAV Remote Sensing RGB Imagery}}},
author = {Su, Jinya and Coombes, Matthew and Liu, Cunjia and Zhu, Yongchao and Song, Xingyang and Fang, Shibo and Guo, Lei and Chen, Wen-Hua},
@ -306,17 +346,16 @@
keywords = {Image processing,Multimodal deep learning,Plant water stress,Time-series modeling}
}
@misc{zheng2019,
@online{zheng2019,
title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}},
shorttitle = {Distance-{{IoU Loss}}},
author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
date = {2019-11-19},
number = {arXiv:1911.08287},
eprint = {1911.08287},
eprint = {arXiv:1911.08287},
eprinttype = {arxiv},
publisher = {{arXiv}},
doi = {10.48550/arXiv.1911.08287},
archiveprefix = {arXiv},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}
}
@ -336,6 +375,18 @@
keywords = {cassava diseases,convolutional neural network,focal angular margin penalty softmax loss (FAMP-Softmax),intelligent agricultural engineering,transformer-embedded ResNet (T-RNet),unbalanced image samples}
}
@online{zhou2015,
title = {Learning {{Deep Features}} for {{Discriminative Localization}}},
author = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio},
date = {2015-12-13},
number = {arXiv:1512.04150},
eprint = {arXiv:1512.04150},
eprinttype = {arxiv},
doi = {10.48550/arXiv.1512.04150},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}
}
@article{zhuang2017,
title = {Early Detection of Water Stress in Maize Based on Digital Images},
author = {Zhuang, Shuo and Wang, Ping and Jiang, Boran and Li, Maosong and Gong, Zhihong},

Binary file not shown.

View File

@ -20,9 +20,12 @@
\usepackage[usenames,dvipsnames,table]{xcolor} % Allows the definition and use of colors. This package has to be included before tikz.
\usepackage{nag} % Issues warnings when best practices in writing LaTeX documents are violated.
\usepackage{todonotes} % Provides tooltip-like todo notes.
\usepackage[backend=biber,style=trad-alpha,isbn=false,eprint=false,maxcitenames=3]{biblatex}
\usepackage{hyperref} % Enables cross linking in the electronic document version. This package has to be included second to last.
\usepackage[acronym,toc]{glossaries} % Enables the generation of glossaries and lists fo acronyms. This package has to be included last.
\addbibresource{references.bib}
% Define convenience functions to use the author name and the thesis title in the PDF document properties.
\newcommand{\authorname}{Tobias Eidelpes} % The author name without titles.
\newcommand{\thesistitle}{Flower State Classification for Watering System} % The title of the thesis. The English version should be used, if it exists.
@ -67,12 +70,14 @@
\setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum.
\newacronym{xai}{XAI}{Explainable Artificial Intelligence}
\newacronym{lime}{LIME}{Local Interpretable Model Agnostic Explanation}
\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping}
\newacronym{cam}{CAM}{Class Activation Mapping}
\newacronym{oid}{OID}{Open Images Dataset}
\newacronym{ap}{AP}{Average Precision}
\newacronym{iou}{IOU}{Intersection over Union}
\newacronym{map}{mAP}{mean average precision}
\newacronym{resnet}{ResNet}{Residual Neural Network}
\newacronym{cnn}{CNN}{Convolutional Neural Network}
\begin{document}
@ -117,10 +122,10 @@
The following sections contain a detailed evaluation of the model in
various scenarios. First, we present metrics from the training phases
of the constituent models. Second, we employ methods from the field of
\gls{xai} such as \gls{lime} and \gls{grad-cam} to get a better
understanding of the models' abstractions. Finally, we turn to the
models' aggregate performance on the test set and discuss whether the
initial goals set by the problem description have been met or not.
\gls{xai} such as \gls{grad-cam} to get a better understanding of the
models' abstractions. Finally, we turn to the models' aggregate
performance on the test set and discuss whether the initial goals set
by the problem description have been met or not.
\section{Object Detection}
\label{sec:yolo-eval}
@ -141,7 +146,7 @@ consists of 91479 images with a roughly 85/5/10 split for training,
validation and testing, respectively.
\subsection{Training Phase}
\label{sec:yolo-training-phase}
\label{ssec:yolo-training-phase}
The object detection model was trained for 300 epochs on 79204 images
with 284130 ground truth labels. The weights from the best-performing
@ -167,9 +172,10 @@ until performance deteriorates due to overfitting.
\begin{figure}
\centering
\includegraphics{graphics/model_fitness.pdf}
\caption[Model fitness per epoch.]{Model fitness for each epoch
calculated as in equation~\ref{eq:fitness}. The vertical gray line
at 133 marks the epoch with the highest fitness.}
\caption[Object detection fitness per epoch.]{Object detection model
fitness for each epoch calculated as in
equation~\ref{eq:fitness}. The vertical gray line at 133 marks the
epoch with the highest fitness.}
\label{fig:fitness}
\end{figure}
@ -189,7 +195,8 @@ rates. This conclusion is supported by the increasing
\begin{figure}
\centering
\includegraphics{graphics/precision_recall.pdf}
\caption{Overall precision and recall during training for each
\caption[Object detection precision and recall during
training.]{Overall precision and recall during training for each
epoch. The vertical gray line at 133 marks the epoch with the
highest fitness.}
\label{fig:prec-rec}
@ -222,15 +229,15 @@ before overfitting occurs.
\begin{figure}
\centering
\includegraphics{graphics/val_box_obj_loss.pdf}
\caption[Box and object loss.]{Box and object loss measured against
the validation set of 3091 images and 4092 ground truth
labels. The class loss is omitted because there is only one class
in the dataset and the loss is therefore always zero.}
\caption[Object detection box and object loss.]{Box and object loss
measured against the validation set of 3091 images and 4092 ground
truth labels. The class loss is omitted because there is only one
class in the dataset and the loss is therefore always zero.}
\label{fig:box-obj-loss}
\end{figure}
\subsection{Test Phase}
\label{ssec:test-phase}
\label{ssec:yolo-test-phase}
Of the 91479 images around 10\% were used for the test phase. These
images contain a total of 12238 ground truth
@ -262,14 +269,19 @@ serve the user's needs better than a high precision.
Figure~\ref{fig:yolo-ap} shows the \gls{ap} for the \gls{iou}
thresholds of 0.5 and 0.95. Predicted bounding boxes with an \gls{iou}
of less than 0.5 are not taken into account for the precision and
recall values of table~\ref{tab:yolo-metrics}. COCO's \cite{lin2015}
main evaluation metric is the \gls{ap} averaged across the \gls{iou}
thresholds from 0.5 to 0.95 in 0.05 steps. This value is then averaged
across all classes and called \gls{map}. The object detection model
achieves a state-of-the-art \gls{map} of 0.5727 for the \emph{Plant}
class.
recall values of table~\ref{tab:yolo-metrics}. The lower the detection
threshold, the more plants are detected. Conversely, a higher
detection threshold leaves potential plants undetected. The
precision-recall curves confirm this behavior because the area under
the curve for the threshold of 0.5 is higher than for the threshold of
0.95 ($0.66$ versus $0.41$). These values are combined in COCO's
\cite{lin2015} main evaluation metric which is the \gls{ap} averaged
across the \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps. This
value is then averaged across all classes and called \gls{map}. The
object detection model achieves a state-of-the-art \gls{map} of 0.5727
for the \emph{Plant} class.
\begin{figure}[h]
\begin{figure}
\centering
\includegraphics{graphics/APpt5-pt95.pdf}
\caption[Object detection AP@0.5 and AP@0.95.]{Precision-recall
@ -281,6 +293,211 @@ class.
\label{fig:yolo-ap}
\end{figure}
\section{Classification}
\label{sec:resnet-eval}
The classifier receives cutouts from the object detection model and
determines whether the image shows a stressed plant or not. To achieve
this goal, we trained a \gls{resnet} \cite{he2016} on a dataset of 452
images of healthy and 452 stressed plants. We chose the \gls{resnet}
architecture due to its popularity and ease of implementation as well
as its consistently high performance on various classification
tasks. While its classification speed in comparison with networks
optimized for mobile and edge devices (e.g. MobileNet) is
significantly lower, the deeper structure and the additional
parameters are necessary for the fairly complex task at
hand. Furthermore, the generous time budget for object detection
\emph{and} classification allows for more accurate results at the
expense of speed. The architecture allows for multiple different
structures, depending on the amount of layers. The smallest one has 18
and the largest 152 layers with 34, 50 and 101 in-between. The larger
networks have better accuracy in general, but come with trade-offs
regarding training and inference time as well as required space. The
50 layer architecture (\gls{resnet}50) is adequate for our use case.
\subsection{Training Phase}
\label{ssec:resnet-training-phase}
The dataset was split 85/15 into training and validation sets. The
images in the training set were augmented with a random crop to arrive
at the expected image dimensions of 224 pixels. Additionally, the
training images were modified with a random horizontal flip to
increase the variation in the set and to train a rotation invariant
classifier. All images, regardless of their membership in the training
or validation set, were normalized with the mean and standard
deviation of the ImageNet~\cite{deng2009} dataset, which the original
\gls{resnet} model was pre-trained with. Training was done for 50
epochs and the best-performing model as measured by validation
accuracy was selected as the final version.
Figure~\ref{fig:classifier-training-metrics} shows accuracy and loss
on the training and validation sets. There is a clear upwards trend
until epoch 20 when validation accuracy and loss stabilize at around
0.84 and 0.3, respectively. The quick convergence and resistance to
overfitting can be attributed to the model already having robust
feature extraction capabilities.
\begin{figure}
\centering
\includegraphics{graphics/classifier-metrics.pdf}
\caption[Classifier accuracy and loss during training.]{Accuracy and
loss during training of the classifier. The model converges
quickly, but additional epochs do not cause validation loss to
increase, which would indicate overfitting. The maximum validation
accuracy of 0.9118 is achieved at epoch 27.}
\label{fig:classifier-training-metrics}
\end{figure}
\subsection{Class Activation Maps}
\label{ssec:resnet-cam}
Neural networks are notorious for their black-box behavior, where it
is possible to observe the inputs and the corresponding outputs, but
the stage in-between stays hidden from view. Models are continuously
developed and deployed to aid in human decision-making and sometimes
supplant it. It is, therefore, crucial to obtain some amount of
interpretability of what the model does \emph{inside} to be able to
explain why a decision was made in a certain way. The research field
of \gls{xai} gained significance during the last few years because of
the development of new methods to peek inside these black boxes.
One such method, \gls{cam}~\cite{zhou2015}, is a popular tool to
produce visual explanations for decisions made by
\glspl{cnn}. Convolutional layers essentially function as object
detectors as long as no fully-connected layers perform the
classification. This ability to localize regions of interest which
play a significant role in the type of class the model predicts, can
be retained until the last layer and used to generate activation maps
for the predictions.
A more recent approach to generating a \gls{cam} via gradients is
proposed by~\textcite{selvaraju2020}. Their \gls{grad-cam} approach
works by computing the gradient of the feature maps of the last
convolutional layer with respect to the specified class. The last
layer is chosen because the authors find that ``[…] Grad-CAM maps
become progressively worse as we move to earlier convolutional layers
as they have smaller receptive fields and only focus on less semantic
local features.''~\cite[p.5]{selvaraju2020}
Turning to our classifier, figure~\ref{fig:resnet-cam} shows the
\glspl{cam} for \emph{healthy} and \emph{stressed}. While the regions
of interest for the \emph{healthy} class lie on the healthy plant, the
\emph{stressed} plant is barely considered and mostly rendered as
background information (blue). Conversely, when asked to explain the
inputs to the \emph{stressed} classification, the regions of interest
predominantly stay on the thirsty as opposed to the healthy plant. In
fact, the large hanging leaves play a significant role in determining
the class the image belongs to. This is an additional data point
confirming that the model focuses on the \emph{right} parts of the
image during classification.
\begin{figure}
\centering
\includegraphics{graphics/classifier-cam.pdf}
\caption[Classifier \glspl{cam}.]{The top left image shows the
original image of the same plant in a stressed (left) and healthy
(right) state. In the top right image, the \gls{cam} for the class
\emph{healthy} is laid over the original image. The classifier
draws its conclusion mainly from the healthy plant, which is
indicated by the red hot spots around the tips of the plant. The
bottom right image shows the \gls{cam} for the \emph{stressed}
class. The classifier focuses on the hanging leaves of the thirsty
plant. The image was classified as \emph{stressed} with a
confidence of 70\%.}
\label{fig:resnet-cam}
\end{figure}
\section{Aggregate Model}
\label{sec:aggregate-model}
In this section we turn to the evaluation of the aggregate model. We
have confirmed the performance of the constituent models: the object
detection and the classification model. It remains to evaluate the
complete pipeline from gathering detections of potential plants in an
image and forwarding them to the classifier to obtaining the results
as either healthy or stressed with their associated confidence scores.
The test set contains 640 images which were obtained from a google
search using the terms \emph{thirsty plant}, \emph{wilted plant} and
\emph{stressed plant}. Images which clearly show one or multiple
plants with some amount of visible stress were added to the
dataset. Care was taken to include plants with various degrees of
stress and in various locations and lighting conditions. The search
not only provided images of stressed plants, but also of healthy
plants due to articles, which describe how to care for plants, having
a banner image of healthy plants. The dataset is biased towards potted
plants which are commonly put on display in western
households. Furthermore, many plants, such as succulents, are sought
after for home environments because of their ease of maintenance. Due
to their inclusion in the dataset and how they exhibit water stress,
the test set nevertheless contains a wide variety of scenarios.
After collecting the images, the aggregate model was run on them to
obtain initial bounding boxes and classifications for ground truth
labeling. Letting the model do the work beforehand and then correcting
the labels allowed to include more images in the test set because they
could be labeled more easily. Additionally, going over the detections
and classifications provided a comprehensive view on how the models
work and what their weaknesses and strengths are. After the labels
have been corrected, the ground truth of the test set contains 662
bounding boxes of healthy plants and 488 of stressed plants.
\begin{table}
\centering
\begin{tabular}{lrrrr}
\toprule
{} & Precision & Recall & F1-score & Support \\
\midrule
Healthy & 0.824 & 0.745 & 0.783 & 662.0 \\
Stressed & 0.707 & 0.783 & 0.743 & 488.0 \\
micro avg & 0.769 & 0.761 & 0.765 & 1150.0 \\
macro avg & 0.766 & 0.764 & 0.763 & 1150.0 \\
weighted avg & 0.775 & 0.761 & 0.766 & 1150.0 \\
\bottomrule
\end{tabular}
\caption{Precision, recall and F1-score for the aggregate model.}
\label{tab:model-metrics}
\end{table}
Table~\ref{tab:model-metrics} shows precision, recall and the F1-score
for both classes \emph{Healthy} and \emph{Stressed}. Both precision
and recall are balanced and the F1-score is high. Unfortunately, these
values do not take the accuracy of bounding boxes into account and
thus have only limited expressive power.
Figure~\ref{fig:aggregate-ap} shows the precision and recall curves
for both classes at different \gls{iou} thresholds. The left plot
shows the \gls{ap} for each class at the threshold of 0.5 and the
right one at 0.95. The \gls{map} is 0.6226 and calculated across all
classes as the median of the \gls{iou} thresholds from 0.5 to 0.95 in
0.05 steps. The difference between \gls{map}@0.5 and \gls{map}@0.95 is
fairly small which indicates that the bounding boxes encapsulate the
objects of interest well. The cliffs at around 0.77 (left) and 0.7
(right) happen at a detection threshold of 0.5. The classifier's last
layer is a softmax layer which necessarily transforms the input into a
probability of showing either a healthy or stressed plant. If the
probability of an image showing a healthy plant is below 0.5, it is no
longer classified as healthy but as stressed. The threshold for
discriminating the two classes lies at the 0.5 value and is therefore
the cutoff for either class.
\begin{figure}
\centering
\includegraphics{graphics/APmodel.pdf}
\caption[Aggregate model AP@0.5 and AP@0.95.]{Precision-recall
curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a
specific threshold is defined as the area under the
precision-recall curve of that threshold. The \gls{map} across
\gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps
\textsf{mAP}@0.5:0.95 is 0.6226.}
\label{fig:aggregate-ap}
\end{figure}
Overall, we believe that the aggregate model shows sufficient
predictive performance to be deployed in the field. The detections are
accurate, especially for potted plants, and the classification into
healthy and stressed is robust.
\backmatter
@ -291,7 +508,7 @@ class.
\cleardoublepage % Start list of tables on the next empty right hand page.
\listoftables % Starred version, i.e., \listoftables*, removes the toc entry.
% Use an optional list of alogrithms.
% Use an optional list of algorithms.
\listofalgorithms
\addcontentsline{toc}{chapter}{List of Algorithms}
@ -302,8 +519,8 @@ class.
\printglossaries
% Add a bibliography.
\bibliographystyle{alpha}
\bibliography{references}
%\bibliographystyle{alpha}
\printbibliography
\end{document}
%%% Local Variables: