Add preliminary thesis structure

2023-07-30 17:17:31 +02:00 · 2023-07-30 17:17:31 +02:00 · 32adb47b78
commit 32adb47b78
parent 9f4143be50
9 changed files with 349 additions and 135 deletions
--- a/classification/evaluation/eval-test-model.ipynb
+++ b/classification/evaluation/eval-test-model.ipynb
--- a/classification/evaluation/relabel.ipynb
+++ b/classification/evaluation/relabel.ipynb
@ -33,7 +33,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "cfd472e0",
   "metadata": {},
   "outputs": [],
@ -99727,16 +99727,48 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "id": "9e57cd86",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading labels from Label Studio...\n",
      "Download complete\n",
      "Loading labels for field 'ground_truth'...\n",
      " 100% |█████████████████| 639/639 [1.1s elapsed, 0s remaining, 576.6 samples/s]         \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"100%\"\n",
       "            height=\"800\"\n",
       "            src=\"http://localhost:5151/?context=ipython&subscription=f8354b3b-60f0-418a-a49d-f664312d58cc\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x7fe94ac02bd0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anno_key = \"labelstudio_basic_recipe\"\n",
    "\n",
    "# Merge annotations back into FiftyOne dataset\n",
    "dataset = fo.load_dataset(\"dataset\")\n",
-    "dataset.load_annotations(anno_key)\n",
+    "dataset.load_annotations(anno_key, url=LABEL_STUDIO_URL, api_key=API_KEY)\n",
    "\n",
    "# Load the view that was annotated in the App\n",
    "view = dataset.load_annotation_view(anno_key)\n",
@ -99745,12 +99777,45 @@
    "# Step 6: Cleanup\n",
    "\n",
    "# Delete tasks from Label Studio\n",
-    "results = dataset.load_annotation_results(anno_key)\n",
+    "#results = dataset.load_annotation_results(anno_key)\n",
-    "results.cleanup()\n",
+    "#results.cleanup()\n",
    "\n",
    "# Delete run record (not the labels) from FiftyOne\n",
    "#dataset.delete_annotation_run(anno_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "65f64f8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Delete tasks from Label Studio\n",
    "#results = dataset.load_annotation_results(anno_key)\n",
    "#results.cleanup()\n",
    "\n",
    "# Delete run record (not the labels) from FiftyOne\n",
    "dataset.delete_annotation_run(anno_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ef4fd54f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b099682d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
--- a/thesis/graphics/APmodel-model-optimized-relabeled.pdf
+++ b/thesis/graphics/APmodel-model-optimized-relabeled.pdf
--- a/thesis/graphics/APmodel-model-original-relabeled.pdf
+++ b/thesis/graphics/APmodel-model-original-relabeled.pdf
--- a/thesis/graphics/APmodel-relabeled.pdf
+++ b/thesis/graphics/APmodel-relabeled.pdf
--- a/thesis/graphics/APmodel-yolo-original-resnet-final-relabeled.pdf
+++ b/thesis/graphics/APmodel-yolo-original-resnet-final-relabeled.pdf
--- a/thesis/graphics/CMmodel-relabeled.pdf
+++ b/thesis/graphics/CMmodel-relabeled.pdf
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -47,6 +47,8 @@
 \nonzeroparskip             % Create space between paragraphs (optional).
 \setlength{\parindent}{0pt} % Remove paragraph identation (optional).
 \setcounter{tocdepth}{3}
 \makeindex      % Use an optional index.
 \makeglossaries % Use an optional glossary.
 %\glstocfalse   % Remove the glossaries from the table of contents.
@ -117,18 +119,45 @@
 % Switch to arabic numbering and start the enumeration of chapters in the table of content.
 \mainmatter
-% \chapter{Introduction}
+\chapter{Introduction}
-% \todo{Enter your text here.}
+\label{chap:introduction}
-\chapter{Evaluation}
+\section{Motivation and Problem Statement}
 \label{sec:motivation}
 \section{Thesis Structure}
 \label{sec:structure}
 \chapter{Theoretical Background}
 \label{chap:background}
 \section{Object Detection}
 \label{sec:background-detection}
 \section{Classification}
 \label{sec:background-classification}
 \section{Related Work}
 \label{sec:related-work}
 \chapter{Prototype Development}
 \label{chap:development}
 \section{Object Detection}
 \label{sec:development-detection}
 \section{Classification}
 \label{sec:Classification}
 \chapter{Results}
 \label{chap:results}
 The following sections contain a detailed evaluation of the model in
 various scenarios. First, we present metrics from the training phases
 of the constituent models. Second, we employ methods from the field of
 \gls{xai} such as \gls{grad-cam} to get a better understanding of the
 models' abstractions. Finally, we turn to the models' aggregate
-performance on the test set and discuss whether the initial goals set
+performance on the test set.
 by the problem description have been met or not.
 \section{Object Detection}
 \label{sec:yolo-eval}
@ -149,7 +178,7 @@ consists of 91479 images with a roughly 85/5/10 split for training,
 validation and testing, respectively.
 \subsection{Training Phase}
-\label{ssec:yolo-training-phase}
+\label{ssec:yolo-training}
 The object detection model was trained for 300 epochs on 79204 images
 with 284130 ground truth labels. The weights from the best-performing
@ -240,7 +269,7 @@ before overfitting occurs.
 \end{figure}
 \subsection{Test Phase}
-\label{ssec:yolo-test-phase}
+\label{ssec:yolo-test}
 Of the 91479 images around 10\% were used for the test phase. These
 images contain a total of 12238 ground truth
@ -337,11 +366,10 @@ Figure~\ref{fig:hyp-opt-fitness} shows the model's fitness during
 training for each epoch. After the highest fitness of 0.6172 at epoch
 27, the performance quickly declines and shows that further training
 would likely not yield improved results. The model converges to its
-highest fitness much earlier than the non-optimized version discussed
+highest fitness much earlier than the non-optimized version, which
-in section~\ref{ssec:yolo-training-phase}, which indicates that the
+indicates that the adjusted parameters provide a better starting point
-adjusted parameters provide a better starting point in general.
+in general. Furthermore, the maximum fitness is 0.74\% higher than in
-Furthermore, the maximum fitness is 0.74\% higher than in the
+the non-optimized version.
 non-optimized version.
 \begin{figure}
  \centering
@ -426,7 +454,7 @@ is lower by 1.8\%.
 \end{figure}
 \section{Classification}
-\label{sec:resnet-eval}
+\label{sec:classifier-eval}
 The classifier receives cutouts from the object detection model and
 determines whether the image shows a stressed plant or not. To achieve
@ -448,7 +476,7 @@ regarding training and inference time as well as required space. The
 50 layer architecture (\gls{resnet}50) is adequate for our use case.
 \subsection{Training Phase}
-\label{ssec:resnet-training-phase}
+\label{ssec:classifier-training}
 The dataset was split 85/15 into training and validation sets. The
 images in the training set were augmented with a random crop to arrive
@ -481,15 +509,15 @@ feature extraction capabilities.
 \end{figure}
 \subsection{Hyper-parameter Optimization}
-\label{ssec:resnet-hyp-opt}
+\label{ssec:classifier-hyp-opt}
 In order to improve the aforementioned accuracy values, we perform
 hyper-parameter optimization across a wide range of
-parameters. Table~\ref{tab:resnet-hyps} lists the hyper-parameters and
+parameters. Table~\ref{tab:classifier-hyps} lists the hyper-parameters
-their possible values. Since the number of all combinations of values
+and their possible values. Since the number of all combinations of
-is 11520 and each combination is trained for 10 epochs with a training
+values is 11520 and each combination is trained for 10 epochs with a
-time of approximately six minutes per combination, exhausting the
+training time of approximately six minutes per combination, exhausting
-search space would take 48 days. Due to time limitations, we have
+the search space would take 48 days. Due to time limitations, we have
 chosen to not search exhaustively but to pick random combinations
 instead. Random search works surprisingly well---especially compared to
 grid search---in a number of domains, one of which is hyper-parameter
@ -513,13 +541,13 @@ optimization~\cite{bergstra2012}.
  \end{tabular}
  \caption{Hyper-parameters and their possible values during
    optimization.}
-  \label{tab:resnet-hyps}
+  \label{tab:classifier-hyps}
 \end{table}
 The random search was run for 138 iterations which equates to a 75\%
 probability that the best solution lies within 1\% of the theoretical
-maximum~\eqref{eq:opt-prob}. Figure~\ref{fig:resnet-hyp-results} shows
+maximum~\eqref{eq:opt-prob}. Figure~\ref{fig:classifier-hyp-results}
-three of the eight parameters and their impact on a high
+shows three of the eight parameters and their impact on a high
 F1-score. \gls{sgd} has less variation in its results than
 Adam~\cite{kingma2017} and manages to provide eight out of the ten
 best results. The number of epochs to train for was chosen based on
@ -549,10 +577,10 @@ figure~\ref{fig:classifier-training-metrics}.
    produced the best iteration with an F1-score of 0.9783. Adam tends
    to require more customization of its parameters than \gls{sgd} to
    achieve good results.}
-  \label{fig:resnet-hyp-results}
+  \label{fig:classifier-hyp-results}
 \end{figure}
-Table~\ref{tab:resnet-final-hyps} lists the final hyper-parameters
+Table~\ref{tab:classifier-final-hyps} lists the final hyper-parameters
 which were chosen to train the improved model. In order to confirm
 that the model does not suffer from overfitting or is a product of
 chance due to a coincidentally advantageous train/test split, we
@ -579,10 +607,10 @@ is robust against variations in the training set.
  \end{tabular}
  \caption[Hyper-parameters for the optimized classifier.]{Chosen
    hyper-parameters for the final, improved model. The difference to
-    the parameters listed in Table~\ref{tab:resnet-hyps} comes as a
+    the parameters listed in Table~\ref{tab:classifier-hyps} comes as
-    result of choosing \gls{sgd} over Adam. The missing four
+    a result of choosing \gls{sgd} over Adam. The missing four
    parameters are only required for Adam and not \gls{sgd}.}
-  \label{tab:resnet-final-hyps}
+  \label{tab:classifier-final-hyps}
 \end{table}
 \begin{figure}
@ -636,7 +664,7 @@ F1-score of 1 on the training set.
 \subsection{Class Activation Maps}
-\label{ssec:resnet-cam}
+\label{ssec:classifier-cam}
 Neural networks are notorious for their black-box behavior, where it
 is possible to observe the inputs and the corresponding outputs, but
@ -666,7 +694,7 @@ become progressively worse as we move to earlier convolutional layers
 as they have smaller receptive fields and only focus on less semantic
 local features.''~\cite[p.5]{selvaraju2020}
-Turning to our classifier, figure~\ref{fig:resnet-cam} shows the
+Turning to our classifier, figure~\ref{fig:classifier-cam} shows the
 \glspl{cam} for \emph{healthy} and \emph{stressed}. While the regions
 of interest for the \emph{healthy} class lie on the healthy plant, the
 \emph{stressed} plant is barely considered and mostly rendered as
@ -675,8 +703,8 @@ inputs to the \emph{stressed} classification, the regions of interest
 predominantly stay on the thirsty as opposed to the healthy plant. In
 fact, the large hanging leaves play a significant role in determining
 the class the image belongs to. This is an additional data point
-confirming that the model focuses on the \emph{right} parts of the
+confirming that the model focuses on the semantically meaningful parts
-image during classification.
+of the image during classification.
 \begin{figure}
  \centering
@ -691,7 +719,7 @@ image during classification.
    class. The classifier focuses on the hanging leaves of the thirsty
    plant. The image was classified as \emph{stressed} with a
    confidence of 70\%.}
-  \label{fig:resnet-cam}
+  \label{fig:classifier-cam}
 \end{figure}
@ -727,20 +755,23 @@ the labels allowed to include more images in the test set because they
 could be labeled more easily. Additionally, going over the detections
 and classifications provided a comprehensive view on how the models
 work and what their weaknesses and strengths are. After the labels
-have been corrected, the ground truth of the test set contains 662
+have been corrected, the ground truth of the test set contains 766
-bounding boxes of healthy plants and 488 of stressed plants.
+bounding boxes of healthy plants and 494 of stressed plants.
 \subsection{Non-optimized Model}
 \label{ssec:model-non-optimized}
 \begin{table}
  \centering
  \begin{tabular}{lrrrr}
    \toprule
-    {} &  Precision &  Recall &  F1-score &  Support \\
+    {} &  precision &  recall &  f1-score &  support \\
    \midrule
-    Healthy      &      0.824 &   0.745 &     0.783 &    662.0 \\
+    Healthy      &      0.665 &   0.554 &     0.604 &    766 \\
-    Stressed     &      0.707 &   0.783 &     0.743 &    488.0 \\
+    Stressed     &      0.639 &   0.502 &     0.562 &    494 \\
-    micro avg    &      0.769 &   0.761 &     0.765 &   1150.0 \\
+    micro avg    &      0.655 &   0.533 &     0.588 &   1260 \\
-    macro avg    &      0.766 &   0.764 &     0.763 &   1150.0 \\
+    macro avg    &      0.652 &   0.528 &     0.583 &   1260 \\
-    weighted avg &      0.775 &   0.761 &     0.766 &   1150.0 \\
+    weighted avg &      0.655 &   0.533 &     0.588 &   1260 \\
    \bottomrule
  \end{tabular}
  \caption{Precision, recall and F1-score for the aggregate model.}
@ -748,41 +779,39 @@ bounding boxes of healthy plants and 488 of stressed plants.
 \end{table}
 Table~\ref{tab:model-metrics} shows precision, recall and the F1-score
-for both classes \emph{Healthy} and \emph{Stressed}. Both precision
+for both classes \emph{Healthy} and \emph{Stressed}. Precision is
-and recall are balanced and the F1-score is high. Unfortunately, these
+higher than recall for both classes and the F1-score is at
-values do not take the accuracy of bounding boxes into account and
+0.59. Unfortunately, these values do not take the accuracy of bounding
-thus have only limited expressive power.
+boxes into account and thus have only limited expressive power.
 Figure~\ref{fig:aggregate-ap} shows the precision and recall curves
 for both classes at different \gls{iou} thresholds. The left plot
 shows the \gls{ap} for each class at the threshold of 0.5 and the
-right one at 0.95. The \gls{map} is 0.6226 and calculated across all
+right one at 0.95. The \gls{map} is 0.3581 and calculated across all
 classes as the median of the \gls{iou} thresholds from 0.5 to 0.95 in
-0.05 steps. The difference between \gls{map}@0.5 and \gls{map}@0.95 is
+0.05 steps. The cliffs at around 0.6 (left) and 0.3 (right) happen at
-fairly small which indicates that the bounding boxes encapsulate the
+a detection threshold of 0.5. The classifier's last layer is a softmax
-objects of interest well. The cliffs at around 0.77 (left) and 0.7
+layer which necessarily transforms the input into a probability of
-(right) happen at a detection threshold of 0.5. The classifier's last
+showing either a healthy or stressed plant. If the probability of an
-layer is a softmax layer which necessarily transforms the input into a
+image showing a healthy plant is below 0.5, it is no longer classified
-probability of showing either a healthy or stressed plant. If the
+as healthy but as stressed. The threshold for discriminating the two
-probability of an image showing a healthy plant is below 0.5, it is no
+classes lies at the 0.5 value and is therefore the cutoff for either
-longer classified as healthy but as stressed. The threshold for
+class.
 discriminating the two classes lies at the 0.5 value and is therefore
 the cutoff for either class.
 \begin{figure}
  \centering
-  \includegraphics{graphics/APmodel.pdf}
+  \includegraphics{graphics/APmodel-model-optimized-relabeled.pdf}
  \caption[Aggregate model AP@0.5 and AP@0.95.]{Precision-recall
    curves for \gls{iou} thresholds of 0.5 and 0.95. The \gls{ap} of a
    specific threshold is defined as the area under the
    precision-recall curve of that threshold. The \gls{map} across
    \gls{iou} thresholds from 0.5 to 0.95 in 0.05 steps
-    \textsf{mAP}@0.5:0.95 is 0.6226.}
+    \textsf{mAP}@0.5:0.95 is 0.3581.}
  \label{fig:aggregate-ap}
 \end{figure}
-\subsection{Hyper-parameter Optimization}
+\subsection{Optimized Model}
-\label{ssec:model-hyp-opt}
+\label{ssec:model-optimized}
 So far the metrics shown in table~\ref{tab:model-metrics} are obtained
 with the non-optimized versions of both the object detection and
@ -790,7 +819,7 @@ classification model. Hyper-parameter optimization of the classifier
 led to significant model improvements, while the object detector has
 improved precision but lower recall and slightly lower \gls{map}
 values. To evaluate the final aggregate model which consists of the
-individual optimized models, we run the same test as in
+individual optimized models, we run the same test described in
 section~\ref{sec:aggregate-model}.
 \begin{table}
@ -799,11 +828,11 @@ section~\ref{sec:aggregate-model}.
    \toprule
    {} &  precision &  recall &  f1-score &  support \\
    \midrule
-    Healthy      &      0.664 &   0.640 &     0.652 &    662.0 \\
+    Healthy      &      0.711 &   0.555 &     0.623 &    766 \\
-    Stressed     &      0.680 &   0.539 &     0.601 &    488.0 \\
+    Stressed     &      0.570 &   0.623 &     0.596 &    494 \\
-    micro avg    &      0.670 &   0.597 &     0.631 &   1150.0 \\
+    micro avg    &      0.644 &   0.582 &     0.611 &   1260 \\
-    macro avg    &      0.672 &   0.590 &     0.626 &   1150.0 \\
+    macro avg    &      0.641 &   0.589 &     0.609 &   1260 \\
-    weighted avg &      0.670 &   0.597 &     0.630 &   1150.0 \\
+    weighted avg &      0.656 &   0.582 &     0.612 &   1260 \\
    \bottomrule
  \end{tabular}
  \caption{Precision, recall and F1-score for the optimized aggregate
@ -813,63 +842,38 @@ section~\ref{sec:aggregate-model}.
 Table~\ref{tab:model-metrics-hyp} shows precision, recall and F1-score
 for the optimized model on the same test dataset of 640 images. All of
-the metrics are significantly worse than for the non-optimized
+the metrics are better for the optimized model. In particular,
-model. Considering that the optimized classifier performs better than
+precision for the healthy class could be improved significantly while
-the non-optimized version this is a surprising result. There are
+recall remains at the same level. This results in a better F1-score
-multiple possible explanations for this behavior:
+for the healthy class. Precision for the stressed class is lower with
-
+the optimized model, but recall is significantly higher (0.502
-\begin{enumerate}
+vs. 0.623). The higher recall results in a 3\% gain for the F1-score
-\item The optimized classifier has worse generalizability than the
+in the stressed class. Overall, precision is the same but recall has
-  non-optimized version.
+improved significantly, which also results in a noticeable improvement
-\item The small difference in the \gls{map} values for the object
+for the average F1-score across both classes.
  detection model result in significantly higher error rates
  overall. This might be the case because a large number of plants is
  not detected in the first place and/or those which are detected are
  more often not classified correctly by the classifier. As mentioned
  in section~\ref{ssec:yolo-hyp-opt}, running the evolution of the
  hyper-parameters for more generations could better the performance
  overall.
 \item The test dataset is tailored to the non-optimized version and
  does not provide an accurate measure of real-world performance. The
  test dataset was labeled by running the individual models on the
  images and taking the predicted bounding boxes and labels as a
  starting point for the labeling process. If the labels were not
  rigorously corrected, the dataset will allow the non-optimized model
  to achieve high scores because the labels are already in line with
  what it predicts. Conversely, the optimized model might get closer
  to the actual ground truth, but that truth is not what is specified
  by the labels to begin with. If that is the case, the evaluation of
  the non-optimized model is too favorably and should be corrected
  down.
 \end{enumerate}
 Of these three possibilities, the second and third points are the most
 likely culprits. The first scenario is unlikely because the optimized
 classifier has been evaluated in a cross validation setting and the
 results do not lend themselves easily to such an
 interpretation. Dealing with the second scenario could allow the
 object detection model to perform better on its own, but would
 probably not explain the big difference in performance. Scenario three
 is the most likely one because the process of creating the test
 dataset can lead to favorable labels for the non-optimized model.
 \begin{figure}
  \centering
-  \includegraphics{graphics/APmodel-final.pdf}
+  \includegraphics{graphics/APModel-model-original-relabeled.pdf}
  \caption[Optimized aggregate model AP@0.5 and
  AP@0.95.]{Precision-recall curves for \gls{iou} thresholds of 0.5
    and 0.95. The \gls{ap} of a specific threshold is defined as the
    area under the precision-recall curve of that threshold. The
    \gls{map} across \gls{iou} thresholds from 0.5 to 0.95 in 0.05
-    steps \textsf{mAP}@0.5:0.95 is 0.4426.}
+    steps \textsf{mAP}@0.5:0.95 is 0.3838.}
  \label{fig:aggregate-ap-hyp}
 \end{figure}
-Figure~\ref{fig:aggregate-ap-hyp} confirms the suspicions raised by
+Figure~\ref{fig:aggregate-ap-hyp} confirms the performance increase of
-the lower metrics from table~\ref{tab:model-metrics-hyp}. More
+the optimized model established in
-iterations for the evolution of the object detection model would
+table~\ref{tab:model-metrics-hyp}. The \textsf{mAP}@0.5 is higher for
-likely have a significant effect on \gls{iou} and the confidence
+both classes, indicating that the model better detects plants in
-values associated with the bounding boxes.
+general. The \textsf{mAP}@0.95 is slightly lower for the healthy
 class, which means that the confidence for the healthy class is
 slightly lower compared to the non-optimized model. The result is that
 more plants are correctly detected and classified overall, but the
 confidence scores tend to be lower with the optimized model. The
 \textsf{mAP}@0.5:0.95 could be improved by about 0.025.
 \backmatter
@ -898,4 +902,7 @@ values associated with the bounding boxes.
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: "thesis"
 %%% End: