Add YOLOv1 to selected methods

2023-11-26 18:02:19 +01:00 · 2023-11-26 18:02:19 +01:00 · 80c1d98bf7
commit 80c1d98bf7
parent bfc9488602
3 changed files with 130 additions and 1 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -1051,6 +1051,35 @@
  file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html}
 }

+@inproceedings{redmon2017,
+  title = {{{YOLO9000}}: {{Better}}, {{Faster}}, {{Stronger}}},
+  shorttitle = {{{YOLO9000}}},
+  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
+  author = {Redmon, Joseph and Farhadi, Ali},
+  date = {2017-07},
+  pages = {6517--6525},
+  issn = {1063-6919},
+  doi = {10.1109/CVPR.2017.690},
+  abstract = {We introduce YOLO9000, a state-of-the-art, real-time object detection system that can detect over 9000 object categories. First we propose various improvements to the YOLO detection method, both novel and drawn from prior work. The improved model, YOLOv2, is state-of-the-art on standard detection tasks like PASCAL VOC and COCO. Using a novel, multi-scale training method the same YOLOv2 model can run at varying sizes, offering an easy tradeoff between speed and accuracy. At 67 FPS, YOLOv2 gets 76.8 mAP on VOC 2007. At 40 FPS, YOLOv2 gets 78.6 mAP, outperforming state-of-the-art methods like Faster RCNN with ResNet and SSD while still running significantly faster. Finally we propose a method to jointly train on object detection and classification. Using this method we train YOLO9000 simultaneously on the COCO detection dataset and the ImageNet classification dataset. Our joint training allows YOLO9000 to predict detections for object classes that dont have labelled detection data. We validate our approach on the ImageNet detection task. YOLO9000 gets 19.7 mAP on the ImageNet detection validation set despite only having detection data for 44 of the 200 classes. On the 156 classes not in COCO, YOLO9000 gets 16.0 mAP. YOLO9000 predicts detections for more than 9000 different object categories, all in real-time.},
+  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
+  file = {/home/zenon/Zotero/storage/MUTDKJCG/Redmon and Farhadi - 2017 - YOLO9000 Better, Faster, Stronger.pdf;/home/zenon/Zotero/storage/D5XNDDQC/8100173.html}
+}
+
+@online{redmon2018,
+  title = {{{YOLOv3}}: {{An Incremental Improvement}}},
+  shorttitle = {{{YOLOv3}}},
+  author = {Redmon, Joseph and Farhadi, Ali},
+  date = {2018-04-08},
+  eprint = {1804.02767},
+  eprinttype = {arxiv},
+  eprintclass = {cs},
+  doi = {10.48550/arXiv.1804.02767},
+  abstract = {We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. As always, all the code is online at https://pjreddie.com/yolo/},
+  pubstate = {preprint},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition},
+  file = {/home/zenon/Zotero/storage/3EJMSL2T/Redmon and Farhadi - 2018 - YOLOv3 An Incremental Improvement.pdf;/home/zenon/Zotero/storage/BJVAIALX/1804.html}
+}
+
@inproceedings{ren2015,
  title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}},
  shorttitle = {Faster {{R-CNN}}},
@ -1314,6 +1343,21 @@
  file = {/home/zenon/Zotero/storage/JQVR2G3M/Szegedy et al. - 2017 - Inception-v4, Inception-ResNet and the Impact of R.pdf}
 }

+@online{terven2023,
+  title = {A {{Comprehensive Review}} of {{YOLO}}: {{From YOLOv1}} and {{Beyond}}},
+  shorttitle = {A {{Comprehensive Review}} of {{YOLO}}},
+  author = {Terven, Juan and Cordova-Esparza, Diana},
+  date = {2023-10-07},
+  eprint = {2304.00501},
+  eprinttype = {arxiv},
+  eprintclass = {cs},
+  doi = {10.48550/arXiv.2304.00501},
+  abstract = {YOLO has become a central real-time object detection system for robotics, driverless cars, and video monitoring applications. We present a comprehensive analysis of YOLO's evolution, examining the innovations and contributions in each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with Transformers. We start by describing the standard metrics and postprocessing; then, we discuss the major changes in network architecture and training tricks for each model. Finally, we summarize the essential lessons from YOLO's development and provide a perspective on its future, highlighting potential research directions to enhance real-time object detection systems.},
+  pubstate = {preprint},
+  keywords = {Computer Science - Computer Vision and Pattern Recognition,I.2.10},
+  file = {/home/zenon/Zotero/storage/QT68D7SY/Terven and Cordova-Esparza - 2023 - A Comprehensive Review of YOLO From YOLOv1 and Be.pdf;/home/zenon/Zotero/storage/DJ5QRQMW/2304.html}
+}
+
@inproceedings{turner2021,
  title = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}: {{Analysis}} of the {{Black-Box Optimization Challenge}} 2020},
  shorttitle = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}},
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -26,6 +26,7 @@
 \usepackage{siunitx}
 \usepackage{float}
 \usepackage{csquotes}
+\usepackage{dsfont}

 \addbibresource{references.bib}

@ -1456,7 +1457,7 @@ two auxiliary classifiers which help retain the gradient during
 backpropagation. The auxiliary classifiers are only used during
 training. The authors submitted multiple model versions to the 2004
 \gls{ilsvrc} and their ensemble prediction model consisting of 7
-GoogleNet's achieved a top-5 error rate of 6.67\%, which resulted in
+GoogleNets achieved a top-5 error rate of 6.67\%, which resulted in
 first place.

 \subsubsection{VGGNet}
@ -2097,6 +2098,90 @@ YOLOv4~\cite{bochkovskiy2020}).

 Estimated 2 pages for this section.

+The \gls{yolo} family of object detection models started in 2015 when
+\cite{redmon2016} published the first version. Since then there have
+been up to 16 updated versions depending on how one counts. The
+original \gls{yolo} model marked a shift from two-stage detectors to
+one-stage detectors as is evident in its name. Two-stage detectors
+(see section~\ref{ssec:theory-two-stage}) rely on a proposal
+generation step and then subsequent rejection or approval of each
+proposal to detect objects. Generating proposals, however, is an
+expensive procedure which limits the amount of object detections per
+second. \gls{yolo} dispenses with the extra proposal generation step
+and instead provides a unified \emph{one-stage} detection approach.
+
+The first version of \gls{yolo} \cite{redmon2016} framed object
+detection as a single regression problem which allows the model to
+directly infer bounding boxes with class probabilities from image
+pixels. This approach has the added benefit that \gls{yolo} sees an
+entire image at once, allowing it to capture more contextual
+information than with sliding window or region proposal
+methods. However, \gls{yolo} still divides an image into regions which
+are called \emph{grid cells}, but this is just a simple operation and
+does not rely on external algorithms such as selective search
+\cite{uijlings2013}. The number of bounding box proposals within
+\gls{yolo} is much lower than with selective search as well ($98$
+versus $2000$ per image).
+
+The architecture of \gls{yolo} is similar to GoogleNet (see
+section~\ref{sssec:theory-googlenet}), but the authors do not use
+inception modules directly. The network contains $24$ convolutional
+layers in total where most three by three layers are fed a reduced
+output from a one by one layer. This approach reduces complexity
+substantially---as has been demonstrated with GoogleNet. Every block of
+convolutional layers is followed by a two by two maxpool layer for
+downsampling. The model expects an input image of size $448$ by $448$
+pixels, but has been pretrained on ImageNet with half that resolution
+(i.e. $224$ by $224$ pixels). After the convolutional layers, the
+authors add two fully-connected layers to produce an output of size
+$7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data
+set has $20$ classes $C$ and each grid cell produces two bounding
+boxes $B$ where each bounding box is described by $x$, $y$, $w$, $h$
+and the confidence. With a grid size of $S = 7$, the output is thus
+$S \times S \times (B \cdot 5 + C) = 7 \times 7 \times 30$.
+
+Each grid cell is responsible for a detected object if the object's
+center coordinates $(x,y)$ fall within the bounds of the
+cell. Furthermore, every cell can only predict \emph{one} object which
+leads to problems with images of dense objects. In that case, a finer
+grid size is needed. The $w$ and $h$ of a bounding box is relative to
+the image as a whole which allows the bounding box to span more than
+one grid cell.
+
+Since the authors frame object detection as a regression problem of
+bounding box coordinates (center point $(x,y)$, width $w$, and height
+$h$), object probabilities per box, and class probabilities, they
+develop a loss function which is a sum of five parts. The first part
+describes the regression for the bounding box center coordinates (sum
+of squared differences), the second part the width and height of the
+box, the third part the confidence of there being an object in a box,
+the fourth part the confidence if there is no actual object in the
+box, and the fifth part the individual class probabilities (see
+equation~\ref{eq:yolo-loss}). The two constants
+$\lambda_{\mathrm{coord}}$ and $\lambda_{\mathrm{noobj}}$ are weighting factors
+which increase the loss from bounding box coordinate predictions and
+decrease the loss from confidence predictions for boxes without
+objects. These are set to $\lambda_{\mathrm{coord}} = 5$ and
+$\lambda_{{\mathrm{noobj}}} = 0.5$.
+
+\begin{multline}
+  \label{eq:yolo-loss}
+  \lambda_{\mathrm{coord}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ (x_{i} - \hat{x}_{i})^{2} + (y_{i} - \hat{y}_{i})^{2} \biggr] \\
+  + \lambda_{\mathrm{coord}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ \biggl(\sqrt{w_{i}} - \sqrt{\hat{w}_{i}}\biggr)^{2} \biggr] \\
+  + \sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ C_{i} - \hat{C}_{i} \biggr]^{2} \\
+  + \lambda_{\mathrm{noobj}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{noobj}}\biggl[ C_{i} - \hat{C}_{i} \biggr]^{2} \\
+  + \sum_{i=0}^{S^{2}}\mathds{1}_{i}^{\mathrm{obj}}\sum_{c\in\mathrm{classes}}\biggl[ p_{i(c)} - \hat{p}_{i(c)} \biggr]^{2}
+\end{multline}
+
+The original \gls{yolo} model has a few limitations. It only predicts
+one class per bounding box and can only accommodate two bounding boxes
+per grid cell. \gls{yolo} thus has problems detecting small and dense
+objects. The most severe problem, however, is the localization
+accuracy. The loss function treats errors in small bounding boxes
+similarly to errors in big bounding boxes even though small errors
+have a higher impact on small bounding boxes than big ones. This
+results in a more lenient loss function for \glspl{iou} of small
+bounding boxes and, therefore, worse localization.
 \subsection{ResNet}
 \label{sec:methods-classification}