Add YOLOv1 to selected methods

This commit is contained in:
Tobias Eidelpes 2023-11-26 18:02:19 +01:00
parent bfc9488602
commit 80c1d98bf7
3 changed files with 130 additions and 1 deletions

View File

@ -1051,6 +1051,35 @@
file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html}
}
@inproceedings{redmon2017,
title = {{{YOLO9000}}: {{Better}}, {{Faster}}, {{Stronger}}},
shorttitle = {{{YOLO9000}}},
booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
author = {Redmon, Joseph and Farhadi, Ali},
date = {2017-07},
pages = {6517--6525},
issn = {1063-6919},
doi = {10.1109/CVPR.2017.690},
abstract = {We introduce YOLO9000, a state-of-the-art, real-time object detection system that can detect over 9000 object categories. First we propose various improvements to the YOLO detection method, both novel and drawn from prior work. The improved model, YOLOv2, is state-of-the-art on standard detection tasks like PASCAL VOC and COCO. Using a novel, multi-scale training method the same YOLOv2 model can run at varying sizes, offering an easy tradeoff between speed and accuracy. At 67 FPS, YOLOv2 gets 76.8 mAP on VOC 2007. At 40 FPS, YOLOv2 gets 78.6 mAP, outperforming state-of-the-art methods like Faster RCNN with ResNet and SSD while still running significantly faster. Finally we propose a method to jointly train on object detection and classification. Using this method we train YOLO9000 simultaneously on the COCO detection dataset and the ImageNet classification dataset. Our joint training allows YOLO9000 to predict detections for object classes that dont have labelled detection data. We validate our approach on the ImageNet detection task. YOLO9000 gets 19.7 mAP on the ImageNet detection validation set despite only having detection data for 44 of the 200 classes. On the 156 classes not in COCO, YOLO9000 gets 16.0 mAP. YOLO9000 predicts detections for more than 9000 different object categories, all in real-time.},
eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
file = {/home/zenon/Zotero/storage/MUTDKJCG/Redmon and Farhadi - 2017 - YOLO9000 Better, Faster, Stronger.pdf;/home/zenon/Zotero/storage/D5XNDDQC/8100173.html}
}
@online{redmon2018,
title = {{{YOLOv3}}: {{An Incremental Improvement}}},
shorttitle = {{{YOLOv3}}},
author = {Redmon, Joseph and Farhadi, Ali},
date = {2018-04-08},
eprint = {1804.02767},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.1804.02767},
abstract = {We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. As always, all the code is online at https://pjreddie.com/yolo/},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/3EJMSL2T/Redmon and Farhadi - 2018 - YOLOv3 An Incremental Improvement.pdf;/home/zenon/Zotero/storage/BJVAIALX/1804.html}
}
@inproceedings{ren2015,
title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}},
shorttitle = {Faster {{R-CNN}}},
@ -1314,6 +1343,21 @@
file = {/home/zenon/Zotero/storage/JQVR2G3M/Szegedy et al. - 2017 - Inception-v4, Inception-ResNet and the Impact of R.pdf}
}
@online{terven2023,
title = {A {{Comprehensive Review}} of {{YOLO}}: {{From YOLOv1}} and {{Beyond}}},
shorttitle = {A {{Comprehensive Review}} of {{YOLO}}},
author = {Terven, Juan and Cordova-Esparza, Diana},
date = {2023-10-07},
eprint = {2304.00501},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2304.00501},
abstract = {YOLO has become a central real-time object detection system for robotics, driverless cars, and video monitoring applications. We present a comprehensive analysis of YOLO's evolution, examining the innovations and contributions in each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with Transformers. We start by describing the standard metrics and postprocessing; then, we discuss the major changes in network architecture and training tricks for each model. Finally, we summarize the essential lessons from YOLO's development and provide a perspective on its future, highlighting potential research directions to enhance real-time object detection systems.},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition,I.2.10},
file = {/home/zenon/Zotero/storage/QT68D7SY/Terven and Cordova-Esparza - 2023 - A Comprehensive Review of YOLO From YOLOv1 and Be.pdf;/home/zenon/Zotero/storage/DJ5QRQMW/2304.html}
}
@inproceedings{turner2021,
title = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}: {{Analysis}} of the {{Black-Box Optimization Challenge}} 2020},
shorttitle = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}},

Binary file not shown.

View File

@ -26,6 +26,7 @@
\usepackage{siunitx}
\usepackage{float}
\usepackage{csquotes}
\usepackage{dsfont}
\addbibresource{references.bib}
@ -1456,7 +1457,7 @@ two auxiliary classifiers which help retain the gradient during
backpropagation. The auxiliary classifiers are only used during
training. The authors submitted multiple model versions to the 2004
\gls{ilsvrc} and their ensemble prediction model consisting of 7
GoogleNet's achieved a top-5 error rate of 6.67\%, which resulted in
GoogleNets achieved a top-5 error rate of 6.67\%, which resulted in
first place.
\subsubsection{VGGNet}
@ -2097,6 +2098,90 @@ YOLOv4~\cite{bochkovskiy2020}).
Estimated 2 pages for this section.
The \gls{yolo} family of object detection models started in 2015 when
\cite{redmon2016} published the first version. Since then there have
been up to 16 updated versions depending on how one counts. The
original \gls{yolo} model marked a shift from two-stage detectors to
one-stage detectors as is evident in its name. Two-stage detectors
(see section~\ref{ssec:theory-two-stage}) rely on a proposal
generation step and then subsequent rejection or approval of each
proposal to detect objects. Generating proposals, however, is an
expensive procedure which limits the amount of object detections per
second. \gls{yolo} dispenses with the extra proposal generation step
and instead provides a unified \emph{one-stage} detection approach.
The first version of \gls{yolo} \cite{redmon2016} framed object
detection as a single regression problem which allows the model to
directly infer bounding boxes with class probabilities from image
pixels. This approach has the added benefit that \gls{yolo} sees an
entire image at once, allowing it to capture more contextual
information than with sliding window or region proposal
methods. However, \gls{yolo} still divides an image into regions which
are called \emph{grid cells}, but this is just a simple operation and
does not rely on external algorithms such as selective search
\cite{uijlings2013}. The number of bounding box proposals within
\gls{yolo} is much lower than with selective search as well ($98$
versus $2000$ per image).
The architecture of \gls{yolo} is similar to GoogleNet (see
section~\ref{sssec:theory-googlenet}), but the authors do not use
inception modules directly. The network contains $24$ convolutional
layers in total where most three by three layers are fed a reduced
output from a one by one layer. This approach reduces complexity
substantially---as has been demonstrated with GoogleNet. Every block of
convolutional layers is followed by a two by two maxpool layer for
downsampling. The model expects an input image of size $448$ by $448$
pixels, but has been pretrained on ImageNet with half that resolution
(i.e. $224$ by $224$ pixels). After the convolutional layers, the
authors add two fully-connected layers to produce an output of size
$7 \times 7 \times 30$. This output tensor is chosen because the \gls{voc} data
set has $20$ classes $C$ and each grid cell produces two bounding
boxes $B$ where each bounding box is described by $x$, $y$, $w$, $h$
and the confidence. With a grid size of $S = 7$, the output is thus
$S \times S \times (B \cdot 5 + C) = 7 \times 7 \times 30$.
Each grid cell is responsible for a detected object if the object's
center coordinates $(x,y)$ fall within the bounds of the
cell. Furthermore, every cell can only predict \emph{one} object which
leads to problems with images of dense objects. In that case, a finer
grid size is needed. The $w$ and $h$ of a bounding box is relative to
the image as a whole which allows the bounding box to span more than
one grid cell.
Since the authors frame object detection as a regression problem of
bounding box coordinates (center point $(x,y)$, width $w$, and height
$h$), object probabilities per box, and class probabilities, they
develop a loss function which is a sum of five parts. The first part
describes the regression for the bounding box center coordinates (sum
of squared differences), the second part the width and height of the
box, the third part the confidence of there being an object in a box,
the fourth part the confidence if there is no actual object in the
box, and the fifth part the individual class probabilities (see
equation~\ref{eq:yolo-loss}). The two constants
$\lambda_{\mathrm{coord}}$ and $\lambda_{\mathrm{noobj}}$ are weighting factors
which increase the loss from bounding box coordinate predictions and
decrease the loss from confidence predictions for boxes without
objects. These are set to $\lambda_{\mathrm{coord}} = 5$ and
$\lambda_{{\mathrm{noobj}}} = 0.5$.
\begin{multline}
\label{eq:yolo-loss}
\lambda_{\mathrm{coord}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ (x_{i} - \hat{x}_{i})^{2} + (y_{i} - \hat{y}_{i})^{2} \biggr] \\
+ \lambda_{\mathrm{coord}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ \biggl(\sqrt{w_{i}} - \sqrt{\hat{w}_{i}}\biggr)^{2} \biggr] \\
+ \sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{obj}}\biggl[ C_{i} - \hat{C}_{i} \biggr]^{2} \\
+ \lambda_{\mathrm{noobj}}\sum_{i=0}^{S^{2}}\sum_{j=0}^{B}\mathds{1}_{ij}^{\mathrm{noobj}}\biggl[ C_{i} - \hat{C}_{i} \biggr]^{2} \\
+ \sum_{i=0}^{S^{2}}\mathds{1}_{i}^{\mathrm{obj}}\sum_{c\in\mathrm{classes}}\biggl[ p_{i(c)} - \hat{p}_{i(c)} \biggr]^{2}
\end{multline}
The original \gls{yolo} model has a few limitations. It only predicts
one class per bounding box and can only accommodate two bounding boxes
per grid cell. \gls{yolo} thus has problems detecting small and dense
objects. The most severe problem, however, is the localization
accuracy. The loss function treats errors in small bounding boxes
similarly to errors in big bounding boxes even though small errors
have a higher impact on small bounding boxes than big ones. This
results in a more lenient loss function for \glspl{iou} of small
bounding boxes and, therefore, worse localization.
\subsection{ResNet}
\label{sec:methods-classification}