Add Viola-Jones, HOG and DPM

2023-09-21 11:35:55 +02:00 · 2023-09-21 11:35:55 +02:00 · 5917923de3
commit 5917923de3
parent 40422aee74
3 changed files with 229 additions and 3 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -149,6 +149,21 @@
  keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
 }
@inproceedings{dalal2005,
  title = {Histograms of Oriented Gradients for Human Detection},
  booktitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  author = {Dalal, N. and Triggs, B.},
  date = {2005-06},
  volume = {1},
  pages = {886-893 vol. 1},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2005.177},
  abstract = {We study the question of feature sets for robust visual object recognition; adopting linear SVM based human detection as a test case. After reviewing existing edge and gradient based descriptors, we show experimentally that grids of histograms of oriented gradient (HOG) descriptors significantly outperform existing feature sets for human detection. We study the influence of each stage of the computation on performance, concluding that fine-scale gradients, fine orientation binning, relatively coarse spatial binning, and high-quality local contrast normalization in overlapping descriptor blocks are all important for good results. The new approach gives near-perfect separation on the original MIT pedestrian database, so we introduce a more challenging dataset containing over 1800 annotated human images with a large range of pose variations and backgrounds.},
  eventtitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  keywords = {High performance computing,Histograms,Humans,Image databases,Image edge detection,Object detection,Object recognition,Robustness,Support vector machines,Testing},
  file = {/home/zenon/Zotero/storage/EJFMAW4Z/Dalal and Triggs - 2005 - Histograms of oriented gradients for human detecti.pdf;/home/zenon/Zotero/storage/G6CK9G7D/1467360.html}
 }
@article{davis1992,
  title = {Operational Prototyping: A New Development Approach},
  shorttitle = {Operational Prototyping},
@ -197,6 +212,65 @@
  file = {/home/zenon/Zotero/storage/FCRT6NYG/Everingham et al. - 2010 - The Pascal Visual Object Classes (VOC) Challenge.pdf}
 }
@inproceedings{felzenszwalb2008,
  title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
  booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
  date = {2008-06},
  pages = {1--8},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2008.4587597},
  abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
  eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
  file = {/home/zenon/Zotero/storage/Q4LTEZL7/4587597.html}
 }
@inproceedings{felzenszwalb2008a,
  title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
  booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
  date = {2008-06},
  pages = {1--8},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2008.4587597},
  abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
  eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
  file = {/home/zenon/Zotero/storage/5NMZ5V8B/Felzenszwalb et al. - 2008 - A discriminatively trained, multiscale, deformable.pdf;/home/zenon/Zotero/storage/3P3CRTV7/4587597.html}
 }
@inproceedings{freund1995,
  title = {A Desicion-Theoretic Generalization of on-Line Learning and an Application to Boosting},
  booktitle = {Computational {{Learning Theory}}},
  author = {Freund, Yoav and Schapire, Robert E.},
  editor = {Vitányi, Paul},
  date = {1995},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {23--37},
  publisher = {{Springer}},
  location = {{Berlin, Heidelberg}},
  doi = {10.1007/3-540-59119-2_166},
  abstract = {We consider the problem of dynamically apportioning resources among a set of options in a worst-case on-line framework. The model we study can be interpreted as a broad, abstract extension of the well-studied on-line prediction model to a general decision-theoretic setting. We show that the multiplicative weight-update rule of Littlestone and Warmuth [10] can be adapted to this mode yielding bounds that are slightly weaker in some cases, but applicable to a considerably more general class of learning problems. We show how the resulting learning algorithm can be applied to a variety of problems, including gambling, multiple-outcome prediction, repeated games and prediction of points in ℝn. We also show how the weight-update rule can be used to derive a new boosting algorithm which does not require prior knowledge about the performance of the weak learning algorithm.},
  isbn = {978-3-540-49195-8},
  langid = {english},
  keywords = {Algorithm AdaBoost,Cumulative Loss,Final Hypothesis,Loss Function,Weak Hypothesis}
 }
@inproceedings{girshick2015,
  title = {Deformable Part Models Are Convolutional Neural Networks},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Girshick, Ross and Iandola, Forrest and Darrell, Trevor and Malik, Jitendra},
  date = {2015-06},
  pages = {437--446},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2015.7298641},
  abstract = {Deformable part models (DPMs) and convolutional neural networks (CNNs) are two widely used tools for visual recognition. They are typically viewed as distinct approaches: DPMs are graphical models (Markov random fields), while CNNs are “black-box” non-linear classifiers. In this paper, we show that a DPM can be formulated as a CNN, thus providing a synthesis of the two ideas. Our construction involves unrolling the DPM inference algorithm and mapping each step to an equivalent CNN layer. From this perspective, it is natural to replace the standard image features used in DPMs with a learned feature extractor. We call the resulting model a DeepPyramid DPM and experimentally validate it on PASCAL VOC object detection. We find that DeepPyramid DPMs significantly outperform DPMs based on histograms of oriented gradients features (HOG) and slightly outperforms a comparable version of the recently introduced R-CNN detection system, while running significantly faster.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  keywords = {Convolution,Detectors,Feature extraction,Geometry,Inference algorithms,Object detection,Transforms},
  file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html}
 }
@inproceedings{he2016,
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -454,6 +528,34 @@
  file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
 }
@inproceedings{viola2001,
  title = {Rapid Object Detection Using a Boosted Cascade of Simple Features},
  booktitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
  author = {Viola, P. and Jones, M.},
  date = {2001-12},
  volume = {1},
  pages = {I-I},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2001.990517},
  abstract = {This paper describes a machine learning approach for visual object detection which is capable of processing images extremely rapidly and achieving high detection rates. This work is distinguished by three key contributions. The first is the introduction of a new image representation called the "integral image" which allows the features used by our detector to be computed very quickly. The second is a learning algorithm, based on AdaBoost, which selects a small number of critical visual features from a larger set and yields extremely efficient classifiers. The third contribution is a method for combining increasingly more complex classifiers in a "cascade" which allows background regions of the image to be quickly discarded while spending more computation on promising object-like regions. The cascade can be viewed as an object specific focus-of-attention mechanism which unlike previous approaches provides statistical guarantees that discarded regions are unlikely to contain the object of interest. In the domain of face detection the system yields detection rates comparable to the best previous systems. Used in real-time applications, the detector runs at 15 frames per second without resorting to image differencing or skin color detection.},
  eventtitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
  keywords = {Detectors,Face detection,Filters,Focusing,Image representation,Machine learning,Object detection,Pixel,Robustness,Skin},
  file = {/home/zenon/Zotero/storage/7EMWJGGB/Viola and Jones - 2001 - Rapid object detection using a boosted cascade of .pdf;/home/zenon/Zotero/storage/PT4TV455/990517.html}
 }
@inproceedings{viola2001a,
  title = {Robust Real-Time Face Detection},
  booktitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
  author = {Viola, P. and Jones, M.},
  date = {2001-07},
  volume = {2},
  pages = {747--747},
  doi = {10.1109/ICCV.2001.937709},
  eventtitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
  keywords = {Boosting,Color,Detectors,Face detection,Information resources,Laboratories,Object detection,Pixel,Robustness,Video sequences},
  file = {/home/zenon/Zotero/storage/MX2PJDWC/Viola and Jones - 2001 - Robust real-time face detection.pdf;/home/zenon/Zotero/storage/NCMDRQ53/937709.html}
 }
@article{virnodkar2020,
  title = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}: {{A Critical Review}}},
  shorttitle = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}},
@ -547,3 +649,20 @@
  doi = {10.1016/j.compag.2017.06.022},
  keywords = {Early maize,Feature extraction,Gradient boosting decision tree,Image segmentation,Water stress}
 }
@article{zou2023,
  title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}},
  shorttitle = {Object {{Detection}} in 20 {{Years}}},
  author = {Zou, Zhengxia and Chen, Keyan and Shi, Zhenwei and Guo, Yuhong and Ye, Jieping},
  date = {2023-03},
  journaltitle = {Proceedings of the IEEE},
  volume = {111},
  number = {3},
  pages = {257--276},
  issn = {1558-2256},
  doi = {10.1109/JPROC.2023.3238524},
  abstract = {Object detection, as of one the most fundamental and challenging problems in computer vision, has received great attention in recent years. Over the past two decades, we have seen a rapid technological evolution of object detection and its profound impact on the entire computer vision field. If we consider today’s object detection technique as a revolution driven by deep learning, then, back in the 1990s, we would see the ingenious thinking and long-term perspective design of early computer vision. This article extensively reviews this fast-moving research field in the light of technical evolution, spanning over a quarter-century’s time (from the 1990s to 2022). A number of topics have been covered in this article, including the milestone detectors in history, detection datasets, metrics, fundamental building blocks of the detection system, speedup techniques, and recent state-of-the-art detection methods.},
  eventtitle = {Proceedings of the {{IEEE}}},
  keywords = {Computer vision,Convolutional neural networks,convolutional neural networks (CNNs),deep learning,Deep learning,Detectors,Feature extraction,object detection,Object detection,technical evolution},
  file = {/home/zenon/Zotero/storage/TFBCMNKC/Zou et al. - 2023 - Object Detection in 20 Years A Survey.pdf;/home/zenon/Zotero/storage/A5ENIFX3/10028728.html}
 }
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -90,6 +90,13 @@
 \newacronym{sbc}{SBC}{single-board computer}
 \newacronym{api}{API}{Application Programming Interface}
 \newacronym{rest}{REST}{Representational State Transfer}
 \newacronym{dl}{DL}{Deep Learning}
 \newacronym{gpu}{GPU}{Graphics Processing Unit}
 \newacronym{tpu}{TPU}{Tensor Processing Unit}
 \newacronym{hog}{HOG}{Histogram of Oriented Gradients}
 \newacronym{sift}{SIFT}{Scale-Invariant Feature Transform}
 \newacronym{svm}{SVM}{Support Vector Machine}
 \newacronym{dpm}{DPM}{Deformable Part-Based Model}
 \begin{document}
@ -182,7 +189,7 @@ by gardeners to survey plants and recommend watering or not. To this
 end, a machine learning model will be trained to first identify the
 plants in the field of view and then to determine if the plants need
 water or not. The model should be suitable for edge devices equipped
-with a TPU or GPU but with otherwise limited processing
+with a \gls{tpu} or \gls{gpu} but with otherwise limited processing
 capabilities. Examples of such systems include Google's Coral
 development board and the Nvidia Jetson series of~\glspl{sbc}. The
 model should make use of state-of-the-art algorithms from either
@ -414,6 +421,104 @@ YOLO and SSDnet.
 Estimated 8 pages for this section.
 From facial detection to fully automated driving—object detection
 provides the basis for a wide variety of tasks within the computer
 vision world. While most implementations in the 1990s and early 2000s
 relied on cumbersome manual feature extraction, current methods almost
 exclusively leverage a deep learning based approach. This chapter
 gives an introduction to object detection, explains common problems
 researchers have faced and how they have been solved, and discusses
 the two main approaches to object detection via deep learning.
 \subsection{Definition}
 \label{ssec:obj-definition}
 \subsection{Traditional Methods}
 \label{ssec:obj-traditional}
 Before the advent of powerful \glspl{gpu}, object detection was
 commonly done by manually extracting features from images and passing
 these features on to a classical machine learning algorithm. Early
 methods were generally far from being able to detect objects in real
 time.
 \subsubsection{Viola-Jones Detector}
 \label{sssec:obj-viola-jones}
 The first milestone was the face detector by
 ~\textcite{viola2001,viola2001} which is able to perform face
 recognition on $384$ by $288$ pixel (grayscale) images with
 \qty{15}{fps} on a \qty{700}{\MHz} Intel Pentium III processor. The
 authors use an integral image representation where every pixel is the
 summation of the pixels above and to the left of it. This
 representation allows them to quickly and efficiently calculate
 Haar-like features.
 The Haar-like features are passed to a modified AdaBoost
 algorithm~\cite{freund1995} which only selects the (presumably) most
 important features. At the end there is a cascading stage of
 classifiers where regions are only considered further if they are
 promising. Every additional classifier adds complexity, but once a
 classifier rejects a sub-window, the processing stops and the
 algorithm moves on to the next window. Despite their final structure
 containing 32 classifiers, the sliding-window approach is fast and
 achieves comparable results to the state of the art in 2001.
 \subsubsection{HOG Detector}
 \label{sssec:obj-hog}
 The \gls{hog}~\cite{dalal2005} is a feature descriptor used in
 computer vision and image processing to detect objects in images. It
 is a detector which detects shape like other methods such as
 \gls{sift}. The idea is to use the distribution of local intensity
 gradients or edge directions to describe an object. To this end, the
 authors divide the image into a grid of cells and calculate a
 histogram of edge orientations within each cell. Additionally, each
 histogram is normalized by taking a larger region and adjusting the
 local histograms based on the larger region's intensity levels. The
 resulting blocks of normalized gradients are evenly spaced out across
 the image with some overlap. These patches are then passed as a
 feature vector to a classifier.
 \textcite{dalal2005} successfully use the \gls{hog} with a linear
 \gls{svm} for classification to detect humans in images. They work
 with images of 64 by 128 pixels and make sure that the image contains
 a margin of 16 pixels around the person. Decreasing the border by
 either enlarging the person or reducing the overall image size results
 in worse performance. Unfortunately, their method is far from being
 able to process images in real time—a 320 by 240 image takes roughly a
 second to process.
 \subsubsection{Deformable Part-Based Model}
 \label{sssec:obj-dpm}
 \glspl{dpm}~\cite{felzenszwalb2008a} were the winners of the
 \gls{pascal-voc} challenge in the years 2007, 2008 and 2009. The
 method is heavily based on the previously discussed \gls{hog} since it
 also uses \gls{hog} descriptors internally. The authors addition is
 the idea of learning how to decompose objects during training and
 classifying/detecting the decomposed parts during inference. The
 \gls{hog} descriptors are computed on different scales to form a
 \gls{hog} feature pyramid. Coarse features are more easily identified
 at the top of the pyramid while details are present at the lower end
 of the pyramid. The coarse features are obtained by calculating the
 histograms over fairly large areas, whereas smaller image patches are
 used for the detailed levels. A root filter works on the coarse levels
 by detecting general features of the object of interest. If the goal
 is to detect a face, for example, the root filter detects the contours
 of the face. Smaller part filters provide additional information about
 the individual parts of the object. For the face example, these
 filters capture information about the eyes, mouth and nose.
 The idea of detecting detail at different scales is not unlike what
 happens with the later \glspl{cnn}. The individual layers of a
 \gls{cnn} often describe higher level features in the earlier layers
 and provide additional lower level information as the network
 increases in depth. \textcite{girshick2015} argue that \glspl{dpm}
 \emph{are} in fact \glspl{cnn} because they can be formulated as
 \glspl{cnn} by unrolling each step of the algorithm into a
 corresponding \gls{cnn} layer.
 \section{Classification}
 \label{sec:background-classification}
@ -891,8 +996,8 @@ The object detection model was pre-trained on the COCO~\cite{lin2015}
 dataset and fine-tuned with data from the \gls{oid}
 \cite{kuznetsova2020} in its sixth version. Since the full \gls{oid}
 dataset contains considerably more classes and samples than would be
-feasibly trainable on a small cluster of GPUs, only images from the
+feasibly trainable on a small cluster of \glspl{gpu}, only images from
-two classes \emph{Plant} and \emph{Houseplant} have been
+the two classes \emph{Plant} and \emph{Houseplant} have been
 downloaded. The samples from the Houseplant class are merged into the
 Plant class because the distinction between the two is not necessary
 for our model. Furthermore, the \gls{oid} contains not only bounding
@ -1547,4 +1652,6 @@ Estimated 1 page for this section
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% End: