Add Viola-Jones, HOG and DPM
This commit is contained in:
parent
40422aee74
commit
5917923de3
@ -149,6 +149,21 @@
|
|||||||
keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
|
keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@inproceedings{dalal2005,
|
||||||
|
title = {Histograms of Oriented Gradients for Human Detection},
|
||||||
|
booktitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
|
||||||
|
author = {Dalal, N. and Triggs, B.},
|
||||||
|
date = {2005-06},
|
||||||
|
volume = {1},
|
||||||
|
pages = {886-893 vol. 1},
|
||||||
|
issn = {1063-6919},
|
||||||
|
doi = {10.1109/CVPR.2005.177},
|
||||||
|
abstract = {We study the question of feature sets for robust visual object recognition; adopting linear SVM based human detection as a test case. After reviewing existing edge and gradient based descriptors, we show experimentally that grids of histograms of oriented gradient (HOG) descriptors significantly outperform existing feature sets for human detection. We study the influence of each stage of the computation on performance, concluding that fine-scale gradients, fine orientation binning, relatively coarse spatial binning, and high-quality local contrast normalization in overlapping descriptor blocks are all important for good results. The new approach gives near-perfect separation on the original MIT pedestrian database, so we introduce a more challenging dataset containing over 1800 annotated human images with a large range of pose variations and backgrounds.},
|
||||||
|
eventtitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
|
||||||
|
keywords = {High performance computing,Histograms,Humans,Image databases,Image edge detection,Object detection,Object recognition,Robustness,Support vector machines,Testing},
|
||||||
|
file = {/home/zenon/Zotero/storage/EJFMAW4Z/Dalal and Triggs - 2005 - Histograms of oriented gradients for human detecti.pdf;/home/zenon/Zotero/storage/G6CK9G7D/1467360.html}
|
||||||
|
}
|
||||||
|
|
||||||
@article{davis1992,
|
@article{davis1992,
|
||||||
title = {Operational Prototyping: A New Development Approach},
|
title = {Operational Prototyping: A New Development Approach},
|
||||||
shorttitle = {Operational Prototyping},
|
shorttitle = {Operational Prototyping},
|
||||||
@ -197,6 +212,65 @@
|
|||||||
file = {/home/zenon/Zotero/storage/FCRT6NYG/Everingham et al. - 2010 - The Pascal Visual Object Classes (VOC) Challenge.pdf}
|
file = {/home/zenon/Zotero/storage/FCRT6NYG/Everingham et al. - 2010 - The Pascal Visual Object Classes (VOC) Challenge.pdf}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@inproceedings{felzenszwalb2008,
|
||||||
|
title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
|
||||||
|
booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
|
||||||
|
author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
|
||||||
|
date = {2008-06},
|
||||||
|
pages = {1--8},
|
||||||
|
issn = {1063-6919},
|
||||||
|
doi = {10.1109/CVPR.2008.4587597},
|
||||||
|
abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
|
||||||
|
eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
|
||||||
|
keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
|
||||||
|
file = {/home/zenon/Zotero/storage/Q4LTEZL7/4587597.html}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{felzenszwalb2008a,
|
||||||
|
title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
|
||||||
|
booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
|
||||||
|
author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
|
||||||
|
date = {2008-06},
|
||||||
|
pages = {1--8},
|
||||||
|
issn = {1063-6919},
|
||||||
|
doi = {10.1109/CVPR.2008.4587597},
|
||||||
|
abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
|
||||||
|
eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
|
||||||
|
keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
|
||||||
|
file = {/home/zenon/Zotero/storage/5NMZ5V8B/Felzenszwalb et al. - 2008 - A discriminatively trained, multiscale, deformable.pdf;/home/zenon/Zotero/storage/3P3CRTV7/4587597.html}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{freund1995,
|
||||||
|
title = {A Desicion-Theoretic Generalization of on-Line Learning and an Application to Boosting},
|
||||||
|
booktitle = {Computational {{Learning Theory}}},
|
||||||
|
author = {Freund, Yoav and Schapire, Robert E.},
|
||||||
|
editor = {Vitányi, Paul},
|
||||||
|
date = {1995},
|
||||||
|
series = {Lecture {{Notes}} in {{Computer Science}}},
|
||||||
|
pages = {23--37},
|
||||||
|
publisher = {{Springer}},
|
||||||
|
location = {{Berlin, Heidelberg}},
|
||||||
|
doi = {10.1007/3-540-59119-2_166},
|
||||||
|
abstract = {We consider the problem of dynamically apportioning resources among a set of options in a worst-case on-line framework. The model we study can be interpreted as a broad, abstract extension of the well-studied on-line prediction model to a general decision-theoretic setting. We show that the multiplicative weight-update rule of Littlestone and Warmuth [10] can be adapted to this mode yielding bounds that are slightly weaker in some cases, but applicable to a considerably more general class of learning problems. We show how the resulting learning algorithm can be applied to a variety of problems, including gambling, multiple-outcome prediction, repeated games and prediction of points in ℝn. We also show how the weight-update rule can be used to derive a new boosting algorithm which does not require prior knowledge about the performance of the weak learning algorithm.},
|
||||||
|
isbn = {978-3-540-49195-8},
|
||||||
|
langid = {english},
|
||||||
|
keywords = {Algorithm AdaBoost,Cumulative Loss,Final Hypothesis,Loss Function,Weak Hypothesis}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{girshick2015,
|
||||||
|
title = {Deformable Part Models Are Convolutional Neural Networks},
|
||||||
|
booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
|
||||||
|
author = {Girshick, Ross and Iandola, Forrest and Darrell, Trevor and Malik, Jitendra},
|
||||||
|
date = {2015-06},
|
||||||
|
pages = {437--446},
|
||||||
|
issn = {1063-6919},
|
||||||
|
doi = {10.1109/CVPR.2015.7298641},
|
||||||
|
abstract = {Deformable part models (DPMs) and convolutional neural networks (CNNs) are two widely used tools for visual recognition. They are typically viewed as distinct approaches: DPMs are graphical models (Markov random fields), while CNNs are “black-box” non-linear classifiers. In this paper, we show that a DPM can be formulated as a CNN, thus providing a synthesis of the two ideas. Our construction involves unrolling the DPM inference algorithm and mapping each step to an equivalent CNN layer. From this perspective, it is natural to replace the standard image features used in DPMs with a learned feature extractor. We call the resulting model a DeepPyramid DPM and experimentally validate it on PASCAL VOC object detection. We find that DeepPyramid DPMs significantly outperform DPMs based on histograms of oriented gradients features (HOG) and slightly outperforms a comparable version of the recently introduced R-CNN detection system, while running significantly faster.},
|
||||||
|
eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
|
||||||
|
keywords = {Convolution,Detectors,Feature extraction,Geometry,Inference algorithms,Object detection,Transforms},
|
||||||
|
file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html}
|
||||||
|
}
|
||||||
|
|
||||||
@inproceedings{he2016,
|
@inproceedings{he2016,
|
||||||
title = {Deep {{Residual Learning}} for {{Image Recognition}}},
|
title = {Deep {{Residual Learning}} for {{Image Recognition}}},
|
||||||
booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
|
booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
|
||||||
@ -454,6 +528,34 @@
|
|||||||
file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
|
file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@inproceedings{viola2001,
|
||||||
|
title = {Rapid Object Detection Using a Boosted Cascade of Simple Features},
|
||||||
|
booktitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
|
||||||
|
author = {Viola, P. and Jones, M.},
|
||||||
|
date = {2001-12},
|
||||||
|
volume = {1},
|
||||||
|
pages = {I-I},
|
||||||
|
issn = {1063-6919},
|
||||||
|
doi = {10.1109/CVPR.2001.990517},
|
||||||
|
abstract = {This paper describes a machine learning approach for visual object detection which is capable of processing images extremely rapidly and achieving high detection rates. This work is distinguished by three key contributions. The first is the introduction of a new image representation called the "integral image" which allows the features used by our detector to be computed very quickly. The second is a learning algorithm, based on AdaBoost, which selects a small number of critical visual features from a larger set and yields extremely efficient classifiers. The third contribution is a method for combining increasingly more complex classifiers in a "cascade" which allows background regions of the image to be quickly discarded while spending more computation on promising object-like regions. The cascade can be viewed as an object specific focus-of-attention mechanism which unlike previous approaches provides statistical guarantees that discarded regions are unlikely to contain the object of interest. In the domain of face detection the system yields detection rates comparable to the best previous systems. Used in real-time applications, the detector runs at 15 frames per second without resorting to image differencing or skin color detection.},
|
||||||
|
eventtitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
|
||||||
|
keywords = {Detectors,Face detection,Filters,Focusing,Image representation,Machine learning,Object detection,Pixel,Robustness,Skin},
|
||||||
|
file = {/home/zenon/Zotero/storage/7EMWJGGB/Viola and Jones - 2001 - Rapid object detection using a boosted cascade of .pdf;/home/zenon/Zotero/storage/PT4TV455/990517.html}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{viola2001a,
|
||||||
|
title = {Robust Real-Time Face Detection},
|
||||||
|
booktitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
|
||||||
|
author = {Viola, P. and Jones, M.},
|
||||||
|
date = {2001-07},
|
||||||
|
volume = {2},
|
||||||
|
pages = {747--747},
|
||||||
|
doi = {10.1109/ICCV.2001.937709},
|
||||||
|
eventtitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
|
||||||
|
keywords = {Boosting,Color,Detectors,Face detection,Information resources,Laboratories,Object detection,Pixel,Robustness,Video sequences},
|
||||||
|
file = {/home/zenon/Zotero/storage/MX2PJDWC/Viola and Jones - 2001 - Robust real-time face detection.pdf;/home/zenon/Zotero/storage/NCMDRQ53/937709.html}
|
||||||
|
}
|
||||||
|
|
||||||
@article{virnodkar2020,
|
@article{virnodkar2020,
|
||||||
title = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}: {{A Critical Review}}},
|
title = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}: {{A Critical Review}}},
|
||||||
shorttitle = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}},
|
shorttitle = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}},
|
||||||
@ -547,3 +649,20 @@
|
|||||||
doi = {10.1016/j.compag.2017.06.022},
|
doi = {10.1016/j.compag.2017.06.022},
|
||||||
keywords = {Early maize,Feature extraction,Gradient boosting decision tree,Image segmentation,Water stress}
|
keywords = {Early maize,Feature extraction,Gradient boosting decision tree,Image segmentation,Water stress}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@article{zou2023,
|
||||||
|
title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}},
|
||||||
|
shorttitle = {Object {{Detection}} in 20 {{Years}}},
|
||||||
|
author = {Zou, Zhengxia and Chen, Keyan and Shi, Zhenwei and Guo, Yuhong and Ye, Jieping},
|
||||||
|
date = {2023-03},
|
||||||
|
journaltitle = {Proceedings of the IEEE},
|
||||||
|
volume = {111},
|
||||||
|
number = {3},
|
||||||
|
pages = {257--276},
|
||||||
|
issn = {1558-2256},
|
||||||
|
doi = {10.1109/JPROC.2023.3238524},
|
||||||
|
abstract = {Object detection, as of one the most fundamental and challenging problems in computer vision, has received great attention in recent years. Over the past two decades, we have seen a rapid technological evolution of object detection and its profound impact on the entire computer vision field. If we consider today’s object detection technique as a revolution driven by deep learning, then, back in the 1990s, we would see the ingenious thinking and long-term perspective design of early computer vision. This article extensively reviews this fast-moving research field in the light of technical evolution, spanning over a quarter-century’s time (from the 1990s to 2022). A number of topics have been covered in this article, including the milestone detectors in history, detection datasets, metrics, fundamental building blocks of the detection system, speedup techniques, and recent state-of-the-art detection methods.},
|
||||||
|
eventtitle = {Proceedings of the {{IEEE}}},
|
||||||
|
keywords = {Computer vision,Convolutional neural networks,convolutional neural networks (CNNs),deep learning,Deep learning,Detectors,Feature extraction,object detection,Object detection,technical evolution},
|
||||||
|
file = {/home/zenon/Zotero/storage/TFBCMNKC/Zou et al. - 2023 - Object Detection in 20 Years A Survey.pdf;/home/zenon/Zotero/storage/A5ENIFX3/10028728.html}
|
||||||
|
}
|
||||||
|
|||||||
Binary file not shown.
@ -90,6 +90,13 @@
|
|||||||
\newacronym{sbc}{SBC}{single-board computer}
|
\newacronym{sbc}{SBC}{single-board computer}
|
||||||
\newacronym{api}{API}{Application Programming Interface}
|
\newacronym{api}{API}{Application Programming Interface}
|
||||||
\newacronym{rest}{REST}{Representational State Transfer}
|
\newacronym{rest}{REST}{Representational State Transfer}
|
||||||
|
\newacronym{dl}{DL}{Deep Learning}
|
||||||
|
\newacronym{gpu}{GPU}{Graphics Processing Unit}
|
||||||
|
\newacronym{tpu}{TPU}{Tensor Processing Unit}
|
||||||
|
\newacronym{hog}{HOG}{Histogram of Oriented Gradients}
|
||||||
|
\newacronym{sift}{SIFT}{Scale-Invariant Feature Transform}
|
||||||
|
\newacronym{svm}{SVM}{Support Vector Machine}
|
||||||
|
\newacronym{dpm}{DPM}{Deformable Part-Based Model}
|
||||||
|
|
||||||
\begin{document}
|
\begin{document}
|
||||||
|
|
||||||
@ -182,7 +189,7 @@ by gardeners to survey plants and recommend watering or not. To this
|
|||||||
end, a machine learning model will be trained to first identify the
|
end, a machine learning model will be trained to first identify the
|
||||||
plants in the field of view and then to determine if the plants need
|
plants in the field of view and then to determine if the plants need
|
||||||
water or not. The model should be suitable for edge devices equipped
|
water or not. The model should be suitable for edge devices equipped
|
||||||
with a TPU or GPU but with otherwise limited processing
|
with a \gls{tpu} or \gls{gpu} but with otherwise limited processing
|
||||||
capabilities. Examples of such systems include Google's Coral
|
capabilities. Examples of such systems include Google's Coral
|
||||||
development board and the Nvidia Jetson series of~\glspl{sbc}. The
|
development board and the Nvidia Jetson series of~\glspl{sbc}. The
|
||||||
model should make use of state-of-the-art algorithms from either
|
model should make use of state-of-the-art algorithms from either
|
||||||
@ -414,6 +421,104 @@ YOLO and SSDnet.
|
|||||||
|
|
||||||
Estimated 8 pages for this section.
|
Estimated 8 pages for this section.
|
||||||
|
|
||||||
|
From facial detection to fully automated driving—object detection
|
||||||
|
provides the basis for a wide variety of tasks within the computer
|
||||||
|
vision world. While most implementations in the 1990s and early 2000s
|
||||||
|
relied on cumbersome manual feature extraction, current methods almost
|
||||||
|
exclusively leverage a deep learning based approach. This chapter
|
||||||
|
gives an introduction to object detection, explains common problems
|
||||||
|
researchers have faced and how they have been solved, and discusses
|
||||||
|
the two main approaches to object detection via deep learning.
|
||||||
|
|
||||||
|
\subsection{Definition}
|
||||||
|
\label{ssec:obj-definition}
|
||||||
|
|
||||||
|
\subsection{Traditional Methods}
|
||||||
|
\label{ssec:obj-traditional}
|
||||||
|
|
||||||
|
Before the advent of powerful \glspl{gpu}, object detection was
|
||||||
|
commonly done by manually extracting features from images and passing
|
||||||
|
these features on to a classical machine learning algorithm. Early
|
||||||
|
methods were generally far from being able to detect objects in real
|
||||||
|
time.
|
||||||
|
|
||||||
|
\subsubsection{Viola-Jones Detector}
|
||||||
|
\label{sssec:obj-viola-jones}
|
||||||
|
|
||||||
|
The first milestone was the face detector by
|
||||||
|
~\textcite{viola2001,viola2001} which is able to perform face
|
||||||
|
recognition on $384$ by $288$ pixel (grayscale) images with
|
||||||
|
\qty{15}{fps} on a \qty{700}{\MHz} Intel Pentium III processor. The
|
||||||
|
authors use an integral image representation where every pixel is the
|
||||||
|
summation of the pixels above and to the left of it. This
|
||||||
|
representation allows them to quickly and efficiently calculate
|
||||||
|
Haar-like features.
|
||||||
|
|
||||||
|
The Haar-like features are passed to a modified AdaBoost
|
||||||
|
algorithm~\cite{freund1995} which only selects the (presumably) most
|
||||||
|
important features. At the end there is a cascading stage of
|
||||||
|
classifiers where regions are only considered further if they are
|
||||||
|
promising. Every additional classifier adds complexity, but once a
|
||||||
|
classifier rejects a sub-window, the processing stops and the
|
||||||
|
algorithm moves on to the next window. Despite their final structure
|
||||||
|
containing 32 classifiers, the sliding-window approach is fast and
|
||||||
|
achieves comparable results to the state of the art in 2001.
|
||||||
|
|
||||||
|
\subsubsection{HOG Detector}
|
||||||
|
\label{sssec:obj-hog}
|
||||||
|
|
||||||
|
The \gls{hog}~\cite{dalal2005} is a feature descriptor used in
|
||||||
|
computer vision and image processing to detect objects in images. It
|
||||||
|
is a detector which detects shape like other methods such as
|
||||||
|
\gls{sift}. The idea is to use the distribution of local intensity
|
||||||
|
gradients or edge directions to describe an object. To this end, the
|
||||||
|
authors divide the image into a grid of cells and calculate a
|
||||||
|
histogram of edge orientations within each cell. Additionally, each
|
||||||
|
histogram is normalized by taking a larger region and adjusting the
|
||||||
|
local histograms based on the larger region's intensity levels. The
|
||||||
|
resulting blocks of normalized gradients are evenly spaced out across
|
||||||
|
the image with some overlap. These patches are then passed as a
|
||||||
|
feature vector to a classifier.
|
||||||
|
|
||||||
|
\textcite{dalal2005} successfully use the \gls{hog} with a linear
|
||||||
|
\gls{svm} for classification to detect humans in images. They work
|
||||||
|
with images of 64 by 128 pixels and make sure that the image contains
|
||||||
|
a margin of 16 pixels around the person. Decreasing the border by
|
||||||
|
either enlarging the person or reducing the overall image size results
|
||||||
|
in worse performance. Unfortunately, their method is far from being
|
||||||
|
able to process images in real time—a 320 by 240 image takes roughly a
|
||||||
|
second to process.
|
||||||
|
|
||||||
|
\subsubsection{Deformable Part-Based Model}
|
||||||
|
\label{sssec:obj-dpm}
|
||||||
|
|
||||||
|
\glspl{dpm}~\cite{felzenszwalb2008a} were the winners of the
|
||||||
|
\gls{pascal-voc} challenge in the years 2007, 2008 and 2009. The
|
||||||
|
method is heavily based on the previously discussed \gls{hog} since it
|
||||||
|
also uses \gls{hog} descriptors internally. The authors addition is
|
||||||
|
the idea of learning how to decompose objects during training and
|
||||||
|
classifying/detecting the decomposed parts during inference. The
|
||||||
|
\gls{hog} descriptors are computed on different scales to form a
|
||||||
|
\gls{hog} feature pyramid. Coarse features are more easily identified
|
||||||
|
at the top of the pyramid while details are present at the lower end
|
||||||
|
of the pyramid. The coarse features are obtained by calculating the
|
||||||
|
histograms over fairly large areas, whereas smaller image patches are
|
||||||
|
used for the detailed levels. A root filter works on the coarse levels
|
||||||
|
by detecting general features of the object of interest. If the goal
|
||||||
|
is to detect a face, for example, the root filter detects the contours
|
||||||
|
of the face. Smaller part filters provide additional information about
|
||||||
|
the individual parts of the object. For the face example, these
|
||||||
|
filters capture information about the eyes, mouth and nose.
|
||||||
|
|
||||||
|
The idea of detecting detail at different scales is not unlike what
|
||||||
|
happens with the later \glspl{cnn}. The individual layers of a
|
||||||
|
\gls{cnn} often describe higher level features in the earlier layers
|
||||||
|
and provide additional lower level information as the network
|
||||||
|
increases in depth. \textcite{girshick2015} argue that \glspl{dpm}
|
||||||
|
\emph{are} in fact \glspl{cnn} because they can be formulated as
|
||||||
|
\glspl{cnn} by unrolling each step of the algorithm into a
|
||||||
|
corresponding \gls{cnn} layer.
|
||||||
|
|
||||||
\section{Classification}
|
\section{Classification}
|
||||||
\label{sec:background-classification}
|
\label{sec:background-classification}
|
||||||
|
|
||||||
@ -891,8 +996,8 @@ The object detection model was pre-trained on the COCO~\cite{lin2015}
|
|||||||
dataset and fine-tuned with data from the \gls{oid}
|
dataset and fine-tuned with data from the \gls{oid}
|
||||||
\cite{kuznetsova2020} in its sixth version. Since the full \gls{oid}
|
\cite{kuznetsova2020} in its sixth version. Since the full \gls{oid}
|
||||||
dataset contains considerably more classes and samples than would be
|
dataset contains considerably more classes and samples than would be
|
||||||
feasibly trainable on a small cluster of GPUs, only images from the
|
feasibly trainable on a small cluster of \glspl{gpu}, only images from
|
||||||
two classes \emph{Plant} and \emph{Houseplant} have been
|
the two classes \emph{Plant} and \emph{Houseplant} have been
|
||||||
downloaded. The samples from the Houseplant class are merged into the
|
downloaded. The samples from the Houseplant class are merged into the
|
||||||
Plant class because the distinction between the two is not necessary
|
Plant class because the distinction between the two is not necessary
|
||||||
for our model. Furthermore, the \gls{oid} contains not only bounding
|
for our model. Furthermore, the \gls{oid} contains not only bounding
|
||||||
@ -1547,4 +1652,6 @@ Estimated 1 page for this section
|
|||||||
%%% TeX-master: t
|
%%% TeX-master: t
|
||||||
%%% TeX-master: t
|
%%% TeX-master: t
|
||||||
%%% TeX-master: t
|
%%% TeX-master: t
|
||||||
|
%%% TeX-master: t
|
||||||
|
%%% TeX-master: t
|
||||||
%%% End:
|
%%% End:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user