diff --git a/thesis/references.bib b/thesis/references.bib index 2679240..ccc6853 100644 --- a/thesis/references.bib +++ b/thesis/references.bib @@ -110,6 +110,21 @@ keywords = {deep learning,global optimization,model selection,neural networks,response surface modeling} } +@book{bishop2006, + title = {Pattern {{Recognition}} and {{Machine Learning}}}, + author = {Bishop, Christopher M.}, + date = {2006-08-17}, + eprint = {qWPwnQEACAAJ}, + eprinttype = {googlebooks}, + publisher = {{Springer}}, + abstract = {This is the first textbook on pattern recognition to present the Bayesian viewpoint. The book presents approximate inference algorithms that permit fast approximate answers in situations where exact answers are not feasible. It uses graphical models to describe probability distributions when no other books apply graphical models to machine learning. No previous knowledge of pattern recognition or machine learning concepts is assumed. Familiarity with multivariate calculus and basic linear algebra is required, and some experience in the use of probabilities would be helpful though not essential as the book includes a self-contained introduction to basic probability theory.}, + isbn = {978-0-387-31073-2}, + langid = {english}, + pagetotal = {738}, + keywords = {Computers / Computer Graphics,Computers / Computer Vision \& Pattern Recognition,Computers / Intelligence (AI) \& Semantics,Computers / Optical Data Processing,Computers / Software Development \& Engineering / General,Mathematics / Probability \& Statistics / General}, + file = {/home/zenon/Zotero/storage/VTDMDZPT/Bishop - 2006 - Pattern Recognition and Machine Learning.pdf} +} + @online{bochkovskiy2020, title = {{{YOLOv4}}: {{Optimal Speed}} and {{Accuracy}} of {{Object Detection}}}, shorttitle = {{{YOLOv4}}}, @@ -135,6 +150,15 @@ file = {/home/zenon/Zotero/storage/56LE395G/Brown et al. - 2020 - Language Models Are Few-Shot Learners.pdf} } +@article{cauchy1847, + title = {Méthode Générale Pour La Résolution Des Systèmes d’équations Simultanées}, + author = {Cauchy, M. Augustine}, + date = {1847-10-18}, + journaltitle = {Comptes rendus hebdomadaires des séances de l’Académie des sciences}, + volume = {25}, + pages = {399--402} +} + @article{chandel2021, title = {Identifying {{Crop Water Stress Using Deep Learning Models}}}, author = {Chandel, Narendra Singh and Chakraborty, Subir Kumar and Rajwade, Yogesh Anand and Dubey, Kumkum and Tiwari, Mukesh K. and Jat, Dilip}, @@ -240,6 +264,22 @@ file = {/home/zenon/Zotero/storage/5NMZ5V8B/Felzenszwalb et al. - 2008 - A discriminatively trained, multiscale, deformable.pdf;/home/zenon/Zotero/storage/3P3CRTV7/4587597.html} } +@article{felzenszwalb2010, + title = {Object {{Detection}} with {{Discriminatively Trained Part-Based Models}}}, + author = {Felzenszwalb, Pedro F. and Girshick, Ross B. and McAllester, David and Ramanan, Deva}, + date = {2010-09}, + journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + volume = {32}, + number = {9}, + pages = {1627--1645}, + issn = {1939-3539}, + doi = {10.1109/TPAMI.2009.167}, + urldate = {2023-10-26}, + abstract = {We describe an object detection system based on mixtures of multiscale deformable part models. Our system is able to represent highly variable object classes and achieves state-of-the-art results in the PASCAL object detection challenges. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL data sets. Our system relies on new methods for discriminative training with partially labeled data. We combine a margin-sensitive approach for data-mining hard negative examples with a formalism we call latent SVM. A latent SVM is a reformulation of MI–SVM in terms of latent variables. A latent SVM is semiconvex, and the training problem becomes convex once latent information is specified for the positive examples. This leads to an iterative training algorithm that alternates between fixing latent values for positive examples and optimizing the latent SVM objective function.}, + eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, + file = {/home/zenon/Zotero/storage/P5378A3K/Felzenszwalb et al. - 2010 - Object Detection with Discriminatively Trained Par.pdf;/home/zenon/Zotero/storage/HYLEIZJU/5255236.html} +} + @inproceedings{freund1995, title = {A Desicion-Theoretic Generalization of on-Line Learning and an Application to Boosting}, booktitle = {Computational {{Learning Theory}}}, @@ -305,6 +345,28 @@ file = {/home/zenon/Zotero/storage/B9KGZ7N2/Ge et al. - 2021 - YOLOX Exceeding YOLO Series in 2021.pdf;/home/zenon/Zotero/storage/XQTJLGLZ/2107.html} } +@online{girshick, + title = {Discriminatively {{Trained Deformable Part Models}} ({{Release}} 5)}, + author = {Girshick, Ross B. and Felzenszwalb, Pedro F. and McAllester, David}, + url = {https://web.archive.org/web/20231026094412/https://www.rossgirshick.info/latent/}, + urldate = {2023-10-26}, + file = {/home/zenon/Zotero/storage/HQTS6PW6/latent.html} +} + +@inproceedings{girshick2014, + title = {Rich {{Feature Hierarchies}} for {{Accurate Object Detection}} and {{Semantic Segmentation}}}, + booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, + author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra}, + date = {2014-06}, + pages = {580--587}, + issn = {1063-6919}, + doi = {10.1109/CVPR.2014.81}, + urldate = {2023-10-22}, + abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012 – achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.}, + eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, + file = {/home/zenon/Zotero/storage/EL92YEYD/Girshick et al. - 2014 - Rich Feature Hierarchies for Accurate Object Detec.pdf;/home/zenon/Zotero/storage/TX9APXST/6909475.html} +} + @inproceedings{girshick2015, title = {Deformable Part Models Are Convolutional Neural Networks}, booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, @@ -319,6 +381,36 @@ file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html} } +@inproceedings{girshick2015a, + title = {Fast {{R-CNN}}}, + booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + author = {Girshick, Ross}, + date = {2015-12}, + pages = {1440--1448}, + issn = {2380-7504}, + doi = {10.1109/ICCV.2015.169}, + urldate = {2023-10-22}, + abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.}, + eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + file = {/home/zenon/Zotero/storage/I4Q5NJCT/Girshick - 2015 - Fast R-CNN.pdf;/home/zenon/Zotero/storage/VQZF2I7Z/7410526.html} +} + +@article{girshick2016, + title = {Region-{{Based Convolutional Networks}} for {{Accurate Object Detection}} and {{Segmentation}}}, + author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra}, + date = {2016-01}, + journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + volume = {38}, + number = {1}, + pages = {142--158}, + issn = {1939-3539}, + doi = {10.1109/TPAMI.2015.2437384}, + urldate = {2023-10-22}, + abstract = {Object detection performance, as measured on the canonical PASCAL VOC Challenge datasets, plateaued in the final years of the competition. The best-performing methods were complex ensemble systems that typically combined multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 50 percent relative to the previous best result on VOC 2012-achieving a mAP of 62.4 percent. Our approach combines two ideas: (1) one can apply high-capacity convolutional networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data are scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, boosts performance significantly. Since we combine region proposals with CNNs, we call the resulting model an R-CNN or Region-based Convolutional Network. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.}, + eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, + file = {/home/zenon/Zotero/storage/MQPF5MGW/Girshick et al. - 2016 - Region-Based Convolutional Networks for Accurate O.pdf;/home/zenon/Zotero/storage/EKC4WHDQ/7112511.html} +} + @book{goodfellow2016, title = {Deep {{Learning}}}, author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron}, @@ -333,6 +425,22 @@ keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science} } +@article{he2015, + title = {Spatial {{Pyramid Pooling}} in {{Deep Convolutional Networks}} for {{Visual Recognition}}}, + author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + date = {2015-09}, + journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + volume = {37}, + number = {9}, + pages = {1904--1916}, + issn = {1939-3539}, + doi = {10.1109/TPAMI.2015.2389824}, + urldate = {2023-10-26}, + abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g., 224\textbackslash times 224) input image. This requirement is “artificial” and may reduce the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with another pooling strategy, “spatial pyramid pooling”, to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. Pyramid pooling is also robust to object deformations. With these advantages, SPP-net should in general improve all CNN-based image classification methods. On the ImageNet 2012 dataset, we demonstrate that SPP-net boosts the accuracy of a variety of CNN architectures despite their different designs. On the Pascal VOC 2007 and Caltech101 datasets, SPP-net achieves state-of-the-art classification results using a single full-image representation and no fine-tuning. The power of SPP-net is also significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method is 24-102 \textbackslash times faster than the R-CNN method, while achieving better or comparable accuracy on Pascal VOC 2007. In ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2014, our methods rank \#2 in object detection and \#3 in image classification among all 38 teams. This manuscript also introduces the improvement made for this competition.}, + eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, + file = {/home/zenon/Zotero/storage/4ZANQDJR/He et al. - 2015 - Spatial Pyramid Pooling in Deep Convolutional Netw.pdf;/home/zenon/Zotero/storage/MYNCND4W/7005506.html} +} + @inproceedings{he2016, title = {Deep {{Residual Learning}} for {{Image Recognition}}}, booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, @@ -346,6 +454,20 @@ file = {/home/zenon/Zotero/storage/JDX3S8QK/He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf} } +@inproceedings{he2017, + title = {Mask {{R-CNN}}}, + booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + author = {He, Kaiming and Gkioxari, Georgia and Dollár, Piotr and Girshick, Ross}, + date = {2017-10}, + pages = {2980--2988}, + issn = {2380-7504}, + doi = {10.1109/ICCV.2017.322}, + urldate = {2023-10-22}, + abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without tricks, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code will be made available.}, + eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + file = {/home/zenon/Zotero/storage/Z6CBZ8AI/He et al. - 2017 - Mask R-CNN.pdf;/home/zenon/Zotero/storage/GW42F6UG/8237584.html} +} + @article{hornik1989, title = {Multilayer Feedforward Networks Are Universal Approximators}, author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert}, @@ -385,6 +507,19 @@ file = {/home/zenon/Zotero/storage/DQAJEA4B/Kingma and Ba - 2017 - Adam A Method for Stochastic Optimization.pdf} } +@inproceedings{krizhevsky2012, + title = {{{ImageNet Classification}} with {{Deep Convolutional Neural Networks}}}, + booktitle = {Advances in {{Neural Information Processing Systems}}}, + author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + date = {2012}, + volume = {25}, + publisher = {{Curran Associates, Inc.}}, + url = {https://papers.nips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html}, + urldate = {2023-10-22}, + abstract = {We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\textbackslash\% and 18.9\textbackslash\% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.}, + file = {/home/zenon/Zotero/storage/ANJ8P844/Krizhevsky et al. - 2012 - ImageNet Classification with Deep Convolutional Ne.pdf} +} + @article{krosney2023, title = {Inside {{Out}}: {{Transforming Images}} of {{Lab-Grown Plants}} for {{Machine Learning Applications}} in {{Agriculture}}}, shorttitle = {Inside {{Out}}}, @@ -451,6 +586,34 @@ file = {/home/zenon/Zotero/storage/8BBA7R4F/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/KUPLTHRQ/1612.html} } +@inproceedings{lin2017a, + title = {Feature {{Pyramid Networks}} for {{Object Detection}}}, + booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge}, + date = {2017-07}, + pages = {936--944}, + issn = {1063-6919}, + doi = {10.1109/CVPR.2017.106}, + urldate = {2023-10-22}, + abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.}, + eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + file = {/home/zenon/Zotero/storage/ZBT2Z36R/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/N9EQUFC2/8099589.html} +} + +@inproceedings{lin2017b, + title = {Focal {{Loss}} for {{Dense Object Detection}}}, + booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + author = {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Dollár, Piotr}, + date = {2017-10}, + pages = {2999--3007}, + issn = {2380-7504}, + doi = {10.1109/ICCV.2017.324}, + urldate = {2023-10-22}, + abstract = {The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.}, + eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, + file = {/home/zenon/Zotero/storage/LL8HFKFW/Lin et al. - 2017 - Focal Loss for Dense Object Detection.pdf;/home/zenon/Zotero/storage/982Z922B/8237586.html} +} + @incollection{liu2016, title = {{{SSD}}: {{Single Shot MultiBox Detector}}}, shorttitle = {{{SSD}}}, @@ -557,6 +720,22 @@ pagetotal = {432} } +@book{murphy2012, + title = {Machine {{Learning}}: {{A Probabilistic Perspective}}}, + shorttitle = {Machine {{Learning}}}, + author = {Murphy, Kevin P.}, + date = {2012-08-24}, + eprint = {NZP6AQAAQBAJ}, + eprinttype = {googlebooks}, + publisher = {{MIT Press}}, + abstract = {A comprehensive introduction to machine learning that uses probabilistic models and inference as a unifying approach.Today's Web-enabled deluge of electronic data calls for automated methods of data analysis. Machine learning provides these, developing methods that can automatically detect patterns in data and then use the uncovered patterns to predict future data. This textbook offers a comprehensive and self-contained introduction to the field of machine learning, based on a unified, probabilistic approach.The coverage combines breadth and depth, offering necessary background material on such topics as probability, optimization, and linear algebra as well as discussion of recent developments in the field, including conditional random fields, L1 regularization, and deep learning. The book is written in an informal, accessible style, complete with pseudo-code for the most important algorithms. All topics are copiously illustrated with color images and worked examples drawn from such application domains as biology, text processing, computer vision, and robotics. Rather than providing a cookbook of different heuristic methods, the book stresses a principled model-based approach, often using the language of graphical models to specify models in a concise and intuitive way. Almost all the models described have been implemented in a MATLAB software package—PMTK (probabilistic modeling toolkit)—that is freely available online. The book is suitable for upper-level undergraduates with an introductory-level college math background and beginning graduate students.}, + isbn = {978-0-262-01802-9}, + langid = {english}, + pagetotal = {1102}, + keywords = {Computers / Artificial Intelligence / General}, + file = {/home/zenon/Zotero/storage/T2BMVXG9/Murphy - 2012 - Machine Learning A Probabilistic Perspective.pdf} +} + @article{nadafzadeh2019, title = {Design and {{Fabrication}} of an {{Intelligent Control System}} for {{Determination}} of {{Watering Time}} for {{Turfgrass Plant Using Computer Vision System}} and {{Artificial Neural Network}}}, author = {Nadafzadeh, Maryam and Abdanan Mehdizadeh, Saman}, @@ -597,6 +776,52 @@ keywords = {Agriculture,Cameras,Computational modeling,computer vision,edge and cloud computing,IoT,machine learning,Sensor systems,Sensors,smart farming,Stress,Temperature sensors} } +@inproceedings{redmon2016, + title = {You {{Only Look Once}}: {{Unified}}, {{Real-Time Object Detection}}}, + shorttitle = {You {{Only Look Once}}}, + booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + author = {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali}, + date = {2016-06}, + pages = {779--788}, + issn = {1063-6919}, + doi = {10.1109/CVPR.2016.91}, + urldate = {2023-10-22}, + abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.}, + eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html} +} + +@inproceedings{ren2015, + title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}}, + shorttitle = {Faster {{R-CNN}}}, + booktitle = {Advances in {{Neural Information Processing Systems}}}, + author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + date = {2015}, + volume = {28}, + publisher = {{Curran Associates, Inc.}}, + url = {https://proceedings.neurips.cc/paper/2015/hash/14bfa6bb14875e45bba028a21ed38046-Abstract.html}, + urldate = {2023-10-27}, + abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully-convolutional network that simultaneously predicts object bounds and objectness scores at each position. RPNs are trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. With a simple alternating optimization, RPN and Fast R-CNN can be trained to share convolutional features. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007 (73.2\% mAP) and 2012 (70.4\% mAP) using 300 proposals per image. Code is available at https://github.com/ShaoqingRen/faster\_rcnn.}, + file = {/home/zenon/Zotero/storage/4XB3KRE8/Ren et al. - 2015 - Faster R-CNN Towards Real-Time Object Detection w.pdf} +} + +@article{ren2017, + title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}}, + shorttitle = {Faster {{R-CNN}}}, + author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + date = {2017-06}, + journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + volume = {39}, + number = {6}, + pages = {1137--1149}, + issn = {1939-3539}, + doi = {10.1109/TPAMI.2016.2577031}, + urldate = {2023-10-22}, + abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model [3], our detection system has a frame rate of 5 fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.}, + eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, + file = {/home/zenon/Zotero/storage/NBA8U8VS/Ren et al. - 2017 - Faster R-CNN Towards Real-Time Object Detection w.pdf;/home/zenon/Zotero/storage/FJKQTY4F/7485869.html} +} + @article{rico-chavez2022, title = {Machine {{Learning}} for {{Plant Stress Modeling}}: {{A Perspective}} towards {{Hormesis Management}}}, shorttitle = {Machine {{Learning}} for {{Plant Stress Modeling}}}, @@ -641,19 +866,54 @@ pagetotal = {648} } -@article{samuel2000, - title = {Some Studies in Machine Learning Using the Game of Checkers}, +@article{rumelhart1986, + title = {Learning Representations by Back-Propagating Errors}, + author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.}, + date = {1986-10}, + journaltitle = {Nature}, + volume = {323}, + number = {6088}, + pages = {533--536}, + publisher = {{Nature Publishing Group}}, + issn = {1476-4687}, + doi = {10.1038/323533a0}, + urldate = {2023-09-29}, + abstract = {We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal ‘hidden’ units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.}, + issue = {6088}, + langid = {english}, + keywords = {Humanities and Social Sciences,multidisciplinary,Science}, + file = {/home/zenon/Zotero/storage/G59XYHFP/Rumelhart et al. - 1986 - Learning representations by back-propagating error.pdf} +} + +@online{russakovsky2015, + title = {{{ImageNet Large Scale Visual Recognition Challenge}}}, + author = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li}, + date = {2015-01-29}, + eprint = {1409.0575}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.1409.0575}, + urldate = {2023-10-22}, + abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the five years of the challenge, and propose future directions and improvements.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition,I.4.8,I.5.2}, + file = {/home/zenon/Zotero/storage/MF8K4TPL/Russakovsky et al. - 2015 - ImageNet Large Scale Visual Recognition Challenge.pdf;/home/zenon/Zotero/storage/EZS75GZV/1409.html} +} + +@article{samuel1959, + title = {Some {{Studies}} in {{Machine Learning Using}} the {{Game}} of {{Checkers}}}, author = {Samuel, A. L.}, - date = {2000-01}, + date = {1959-07}, journaltitle = {IBM Journal of Research and Development}, - volume = {44}, - number = {1.2}, - pages = {206--226}, + volume = {3}, + number = {3}, + pages = {210--229}, issn = {0018-8646}, - doi = {10.1147/rd.441.0206}, + doi = {10.1147/rd.33.0210}, + urldate = {2023-10-01}, abstract = {Two machine-learning procedures have been investigated in some detail using the game of checkers. Enough work has been done to verify the fact that a computer can be programmed so that it will learn to play a better game of checkers than can be played by the person who wrote the program. Furthermore, it can learn to do this in a remarkably short period of time (8 or 10 hours of machine-playing time) when given only the rules of the game, a sense of direction, and a redundant and incomplete list of parameters which are thought to have something to do with the game, but whose correct signs and relative weights are unknown and unspecified. The principles of machine learning verified by these experiments are, of course, applicable to many other situations.}, eventtitle = {{{IBM Journal}} of {{Research}} and {{Development}}}, - file = {/home/zenon/Zotero/storage/CQD65S78/5389202.html} + file = {/home/zenon/Zotero/storage/9YJSG7IJ/Samuel - 1959 - Some Studies in Machine Learning Using the Game of.pdf;/home/zenon/Zotero/storage/6XF4QCUQ/5392560.html} } @inproceedings{sears2007, @@ -688,6 +948,18 @@ file = {/home/zenon/Zotero/storage/QC22JBMX/Selvaraju et al. - 2020 - Grad-CAM Visual Explanations from Deep Networks v.pdf} } +@inproceedings{simard2003, + title = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis}, + booktitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.}, + author = {Simard, P.Y. and Steinkraus, D. and Platt, J.C.}, + date = {2003-08}, + pages = {958--963}, + doi = {10.1109/ICDAR.2003.1227801}, + urldate = {2023-10-01}, + eventtitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.}, + file = {/home/zenon/Zotero/storage/S6SE8F56/Simard et al. - 2003 - Best practices for convolutional neural networks a.pdf;/home/zenon/Zotero/storage/FQHDISEK/1227801.html} +} + @article{su2020, title = {Machine {{Learning-Based Crop Drought Mapping System}} by {{UAV Remote Sensing RGB Imagery}}}, author = {Su, Jinya and Coombes, Matthew and Liu, Cunjia and Zhu, Yongchao and Song, Xingyang and Fang, Shibo and Guo, Lei and Chen, Wen-Hua}, @@ -704,6 +976,24 @@ file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf} } +@article{uijlings2013, + title = {Selective {{Search}} for {{Object Recognition}}}, + author = {Uijlings, J. R. R. and family=Sande, given=K. E. A., prefix=van de, useprefix=true and Gevers, T. and Smeulders, A. W. M.}, + date = {2013-09-01}, + journaltitle = {International Journal of Computer Vision}, + shortjournal = {Int J Comput Vis}, + volume = {104}, + number = {2}, + pages = {154--171}, + issn = {1573-1405}, + doi = {10.1007/s11263-013-0620-5}, + urldate = {2023-10-22}, + abstract = {This paper addresses the problem of generating possible object locations for use in object recognition. We introduce selective search which combines the strength of both an exhaustive search and segmentation. Like segmentation, we use the image structure to guide our sampling process. Like exhaustive search, we aim to capture all possible object locations. Instead of a single technique to generate possible object locations, we diversify our search and use a variety of complementary image partitionings to deal with as many image conditions as possible. Our selective search results in a small set of data-driven, class-independent, high quality locations, yielding 99~\% recall and a Mean Average Best Overlap of 0.879 at 10,097 locations. The reduced number of locations compared to an exhaustive search enables the use of stronger machine learning techniques and stronger appearance models for object recognition. In this paper we show that our selective search enables the use of the powerful Bag-of-Words model for recognition. The selective search software is made publicly available (Software: http://disi.unitn.it/\textasciitilde uijlings/SelectiveSearch.html).}, + langid = {english}, + keywords = {Appearance Model,Colour Space,Exhaustive Search,Object Location,Object Recognition}, + file = {/home/zenon/Zotero/storage/P39PKRXR/Uijlings et al. - 2013 - Selective Search for Object Recognition.pdf} +} + @inproceedings{viola2001, title = {Rapid Object Detection Using a Boosted Cascade of Simple Features}, booktitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001}, @@ -773,6 +1063,21 @@ file = {/home/zenon/Zotero/storage/G27M4VFA/Wang et al. - 2022 - YOLOv7 Trainable Bag-of-Freebies Sets New State-o.pdf} } +@online{zeiler2013, + title = {Visualizing and {{Understanding Convolutional Networks}}}, + author = {Zeiler, Matthew D. and Fergus, Rob}, + date = {2013-11-28}, + eprint = {1311.2901}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.1311.2901}, + urldate = {2023-10-27}, + abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky \textbackslash etal on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition}, + file = {/home/zenon/Zotero/storage/XIE8AWCP/Zeiler and Fergus - 2013 - Visualizing and Understanding Convolutional Networ.pdf;/home/zenon/Zotero/storage/2SFHRHUU/1311.html} +} + @online{zheng2019, title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}}, shorttitle = {Distance-{{IoU Loss}}}, @@ -842,6 +1147,19 @@ keywords = {Early maize,Feature extraction,Gradient boosting decision tree,Image segmentation,Water stress} } +@online{zotero-142, + title = {Pattern {{Recognition}} and {{Machine Learning}} - {{Google Books}}}, + url = {https://www.google.at/books/edition/Pattern_Recognition_and_Machine_Learning/kOXDtAEACAAJ?hl=de&bshm=rimc/1}, + urldate = {2023-10-01} +} + +@online{zotero-143, + title = {Pattern {{Recognition}} and {{Machine Learning}} - {{Google Books}}}, + url = {https://www.google.at/books/edition/Pattern_Recognition_and_Machine_Learning/qWPwnQEACAAJ?hl=de&bshm=rimc/1}, + urldate = {2023-10-01}, + file = {/home/zenon/Zotero/storage/CLHDBTJ2/qWPwnQEACAAJ.html} +} + @article{zou2023, title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}}, shorttitle = {Object {{Detection}} in 20 {{Years}}}, diff --git a/thesis/thesis.pdf b/thesis/thesis.pdf index da776cb..761d274 100644 Binary files a/thesis/thesis.pdf and b/thesis/thesis.pdf differ diff --git a/thesis/thesis.tex b/thesis/thesis.tex index d7626de..5d23c71 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -25,6 +25,7 @@ \usepackage[acronym,toc]{glossaries} % Enables the generation of glossaries and lists fo acronyms. This package has to be included last. \usepackage{siunitx} \usepackage{float} +\usepackage{csquotes} \addbibresource{references.bib} @@ -74,11 +75,10 @@ \setcurriculum{Software Engineering \& Internet Computing}{Software Engineering \& Internet Computing} % Sets the English and German name of the curriculum. \newacronym{xai}{XAI}{Explainable Artificial Intelligence} -\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation Mapping} -\newacronym{cam}{CAM}{Class Activation Mapping} -\newacronym{oid}{OID}{Open Images Dataset} -\newacronym{ap}{AP}{Average Precision} -\newacronym{iou}{IOU}{Intersection over Union} +\newacronym{grad-cam}{Grad-CAM}{Gradient-weighted Class Activation + Mapping} \newacronym{cam}{CAM}{Class Activation Mapping} +\newacronym{oid}{OID}{Open Images Dataset} \newacronym{ap}{AP}{Average + Precision} \newacronym{iou}{IOU}{Intersection over Union} \newacronym{map}{mAP}{mean Average Precision} \newacronym{resnet}{ResNet}{Residual Neural Network} \newacronym{cnn}{CNN}{Convolutional Neural Network} @@ -86,12 +86,13 @@ \newacronym{roc}{ROC}{Receiver Operating Characteristic} \newacronym{auc}{AUC}{Area Under the Curve} \newacronym{coco}{COCO}{Common Objects in Context} -\newacronym{pascal-voc}{\textsc{PASCAL} VOC}{\textsc{PASCAL} Visual Object Classes} +\newacronym{voc}{VOC}{\textsc{PASCAL} Visual Object Classes} \newacronym{sbc}{SBC}{single-board computer} \newacronym{api}{API}{Application Programming Interface} \newacronym{rest}{REST}{Representational State Transfer} \newacronym{dl}{DL}{Deep Learning} \newacronym{gpu}{GPU}{Graphics Processing Unit} +\newacronym{cpu}{CPU}{Central Processing Unit} \newacronym{tpu}{TPU}{Tensor Processing Unit} \newacronym{hog}{HOG}{Histogram of Oriented Gradients} \newacronym{sift}{SIFT}{Scale-Invariant Feature Transform} @@ -104,6 +105,12 @@ \newacronym{elu}{ELU}{Exponential Linear Unit} \newacronym{silu}{SiLU}{Sigmoid Linear Unit} \newacronym{mse}{MSE}{mean squared error} +\newacronym{ilsvrc2012}{ILSVRC2012}{ImageNet Large Scale Visual + Recognition Challenge} +\newacronym{lrn}{LRN}{Local Response Normalization} +\newacronym[plural=ROIs,longplural=Regions of Interest]{roi}{ROI}{Region of Interest} +\newacronym{spp}{SPP}{Spatial Pyramid Pooling} +\newacronym{rpn}{RPN}{Region Proposal Network} \begin{document} @@ -300,11 +307,11 @@ further insights about the type of models which are commonly used. In order to find and select appropriate datasets to train the models on, we will survey the existing big datasets for classes we can use. Datasets such as the \gls{coco}~\cite{lin2015} and -\gls{pascal-voc}~\cite{everingham2010} contain the highly relevant -class \emph{Potted Plant}. By extracting only these classes from -multiple datasets and concatenating them together, it is possible to -create one unified dataset which only contains the classes necessary -for training the model. +\gls{voc}~\cite{everingham2010} contain the highly relevant class +\emph{Potted Plant}. By extracting only these classes from multiple +datasets and concatenating them together, it is possible to create one +unified dataset which only contains the classes necessary for training +the model. The training of the models will happen in an environment where more computational resources are available than what the~\gls{sbc} @@ -414,7 +421,7 @@ Estimated 25 pages for this chapter. \section{Machine Learning} \label{sec:theory-ml} -The term machine learning was first used by \textcite{samuel2000} in +The term machine learning was first used by \textcite{samuel1959} in 1959 in the context of teaching a machine how to play the game Checkers. \textcite{mitchell1997a} defines learning in the context of programs as: @@ -661,8 +668,8 @@ network and is, therefore, not suitable for complex intra-data relationships. A major downside to using the Heaviside step function is that it is not differentiable at $x = 0$ and has a $0$ derivative elsewhere. These properties make it unsuitable for use with gradient -descent during \todo[noline]{link to backpropagation section} -backpropagation. +descent during back-propagation (section +\ref{ssec:theory-back-propagation}). \subsubsection{Sigmoid} \label{sssec:theory-sigmoid} @@ -781,8 +788,8 @@ and updating the weights within the network is it possible to gain experience $E$ at carrying out a task $T$. How the weights are updated depends on the algorithm which is used during the \emph{backward pass} to minimize the error. This type of procedure is referred to as -\emph{backpropagation} (see -section~\ref{ssec:theory-backpropagation}). +\emph{back-propagation} (see +section~\ref{ssec:theory-back-propagation}). One common type of loss function is the \gls{mse} which is widely used in regression problems. The \gls{mse} is a popular choice because it @@ -817,29 +824,41 @@ set, it is likely that the model is suffering from error rates on the training set, it is likely that the model is suffering from \emph{underfitting}. -Another popular loss function is the cross-entropy loss, which is -commonly used in classification problems. Cross-entropy loss measures -the difference between predicted probabilities and actual labels. It -is a good choice for classification problems because it takes into -account the class imbalance issue and it is less sensitive to -outliers. However, cross-entropy loss has its own limitations. For -instance, it may not be appropriate for problems with more than two -classes, and it can be sensitive to the choice of the softmax -function. +\textcite{goodfellow2016} writes on \gls{mse}: ``\gls{mse} was popular +in the 1980s and 1990s but was gradually replaced by cross-entropy +losses and the principle of maximum likelihood as ideas spread between +the statistics community and the machine learning +community''~\cite[p.222]{goodfellow2016}. Cross-entropy measures the +difference in information between two distinct probability +distributions. Specifically, it gives a number on the average total +amount of bits needed to represent a message or event from the first +probability distribution in the second probability distribution. If +there is the case of binary random variables, i.e. only two classes to +classify exist, the measure is called binary +cross-entropy. Cross-entropy loss is known to outperform \gls{mse} for +classification tasks and allows the model to be trained +faster~\cite{simard2003}. -In recent years, there has been an increasing interest in using -alternative loss functions that can better handle complex -problems. For example, the Huber loss is a modification of the MSE -loss that is more robust to outliers. The Smooth L1 loss is another -alternative that is less sensitive to outliers and can handle -non-normal distributions. These alternative loss functions have been -shown to be effective in various applications such as image -classification, object detection, and speech recognition. - -\subsection{Backpropagation} -\label{ssec:theory-backpropagation} +\subsection{Back-Propagation} +\label{ssec:theory-back-propagation} +So far, information only flows forward through the network whenever a +prediction for a particular input should be made. In order for a +neural network to learn, information about the computed loss has to +flow backward through the network. Only then can the weights at the +individual neurons be updated. This type of information flow is termed +\emph{back-propagation} \cite{rumelhart1986}. Back-propagation +computes the gradient of a loss function with respect to the weights +of a network for an input-output pair. The algorithm computes the +gradient iteratively starting from the last layer and works its way +backward through the network until it reaches the first layer. +Strictly speaking, back-propagation only computes the gradient, but +does not determine how the gradient is used to learn the new +weights. Once the back-propagation algorithm has computed the +gradient, that gradient is passed to an algorithm which finds a local +minimum of it. This step is usually performed by some variant of +gradient descent \cite{cauchy1847}. \section{Object Detection} \label{sec:background-detection} @@ -926,11 +945,11 @@ second to process. \subsubsection{Deformable Part-Based Model} \label{sssec:obj-dpm} -\glspl{dpm}~\cite{felzenszwalb2008a} were the winners of the -\gls{pascal-voc} challenge in the years 2007, 2008 and 2009. The -method is heavily based on the previously discussed \gls{hog} since it -also uses \gls{hog} descriptors internally. The authors addition is -the idea of learning how to decompose objects during training and +\glspl{dpm}~\cite{felzenszwalb2008a} were the winners of the \gls{voc} +challenge in the years 2007, 2008 and 2009. The method is heavily +based on the previously discussed \gls{hog} since it also uses +\gls{hog} descriptors internally. The authors addition is the idea of +learning how to decompose objects during training and classifying/detecting the decomposed parts during inference. The \gls{hog} descriptors are computed on different scales to form a \gls{hog} feature pyramid. Coarse features are more easily identified @@ -953,6 +972,175 @@ increases in depth. \textcite{girshick2015} argue that \glspl{dpm} \glspl{cnn} by unrolling each step of the algorithm into a corresponding \gls{cnn} layer. +\subsection{Deep Learning Based Methods} +\label{ssec:theory-dl-based} + +After the publication of the \gls{dpm}, the field of object detection +did not make significant advances regarding speed or accuracy. Only +the (re-)introduction of \glspl{cnn} by \textcite{krizhevsky2012} with +their AlexNet architecture and their subsequent win of the +\gls{ilsvrc2012} gave the field a new influx of ideas. The +availability of the 12 million labeled images in the ImageNet dataset +\cite{deng2009} allowed a shift from focusing on better methods to +being able to use more data to train models. Earlier models had +difficulties with making use of the large dataset since training was +unfeasible. AlexNet, however, provided an architecture which was able +to be trained on two \glspl{gpu} within 6 days. + +AlexNet's main contributions are the use of \glspl{relu}, training on +multiple \glspl{gpu}, \gls{lrn} and overlapping pooling +\cite{krizhevsky2012}. As mentioned in +section~\ref{sssec:theory-relu}, \glspl{relu} introduce non-linearity +into the network. Instead of using the traditional non-linear +activation function $\tanh$, where the output is bounded between $-1$ +and $1$, \glspl{relu} allow the output layers to grow as high as +training requires it. Normalization before an activation function is +usually used to prevent the neuron from saturating, as would be the +case with $\tanh$. Even though \glspl{relu} do not suffer from +saturation, the authors found that \gls{lrn} reduces the top-1 error +rate by 1.4\% \cite{krizhevsky2012}. Overlapping pooling, in contrast +to regular pooling, does not easily accept the dominant pixel values +per window. By smoothing out the pooled information, bias is reduced +and networks are slightly more resilient to overfitting. Overlapping +pooling reduces the top-1 error rate by 0.4\% +\cite{krizhevsky2012}. In aggregate, these improvements result in a +top-5 error rate of below 25\% at 16.4\%. + +These results demonstrated that \glspl{cnn} can extract highly +relevant feature representations from images. While AlexNet was only +concerned with classification of images, it did not take long for +researchers to apply \glspl{cnn} to the problem of object +detection. Object detection networks from 2014 onward either follow a +\emph{one-stage} or \emph{two-stage} detection approach. The following +sections go into detail about each model category. + +\subsection{Two-Stage Detectors} +\label{ssec:theory-two-stage} + +As their name implies, two-stage detectors consist of two stages which +together form a complete object detection pipeline. Commonly, the +first stage extracts \glspl{roi} which might contain relevant objects +to detect. The second stage operates on the extracted \glspl{roi} and +returns a vector of class probabilities. Since the computation in the +second stage is performed for every \gls{roi}, two-stage detectors are +often not as efficient as one-stage detectors. + +\subsubsection{R-\gls{cnn}} +\label{sssec:theory-rcnn} + +\textcite{girshick2014} were the first to propose using feature +representations of \glspl{cnn} for object detection. Their approach +consists of generating around 2000 region proposals and passing these +on to a \gls{cnn} for feature extraction. The fixed-length feature +vector is used as input for a linear \gls{svm} which classifies the +region. They name their method R-\gls{cnn}, where the R stands for +region. + +R-\gls{cnn} uses selective search to generate region proposals +\cite{uijlings2013}.The authors use selective search's \emph{fast +mode} to generate the 2000 proposals and warp (i.e. aspect ratios are +not retained) each proposal into the image dimensions required by the +\gls{cnn}. The \gls{cnn}, which matches the architecture of AlexNet +\cite{krizhevsky2012}, generates a $4096$-dimensional feature vector +and each feature vector is scored by a linear \gls{svm} for each +class. Scored regions are selected/discarded by comparing each region +to other regions within the same class and rejecting them if there +exists another region with a higher score and greater \gls{iou} than a +threshold. The linear \gls{svm} classifiers are trained to only label +a region as positive if the overlap, as measured by \gls{iou}, is +above $0.3$. + +While the approach of generating region proposals is not new, using a +\gls{cnn} purely for feature extraction is. Unfortunately, R-\gls{cnn} +is far from being able to operate in real time. The authors report +that it takes \qty{13}{\s\per image} on a \gls{gpu} and +\qty{53}{\s\per image} on a \gls{cpu} to generate the region proposals +and feature vector. In some sense, these processing times are a step +backward from the \glspl{dpm} introduced in +section~\ref{sssec:obj-dpm}. However, the authors showed that +\glspl{cnn} can function perfectly well as feature extractors, even if +their processing performance is not yet up to par with traditional +methods. Furthermore, R-\gls{cnn} crushes \glspl{dpm} on the \gls{voc} +2007 challenge with a \gls{map} of 58.5\% \cite{girshick2014} versus +33.7\% (\gls{dpm}-v5 \cite{girshick,felzenszwalb2010}) This was enough +to spark renewed interest in \glspl{cnn} and—with better availability +of large data sets and \gls{gpu} processing capabilities—opened the +way for further research in that direction. + +\subsubsection{SPP-net} +\label{sssec:theory-spp-net} + +A year after the publication of R-\gls{cnn}, \textcite{he2015} +introduce the concept of \gls{spp} to allow \glspl{cnn} to accept +arbitrarily sized instead of fixed-size input images. They name their +method \gls{spp}-net and it outputs a fixed-length feature vector of +the input image. + +\gls{spp} layers operate in-between the convolutional and +fully-connected layers of a \gls{cnn}. Since the fully-connected +layers require fixed-size inputs but the convolutional layers do not, +\gls{spp} layers aggregate the information from convolutional layers +and pass the resulting fixed-size outputs to the fully-connected +layers. This approach allows only passing the full image through the +convolutional layers once and calculating features with the \gls{spp} +layer from these results. This avoids the redundant computations for +each \gls{roi} present in R-\gls{cnn} and provides a speedup of 24-102 +times while achieving even better metrics on the \gls{voc} 2007 data +set at a \gls{map} of 59.2\%. + +\subsubsection{Fast R-\gls{cnn}} +\label{sssec:theory-fast-rcnn} + +Fast R-\gls{cnn} was proposed by \textcite{girshick2015a} to fix the +three main problems R-\gls{cnn} and \gls{spp}-net have. The first +problem is that the training for both models is +multi-stage. R-\gls{cnn} finetunes the convolutional network which is +responsible for feature extraction and then trains \glspl{svm} to +classify the feature vectors. The third stage consists of training the +bounding box regressors. The second problem is the training time which +is on the order of multiple days for deep convolutional networks. The +third problem is the processing time per image which is (depending on +the convolutional network) upwards of \qty{13}{\s\per image}. + +Fast R-\gls{cnn} deals with these problems by having an architecture +which allows it to take in images and object proposals at once and +process them simultaneously to arrive at the results. The outputs of +the network are the class an object proposal belongs to and 4 scalar +values representing the bounding box of the object. Unfortunately, +this approach still requires a separate object proposal generator such +as selective search \cite{uijlings2013}. + +\subsubsection{Faster R-\gls{cnn}} +\label{sssec:theory-faster-rcnn} + +Faster R-\gls{cnn} \cite{ren2015,ren2017}—as the name implies—is yet +another improvement building on R-\gls{cnn}, \gls{spp}-net and Fast +R-\gls{cnn}. Since the bottleneck in performance with previous +approaches has been the object proposal generator, the authors of +Faster R-\gls{cnn} introduce a \gls{rpn} to predict bounding boxes and +objectness in one step. As with previous networks, the proposals are +then passed to the detection network. + +\glspl{rpn} work by using the already present convolutional features +in Fast R-\gls{cnn} and adding additional layers on top to also +regress bounding boxes and objectness scores per location. Instead of +relying on a pyramid structure such as with \gls{spp}-net (see +section~\ref{sssec:theory-spp-net}), \glspl{rpn} use \emph{anchor +boxes} as a basis for the bounding box regressor. These anchor boxes +are predefined for various scales and aspect ratios and serve as +starting points for the regressor to properly fit a bounding box +around an object. + +The \gls{rpn} makes object proposal generation inexpensive and +possible on \glspl{gpu}. The whole network operates on an almost real +time scale by being able to process \qty{5}{images\per\s} and +maintaining high state-of-the-art \gls{map} values of 73.2\% +(\gls{voc} 2007). If the detection network is switched from VGGNet to +ZF-Net \cite{zeiler2013}, Faster R-\gls{cnn} is able to achieve +\qty{17}{images\per\s}, albeit at a lower \gls{map} of 59.9\%. + +\subsection{One-Stage Detectors} +\label{ssec:theory-one-stage} \section{Classification} \label{sec:background-classification}