Finish object detection, add LeNet-5

This commit is contained in:
Tobias Eidelpes 2023-11-04 17:56:05 +01:00
parent 121fc046ff
commit e30879f9e2
3 changed files with 449 additions and 75 deletions

View File

@ -218,6 +218,19 @@
keywords = {Explosions,Image databases,Image retrieval,Information retrieval,Internet,Large-scale systems,Multimedia databases,Ontologies,Robustness,Spine} keywords = {Explosions,Image databases,Image retrieval,Information retrieval,Internet,Large-scale systems,Multimedia databases,Ontologies,Robustness,Spine}
} }
@inproceedings{erhan2014,
title = {Scalable {{Object Detection Using Deep Neural Networks}}},
booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
author = {Erhan, Dumitru and Szegedy, Christian and Toshev, Alexander and Anguelov, Dragomir},
date = {2014-06},
pages = {2155--2162},
issn = {1063-6919},
doi = {10.1109/CVPR.2014.276},
abstract = {Deep convolutional neural networks have recently achieved state-of-the-art performance on a number of image recognition benchmarks, including the ImageNet Large-Scale Visual Recognition Challenge (ILSVRC-2012). The winning model on the localization sub-task was a network that predicts a single bounding box and a confidence score for each object category in the image. Such a model captures the whole-image context around the objects but cannot handle multiple instances of the same object in the image without naively replicating the number of outputs for each instance. In this work, we propose a saliency-inspired neural network model for detection, which predicts a set of class-agnostic bounding boxes along with a single score for each box, corresponding to its likelihood of containing any object of interest. The model naturally handles a variable number of instances for each class and allows for cross-class generalization at the highest levels of the network. We are able to obtain competitive recognition performance on VOC2007 and ILSVRC2012, while using only the top few predicted locations in each image and a small number of neural network evaluations.},
eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
file = {/home/zenon/Zotero/storage/QK2PPIT2/Erhan et al. - 2014 - Scalable Object Detection Using Deep Neural Networ.pdf;/home/zenon/Zotero/storage/KRHQRR7X/6909673.html}
}
@article{everingham2010, @article{everingham2010,
title = {The {{Pascal Visual Object Classes}} ({{VOC}}) {{Challenge}}}, title = {The {{Pascal Visual Object Classes}} ({{VOC}}) {{Challenge}}},
author = {Everingham, Mark and Van Gool, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew}, author = {Everingham, Mark and Van Gool, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew},
@ -229,7 +242,6 @@
pages = {303--338}, pages = {303--338},
issn = {1573-1405}, issn = {1573-1405},
doi = {10.1007/s11263-009-0275-4}, doi = {10.1007/s11263-009-0275-4},
urldate = {2023-09-07},
abstract = {The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection.}, abstract = {The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection.},
langid = {english}, langid = {english},
keywords = {Benchmark,Database,Object detection,Object recognition}, keywords = {Benchmark,Database,Object detection,Object recognition},
@ -274,7 +286,6 @@
pages = {1627--1645}, pages = {1627--1645},
issn = {1939-3539}, issn = {1939-3539},
doi = {10.1109/TPAMI.2009.167}, doi = {10.1109/TPAMI.2009.167},
urldate = {2023-10-26},
abstract = {We describe an object detection system based on mixtures of multiscale deformable part models. Our system is able to represent highly variable object classes and achieves state-of-the-art results in the PASCAL object detection challenges. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL data sets. Our system relies on new methods for discriminative training with partially labeled data. We combine a margin-sensitive approach for data-mining hard negative examples with a formalism we call latent SVM. A latent SVM is a reformulation of MISVM in terms of latent variables. A latent SVM is semiconvex, and the training problem becomes convex once latent information is specified for the positive examples. This leads to an iterative training algorithm that alternates between fixing latent values for positive examples and optimizing the latent SVM objective function.}, abstract = {We describe an object detection system based on mixtures of multiscale deformable part models. Our system is able to represent highly variable object classes and achieves state-of-the-art results in the PASCAL object detection challenges. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL data sets. Our system relies on new methods for discriminative training with partially labeled data. We combine a margin-sensitive approach for data-mining hard negative examples with a formalism we call latent SVM. A latent SVM is a reformulation of MISVM in terms of latent variables. A latent SVM is semiconvex, and the training problem becomes convex once latent information is specified for the positive examples. This leads to an iterative training algorithm that alternates between fixing latent values for positive examples and optimizing the latent SVM objective function.},
eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
file = {/home/zenon/Zotero/storage/P5378A3K/Felzenszwalb et al. - 2010 - Object Detection with Discriminatively Trained Par.pdf;/home/zenon/Zotero/storage/HYLEIZJU/5255236.html} file = {/home/zenon/Zotero/storage/P5378A3K/Felzenszwalb et al. - 2010 - Object Detection with Discriminatively Trained Par.pdf;/home/zenon/Zotero/storage/HYLEIZJU/5255236.html}
@ -307,7 +318,6 @@
pages = {322--333}, pages = {322--333},
issn = {2168-2887}, issn = {2168-2887},
doi = {10.1109/TSSC.1969.300225}, doi = {10.1109/TSSC.1969.300225},
urldate = {2023-09-27},
abstract = {A new type of visual feature extracting network has been synthesized, and the response of the network has been simulated on a digital computer. This research has been done as a first step towards the realization of a recognizer of handwritten characters. The design of the network was suggested by biological systems, especially, the visual systems of cat and monkey. The network is composed of analog threshold elements connected in layers. Each analog threshold element receives inputs from a large number of elements in the neighbouring layers and performs its own special functions. It takes care of one restricted part of the photoreceptor layer, on which an input pattem is presented, and it responds to one particular feature of the input pattem, such as brightness contrast, a dot in the pattern, a line segment of a particular orientation, or an end of the line. This means that the network performs parallel processing of the information. With the propagation of the information through the layered network, the input pattern is successively decomposed into dots, groups of line segments of the same orientation, and the ends of these line segments.}, abstract = {A new type of visual feature extracting network has been synthesized, and the response of the network has been simulated on a digital computer. This research has been done as a first step towards the realization of a recognizer of handwritten characters. The design of the network was suggested by biological systems, especially, the visual systems of cat and monkey. The network is composed of analog threshold elements connected in layers. Each analog threshold element receives inputs from a large number of elements in the neighbouring layers and performs its own special functions. It takes care of one restricted part of the photoreceptor layer, on which an input pattem is presented, and it responds to one particular feature of the input pattem, such as brightness contrast, a dot in the pattern, a line segment of a particular orientation, or an end of the line. This means that the network performs parallel processing of the information. With the propagation of the information through the layered network, the input pattern is successively decomposed into dots, groups of line segments of the same orientation, and the ends of these line segments.},
eventtitle = {{{IEEE Transactions}} on {{Systems Science}} and {{Cybernetics}}}, eventtitle = {{{IEEE Transactions}} on {{Systems Science}} and {{Cybernetics}}},
file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html} file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
@ -322,7 +332,6 @@
eprinttype = {arxiv}, eprinttype = {arxiv},
eprintclass = {cs}, eprintclass = {cs},
doi = {10.48550/arXiv.2103.14259}, doi = {10.48550/arXiv.2103.14259},
urldate = {2023-09-28},
abstract = {Recent advances in label assignment in object detection mainly seek to independently define positive/negative training samples for each ground-truth (gt) object. In this paper, we innovatively revisit the label assignment from a global perspective and propose to formulate the assigning procedure as an Optimal Transport (OT) problem -- a well-studied topic in Optimization Theory. Concretely, we define the unit transportation cost between each demander (anchor) and supplier (gt) pair as the weighted summation of their classification and regression losses. After formulation, finding the best assignment solution is converted to solve the optimal transport plan at minimal transportation costs, which can be solved via Sinkhorn-Knopp Iteration. On COCO, a single FCOS-ResNet-50 detector equipped with Optimal Transport Assignment (OTA) can reach 40.7\% mAP under 1X scheduler, outperforming all other existing assigning methods. Extensive experiments conducted on COCO and CrowdHuman further validate the effectiveness of our proposed OTA, especially its superiority in crowd scenarios. The code is available at https://github.com/Megvii-BaseDetection/OTA.}, abstract = {Recent advances in label assignment in object detection mainly seek to independently define positive/negative training samples for each ground-truth (gt) object. In this paper, we innovatively revisit the label assignment from a global perspective and propose to formulate the assigning procedure as an Optimal Transport (OT) problem -- a well-studied topic in Optimization Theory. Concretely, we define the unit transportation cost between each demander (anchor) and supplier (gt) pair as the weighted summation of their classification and regression losses. After formulation, finding the best assignment solution is converted to solve the optimal transport plan at minimal transportation costs, which can be solved via Sinkhorn-Knopp Iteration. On COCO, a single FCOS-ResNet-50 detector equipped with Optimal Transport Assignment (OTA) can reach 40.7\% mAP under 1X scheduler, outperforming all other existing assigning methods. Extensive experiments conducted on COCO and CrowdHuman further validate the effectiveness of our proposed OTA, especially its superiority in crowd scenarios. The code is available at https://github.com/Megvii-BaseDetection/OTA.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}, keywords = {Computer Science - Computer Vision and Pattern Recognition},
@ -338,7 +347,6 @@
eprinttype = {arxiv}, eprinttype = {arxiv},
eprintclass = {cs}, eprintclass = {cs},
doi = {10.48550/arXiv.2107.08430}, doi = {10.48550/arXiv.2107.08430},
urldate = {2023-09-28},
abstract = {In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3\% AP on COCO, surpassing NanoDet by 1.8\% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3\% AP on COCO, outperforming the current best practice by 3.0\% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0\% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8\% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. Source code is at https://github.com/Megvii-BaseDetection/YOLOX.}, abstract = {In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3\% AP on COCO, surpassing NanoDet by 1.8\% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3\% AP on COCO, outperforming the current best practice by 3.0\% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0\% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8\% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. Source code is at https://github.com/Megvii-BaseDetection/YOLOX.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}, keywords = {Computer Science - Computer Vision and Pattern Recognition},
@ -348,7 +356,7 @@
@online{girshick, @online{girshick,
title = {Discriminatively {{Trained Deformable Part Models}} ({{Release}} 5)}, title = {Discriminatively {{Trained Deformable Part Models}} ({{Release}} 5)},
author = {Girshick, Ross B. and Felzenszwalb, Pedro F. and McAllester, David}, author = {Girshick, Ross B. and Felzenszwalb, Pedro F. and McAllester, David},
url = {https://web.archive.org/web/20231026094412/https://www.rossgirshick.info/latent/}, url = {https://www.rossgirshick.info/latent/},
urldate = {2023-10-26}, urldate = {2023-10-26},
file = {/home/zenon/Zotero/storage/HQTS6PW6/latent.html} file = {/home/zenon/Zotero/storage/HQTS6PW6/latent.html}
} }
@ -361,7 +369,6 @@
pages = {580--587}, pages = {580--587},
issn = {1063-6919}, issn = {1063-6919},
doi = {10.1109/CVPR.2014.81}, doi = {10.1109/CVPR.2014.81},
urldate = {2023-10-22},
abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012 achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.}, abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012 achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.},
eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
file = {/home/zenon/Zotero/storage/EL92YEYD/Girshick et al. - 2014 - Rich Feature Hierarchies for Accurate Object Detec.pdf;/home/zenon/Zotero/storage/TX9APXST/6909475.html} file = {/home/zenon/Zotero/storage/EL92YEYD/Girshick et al. - 2014 - Rich Feature Hierarchies for Accurate Object Detec.pdf;/home/zenon/Zotero/storage/TX9APXST/6909475.html}
@ -389,7 +396,6 @@
pages = {1440--1448}, pages = {1440--1448},
issn = {2380-7504}, issn = {2380-7504},
doi = {10.1109/ICCV.2015.169}, doi = {10.1109/ICCV.2015.169},
urldate = {2023-10-22},
abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.}, abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.},
eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
file = {/home/zenon/Zotero/storage/I4Q5NJCT/Girshick - 2015 - Fast R-CNN.pdf;/home/zenon/Zotero/storage/VQZF2I7Z/7410526.html} file = {/home/zenon/Zotero/storage/I4Q5NJCT/Girshick - 2015 - Fast R-CNN.pdf;/home/zenon/Zotero/storage/VQZF2I7Z/7410526.html}
@ -405,7 +411,6 @@
pages = {142--158}, pages = {142--158},
issn = {1939-3539}, issn = {1939-3539},
doi = {10.1109/TPAMI.2015.2437384}, doi = {10.1109/TPAMI.2015.2437384},
urldate = {2023-10-22},
abstract = {Object detection performance, as measured on the canonical PASCAL VOC Challenge datasets, plateaued in the final years of the competition. The best-performing methods were complex ensemble systems that typically combined multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 50 percent relative to the previous best result on VOC 2012-achieving a mAP of 62.4 percent. Our approach combines two ideas: (1) one can apply high-capacity convolutional networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data are scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, boosts performance significantly. Since we combine region proposals with CNNs, we call the resulting model an R-CNN or Region-based Convolutional Network. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.}, abstract = {Object detection performance, as measured on the canonical PASCAL VOC Challenge datasets, plateaued in the final years of the competition. The best-performing methods were complex ensemble systems that typically combined multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 50 percent relative to the previous best result on VOC 2012-achieving a mAP of 62.4 percent. Our approach combines two ideas: (1) one can apply high-capacity convolutional networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data are scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, boosts performance significantly. Since we combine region proposals with CNNs, we call the resulting model an R-CNN or Region-based Convolutional Network. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.},
eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
file = {/home/zenon/Zotero/storage/MQPF5MGW/Girshick et al. - 2016 - Region-Based Convolutional Networks for Accurate O.pdf;/home/zenon/Zotero/storage/EKC4WHDQ/7112511.html} file = {/home/zenon/Zotero/storage/MQPF5MGW/Girshick et al. - 2016 - Region-Based Convolutional Networks for Accurate O.pdf;/home/zenon/Zotero/storage/EKC4WHDQ/7112511.html}
@ -435,7 +440,6 @@
pages = {1904--1916}, pages = {1904--1916},
issn = {1939-3539}, issn = {1939-3539},
doi = {10.1109/TPAMI.2015.2389824}, doi = {10.1109/TPAMI.2015.2389824},
urldate = {2023-10-26},
abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g., 224\textbackslash times 224) input image. This requirement is “artificial” and may reduce the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with another pooling strategy, “spatial pyramid pooling”, to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. Pyramid pooling is also robust to object deformations. With these advantages, SPP-net should in general improve all CNN-based image classification methods. On the ImageNet 2012 dataset, we demonstrate that SPP-net boosts the accuracy of a variety of CNN architectures despite their different designs. On the Pascal VOC 2007 and Caltech101 datasets, SPP-net achieves state-of-the-art classification results using a single full-image representation and no fine-tuning. The power of SPP-net is also significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method is 24-102 \textbackslash times faster than the R-CNN method, while achieving better or comparable accuracy on Pascal VOC 2007. In ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2014, our methods rank \#2 in object detection and \#3 in image classification among all 38 teams. This manuscript also introduces the improvement made for this competition.}, abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g., 224\textbackslash times 224) input image. This requirement is “artificial” and may reduce the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with another pooling strategy, “spatial pyramid pooling”, to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. Pyramid pooling is also robust to object deformations. With these advantages, SPP-net should in general improve all CNN-based image classification methods. On the ImageNet 2012 dataset, we demonstrate that SPP-net boosts the accuracy of a variety of CNN architectures despite their different designs. On the Pascal VOC 2007 and Caltech101 datasets, SPP-net achieves state-of-the-art classification results using a single full-image representation and no fine-tuning. The power of SPP-net is also significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method is 24-102 \textbackslash times faster than the R-CNN method, while achieving better or comparable accuracy on Pascal VOC 2007. In ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2014, our methods rank \#2 in object detection and \#3 in image classification among all 38 teams. This manuscript also introduces the improvement made for this competition.},
eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
file = {/home/zenon/Zotero/storage/4ZANQDJR/He et al. - 2015 - Spatial Pyramid Pooling in Deep Convolutional Netw.pdf;/home/zenon/Zotero/storage/MYNCND4W/7005506.html} file = {/home/zenon/Zotero/storage/4ZANQDJR/He et al. - 2015 - Spatial Pyramid Pooling in Deep Convolutional Netw.pdf;/home/zenon/Zotero/storage/MYNCND4W/7005506.html}
@ -462,7 +466,6 @@
pages = {2980--2988}, pages = {2980--2988},
issn = {2380-7504}, issn = {2380-7504},
doi = {10.1109/ICCV.2017.322}, doi = {10.1109/ICCV.2017.322},
urldate = {2023-10-22},
abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without tricks, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code will be made available.}, abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without tricks, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code will be made available.},
eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
file = {/home/zenon/Zotero/storage/Z6CBZ8AI/He et al. - 2017 - Mask R-CNN.pdf;/home/zenon/Zotero/storage/GW42F6UG/8237584.html} file = {/home/zenon/Zotero/storage/Z6CBZ8AI/He et al. - 2017 - Mask R-CNN.pdf;/home/zenon/Zotero/storage/GW42F6UG/8237584.html}
@ -479,7 +482,6 @@
pages = {359--366}, pages = {359--366},
issn = {0893-6080}, issn = {0893-6080},
doi = {10.1016/0893-6080(89)90020-8}, doi = {10.1016/0893-6080(89)90020-8},
urldate = {2023-09-27},
abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.}, abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.},
keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation}, keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation},
file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html} file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html}
@ -534,7 +536,6 @@
pages = {1200977}, pages = {1200977},
issn = {2624-8212}, issn = {2624-8212},
doi = {10.3389/frai.2023.1200977}, doi = {10.3389/frai.2023.1200977},
urldate = {2023-08-25},
abstract = {Machine learning tasks often require a significant amount of training data for the resultant network to perform suitably for a given problem in any domain. In agriculture, dataset sizes are further limited by phenotypical differences between two plants of the same genotype, often as a result of differing growing conditions. Synthetically-augmented datasets have shown promise in improving existing models when real data is not available. In this paper, we employ a contrastive unpaired translation (CUT) generative adversarial network (GAN) and simple image processing techniques to translate indoor plant images to appear as field images. While we train our network to translate an image containing only a single plant, we show that our method is easily extendable to produce multiple-plant field images. Furthermore, we use our synthetic multi-plant images to train several YoloV5 nano object detection models to perform the task of plant detection and measure the accuracy of the model on real field data images. Including training data generated by the CUT-GAN leads to better plant detection performance compared to a network trained solely on real data.}, abstract = {Machine learning tasks often require a significant amount of training data for the resultant network to perform suitably for a given problem in any domain. In agriculture, dataset sizes are further limited by phenotypical differences between two plants of the same genotype, often as a result of differing growing conditions. Synthetically-augmented datasets have shown promise in improving existing models when real data is not available. In this paper, we employ a contrastive unpaired translation (CUT) generative adversarial network (GAN) and simple image processing techniques to translate indoor plant images to appear as field images. While we train our network to translate an image containing only a single plant, we show that our method is easily extendable to produce multiple-plant field images. Furthermore, we use our synthetic multi-plant images to train several YoloV5 nano object detection models to perform the task of plant detection and measure the accuracy of the model on real field data images. Including training data generated by the CUT-GAN leads to better plant detection performance compared to a network trained solely on real data.},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning}, keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
file = {/home/zenon/Zotero/storage/Y5MUHPDE/Krosney et al. - 2023 - Inside Out Transforming Images of Lab-Grown Plant.pdf;/home/zenon/Zotero/storage/8NB5H9E8/2211.html} file = {/home/zenon/Zotero/storage/Y5MUHPDE/Krosney et al. - 2023 - Inside Out Transforming Images of Lab-Grown Plant.pdf;/home/zenon/Zotero/storage/8NB5H9E8/2211.html}
@ -558,6 +559,52 @@
file = {/home/zenon/Zotero/storage/R6SKDLQU/Kuznetsova et al. - 2020 - The Open Images Dataset V4 Unified Image Classifi.pdf} file = {/home/zenon/Zotero/storage/R6SKDLQU/Kuznetsova et al. - 2020 - The Open Images Dataset V4 Unified Image Classifi.pdf}
} }
@article{lecun1989,
title = {Backpropagation {{Applied}} to {{Handwritten Zip Code Recognition}}},
author = {LeCun, Y. and Boser, B. and Denker, J. S. and Henderson, D. and Howard, R. E. and Hubbard, W. and Jackel, L. D.},
date = {1989-12-01},
journaltitle = {Neural Computation},
shortjournal = {Neural Computation},
volume = {1},
number = {4},
pages = {541--551},
issn = {0899-7667},
doi = {10.1162/neco.1989.1.4.541},
abstract = {The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification.},
file = {/home/zenon/Zotero/storage/Y8AWACVG/LeCun et al. - 1989 - Backpropagation Applied to Handwritten Zip Code Re.pdf;/home/zenon/Zotero/storage/R7RK6LZ6/Backpropagation-Applied-to-Handwritten-Zip-Code.html}
}
@article{lecun1998,
title = {Gradient-Based Learning Applied to Document Recognition},
author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
date = {1998-11},
journaltitle = {Proceedings of the IEEE},
volume = {86},
number = {11},
pages = {2278--2324},
issn = {1558-2256},
doi = {10.1109/5.726791},
abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day.},
eventtitle = {Proceedings of the {{IEEE}}},
file = {/home/zenon/Zotero/storage/9X5LATEB/Lecun et al. - 1998 - Gradient-based learning applied to document recogn.pdf;/home/zenon/Zotero/storage/DY64NJW5/726791.html}
}
@article{li2022,
title = {A {{Survey}} of {{Convolutional Neural Networks}}: {{Analysis}}, {{Applications}}, and {{Prospects}}},
shorttitle = {A {{Survey}} of {{Convolutional Neural Networks}}},
author = {Li, Zewen and Liu, Fan and Yang, Wenjie and Peng, Shouheng and Zhou, Jun},
date = {2022-12},
journaltitle = {IEEE Transactions on Neural Networks and Learning Systems},
volume = {33},
number = {12},
pages = {6999--7019},
issn = {2162-2388},
doi = {10.1109/TNNLS.2021.3084827},
abstract = {A convolutional neural network (CNN) is one of the most significant networks in the deep learning field. Since CNN made impressive achievements in many areas, including but not limited to computer vision and natural language processing, it attracted much attention from both industry and academia in the past few years. The existing reviews mainly focus on CNNs applications in different scenarios without considering CNN from a general perspective, and some novel ideas proposed recently are not covered. In this review, we aim to provide some novel ideas and prospects in this fast-growing field. Besides, not only 2-D convolution but also 1-D and multidimensional ones are involved. First, this review introduces the history of CNN. Second, we provide an overview of various convolutions. Third, some classic and advanced CNN models are introduced; especially those key points making them reach state-of-the-art results. Fourth, through experimental analysis, we draw some conclusions and provide several rules of thumb for functions and hyperparameter selection. Fifth, the applications of 1-D, 2-D, and multidimensional convolution are covered. Finally, some open issues and promising directions for CNN are discussed as guidelines for future work.},
eventtitle = {{{IEEE Transactions}} on {{Neural Networks}} and {{Learning Systems}}},
file = {/home/zenon/Zotero/storage/U7JKC8DW/Li et al. - 2022 - A Survey of Convolutional Neural Networks Analysi.pdf;/home/zenon/Zotero/storage/99TTKB2L/9451544.html}
}
@online{lin2015, @online{lin2015,
title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}}, title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}},
shorttitle = {Microsoft {{COCO}}}, shorttitle = {Microsoft {{COCO}}},
@ -571,19 +618,17 @@
file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf} file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf}
} }
@online{lin2017, @inproceedings{lin2017,
title = {Feature {{Pyramid Networks}} for {{Object Detection}}}, title = {Feature {{Pyramid Networks}} for {{Object Detection}}},
booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge}, author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
date = {2017-04-19}, date = {2017-07},
eprint = {1612.03144}, pages = {936--944},
eprinttype = {arxiv}, issn = {1063-6919},
eprintclass = {cs}, doi = {10.1109/CVPR.2017.106},
doi = {10.48550/arXiv.1612.03144}, abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
urldate = {2023-09-28}, eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But recent deep learning object detectors have avoided pyramid representations, in part because they are compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using FPN in a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.}, file = {/home/zenon/Zotero/storage/P54JRJGY/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/A8YVPLFS/8099589.html}
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/8BBA7R4F/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/KUPLTHRQ/1612.html}
} }
@inproceedings{lin2017a, @inproceedings{lin2017a,
@ -594,7 +639,6 @@
pages = {936--944}, pages = {936--944},
issn = {1063-6919}, issn = {1063-6919},
doi = {10.1109/CVPR.2017.106}, doi = {10.1109/CVPR.2017.106},
urldate = {2023-10-22},
abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.}, abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
file = {/home/zenon/Zotero/storage/ZBT2Z36R/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/N9EQUFC2/8099589.html} file = {/home/zenon/Zotero/storage/ZBT2Z36R/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/N9EQUFC2/8099589.html}
@ -608,27 +652,54 @@
pages = {2999--3007}, pages = {2999--3007},
issn = {2380-7504}, issn = {2380-7504},
doi = {10.1109/ICCV.2017.324}, doi = {10.1109/ICCV.2017.324},
urldate = {2023-10-22},
abstract = {The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.}, abstract = {The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.},
eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})}, eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
file = {/home/zenon/Zotero/storage/LL8HFKFW/Lin et al. - 2017 - Focal Loss for Dense Object Detection.pdf;/home/zenon/Zotero/storage/982Z922B/8237586.html} file = {/home/zenon/Zotero/storage/LL8HFKFW/Lin et al. - 2017 - Focal Loss for Dense Object Detection.pdf;/home/zenon/Zotero/storage/982Z922B/8237586.html}
} }
@incollection{liu2016, @inproceedings{lin2017c,
title = {Feature {{Pyramid Networks}} for {{Object Detection}}},
booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
date = {2017-07},
pages = {936--944},
issn = {1063-6919},
doi = {10.1109/CVPR.2017.106},
abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
file = {/home/zenon/Zotero/storage/RNMZUZMQ/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/5JNA38YH/8099589.html}
}
@inproceedings{liu2015,
title = {Very Deep Convolutional Neural Network Based Image Classification Using Small Training Sample Size},
booktitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
author = {Liu, Shuying and Deng, Weihong},
date = {2015-11},
pages = {730--734},
issn = {2327-0985},
doi = {10.1109/ACPR.2015.7486599},
abstract = {Since Krizhevsky won the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2012 competition with the brilliant deep convolutional neural networks (D-CNNs), researchers have designed lots of D-CNNs. However, almost all the existing very deep convolutional neural networks are trained on the giant ImageNet datasets. Small datasets like CIFAR-10 has rarely taken advantage of the power of depth since deep models are easy to overfit. In this paper, we proposed a modified VGG-16 network and used this model to fit CIFAR-10. By adding stronger regularizer and using Batch Normalization, we achieved 8.45\% error rate on CIFAR-10 without severe overfitting. Our results show that the very deep CNN can be used to fit small datasets with simple and proper modifications and don't need to re-design specific small networks. We believe that if a model is strong enough to fit a large dataset, it can also fit a small one.},
eventtitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
file = {/home/zenon/Zotero/storage/H9B6RK53/Liu and Deng - 2015 - Very deep convolutional neural network based image.pdf;/home/zenon/Zotero/storage/BIPI3CNN/7486599.html}
}
@inproceedings{liu2016,
title = {{{SSD}}: {{Single Shot MultiBox Detector}}}, title = {{{SSD}}: {{Single Shot MultiBox Detector}}},
shorttitle = {{{SSD}}}, shorttitle = {{{SSD}}},
booktitle = {Computer {{Vision}} {{ECCV}} 2016},
author = {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.}, author = {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.},
editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
date = {2016}, date = {2016},
volume = {9905}, series = {Lecture {{Notes}} in {{Computer Science}}},
eprint = {1512.02325},
eprinttype = {arxiv},
eprintclass = {cs},
pages = {21--37}, pages = {21--37},
publisher = {{Springer International Publishing}},
location = {{Cham}},
doi = {10.1007/978-3-319-46448-0_2}, doi = {10.1007/978-3-319-46448-0_2},
urldate = {2023-08-24}, abstract = {We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. SSD is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stages and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, COCO, and ILSVRC datasets confirm that SSD has competitive accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. For \$\$300 \textbackslash times 300\$\$300×300input, SSD achieves 74.3~\% mAP on VOC2007 test at 59~FPS on a Nvidia Titan X and for \$\$512 \textbackslash times 512\$\$512×512input, SSD achieves 76.9~\% mAP, outperforming a comparable state of the art Faster R-CNN model. Compared to other single stage methods, SSD has much better accuracy even with a smaller input image size. Code is available at https://github.com/weiliu89/caffe/tree/ssd.},
abstract = {We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. Our SSD model is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stage and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, MS COCO, and ILSVRC datasets confirm that SSD has comparable accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. Compared to other single stage methods, SSD has much better accuracy, even with a smaller input image size. For \$300\textbackslash times 300\$ input, SSD achieves 72.1\% mAP on VOC2007 test at 58 FPS on a Nvidia Titan X and for \$500\textbackslash times 500\$ input, SSD achieves 75.1\% mAP, outperforming a comparable state of the art Faster R-CNN model. Code is available at https://github.com/weiliu89/caffe/tree/ssd .}, isbn = {978-3-319-46448-0},
keywords = {Computer Science - Computer Vision and Pattern Recognition}, langid = {english},
file = {/home/zenon/Zotero/storage/JQWR9QIY/Liu et al. - 2016 - SSD Single Shot MultiBox Detector.pdf;/home/zenon/Zotero/storage/Y8UXAEEU/1512.html} keywords = {Convolutional neural network,Real-time object detection},
file = {/home/zenon/Zotero/storage/LUL6FCIQ/Liu et al. - 2016 - SSD Single Shot MultiBox Detector.pdf}
} }
@article{lopez-garcia2022, @article{lopez-garcia2022,
@ -646,6 +717,20 @@
file = {/home/zenon/Zotero/storage/MJSM2BFH/López-García et al. - 2022 - Machine Learning-Based Processing of Multispectral.pdf} file = {/home/zenon/Zotero/storage/MJSM2BFH/López-García et al. - 2022 - Machine Learning-Based Processing of Multispectral.pdf}
} }
@inproceedings{lowe1999,
title = {Object {{Recognition}} from {{Local Scale-Invariant Features}}},
booktitle = {Proceedings of the {{International Conference}} on {{Computer Vision-Volume}} 2 - {{Volume}} 2},
author = {Lowe, David G.},
date = {1999-09-20},
series = {{{ICCV}} '99},
pages = {1150},
publisher = {{IEEE Computer Society}},
location = {{USA}},
abstract = {An object recognition system has been developed that uses a new class of local image features. The features are invariant to image scaling, translation, and rotation, and partially invariant to illumination changes and affine or 3D projection.These features share similar properties with neurons in inferior temporal cortex that are used for object recognition in primate vision. Features are efficiently detected through a staged filtering approach that identifies stable points in scale space. Image keys are created that allow for local geometric deformations by representing blurred image gradients in multiple orientation planes and at multiple scales.The keys are used as input to a nearest-neighbor indexing method that identifies candidate object matches. Final verification of each match is achieved by finding a low-residual least-squares solution for the unknown model parameters. Experimental results show that robust object recognition can be achieved in cluttered partially-occluded images with a computation time of under 2 seconds.},
isbn = {978-0-7695-0164-2},
file = {/home/zenon/Zotero/storage/XTECRTI7/Lowe - 1999 - Object Recognition from Local Scale-Invariant Feat.pdf}
}
@article{mateo-aroca2019, @article{mateo-aroca2019,
title = {Remote {{Image Capture System}} to {{Improve Aerial Supervision}} for {{Precision Irrigation}} in {{Agriculture}}}, title = {Remote {{Image Capture System}} to {{Improve Aerial Supervision}} for {{Precision Irrigation}} in {{Agriculture}}},
author = {Mateo-Aroca, Antonio and García-Mateos, Ginés and Ruiz-Canales, Antonio and Molina-García-Pardo, José María and Molina-Martínez, José Miguel}, author = {Mateo-Aroca, Antonio and García-Mateos, Ginés and Ruiz-Canales, Antonio and Molina-García-Pardo, José María and Molina-Martínez, José Miguel},
@ -672,7 +757,6 @@
pages = {115--133}, pages = {115--133},
issn = {1522-9602}, issn = {1522-9602},
doi = {10.1007/BF02478259}, doi = {10.1007/BF02478259},
urldate = {2023-09-22},
abstract = {Because of the “all-or-none” character of nervous activity, neural events and the relations among them can be treated by means of propositional logic. It is found that the behavior of every net can be described in these terms, with the addition of more complicated logical means for nets containing circles; and that for any logical expression satisfying certain conditions, one can find a net behaving in the fashion it describes. It is shown that many particular choices among possible neurophysiological assumptions are equivalent, in the sense that for every net behaving under one assumption, there exists another net which behaves under the other and gives the same results, although perhaps not in the same time. Various applications of the calculus are discussed.}, abstract = {Because of the “all-or-none” character of nervous activity, neural events and the relations among them can be treated by means of propositional logic. It is found that the behavior of every net can be described in these terms, with the addition of more complicated logical means for nets containing circles; and that for any logical expression satisfying certain conditions, one can find a net behaving in the fashion it describes. It is shown that many particular choices among possible neurophysiological assumptions are equivalent, in the sense that for every net behaving under one assumption, there exists another net which behaves under the other and gives the same results, although perhaps not in the same time. Various applications of the calculus are discussed.},
langid = {english}, langid = {english},
keywords = {Excitatory Synapse,Inhibitory Synapse,Nervous Activity,Spatial Summation,Temporal Summation} keywords = {Excitatory Synapse,Inhibitory Synapse,Nervous Activity,Spatial Summation,Temporal Summation}
@ -701,7 +785,6 @@
date = {2017-09-22}, date = {2017-09-22},
publisher = {{The MIT Press}}, publisher = {{The MIT Press}},
doi = {10.7551/mitpress/11301.001.0001}, doi = {10.7551/mitpress/11301.001.0001},
urldate = {2023-09-27},
abstract = {The first systematic study of parallelism in computation by two pioneers in the field.Reissue of the 1988 Expanded Edition with a new foreword by Léon BottouIn}, abstract = {The first systematic study of parallelism in computation by two pioneers in the field.Reissue of the 1988 Expanded Edition with a new foreword by Léon BottouIn},
isbn = {978-0-262-34393-0}, isbn = {978-0-262-34393-0},
langid = {english}, langid = {english},
@ -785,7 +868,6 @@
pages = {779--788}, pages = {779--788},
issn = {1063-6919}, issn = {1063-6919},
doi = {10.1109/CVPR.2016.91}, doi = {10.1109/CVPR.2016.91},
urldate = {2023-10-22},
abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.}, abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.},
eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html} file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html}
@ -816,7 +898,6 @@
pages = {1137--1149}, pages = {1137--1149},
issn = {1939-3539}, issn = {1939-3539},
doi = {10.1109/TPAMI.2016.2577031}, doi = {10.1109/TPAMI.2016.2577031},
urldate = {2023-10-22},
abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model [3], our detection system has a frame rate of 5 fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.}, abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model [3], our detection system has a frame rate of 5 fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.},
eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}}, eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
file = {/home/zenon/Zotero/storage/NBA8U8VS/Ren et al. - 2017 - Faster R-CNN Towards Real-Time Object Detection w.pdf;/home/zenon/Zotero/storage/FJKQTY4F/7485869.html} file = {/home/zenon/Zotero/storage/NBA8U8VS/Ren et al. - 2017 - Faster R-CNN Towards Real-Time Object Detection w.pdf;/home/zenon/Zotero/storage/FJKQTY4F/7485869.html}
@ -836,7 +917,6 @@
pages = {970}, pages = {970},
issn = {2223-7747}, issn = {2223-7747},
doi = {10.3390/plants11070970}, doi = {10.3390/plants11070970},
urldate = {2023-08-25},
abstract = {Plant stress is one of the most significant factors affecting plant fitness and, consequently, food production. However, plant stress may also be profitable since it behaves hormetically; at low doses, it stimulates positive traits in crops, such as the synthesis of specialized metabolites and additional stress tolerance. The controlled exposure of crops to low doses of stressors is therefore called hormesis management, and it is a promising method to increase crop productivity and quality. Nevertheless, hormesis management has severe limitations derived from the complexity of plant physiological responses to stress. Many technological advances assist plant stress science in overcoming such limitations, which results in extensive datasets originating from the multiple layers of the plant defensive response. For that reason, artificial intelligence tools, particularly Machine Learning (ML) and Deep Learning (DL), have become crucial for processing and interpreting data to accurately model plant stress responses such as genomic variation, gene and protein expression, and metabolite biosynthesis. In this review, we discuss the most recent ML and DL applications in plant stress science, focusing on their potential for improving the development of hormesis management protocols.}, abstract = {Plant stress is one of the most significant factors affecting plant fitness and, consequently, food production. However, plant stress may also be profitable since it behaves hormetically; at low doses, it stimulates positive traits in crops, such as the synthesis of specialized metabolites and additional stress tolerance. The controlled exposure of crops to low doses of stressors is therefore called hormesis management, and it is a promising method to increase crop productivity and quality. Nevertheless, hormesis management has severe limitations derived from the complexity of plant physiological responses to stress. Many technological advances assist plant stress science in overcoming such limitations, which results in extensive datasets originating from the multiple layers of the plant defensive response. For that reason, artificial intelligence tools, particularly Machine Learning (ML) and Deep Learning (DL), have become crucial for processing and interpreting data to accurately model plant stress responses such as genomic variation, gene and protein expression, and metabolite biosynthesis. In this review, we discuss the most recent ML and DL applications in plant stress science, focusing on their potential for improving the development of hormesis management protocols.},
pmcid = {PMC9003083}, pmcid = {PMC9003083},
file = {/home/zenon/Zotero/storage/56I7ELHW/Rico-Chávez et al. - 2022 - Machine Learning for Plant Stress Modeling A Pers.pdf} file = {/home/zenon/Zotero/storage/56I7ELHW/Rico-Chávez et al. - 2022 - Machine Learning for Plant Stress Modeling A Pers.pdf}
@ -877,7 +957,6 @@
publisher = {{Nature Publishing Group}}, publisher = {{Nature Publishing Group}},
issn = {1476-4687}, issn = {1476-4687},
doi = {10.1038/323533a0}, doi = {10.1038/323533a0},
urldate = {2023-09-29},
abstract = {We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal hidden units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.}, abstract = {We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal hidden units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
issue = {6088}, issue = {6088},
langid = {english}, langid = {english},
@ -893,7 +972,6 @@
eprinttype = {arxiv}, eprinttype = {arxiv},
eprintclass = {cs}, eprintclass = {cs},
doi = {10.48550/arXiv.1409.0575}, doi = {10.48550/arXiv.1409.0575},
urldate = {2023-10-22},
abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the five years of the challenge, and propose future directions and improvements.}, abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the five years of the challenge, and propose future directions and improvements.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition,I.4.8,I.5.2}, keywords = {Computer Science - Computer Vision and Pattern Recognition,I.4.8,I.5.2},
@ -910,7 +988,6 @@
pages = {210--229}, pages = {210--229},
issn = {0018-8646}, issn = {0018-8646},
doi = {10.1147/rd.33.0210}, doi = {10.1147/rd.33.0210},
urldate = {2023-10-01},
abstract = {Two machine-learning procedures have been investigated in some detail using the game of checkers. Enough work has been done to verify the fact that a computer can be programmed so that it will learn to play a better game of checkers than can be played by the person who wrote the program. Furthermore, it can learn to do this in a remarkably short period of time (8 or 10 hours of machine-playing time) when given only the rules of the game, a sense of direction, and a redundant and incomplete list of parameters which are thought to have something to do with the game, but whose correct signs and relative weights are unknown and unspecified. The principles of machine learning verified by these experiments are, of course, applicable to many other situations.}, abstract = {Two machine-learning procedures have been investigated in some detail using the game of checkers. Enough work has been done to verify the fact that a computer can be programmed so that it will learn to play a better game of checkers than can be played by the person who wrote the program. Furthermore, it can learn to do this in a remarkably short period of time (8 or 10 hours of machine-playing time) when given only the rules of the game, a sense of direction, and a redundant and incomplete list of parameters which are thought to have something to do with the game, but whose correct signs and relative weights are unknown and unspecified. The principles of machine learning verified by these experiments are, of course, applicable to many other situations.},
eventtitle = {{{IBM Journal}} of {{Research}} and {{Development}}}, eventtitle = {{{IBM Journal}} of {{Research}} and {{Development}}},
file = {/home/zenon/Zotero/storage/9YJSG7IJ/Samuel - 1959 - Some Studies in Machine Learning Using the Game of.pdf;/home/zenon/Zotero/storage/6XF4QCUQ/5392560.html} file = {/home/zenon/Zotero/storage/9YJSG7IJ/Samuel - 1959 - Some Studies in Machine Learning Using the Game of.pdf;/home/zenon/Zotero/storage/6XF4QCUQ/5392560.html}
@ -924,7 +1001,6 @@
pages = {1043--1066}, pages = {1043--1066},
publisher = {{CRC Press}}, publisher = {{CRC Press}},
doi = {10.1201/9781410615862-66}, doi = {10.1201/9781410615862-66},
urldate = {2023-09-17},
abstract = {We begin with our definition of a prototype and then discuss prototypes as design artifacts, introducing four dimensions for analyzing them. We then discuss the role of prototyping within the design process, in particular the concept of a design space, and how it is expanded and contracted by generating and selecting design ideas. The next three sections describe specific prototyping approaches: Rapid prototyping, both off-line and on-line, for early stages of design, iterative prototyping, which uses on-line development tools, and evolutionary prototyping, which must be based on a sound software architecture.}, abstract = {We begin with our definition of a prototype and then discuss prototypes as design artifacts, introducing four dimensions for analyzing them. We then discuss the role of prototyping within the design process, in particular the concept of a design space, and how it is expanded and contracted by generating and selecting design ideas. The next three sections describe specific prototyping approaches: Rapid prototyping, both off-line and on-line, for early stages of design, iterative prototyping, which uses on-line development tools, and evolutionary prototyping, which must be based on a sound software architecture.},
isbn = {978-0-429-16397-5}, isbn = {978-0-429-16397-5},
langid = {english} langid = {english}
@ -948,6 +1024,19 @@
file = {/home/zenon/Zotero/storage/QC22JBMX/Selvaraju et al. - 2020 - Grad-CAM Visual Explanations from Deep Networks v.pdf} file = {/home/zenon/Zotero/storage/QC22JBMX/Selvaraju et al. - 2020 - Grad-CAM Visual Explanations from Deep Networks v.pdf}
} }
@inproceedings{shrivastava2016,
title = {Training {{Region-Based Object Detectors}} with {{Online Hard Example Mining}}},
booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
author = {Shrivastava, Abhinav and Gupta, Abhinav and Girshick, Ross},
date = {2016-06},
pages = {761--769},
issn = {1063-6919},
doi = {10.1109/CVPR.2016.89},
abstract = {The field of object detection has made significant advances riding on the wave of region-based ConvNets, but their training procedure still includes many heuristics and hyperparameters that are costly to tune. We present a simple yet surprisingly effective online hard example mining (OHEM) algorithm for training region-based ConvNet detectors. Our motivation is the same as it has always been - detection datasets contain an overwhelming number of easy examples and a small number of hard examples. Automatic selection of these hard examples can make training more effective and efficient. OHEM is a simple and intuitive algorithm that eliminates several heuristics and hyperparameters in common use. But more importantly, it yields consistent and significant boosts in detection performance on benchmarks like PASCAL VOC 2007 and 2012. Its effectiveness increases as datasets become larger and more difficult, as demonstrated by the results on the MS COCO dataset. Moreover, combined with complementary advances in the field, OHEM leads to state-of-the-art results of 78.9\% and 76.3\% mAP on PASCAL VOC 2007 and 2012 respectively.},
eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
file = {/home/zenon/Zotero/storage/HB7BQR72/Shrivastava et al. - 2016 - Training Region-Based Object Detectors with Online.pdf;/home/zenon/Zotero/storage/PEAFAEE9/7780458.html}
}
@inproceedings{simard2003, @inproceedings{simard2003,
title = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis}, title = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis},
booktitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.}, booktitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.},
@ -955,7 +1044,6 @@
date = {2003-08}, date = {2003-08},
pages = {958--963}, pages = {958--963},
doi = {10.1109/ICDAR.2003.1227801}, doi = {10.1109/ICDAR.2003.1227801},
urldate = {2023-10-01},
eventtitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.}, eventtitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.},
file = {/home/zenon/Zotero/storage/S6SE8F56/Simard et al. - 2003 - Best practices for convolutional neural networks a.pdf;/home/zenon/Zotero/storage/FQHDISEK/1227801.html} file = {/home/zenon/Zotero/storage/S6SE8F56/Simard et al. - 2003 - Best practices for convolutional neural networks a.pdf;/home/zenon/Zotero/storage/FQHDISEK/1227801.html}
} }
@ -987,7 +1075,6 @@
pages = {154--171}, pages = {154--171},
issn = {1573-1405}, issn = {1573-1405},
doi = {10.1007/s11263-013-0620-5}, doi = {10.1007/s11263-013-0620-5},
urldate = {2023-10-22},
abstract = {This paper addresses the problem of generating possible object locations for use in object recognition. We introduce selective search which combines the strength of both an exhaustive search and segmentation. Like segmentation, we use the image structure to guide our sampling process. Like exhaustive search, we aim to capture all possible object locations. Instead of a single technique to generate possible object locations, we diversify our search and use a variety of complementary image partitionings to deal with as many image conditions as possible. Our selective search results in a small set of data-driven, class-independent, high quality locations, yielding 99~\% recall and a Mean Average Best Overlap of 0.879 at 10,097 locations. The reduced number of locations compared to an exhaustive search enables the use of stronger machine learning techniques and stronger appearance models for object recognition. In this paper we show that our selective search enables the use of the powerful Bag-of-Words model for recognition. The selective search software is made publicly available (Software: http://disi.unitn.it/\textasciitilde uijlings/SelectiveSearch.html).}, abstract = {This paper addresses the problem of generating possible object locations for use in object recognition. We introduce selective search which combines the strength of both an exhaustive search and segmentation. Like segmentation, we use the image structure to guide our sampling process. Like exhaustive search, we aim to capture all possible object locations. Instead of a single technique to generate possible object locations, we diversify our search and use a variety of complementary image partitionings to deal with as many image conditions as possible. Our selective search results in a small set of data-driven, class-independent, high quality locations, yielding 99~\% recall and a Mean Average Best Overlap of 0.879 at 10,097 locations. The reduced number of locations compared to an exhaustive search enables the use of stronger machine learning techniques and stronger appearance models for object recognition. In this paper we show that our selective search enables the use of the powerful Bag-of-Words model for recognition. The selective search software is made publicly available (Software: http://disi.unitn.it/\textasciitilde uijlings/SelectiveSearch.html).},
langid = {english}, langid = {english},
keywords = {Appearance Model,Colour Space,Exhaustive Search,Object Location,Object Recognition}, keywords = {Appearance Model,Colour Space,Exhaustive Search,Object Location,Object Recognition},
@ -1071,7 +1158,6 @@
eprinttype = {arxiv}, eprinttype = {arxiv},
eprintclass = {cs}, eprintclass = {cs},
doi = {10.48550/arXiv.1311.2901}, doi = {10.48550/arXiv.1311.2901},
urldate = {2023-10-27},
abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky \textbackslash etal on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.}, abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky \textbackslash etal on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}, keywords = {Computer Science - Computer Vision and Pattern Recognition},
@ -1100,7 +1186,6 @@
eprinttype = {arxiv}, eprinttype = {arxiv},
eprintclass = {cs}, eprintclass = {cs},
doi = {10.48550/arXiv.1911.08287}, doi = {10.48550/arXiv.1911.08287},
urldate = {2023-09-28},
abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while \$\textbackslash ell\_n\$-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, \textbackslash ie, overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster RCNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.}, abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while \$\textbackslash ell\_n\$-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, \textbackslash ie, overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster RCNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition}, keywords = {Computer Science - Computer Vision and Pattern Recognition},
@ -1160,6 +1245,18 @@
file = {/home/zenon/Zotero/storage/CLHDBTJ2/qWPwnQEACAAJ.html} file = {/home/zenon/Zotero/storage/CLHDBTJ2/qWPwnQEACAAJ.html}
} }
@online{zotero-204,
title = {Feature {{Pyramid Networks}} for {{Object Detection}} | {{IEEE Conference Publication}} | {{IEEE Xplore}}},
url = {https://ieeexplore.ieee.org/document/8099589},
urldate = {2023-10-27}
}
@online{zotero-219,
title = {Scalable {{Object Detection Using Deep Neural Networks}} | {{IEEE Conference Publication}} | {{IEEE Xplore}}},
url = {https://ieeexplore.ieee.org/document/6909673},
urldate = {2023-10-29}
}
@article{zou2023, @article{zou2023,
title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}}, title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}},
shorttitle = {Object {{Detection}} in 20 {{Years}}}, shorttitle = {Object {{Detection}} in 20 {{Years}}},

Binary file not shown.

View File

@ -111,6 +111,13 @@
\newacronym[plural=ROIs,longplural=Regions of Interest]{roi}{ROI}{Region of Interest} \newacronym[plural=ROIs,longplural=Regions of Interest]{roi}{ROI}{Region of Interest}
\newacronym{spp}{SPP}{Spatial Pyramid Pooling} \newacronym{spp}{SPP}{Spatial Pyramid Pooling}
\newacronym{rpn}{RPN}{Region Proposal Network} \newacronym{rpn}{RPN}{Region Proposal Network}
\newacronym{fpn}{FPN}{Feature Pyramid Network}
\newacronym{yolo}{YOLO}{You Only Look Once}
\newacronym{ssd}{SSD}{Single Shot MultiBox Detector}
\newacronym{ann}{ANN}{Artificial Neural Network}
\newacronym{cuda}{CUDA}{Compute Unified Device Architecture}
\newacronym{rbf}{RBF}{Radial Basis Function}
\newacronym{mnist}{MNIST}{Modified National Institute of Standards and Technology}
\begin{document} \begin{document}
@ -863,20 +870,6 @@ gradient descent \cite{cauchy1847}.
\section{Object Detection} \section{Object Detection}
\label{sec:background-detection} \label{sec:background-detection}
Give a definition of object detection and contrast it with instance
segmentation/other detection tasks. Briefly mention how object
detection was done before deep neural networks (feature-based methods
(HOG, SIFT) and sliding window methods (Viola-Jones)). Go over the
different approaches to object detection, namely region-based methods
(Mask R-CNN and Faster R-CNN) and single-shot detection. Illustrate
the approach region-based methods take and discuss problems arising
from said approach (e.g. Dual-Priorities, multiple image passes and
slow selective search algorithms for region proposals). Contrast the
previous region-based methods with newer single-shot detectors such as
YOLO and SSDnet.
Estimated 8 pages for this section.
From facial detection to fully automated driving—object detection From facial detection to fully automated driving—object detection
provides the basis for a wide variety of tasks within the computer provides the basis for a wide variety of tasks within the computer
vision world. While most implementations in the 1990s and early 2000s vision world. While most implementations in the 1990s and early 2000s
@ -923,15 +916,15 @@ achieves comparable results to the state of the art in 2001.
The \gls{hog}~\cite{dalal2005} is a feature descriptor used in The \gls{hog}~\cite{dalal2005} is a feature descriptor used in
computer vision and image processing to detect objects in images. It computer vision and image processing to detect objects in images. It
is a detector which detects shape like other methods such as is a detector which detects shape like other methods such as
\gls{sift}. The idea is to use the distribution of local intensity \gls{sift} \cite{lowe1999}. The idea is to use the distribution of
gradients or edge directions to describe an object. To this end, the local intensity gradients or edge directions to describe an object. To
authors divide the image into a grid of cells and calculate a this end, the authors divide the image into a grid of cells and
histogram of edge orientations within each cell. Additionally, each calculate a histogram of edge orientations within each
histogram is normalized by taking a larger region and adjusting the cell. Additionally, each histogram is normalized by taking a larger
local histograms based on the larger region's intensity levels. The region and adjusting the local histograms based on the larger region's
resulting blocks of normalized gradients are evenly spaced out across intensity levels. The resulting blocks of normalized gradients are
the image with some overlap. These patches are then passed as a evenly spaced out across the image with some overlap. These patches
feature vector to a classifier. are then passed as a feature vector to a classifier.
\textcite{dalal2005} successfully use the \gls{hog} with a linear \textcite{dalal2005} successfully use the \gls{hog} with a linear
\gls{svm} for classification to detect humans in images. They work \gls{svm} for classification to detect humans in images. They work
@ -1135,14 +1128,151 @@ The \gls{rpn} makes object proposal generation inexpensive and
possible on \glspl{gpu}. The whole network operates on an almost real possible on \glspl{gpu}. The whole network operates on an almost real
time scale by being able to process \qty{5}{images\per\s} and time scale by being able to process \qty{5}{images\per\s} and
maintaining high state-of-the-art \gls{map} values of 73.2\% maintaining high state-of-the-art \gls{map} values of 73.2\%
(\gls{voc} 2007). If the detection network is switched from VGGNet to (\gls{voc} 2007). If the detection network is switched from VGGNet
ZF-Net \cite{zeiler2013}, Faster R-\gls{cnn} is able to achieve \cite{liu2015} to ZF-Net \cite{zeiler2013}, Faster R-\gls{cnn} is able
\qty{17}{images\per\s}, albeit at a lower \gls{map} of 59.9\%. to achieve \qty{17}{images\per\s}, albeit at a lower \gls{map} of
59.9\%.
\subsubsection{Feature Pyramid Network}
\label{sssec:theory-fpn}
\glspl{fpn} were first introduced by \textcite{lin2017} to use the
hierarchical pyramid structure inherent in \glspl{cnn} to compute
feature maps on different scales. Previously, detectors were only
using the features of the top most (coarse) layers because it was
computationally too expensive to use lower (fine-grained) layers. By
leveraging feature maps on different scales, \glspl{fpn} are able to
better detect small objects because predictions are made independently
on all levels. \glspl{fpn} are an important building block of many
state-of-the-art object detectors.
A \gls{fpn} first computes the feature pyramid bottom-up with a
scaling step of 2. The lower levels capture less semantic information
than the higher levels, but include more spatial information due to
the higher granularity. In a second step, the \gls{fpn} upsamples the
higher levels such that the dimensions of two consecutive layers are
the same. The upsampled top layer is merged with the layer beneath it
via element-wise addition and convolved with a $1\times 1$ convolutional
layer to reduce channel dimensions and to smooth out potential
artifacts introduced during the upsampling step. The results of that
operation constitute the new \emph{top layer} and the process
continues with the layer below it until the finest resolution feature
map is generated. In this way, the features of the different layers at
different scales are fused to obtain a feature map with high semantic
information but also high spatial information.
\textcite{lin2017} report results on \gls{coco} with a \gls{map}@0.5
of 59.1\% with a Faster R-\gls{cnn} structure and a ResNet-101
backbone. Their submission does not include any specific improvements
such as hard negative mining \cite{shrivastava2016} or data
augmentation.
\subsection{One-Stage Detectors} \subsection{One-Stage Detectors}
\label{ssec:theory-one-stage} \label{ssec:theory-one-stage}
\section{Classification} One-stage detectors, in contrast to two-stage detectors, combine the
proposal generation and detection tasks into one neural network such
that all objects can be retrieved in a single step. Since the proposal
generation in two-stage detectors is a costly operation and usually
the bottleneck, one-stage detectors are significantly faster
overall. Their speeds allow them to be deployed to low-resource
devices such as mobile phones while still providing real time object
detection. Unfortunately, their detection accuracy trailed the
two-stage approaches for years, especially for small and/or dense
objects.
\subsubsection{You Only Look Once}
\label{sssec:theory-yolo}
\gls{yolo} was the first one-stage detector introduced by
\textcite{redmon2016}. It divides each image into regions and predicts
bounding boxes and classes of objects simultaneously. This allows it
to be extremely fast at up to \qty{155}{fps} with a \gls{map} of
52.7\% on \gls{voc} 2007. The accuracy results were not state of the
art at the time because the architecture trades localization accuracy
for speed, especially for small objects. These issues have been
gradually dealt with in later versions of \gls{yolo} as well as in
other one-stage detectors such as \gls{ssd}. Since a later version of
\gls{yolo} is used in this work, we refer to
section~\ref{sec:methods-detection} for a thorough account of its
architecture.
\subsubsection{Single Shot MultiBox Detector}
\label{sssec:theory-ssd}
\gls{ssd} was proposed by \textcite{liu2016} and functions similarly
to \gls{yolo} in that it does not need an extra proposal generation
step, but instead detects and classifies objects in one go. The aim of
one-stage detectors is to be considerably faster and at least as
accurate as two-stage detectors. While \gls{yolo} paved the way for
one-stage detectors, the detection accuracy is significantly lower
than state-of-the-art two-stage detection approaches such as Faster
R\gls{cnn}. \gls{ssd} combines generating detections on multiple
scales and an end-to-end architecture to achieve high accuracy as well
as high speed.
\gls{ssd} is based on a standard \gls{cnn} such as VGG16
\cite{liu2015} and adds additional feature layers to the network. The
\gls{cnn}, which the detector is using to extract features, has its
last fully-connected layer removed such that the output of the
\gls{cnn} is a scaled down representation of the input image. The
extra layers are intended to capture features at different scales and
compare them during training to a range of default anchor boxes. This
idea comes from MultiBox \cite{erhan2014}, but is implemented in
\gls{ssd} with a slight twist: during matching of default boxes to the
ground truth, boxes with a Jaccard overlap (\gls{iou}) of less than
$0.5$ are discarded. In one-stage detector terms, the feature
extractor is the \emph{backbone} whereas the extra layers constitute
the \emph{head} of the network. The outputs of the extra layers
contain features for smaller regions with higher spatial
information. Making use of these additional feature maps is what sets
\gls{ssd} apart from \gls{yolo} and results in \gls{ssd} being able to
detect smaller and denser objects as well.
The authors report results on \gls{voc} 2007 for their \gls{ssd}300
and \gls{ssd}512 model varieties. The number refers to the size of the
input images. \gls{ssd}300 outperforms Fast R-\gls{cnn} by 1.1
percentage points (\gls{map} 66.9\% vs 68\%). \gls{ssd}512 outperforms
Faster R-\gls{cnn} by 1.7\% \gls{map}. If trained on the \gls{voc}
2007, 2012 and \gls{coco} train sets, \gls{ssd}512 achieves a
\gls{map} of 81.5\% on the \gls{voc} 2007 test set. \gls{ssd}'s speed
is at \qty{46}{fps} which, although lower than Fast \gls{yolo}'s
\qty{155}{fps}, is still in real time. Furthermore, \gls{ssd} has a
\gls{map} which is almost 22\% higher than Fast \gls{yolo}.
\subsubsection{RetinaNet}
\label{sssec:theory-retinanet}
One-stage detectors before 2017 always trailed the accuracy of top
two-stage detectors on common and difficult benchmark data sets such
as \gls{coco}. \textcite{lin2017b} investigated what the culprit for
the lower accuracy scores could be and found that the severe class
imbalance between foreground and background instances is the
problem. They introduce a novel loss function called \emph{Focal Loss}
which replaces the standard cross-entropy loss. Focal loss
down-weights the importance of easy negative examples during training
and instead focuses on instances which are harder but provide more
information.
Focal loss is based on cross-entropy loss but includes a scaling
factor which decreases while the classification confidence
increases. In other words, if the confidence that an object belongs to
a particular class is already high, focal loss outputs a small value
such that the weight updates during backpropagation are only
marginally affected by the current example. The model can thus focus
on examples which are harder to achieve a good confidence score on.
\textcite{lin2017b} implement their focal loss with a simple one-stage
detector called \emph{RetinaNet}. It makes use of previous advances in
object detection and classification by including a \gls{fpn} on top of
a ResNet \cite{he2016} as the backbone and using anchors for the
different levels in the feature pyramid. Attached to the backbone are
two subnetworks which classify anchor boxes and regress them to the
ground truth boxes. The results are that the RetinaNet-101-500 version
(with an input size of \qty{500}{px}) achieves a \gls{map} of 34.4\%
at a speed of around \qty{11}{fp\s} on the \gls{coco} data set.
\section{Image Classification}
\label{sec:background-classification} \label{sec:background-classification}
Give a definition of image classification and briefly mention the way Give a definition of image classification and briefly mention the way
@ -1160,6 +1290,153 @@ connections.
Estimated 8 pages for this section. Estimated 8 pages for this section.
Image classification, in contrast to object detection, is a slightly
easier task because there is no requirement to localize objects in the
image. Instead, image classification operates always on the image as a
whole rather than individual parts of it. As has been demonstrated in
the last chapter, object detection methods often rely on advances in
image classification to accurately detect objects. After objects have
been localized, we humans want to know what kind of object it is and
that is where image classification methods become useful.
This section goes into detail about various image classification
methods. We first give a short summary on how image classification was
commonly done before \glspl{cnn} became the de facto
standard. Afterwards, we will introduce common and influential
approaches leveraging \glspl{cnn} and discuss problems and solutions
for training large networks.
\subsection{Traditional Methods}
\label{ssec:class-traditional}
Similarly to early object detection algorithms, traditional methods
rely on manual feature extraction and subsequent classification with
classical algorithms. Passing raw images to the algorithms is often
not feasible due to the immense information contained in just one
image. Furthermore, a raw image contains a signal to noise ratio which
is too low for a computer to successfully learn properties about the
image. Instead, humans—with the aid of image processing methods—have
to select a lower-dimensional representation of the input image and
then pass this representation to a classifier. This process of
manually reducing the dimensions and complexity of an image to the
part which is \emph{relevant} is termed \emph{feature engineering}.
Manual feature engineering requires selecting an appropriate
representation for the task at hand. For example, if the task is to
classify images which show an object with a special texture, a feature
engineer will likely select an image representation which clearly
pulls the texture into the foreground. In other words, engineers help
the classifier by preprocessing the image such that the most
discriminative features are easily visible. The methods with which an
image representation is created is called \emph{feature descriptor}.
In line with the different ways objects can present themselves on
images, there have been many feature descriptors proposed. Most of the
feature descriptors used in object detection are also used in image
classification (see \gls{hog} and \gls{sift} from
section~\ref{sssec:obj-hog}) because their representational power is
useful in both domains.
\subsection{Deep Learning Based Methods}
\label{ssec:class-dl}
Manual feature engineering is a double-edged sword. Although it allows
to have a high amount of control, it also necessitates the engineer to
select a meaningful representation for training the downstream
classifier. Often, humans make unconscious assumptions about the
problem to be solved as well as the available data and how best to
extract features. These assumptions can have a detrimental effect on
classification accuracy later on because the best-performing feature
descriptor lies outside of the engineer's purview. Therefore, instead
of manually preparing feature vectors for the classifier, researchers
turned to allowing an \gls{ann} to recognize and extract the most
relevant aspects of an image on its own, without human
intervention. Attention is thus mostly given to the structure of the
\gls{ann} and less to the preparation of inputs.
The idea of automatic generation of feature maps via \glspl{ann} gave
rise to \glspl{cnn}. Early \glspl{cnn} \cite{lecun1989} were mostly
discarded for practical applications because they require much more
data during training than traditional methods and also more processing
power during inference. Passing $224\times 224$ pixel images to a
\gls{cnn}, as is common today, was simply not feasible if one wanted a
reasonable inference time. With the development of \glspl{gpu} and
supporting software such as the \gls{cuda} toolkit, it was possible to
perform many computations in parallel. The architecture of \glspl{cnn}
lends itself well to parallel processing and thus \glspl{cnn} slowly
but surely overtook other image classification methods.
\subsubsection{LeNet-5}
\label{sssec:theory-lenet-5}
LeNet-5, developed and described by \textcite{lecun1998}, laid the
foundation of \glspl{cnn} as we still use them today. The basic
structure of convolutional layers with pooling layers in-between and
one or more fully-connected layers at the end has been iterated on
many times since then. \textcite{lecun1989} introduced the first
version of LeNet when describing their system for automatic
handwritten zip code recognition. They applied backpropagation with
\gls{sgd} and used the scaled hyperbolic tangent as the activation
function. The error function with which the weights are updated is
\gls{mse}.
The architecture of LeNet-5 is composed of two convolutional layers,
two pooling layers and a dense block of three fully-connected
layers. The input image is a grayscale image of 32 by 32 pixels. The
first convolutional layer generates six feature maps, each with a
scale of 28 by 28 pixels. Each feature map is fed to a pooling layer
which effectively downsamples the image by a factor of two. By
aggregating each two by two area in the feature map via averaging, the
authors are more likely to obtain relative (to each other) instead of
absolute positions of the features. To make up for the loss in spatial
resolution, the following convolutional layer increases the amount of
feature maps to 16 which aims to increase the richness of the learned
representations. Another pooling layer follows which reduces the size
of each of the 16 feature maps to five by five pixels. A dense block
of three fully-connected layers of 120, 84 and 10 neurons respectively
serves as the actual classifier in the network. The last layer uses
the euclidean \gls{rbf} to compute the class an image belongs to (0-9
digits).
The performance of LeNet-5 was measured on the \gls{mnist} database
which consists of 70.000 labeled images of handwritten digits. The
\gls{mse} on the test set is 0.95\%. This result is impressive
considering that character recognition with a \gls{cnn} had not been
done before. However, standard machine learning methods of the time,
such as manual feature engineering and \glspl{svm}, achieved a similar
error rate, even though they are much more memory-intensive. LeNet-5
was conceived to take advantage of the (then) large \gls{mnist}
database. Since there were not many data sets available at the time,
especially with more samples than in the \gls{mnist} database,
\glspl{cnn} were not widely used even after their viability had been
demonstrated by \textcite{lecun1998}. Only in 2012
\textcite{krizhevsky2012} reintroduced \glspl{cnn} (see
section~\ref{ssec:theory-dl-based}) and since then most
state-of-the-art image classification methods have used them.
\subsubsection{ZFNet}
\label{sssec:theory-zfnet}
\subsubsection{GoogLeNet}
\label{sssec:theory-googlenet}
\subsubsection{VGGNet}
\label{sssec:theory-vggnet}
\subsubsection{ResNet}
\label{sssec:theory-resnet}
\subsubsection{Inception v4}
\label{sssec:theory-inception-v4}
\subsubsection{DenseNet}
\label{sssec:theory-densenet}
\subsubsection{MobileNet v3}
\label{sssec:theory-mobilenet-v3}
\section{Transfer Learning} \section{Transfer Learning}
\label{sec:background-transfer-learning} \label{sec:background-transfer-learning}