master-thesis/thesis/references.bib

@article{an2019,
  title = {Identification and {{Classification}} of {{Maize Drought Stress Using Deep Convolutional Neural Network}}},
  author = {An, Jiangyong and Li, Wanyi and Li, Maosong and Cui, Sanrong and Yue, Huanran},
  date = {2019-02},
  journaltitle = {Symmetry},
  volume = {11},
  number = {2},
  pages = {256},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {2073-8994},
  doi = {10.3390/sym11020256},
  keywords = {deep convolutional neural network,drought classification,drought identification,drought stress,maize,phenotype,traditional machine learning},
  file = {/home/zenon/Zotero/storage/GUCGV95A/An et al. - 2019 - Identification and Classification of Maize Drought.pdf}
}

@article{ariss2022,
  title = {{{ResNet-based Parkinson}}'s {{Disease Classification}}},
  author = {Ariss, Omar El and Hu, Kaoning},
  date = {2022},
  journaltitle = {IEEE Transactions on Artificial Intelligence},
  pages = {1--11},
  issn = {2691-4581},
  doi = {10.1109/TAI.2022.3193651},
  eventtitle = {{{IEEE Transactions}} on {{Artificial Intelligence}}},
  keywords = {Convolutional Neural Networks,deep learning,Deep learning,diagnosis,Diseases,Feature extraction,frequency features,heat map,Heating systems,Parkinson's disease,Parkinson's Disease,Recording,Residual neural networks,ResNet,speech recording,transfer learning}
}

@article{atanasov2021,
  title = {Predicting {{Soil Moisture Based}} on the {{Color}} of the {{Leaves Using Data Mining}} and {{Machine Learning Techniques}}},
  author = {Atanasov, S. S.},
  date = {2021-01},
  journaltitle = {IOP Conference Series: Materials Science and Engineering},
  shortjournal = {IOP Conf. Ser.: Mater. Sci. Eng.},
  volume = {1031},
  number = {1},
  pages = {012076},
  publisher = {{IOP Publishing}},
  issn = {1757-899X},
  doi = {10.1088/1757-899X/1031/1/012076},
  file = {/home/zenon/Zotero/storage/TIZ9KQTP/Atanasov - 2021 - Predicting Soil Moisture Based on the Color of the.pdf}
}

@article{aversano2022,
  title = {Water Stress Classification Using {{Convolutional Deep Neural Networks}}},
  author = {Aversano, Lerina and Bernardi, Mario Luca and Cimitile, Marta},
  date = {2022-03-28},
  journaltitle = {JUCS - Journal of Universal Computer Science},
  volume = {28},
  number = {3},
  pages = {311--328},
  publisher = {{Journal of Universal Computer Science}},
  issn = {0948-6968},
  doi = {10.3897/jucs.80733},
  abstract = {In agriculture, given the global water scarcity, optimizing the irrigation system have become a key requisite of any semi-automatic irrigation scheduling system. Using efficient assessment methods for crop water stress allows reduced water consumption as well as improved quality and quantity of the production. The adoption of Neural Network can support the automatic in situ continuous monitoring and irrigation through the real-time classification of the plant water stress. This study proposes an end-to-end automatic irrigation system based on the adoption of Deep Neural Networks for the multinomial classification of tomato plants\&rsquo; water stress based on thermal and optical aerial images. This paper proposes a novel approach that cover three important aspects: (i) joint usage of optical and thermal camera, captured by un-manned aerial vehicles (UAVs); (ii) strategies of image segmentation in both thermal imaging used to obtain images that can remove noise and parts not useful for classifying water stress; (iii) the adoption of deep pre-trained neural ensembles to perform effective classification of field water stress. Firstly, we used a multi-channel approach based on both thermal and optical images gathered by a drone to obtain a more robust and broad image extraction. Moreover, looking at the image processing, a segmentation and background removal step is performed to improve the image quality. Then, the proposed VGG-based architecture is designed as a combination of two different VGG instances (one for each channel). To validate the proposed approach a large real dataset is built. It is com- posed of 6000 images covering all the lifecycle of the tomato crops captured with a drone thermal and optical photocamera. Specifically, our approach, looking mainly at leafs and fruits status and patterns, is designed to be applied after the plants has been transplanted and have reached, at least, early growth stage (covering vegetative, flowering, friut-formation and mature fruiting stages).},
  issue = {3},
  langid = {english},
  file = {/home/zenon/Zotero/storage/BMNA2R55/Aversano et al. - 2022 - Water stress classification using Convolutional De.pdf}
}

@article{awad2019,
  title = {Toward {{Precision}} in {{Crop Yield Estimation Using Remote Sensing}} and {{Optimization Techniques}}},
  author = {Awad, Mohamad M.},
  date = {2019-03},
  journaltitle = {Agriculture},
  volume = {9},
  number = {3},
  pages = {54},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {2077-0472},
  doi = {10.3390/agriculture9030054},
  keywords = {crop yield,environment,evapotranspiration,image processing,remote sensing},
  file = {/home/zenon/Zotero/storage/C65MLVQW/Awad - 2019 - Toward Precision in Crop Yield Estimation Using Re.pdf}
}

@inproceedings{azimi2020,
  title = {Water {{Stress Identification}} in {{Chickpea Plant Shoot Images Using Deep Learning}}},
  booktitle = {2020 {{IEEE}} 17th {{India Council International Conference}} ({{INDICON}})},
  author = {Azimi, Shiva and Kaur, Taranjit and Gandhi, Tapan K},
  date = {2020-12},
  pages = {1--7},
  issn = {2325-9418},
  doi = {10.1109/INDICON49873.2020.9342388},
  eventtitle = {2020 {{IEEE}} 17th {{India Council International Conference}} ({{INDICON}})},
  keywords = {computer vision,deep learning,Deep learning,Nitrogen,plant phenotyping,Proteins,Real-time systems,Stress,Support vector machines,Tools,water stress}
}

@article{azimi2021,
  title = {Intelligent {{Monitoring}} of {{Stress Induced}} by {{Water Deficiency}} in {{Plants Using Deep Learning}}},
  author = {Azimi, Shiva and Wadhawan, Rohan and Gandhi, Tapan K.},
  date = {2021},
  journaltitle = {IEEE Transactions on Instrumentation and Measurement},
  volume = {70},
  pages = {1--13},
  issn = {1557-9662},
  doi = {10.1109/TIM.2021.3111994},
  eventtitle = {{{IEEE Transactions}} on {{Instrumentation}} and {{Measurement}}},
  keywords = {Computer vision,convolutional neural network (CNN),Convolutional neural networks,Crops,deep learning (DL),Long short term memory,long short-term memory (LSTM),monitoring,neural network,Pipelines,plant phenotyping,spatiotemporal analysis,Stress,Visualization,water stress},
  file = {/home/zenon/Zotero/storage/RSNWFVIZ/Azimi et al. - 2021 - Intelligent Monitoring of Stress Induced by Water .pdf}
}

@article{bengio1994,
  title = {Learning Long-Term Dependencies with Gradient Descent Is Difficult},
  author = {Bengio, Y. and Simard, P. and Frasconi, P.},
  date = {1994-03},
  journaltitle = {IEEE Transactions on Neural Networks},
  volume = {5},
  number = {2},
  pages = {157--166},
  issn = {1941-0093},
  doi = {10.1109/72.279181},
  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captured increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.{$<>$}},
  eventtitle = {{{IEEE Transactions}} on {{Neural Networks}}},
  file = {/home/zenon/Zotero/storage/VNSQAD8H/Bengio et al. - 1994 - Learning long-term dependencies with gradient desc.pdf;/home/zenon/Zotero/storage/5BQESSD9/279181.html}
}

@article{benos2021,
  title = {Machine {{Learning}} in {{Agriculture}}: {{A Comprehensive Updated Review}}},
  shorttitle = {Machine {{Learning}} in {{Agriculture}}},
  author = {Benos, Lefteris and Tagarakis, Aristotelis C. and Dolias, Georgios and Berruto, Remigio and Kateris, Dimitrios and Bochtis, Dionysis},
  date = {2021-01},
  journaltitle = {Sensors},
  volume = {21},
  number = {11},
  pages = {3758},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {1424-8220},
  doi = {10.3390/s21113758},
  keywords = {artificial intelligence,crop management,livestock management,machine learning,precision agriculture,precision livestock farming,soil management,water management},
  file = {/home/zenon/Zotero/storage/ILXR97E5/Benos et al. - 2021 - Machine Learning in Agriculture A Comprehensive U.pdf}
}

@inproceedings{bergstra2011,
  title = {Algorithms for {{Hyper-Parameter Optimization}}},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  author = {Bergstra, James and Bardenet, Rémi and Bengio, Yoshua and Kégl, Balázs},
  date = {2011},
  volume = {24},
  publisher = {{Curran Associates, Inc.}},
  url = {https://proceedings.neurips.cc/paper_files/paper/2011/hash/86e8f7ab32cfd12577bc2619bc635690-Abstract.html},
  urldate = {2023-11-16},
  abstract = {Several recent advances to the state of the art in image classification benchmarks have come from better configurations of existing techniques rather than novel approaches to feature learning. Traditionally, hyper-parameter optimization has been the job of humans because they can be very efficient in regimes where only a few trials are possible. Presently, computer clusters and GPU processors make it possible to run more trials and we show that algorithmic approaches can find better results. We present hyper-parameter optimization results on tasks of training neural networks and deep belief networks (DBNs). We optimize hyper-parameters using random search and two new greedy sequential methods based on the expected improvement criterion. Random search has been shown to be sufficiently efficient for learning neural networks for several datasets, but we show it is unreliable for training DBNs. The sequential algorithms are applied to the most difficult DBN learning problems from [Larochelle et al., 2007] and find significantly better results than the best previously reported. This work contributes novel techniques for making response surface models P (y|x) in which many elements of hyper-parameter assignment (x) are known to be irrelevant given particular values of other elements.},
  file = {/home/zenon/Zotero/storage/VYNIDUVW/Bergstra et al. - 2011 - Algorithms for Hyper-Parameter Optimization.pdf}
}

@article{bergstra2012,
  title = {Random {{Search}} for {{Hyper-Parameter Optimization}}},
  author = {Bergstra, James and Bengio, Yoshua},
  date = {2012-02-01},
  journaltitle = {The Journal of Machine Learning Research},
  shortjournal = {J. Mach. Learn. Res.},
  volume = {13},
  pages = {281--305},
  issn = {1532-4435},
  issue = {null},
  keywords = {deep learning,global optimization,model selection,neural networks,response surface modeling},
  file = {/home/zenon/Zotero/storage/DURIB5S9/Bergstra and Bengio - 2012 - Random search for hyper-parameter optimization.pdf}
}

@article{beyer2002,
  title = {Evolution Strategies – {{A}} Comprehensive Introduction},
  author = {Beyer, Hans-Georg and Schwefel, Hans-Paul},
  date = {2002-03-01},
  journaltitle = {Natural Computing},
  shortjournal = {Natural Computing},
  volume = {1},
  number = {1},
  pages = {3--52},
  issn = {1572-9796},
  doi = {10.1023/A:1015059928466},
  abstract = {This article gives a comprehensive introduction into one of the main branches of evolutionary computation – the evolution strategies (ES) the history of which dates back to the 1960s in Germany. Starting from a survey of history the philosophical background is explained in order to make understandable why ES are realized in the way they are. Basic ES algorithms and design principles for variation and selection operators as well as theoretical issues are presented, and future branches of ES research are discussed.},
  langid = {english},
  keywords = {computational intelligence,Darwinian evolution,design principles for genetic operators,evolution strategies,evolutionary computation,optimization},
  file = {/home/zenon/Zotero/storage/98W3Q3UZ/Beyer and Schwefel - 2002 - Evolution strategies – A comprehensive introductio.pdf}
}

@article{bischl2023,
  title = {Hyperparameter Optimization: {{Foundations}}, Algorithms, Best Practices, and Open Challenges},
  shorttitle = {Hyperparameter Optimization},
  author = {Bischl, Bernd and Binder, Martin and Lang, Michel and Pielok, Tobias and Richter, Jakob and Coors, Stefan and Thomas, Janek and Ullmann, Theresa and Becker, Marc and Boulesteix, Anne-Laure and Deng, Difan and Lindauer, Marius},
  date = {2023},
  journaltitle = {WIREs Data Mining and Knowledge Discovery},
  volume = {13},
  number = {2},
  pages = {e1484},
  issn = {1942-4795},
  doi = {10.1002/widm.1484},
  abstract = {Most machine learning algorithms are configured by a set of hyperparameters whose values must be carefully chosen and which often considerably impact performance. To avoid a time-consuming and irreproducible manual process of trial-and-error to find well-performing hyperparameter configurations, various automatic hyperparameter optimization (HPO) methods—for example, based on resampling error estimation for supervised machine learning—can be employed. After introducing HPO from a general perspective, this paper reviews important HPO methods, from simple techniques such as grid or random search to more advanced methods like evolution strategies, Bayesian optimization, Hyperband, and racing. This work gives practical recommendations regarding important choices to be made when conducting HPO, including the HPO algorithms themselves, performance evaluation, how to combine HPO with machine learning pipelines, runtime improvements, and parallelization. This article is categorized under: Algorithmic Development {$>$} Statistics Technologies {$>$} Machine Learning Technologies {$>$} Prediction},
  langid = {english},
  keywords = {automl,hyperparameter optimization,machine learning,model selection,tuning},
  file = {/home/zenon/Zotero/storage/A9KM545A/Bischl et al. - 2023 - Hyperparameter optimization Foundations, algorith.pdf;/home/zenon/Zotero/storage/HXJ3BRVQ/widm.html}
}

@book{bishop2006,
  title = {Pattern {{Recognition}} and {{Machine Learning}}},
  author = {Bishop, Christopher M.},
  date = {2006-08-17},
  eprint = {qWPwnQEACAAJ},
  eprinttype = {googlebooks},
  publisher = {{Springer}},
  abstract = {This is the first textbook on pattern recognition to present the Bayesian viewpoint. The book presents approximate inference algorithms that permit fast approximate answers in situations where exact answers are not feasible. It uses graphical models to describe probability distributions when no other books apply graphical models to machine learning. No previous knowledge of pattern recognition or machine learning concepts is assumed. Familiarity with multivariate calculus and basic linear algebra is required, and some experience in the use of probabilities would be helpful though not essential as the book includes a self-contained introduction to basic probability theory.},
  isbn = {978-0-387-31073-2},
  langid = {english},
  pagetotal = {738},
  keywords = {Computers / Computer Graphics,Computers / Computer Vision \& Pattern Recognition,Computers / Intelligence (AI) \& Semantics,Computers / Optical Data Processing,Computers / Software Development \& Engineering / General,Mathematics / Probability \& Statistics / General},
  file = {/home/zenon/Zotero/storage/VTDMDZPT/Bishop - 2006 - Pattern Recognition and Machine Learning.pdf}
}

@online{bochkovskiy2020,
  title = {{{YOLOv4}}: {{Optimal Speed}} and {{Accuracy}} of {{Object Detection}}},
  shorttitle = {{{YOLOv4}}},
  author = {Bochkovskiy, Alexey and Wang, Chien-Yao and Liao, Hong-Yuan Mark},
  date = {2020-04-22},
  eprint = {2004.10934},
  eprinttype = {arxiv},
  eprintclass = {cs, eess},
  doi = {10.48550/arXiv.2004.10934},
  abstract = {There are a huge number of features which are said to improve Convolutional Neural Network (CNN) accuracy. Practical testing of combinations of such features on large datasets, and theoretical justification of the result, is required. Some features operate on certain models exclusively and for certain problems exclusively, or only for small-scale datasets; while some features, such as batch-normalization and residual-connections, are applicable to the majority of models, tasks, and datasets. We assume that such universal features include Weighted-Residual-Connections (WRC), Cross-Stage-Partial-connections (CSP), Cross mini-Batch Normalization (CmBN), Self-adversarial-training (SAT) and Mish-activation. We use new features: WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, CmBN, DropBlock regularization, and CIoU loss, and combine some of them to achieve state-of-the-art results: 43.5\% AP (65.7\% AP50) for the MS COCO dataset at a realtime speed of \textasciitilde 65 FPS on Tesla V100. Source code is at https://github.com/AlexeyAB/darknet},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Electrical Engineering and Systems Science - Image and Video Processing},
  file = {/home/zenon/Zotero/storage/QHHW6JLR/Bochkovskiy et al. - 2020 - YOLOv4 Optimal Speed and Accuracy of Object Detec.pdf;/home/zenon/Zotero/storage/FPLD4HZH/2004.html}
}

@online{brown2020,
  title = {Language {{Models Are Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  date = {2020-07-22},
  eprint = {2005.14165},
  eprinttype = {arxiv},
  doi = {10.48550/arXiv.2005.14165},
  issue = {arXiv:2005.14165},
  keywords = {Computer Science - Computation and Language},
  file = {/home/zenon/Zotero/storage/56LE395G/Brown et al. - 2020 - Language Models Are Few-Shot Learners.pdf}
}

@article{cauchy1847,
  title = {Méthode Générale Pour La Résolution Des Systèmes d’équations Simultanées},
  author = {Cauchy, M. Augustine},
  date = {1847-10-18},
  journaltitle = {Comptes rendus hebdomadaires des séances de l’Académie des sciences},
  volume = {25},
  pages = {399--402}
}

@online{chan2020,
  title = {Healthy and {{Wilted Houseplant Images}}},
  author = {Chan, Russell},
  date = {2020-01},
  url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images},
  urldate = {2023-12-08},
  abstract = {A collection of 904 houseplant images, classified as either healthy or wilted},
  langid = {english},
  file = {/home/zenon/Zotero/storage/KDVV3SVG/healthy-and-wilted-houseplant-images.html}
}

@article{chandel2021,
  title = {Identifying {{Crop Water Stress Using Deep Learning Models}}},
  author = {Chandel, Narendra Singh and Chakraborty, Subir Kumar and Rajwade, Yogesh Anand and Dubey, Kumkum and Tiwari, Mukesh K. and Jat, Dilip},
  date = {2021-05-01},
  journaltitle = {Neural Computing and Applications},
  shortjournal = {Neural Comput \& Applic},
  volume = {33},
  number = {10},
  pages = {5353--5367},
  issn = {1433-3058},
  doi = {10.1007/s00521-020-05325-4},
  keywords = {Confusion matrix,Crop phenotyping,DCNN,Digital agriculture,Machine learning}
}

@inproceedings{dalal2005,
  title = {Histograms of Oriented Gradients for Human Detection},
  booktitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  author = {Dalal, N. and Triggs, B.},
  date = {2005-06},
  volume = {1},
  pages = {886-893 vol. 1},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2005.177},
  abstract = {We study the question of feature sets for robust visual object recognition; adopting linear SVM based human detection as a test case. After reviewing existing edge and gradient based descriptors, we show experimentally that grids of histograms of oriented gradient (HOG) descriptors significantly outperform existing feature sets for human detection. We study the influence of each stage of the computation on performance, concluding that fine-scale gradients, fine orientation binning, relatively coarse spatial binning, and high-quality local contrast normalization in overlapping descriptor blocks are all important for good results. The new approach gives near-perfect separation on the original MIT pedestrian database, so we introduce a more challenging dataset containing over 1800 annotated human images with a large range of pose variations and backgrounds.},
  eventtitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  keywords = {High performance computing,Histograms,Humans,Image databases,Image edge detection,Object detection,Object recognition,Robustness,Support vector machines,Testing},
  file = {/home/zenon/Zotero/storage/EJFMAW4Z/Dalal and Triggs - 2005 - Histograms of oriented gradients for human detecti.pdf;/home/zenon/Zotero/storage/G6CK9G7D/1467360.html}
}

@article{davis1992,
  title = {Operational Prototyping: A New Development Approach},
  shorttitle = {Operational Prototyping},
  author = {Davis, A.M.},
  date = {1992-09},
  journaltitle = {IEEE Software},
  volume = {9},
  number = {5},
  pages = {70--78},
  issn = {1937-4194},
  doi = {10.1109/52.156899},
  abstract = {The two traditional types of software prototyping methods, throwaway prototyping and evolutionary prototyping, are compared, and prototyping's relation to conventional software development is discussed. Operational prototyping, a method that combines throwaway and evolutionary prototyping techniques by layering a rapid prototype over a solid evolutionary base, is described. Operational prototyping's implications for configuration management, quality assurance, and general project management are reviewed. The application of operational prototyping to a prototype ocean surveillance terminal is presented.{$<>$}},
  eventtitle = {{{IEEE Software}}},
  keywords = {Application software,Oceans,Programming,Project management,Prototypes,Quality assurance,Quality management,Software prototyping,Solids,Surveillance},
  file = {/home/zenon/Zotero/storage/7NBJW3VE/Davis - 1992 - Operational prototyping a new development approac.pdf;/home/zenon/Zotero/storage/N96N3CIA/156899.html}
}

@inproceedings{deng2009,
  title = {{{ImageNet}}: {{A Large-Scale Hierarchical Image Database}}},
  shorttitle = {{{ImageNet}}},
  booktitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  date = {2009-06},
  pages = {248--255},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2009.5206848},
  eventtitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  keywords = {Explosions,Image databases,Image retrieval,Information retrieval,Internet,Large-scale systems,Multimedia databases,Ontologies,Robustness,Spine}
}

@inproceedings{ding2021,
  title = {{{RepVGG}}: {{Making VGG-style ConvNets Great Again}}},
  shorttitle = {{{RepVGG}}},
  booktitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Ding, Xiaohan and Zhang, Xiangyu and Ma, Ningning and Han, Jungong and Ding, Guiguang and Sun, Jian},
  date = {2021-06},
  pages = {13728--13737},
  issn = {2575-7075},
  doi = {10.1109/CVPR46437.2021.01352},
  abstract = {We present a simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3 × 3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. On ImageNet, RepVGG reaches over 80\% top-1 accuracy, which is the first time for a plain model, to the best of our knowledge. On NVIDIA 1080Ti GPU, RepVGG models run 83\% faster than ResNet-50 or 101\% faster than ResNet-101 with higher accuracy and show favorable accuracy-speed trade-off compared to the state-of-the-art models like EfficientNet and RegNet. The code and trained models are available at https://github.com/megvii-model/RepVGG.},
  eventtitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/PIN4B83K/Ding et al. - 2021 - RepVGG Making VGG-style ConvNets Great Again.pdf;/home/zenon/Zotero/storage/IXAPBC22/9577516.html}
}

@inproceedings{erhan2014,
  title = {Scalable {{Object Detection Using Deep Neural Networks}}},
  booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Erhan, Dumitru and Szegedy, Christian and Toshev, Alexander and Anguelov, Dragomir},
  date = {2014-06},
  pages = {2155--2162},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2014.276},
  abstract = {Deep convolutional neural networks have recently achieved state-of-the-art performance on a number of image recognition benchmarks, including the ImageNet Large-Scale Visual Recognition Challenge (ILSVRC-2012). The winning model on the localization sub-task was a network that predicts a single bounding box and a confidence score for each object category in the image. Such a model captures the whole-image context around the objects but cannot handle multiple instances of the same object in the image without naively replicating the number of outputs for each instance. In this work, we propose a saliency-inspired neural network model for detection, which predicts a set of class-agnostic bounding boxes along with a single score for each box, corresponding to its likelihood of containing any object of interest. The model naturally handles a variable number of instances for each class and allows for cross-class generalization at the highest levels of the network. We are able to obtain competitive recognition performance on VOC2007 and ILSVRC2012, while using only the top few predicted locations in each image and a small number of neural network evaluations.},
  eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  file = {/home/zenon/Zotero/storage/QK2PPIT2/Erhan et al. - 2014 - Scalable Object Detection Using Deep Neural Networ.pdf;/home/zenon/Zotero/storage/KRHQRR7X/6909673.html}
}

@article{everingham2010,
  title = {The {{Pascal Visual Object Classes}} ({{VOC}}) {{Challenge}}},
  author = {Everingham, Mark and Van Gool, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew},
  date = {2010-06-01},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  volume = {88},
  number = {2},
  pages = {303--338},
  issn = {1573-1405},
  doi = {10.1007/s11263-009-0275-4},
  abstract = {The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection.},
  langid = {english},
  keywords = {Benchmark,Database,Object detection,Object recognition},
  file = {/home/zenon/Zotero/storage/FCRT6NYG/Everingham et al. - 2010 - The Pascal Visual Object Classes (VOC) Challenge.pdf}
}

@inproceedings{felzenszwalb2008,
  title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
  booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
  date = {2008-06},
  pages = {1--8},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2008.4587597},
  abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
  eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
  file = {/home/zenon/Zotero/storage/Q4LTEZL7/4587597.html}
}

@inproceedings{felzenszwalb2008a,
  title = {A Discriminatively Trained, Multiscale, Deformable Part Model},
  booktitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Felzenszwalb, Pedro and McAllester, David and Ramanan, Deva},
  date = {2008-06},
  pages = {1--8},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2008.4587597},
  abstract = {This paper describes a discriminatively trained, multiscale, deformable part model for object detection. Our system achieves a two-fold improvement in average precision over the best performance in the 2006 PASCAL person detection challenge. It also outperforms the best results in the 2007 challenge in ten out of twenty categories. The system relies heavily on deformable parts. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL challenge. Our system also relies heavily on new methods for discriminative training. We combine a margin-sensitive approach for data mining hard negative examples with a formalism we call latent SVM. A latent SVM, like a hidden CRF, leads to a non-convex training problem. However, a latent SVM is semi-convex and the training problem becomes convex once latent information is specified for the positive examples. We believe that our training methods will eventually make possible the effective use of more latent information such as hierarchical (grammar) models and models involving latent three dimensional pose.},
  eventtitle = {2008 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  keywords = {Computer vision,Costs,Data mining,Deformable models,Filters,Histograms,Object detection,Spatial resolution,Support vector machine classification,Support vector machines},
  file = {/home/zenon/Zotero/storage/5NMZ5V8B/Felzenszwalb et al. - 2008 - A discriminatively trained, multiscale, deformable.pdf;/home/zenon/Zotero/storage/3P3CRTV7/4587597.html}
}

@article{felzenszwalb2010,
  title = {Object {{Detection}} with {{Discriminatively Trained Part-Based Models}}},
  author = {Felzenszwalb, Pedro F. and Girshick, Ross B. and McAllester, David and Ramanan, Deva},
  date = {2010-09},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {32},
  number = {9},
  pages = {1627--1645},
  issn = {1939-3539},
  doi = {10.1109/TPAMI.2009.167},
  abstract = {We describe an object detection system based on mixtures of multiscale deformable part models. Our system is able to represent highly variable object classes and achieves state-of-the-art results in the PASCAL object detection challenges. While deformable part models have become quite popular, their value had not been demonstrated on difficult benchmarks such as the PASCAL data sets. Our system relies on new methods for discriminative training with partially labeled data. We combine a margin-sensitive approach for data-mining hard negative examples with a formalism we call latent SVM. A latent SVM is a reformulation of MI–SVM in terms of latent variables. A latent SVM is semiconvex, and the training problem becomes convex once latent information is specified for the positive examples. This leads to an iterative training algorithm that alternates between fixing latent values for positive examples and optimizing the latent SVM objective function.},
  eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
  file = {/home/zenon/Zotero/storage/P5378A3K/Felzenszwalb et al. - 2010 - Object Detection with Discriminatively Trained Par.pdf;/home/zenon/Zotero/storage/HYLEIZJU/5255236.html}
}

@inproceedings{freund1995,
  title = {A Desicion-Theoretic Generalization of on-Line Learning and an Application to Boosting},
  booktitle = {Computational {{Learning Theory}}},
  author = {Freund, Yoav and Schapire, Robert E.},
  editor = {Vitányi, Paul},
  date = {1995},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {23--37},
  publisher = {{Springer}},
  location = {{Berlin, Heidelberg}},
  doi = {10.1007/3-540-59119-2_166},
  abstract = {We consider the problem of dynamically apportioning resources among a set of options in a worst-case on-line framework. The model we study can be interpreted as a broad, abstract extension of the well-studied on-line prediction model to a general decision-theoretic setting. We show that the multiplicative weight-update rule of Littlestone and Warmuth [10] can be adapted to this mode yielding bounds that are slightly weaker in some cases, but applicable to a considerably more general class of learning problems. We show how the resulting learning algorithm can be applied to a variety of problems, including gambling, multiple-outcome prediction, repeated games and prediction of points in ℝn. We also show how the weight-update rule can be used to derive a new boosting algorithm which does not require prior knowledge about the performance of the weak learning algorithm.},
  isbn = {978-3-540-49195-8},
  langid = {english},
  keywords = {Algorithm AdaBoost,Cumulative Loss,Final Hypothesis,Loss Function,Weak Hypothesis}
}

@article{fukushima1969,
  title = {Visual {{Feature Extraction}} by a {{Multilayered Network}} of {{Analog Threshold Elements}}},
  author = {Fukushima, Kunihiko},
  date = {1969-10},
  journaltitle = {IEEE Transactions on Systems Science and Cybernetics},
  volume = {5},
  number = {4},
  pages = {322--333},
  issn = {2168-2887},
  doi = {10.1109/TSSC.1969.300225},
  abstract = {A new type of visual feature extracting network has been synthesized, and the response of the network has been simulated on a digital computer. This research has been done as a first step towards the realization of a recognizer of handwritten characters. The design of the network was suggested by biological systems, especially, the visual systems of cat and monkey. The network is composed of analog threshold elements connected in layers. Each analog threshold element receives inputs from a large number of elements in the neighbouring layers and performs its own special functions. It takes care of one restricted part of the photoreceptor layer, on which an input pattem is presented, and it responds to one particular feature of the input pattem, such as brightness contrast, a dot in the pattern, a line segment of a particular orientation, or an end of the line. This means that the network performs parallel processing of the information. With the propagation of the information through the layered network, the input pattern is successively decomposed into dots, groups of line segments of the same orientation, and the ends of these line segments.},
  eventtitle = {{{IEEE Transactions}} on {{Systems Science}} and {{Cybernetics}}},
  file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
}

@online{ge2021,
  title = {{{OTA}}: {{Optimal Transport Assignment}} for {{Object Detection}}},
  shorttitle = {{{OTA}}},
  author = {Ge, Zheng and Liu, Songtao and Li, Zeming and Yoshie, Osamu and Sun, Jian},
  date = {2021-03-26},
  eprint = {2103.14259},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2103.14259},
  abstract = {Recent advances in label assignment in object detection mainly seek to independently define positive/negative training samples for each ground-truth (gt) object. In this paper, we innovatively revisit the label assignment from a global perspective and propose to formulate the assigning procedure as an Optimal Transport (OT) problem -- a well-studied topic in Optimization Theory. Concretely, we define the unit transportation cost between each demander (anchor) and supplier (gt) pair as the weighted summation of their classification and regression losses. After formulation, finding the best assignment solution is converted to solve the optimal transport plan at minimal transportation costs, which can be solved via Sinkhorn-Knopp Iteration. On COCO, a single FCOS-ResNet-50 detector equipped with Optimal Transport Assignment (OTA) can reach 40.7\% mAP under 1X scheduler, outperforming all other existing assigning methods. Extensive experiments conducted on COCO and CrowdHuman further validate the effectiveness of our proposed OTA, especially its superiority in crowd scenarios. The code is available at https://github.com/Megvii-BaseDetection/OTA.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/LE4HHL9X/Ge et al. - 2021 - OTA Optimal Transport Assignment for Object Detec.pdf;/home/zenon/Zotero/storage/MCF86ZKV/2103.html}
}

@online{ge2021a,
  title = {{{YOLOX}}: {{Exceeding YOLO Series}} in 2021},
  shorttitle = {{{YOLOX}}},
  author = {Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
  date = {2021-08-05},
  eprint = {2107.08430},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2107.08430},
  abstract = {In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3\% AP on COCO, surpassing NanoDet by 1.8\% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3\% AP on COCO, outperforming the current best practice by 3.0\% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0\% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8\% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. Source code is at https://github.com/Megvii-BaseDetection/YOLOX.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/B9KGZ7N2/Ge et al. - 2021 - YOLOX Exceeding YOLO Series in 2021.pdf;/home/zenon/Zotero/storage/XQTJLGLZ/2107.html}
}

@online{gevorgyan2022,
  title = {{{SIoU Loss}}: {{More Powerful Learning}} for {{Bounding Box Regression}}},
  shorttitle = {{{SIoU Loss}}},
  author = {Gevorgyan, Zhora},
  date = {2022-05-25},
  eprint = {2205.12740},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2205.12740},
  abstract = {The effectiveness of Object Detection, one of the central problems in computer vision tasks, highly depends on the definition of the loss function - a measure of how accurately your ML model can predict the expected outcome. Conventional object detection loss functions depend on aggregation of metrics of bounding box regression such as the distance, overlap area and aspect ratio of the predicted and ground truth boxes (i.e. GIoU, CIoU, ICIoU etc). However, none of the methods proposed and used to date considers the direction of the mismatch between the desired ground box and the predicted, "experimental" box. This shortage results in slower and less effective convergence as the predicted box can "wander around" during the training process and eventually end up producing a worse model. In this paper a new loss function SIoU was suggested, where penalty metrics were redefined considering the angle of the vector between the desired regression. Applied to conventional Neural Networks and datasets it is shown that SIoU improves both the speed of training and the accuracy of the inference. The effectiveness of the proposed loss function was revealed in a number of simulations and tests.},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,I.2,I.4},
  file = {/home/zenon/Zotero/storage/4T3GEYK9/Gevorgyan - 2022 - SIoU Loss More Powerful Learning for Bounding Box.pdf;/home/zenon/Zotero/storage/T7BRMTH2/2205.html}
}

@online{girshick,
  title = {Discriminatively {{Trained Deformable Part Models}} ({{Release}} 5)},
  author = {Girshick, Ross B. and Felzenszwalb, Pedro F. and McAllester, David},
  url = {https://www.rossgirshick.info/latent/},
  urldate = {2023-10-26},
  file = {/home/zenon/Zotero/storage/HQTS6PW6/latent.html}
}

@inproceedings{girshick2014,
  title = {Rich {{Feature Hierarchies}} for {{Accurate Object Detection}} and {{Semantic Segmentation}}},
  booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
  date = {2014-06},
  pages = {580--587},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2014.81},
  abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012 – achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.},
  eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  file = {/home/zenon/Zotero/storage/EL92YEYD/Girshick et al. - 2014 - Rich Feature Hierarchies for Accurate Object Detec.pdf;/home/zenon/Zotero/storage/TX9APXST/6909475.html}
}

@inproceedings{girshick2015,
  title = {Deformable Part Models Are Convolutional Neural Networks},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Girshick, Ross and Iandola, Forrest and Darrell, Trevor and Malik, Jitendra},
  date = {2015-06},
  pages = {437--446},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2015.7298641},
  abstract = {Deformable part models (DPMs) and convolutional neural networks (CNNs) are two widely used tools for visual recognition. They are typically viewed as distinct approaches: DPMs are graphical models (Markov random fields), while CNNs are “black-box” non-linear classifiers. In this paper, we show that a DPM can be formulated as a CNN, thus providing a synthesis of the two ideas. Our construction involves unrolling the DPM inference algorithm and mapping each step to an equivalent CNN layer. From this perspective, it is natural to replace the standard image features used in DPMs with a learned feature extractor. We call the resulting model a DeepPyramid DPM and experimentally validate it on PASCAL VOC object detection. We find that DeepPyramid DPMs significantly outperform DPMs based on histograms of oriented gradients features (HOG) and slightly outperforms a comparable version of the recently introduced R-CNN detection system, while running significantly faster.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  keywords = {Convolution,Detectors,Feature extraction,Geometry,Inference algorithms,Object detection,Transforms},
  file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html}
}

@inproceedings{girshick2015a,
  title = {Fast {{R-CNN}}},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  author = {Girshick, Ross},
  date = {2015-12},
  pages = {1440--1448},
  issn = {2380-7504},
  doi = {10.1109/ICCV.2015.169},
  abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  file = {/home/zenon/Zotero/storage/I4Q5NJCT/Girshick - 2015 - Fast R-CNN.pdf;/home/zenon/Zotero/storage/VQZF2I7Z/7410526.html}
}

@article{girshick2016,
  title = {Region-{{Based Convolutional Networks}} for {{Accurate Object Detection}} and {{Segmentation}}},
  author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
  date = {2016-01},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {38},
  number = {1},
  pages = {142--158},
  issn = {1939-3539},
  doi = {10.1109/TPAMI.2015.2437384},
  abstract = {Object detection performance, as measured on the canonical PASCAL VOC Challenge datasets, plateaued in the final years of the competition. The best-performing methods were complex ensemble systems that typically combined multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 50 percent relative to the previous best result on VOC 2012-achieving a mAP of 62.4 percent. Our approach combines two ideas: (1) one can apply high-capacity convolutional networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data are scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, boosts performance significantly. Since we combine region proposals with CNNs, we call the resulting model an R-CNN or Region-based Convolutional Network. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.},
  eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
  file = {/home/zenon/Zotero/storage/MQPF5MGW/Girshick et al. - 2016 - Region-Based Convolutional Networks for Accurate O.pdf;/home/zenon/Zotero/storage/EKC4WHDQ/7112511.html}
}

@inproceedings{glorot2010,
  title = {Understanding the Difficulty of Training Deep Feedforward Neural Networks},
  booktitle = {Proceedings of the {{Thirteenth International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
  author = {Glorot, Xavier and Bengio, Yoshua},
  date = {2010-03-31},
  pages = {249--256},
  publisher = {{JMLR Workshop and Conference Proceedings}},
  issn = {1938-7228},
  url = {https://proceedings.mlr.press/v9/glorot10a.html},
  urldate = {2023-11-08},
  abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future.  We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1.  Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.},
  eventtitle = {Proceedings of the {{Thirteenth International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
  langid = {english},
  file = {/home/zenon/Zotero/storage/Q2UQI5CZ/Glorot and Bengio - 2010 - Understanding the difficulty of training deep feed.pdf}
}

@book{goodfellow2016,
  title = {Deep {{Learning}}},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
  date = {2016-11-10},
  eprint = {omivDQAAQBAJ},
  eprinttype = {googlebooks},
  publisher = {{MIT Press}},
  abstract = {An introduction to a broad range of topics in deep learning, covering mathematical and conceptual background, deep learning techniques used in industry, and research perspectives.“Written by three experts in the field, Deep Learning is the only comprehensive book on the subject.”—Elon Musk, cochair of OpenAI; cofounder and CEO of Tesla and SpaceXDeep learning is a form of machine learning that enables computers to learn from experience and understand the world in terms of a hierarchy of concepts. Because the computer gathers knowledge from experience, there is no need for a human computer operator to formally specify all the knowledge that the computer needs. The hierarchy of concepts allows the computer to learn complicated concepts by building them out of simpler ones; a graph of these hierarchies would be many layers deep. This book introduces a broad range of topics in deep learning. The text offers mathematical and conceptual background, covering relevant concepts in linear algebra, probability theory and information theory, numerical computation, and machine learning. It describes deep learning techniques used by practitioners in industry, including deep feedforward networks, regularization, optimization algorithms, convolutional networks, sequence modeling, and practical methodology; and it surveys such applications as natural language processing, speech recognition, computer vision, online recommendation systems, bioinformatics, and videogames. Finally, the book offers research perspectives, covering such theoretical topics as linear factor models, autoencoders, representation learning, structured probabilistic models, Monte Carlo methods, the partition function, approximate inference, and deep generative models. Deep Learning can be used by undergraduate or graduate students planning careers in either industry or research, and by software engineers who want to begin using deep learning in their products or platforms. A website offers supplementary material for both readers and instructors.},
  isbn = {978-0-262-33737-3},
  langid = {english},
  pagetotal = {801},
  keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science}
}

@article{he2015,
  title = {Spatial {{Pyramid Pooling}} in {{Deep Convolutional Networks}} for {{Visual Recognition}}},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  date = {2015-09},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {37},
  number = {9},
  pages = {1904--1916},
  issn = {1939-3539},
  doi = {10.1109/TPAMI.2015.2389824},
  abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g., 224\textbackslash times 224) input image. This requirement is “artificial” and may reduce the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with another pooling strategy, “spatial pyramid pooling”, to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. Pyramid pooling is also robust to object deformations. With these advantages, SPP-net should in general improve all CNN-based image classification methods. On the ImageNet 2012 dataset, we demonstrate that SPP-net boosts the accuracy of a variety of CNN architectures despite their different designs. On the Pascal VOC 2007 and Caltech101 datasets, SPP-net achieves state-of-the-art classification results using a single full-image representation and no fine-tuning. The power of SPP-net is also significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method is 24-102 \textbackslash times faster than the R-CNN method, while achieving better or comparable accuracy on Pascal VOC 2007. In ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2014, our methods rank \#2 in object detection and \#3 in image classification among all 38 teams. This manuscript also introduces the improvement made for this competition.},
  eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
  file = {/home/zenon/Zotero/storage/4ZANQDJR/He et al. - 2015 - Spatial Pyramid Pooling in Deep Convolutional Netw.pdf;/home/zenon/Zotero/storage/MYNCND4W/7005506.html}
}

@inproceedings{he2016,
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  date = {2016-06},
  pages = {770--778},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2016.90},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  keywords = {Complexity theory,Degradation,Image recognition,Image segmentation,Neural networks,Training,Visualization},
  file = {/home/zenon/Zotero/storage/JDX3S8QK/He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf}
}

@inproceedings{he2017,
  title = {Mask {{R-CNN}}},
  booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  author = {He, Kaiming and Gkioxari, Georgia and Dollár, Piotr and Girshick, Ross},
  date = {2017-10},
  pages = {2980--2988},
  issn = {2380-7504},
  doi = {10.1109/ICCV.2017.322},
  abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without tricks, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code will be made available.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  file = {/home/zenon/Zotero/storage/Z6CBZ8AI/He et al. - 2017 - Mask R-CNN.pdf;/home/zenon/Zotero/storage/GW42F6UG/8237584.html}
}

@article{hornik1989,
  title = {Multilayer Feedforward Networks Are Universal Approximators},
  author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
  date = {1989-01-01},
  journaltitle = {Neural Networks},
  shortjournal = {Neural Networks},
  volume = {2},
  number = {5},
  pages = {359--366},
  issn = {0893-6080},
  doi = {10.1016/0893-6080(89)90020-8},
  abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.},
  keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation},
  file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html}
}

@online{howard2017,
  title = {{{MobileNets}}: {{Efficient Convolutional Neural Networks}} for {{Mobile Vision Applications}}},
  shorttitle = {{{MobileNets}}},
  author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
  date = {2017-04-16},
  eprint = {1704.04861},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1704.04861},
  abstract = {We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/3LNIUGZQ/Howard et al. - 2017 - MobileNets Efficient Convolutional Neural Network.pdf;/home/zenon/Zotero/storage/4TT8Z6JP/1704.html}
}

@inproceedings{howard2019,
  title = {Searching for {{MobileNetV3}}},
  booktitle = {2019 {{IEEE}}/{{CVF International Conference}} on {{Computer Vision}} ({{ICCV}})},
  author = {Howard, Andrew and Sandler, Mark and Chen, Bo and Wang, Weijun and Chen, Liang-Chieh and Tan, Mingxing and Chu, Grace and Vasudevan, Vijay and Zhu, Yukun and Pang, Ruoming and Adam, Hartwig and Le, Quoc},
  date = {2019-10},
  pages = {1314--1324},
  issn = {2380-7504},
  doi = {10.1109/ICCV.2019.00140},
  abstract = {We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2\% more accurate on ImageNet classification while reducing latency by 20\% compared to MobileNetV2. MobileNetV3-Small is 6.6\% more accurate compared to a MobileNetV2 model with comparable latency. MobileNetV3-Large detection is over 25\% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 34\% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.},
  eventtitle = {2019 {{IEEE}}/{{CVF International Conference}} on {{Computer Vision}} ({{ICCV}})},
  file = {/home/zenon/Zotero/storage/6369RRPZ/Howard et al. - 2019 - Searching for MobileNetV3.pdf;/home/zenon/Zotero/storage/FBTXJ8V7/9008835.html}
}

@inproceedings{huang2017,
  title = {Densely {{Connected Convolutional Networks}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q.},
  date = {2017-07},
  pages = {2261--2269},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2017.243},
  abstract = {Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections-one between each layer and its subsequent layer-our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and pre-trained models are available at https://github.com/liuzhuang13/DenseNet.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/G2G38ZEF/Huang et al. - 2017 - Densely Connected Convolutional Networks.pdf;/home/zenon/Zotero/storage/3Q7NRNV6/8099726.html}
}

@inproceedings{ioffe2015,
  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
  shorttitle = {Batch {{Normalization}}},
  booktitle = {Proceedings of the 32nd {{International Conference}} on {{Machine Learning}}},
  author = {Ioffe, Sergey and Szegedy, Christian},
  date = {2015-06-01},
  pages = {448--456},
  publisher = {{PMLR}},
  issn = {1938-7228},
  url = {https://proceedings.mlr.press/v37/ioffe15.html},
  urldate = {2023-11-09},
  abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer’s inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a stateof-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82\% top-5 test error, exceeding the accuracy of human raters.},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  langid = {english},
  file = {/home/zenon/Zotero/storage/7SMC2DMX/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf}
}

@software{jocher2020,
  title = {{{YOLOv5}} by {{Ultralytics}}},
  author = {Jocher, Glenn},
  date = {2020-05},
  doi = {10.5281/zenodo.3908559},
  abstract = {YOLOv5 🚀 in PyTorch {$>$} ONNX {$>$} CoreML {$>$} TFLite},
  version = {7.0}
}

@software{jocher2022,
  title = {Ultralytics/{{Yolov5}}: {{V7}}.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}},
  shorttitle = {Ultralytics/{{Yolov5}}},
  author = {Jocher, Glenn and Chaurasia, Ayush and Stoken, Alex and Borovec, Jirka and {NanoCode012} and Kwon, Yonghye and Michael, Kalen and {TaoXie} and Fang, Jiacong and {imyhxy} and {Lorna} and Yifu, Zeng and Wong, Colin and V, Abhiram and Montes, Diego and Wang, Zhiqiang and Fati, Cristi and Nadar, Jebastin and {Laughing} and {UnglvKitDe} and Sonck, Victor and {tkianai} and {yxNONG} and Skalski, Piotr and Hogan, Adam and Nair, Dhruv and Strobel, Max and Jain, Mrinal},
  date = {2022-11-22},
  doi = {10.5281/zenodo.7347926},
  organization = {{Zenodo}}
}

@online{kingma2017,
  title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
  shorttitle = {Adam},
  author = {Kingma, Diederik P. and Ba, Jimmy},
  date = {2017-01-29},
  eprint = {1412.6980},
  eprinttype = {arxiv},
  doi = {10.48550/arXiv.1412.6980},
  issue = {arXiv:1412.6980},
  keywords = {Computer Science - Machine Learning},
  file = {/home/zenon/Zotero/storage/DQAJEA4B/Kingma and Ba - 2017 - Adam A Method for Stochastic Optimization.pdf}
}

@dataset{krasin2017,
  title = {{{OpenImages}}: {{A}} Public Dataset for Large-Scale Multi-Label and Multi-Class Image Classification},
  author = {Krasin, Ivan and Duerig, Tom and Alldrin, Neil and Ferrari, Vittorio and Abu-El-Haija, Sami and Kuznetsova, Alina and Rom, Hassan and Uijlings, Jasper and Popov, Stefan and Kamali, Shahab and Malloci, Matteo and Pont-Tuset, Jordi and Veit, Andreas and Belongie, Serge and Gomes, Victor and Gupta, Abhinav and Sun, Chen and Chechik, Gal and Cai, David and Feng, Zheyun and Narayanan, Dhyanesh and Murphy, Kevin},
  date = {2017},
  url = {https://storage.googleapis.com/openimages/web/index.html},
  urldate = {2023-12-06}
}

@inproceedings{krizhevsky2012,
  title = {{{ImageNet Classification}} with {{Deep Convolutional Neural Networks}}},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  date = {2012},
  volume = {25},
  publisher = {{Curran Associates, Inc.}},
  url = {https://papers.nips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html},
  urldate = {2023-10-22},
  abstract = {We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\textbackslash\% and 18.9\textbackslash\% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.},
  file = {/home/zenon/Zotero/storage/ANJ8P844/Krizhevsky et al. - 2012 - ImageNet Classification with Deep Convolutional Ne.pdf}
}

@article{krosney2023,
  title = {Inside {{Out}}: {{Transforming Images}} of {{Lab-Grown Plants}} for {{Machine Learning Applications}} in {{Agriculture}}},
  shorttitle = {Inside {{Out}}},
  author = {Krosney, A. E. and Sotoodeh, P. and Henry, C. J. and Beck, M. A. and Bidinosti, C. P.},
  date = {2023-07-06},
  journaltitle = {Frontiers in Artificial Intelligence},
  shortjournal = {Front. Artif. Intell.},
  volume = {6},
  eprint = {2211.02972},
  eprinttype = {arxiv},
  eprintclass = {cs},
  pages = {1200977},
  issn = {2624-8212},
  doi = {10.3389/frai.2023.1200977},
  abstract = {Machine learning tasks often require a significant amount of training data for the resultant network to perform suitably for a given problem in any domain. In agriculture, dataset sizes are further limited by phenotypical differences between two plants of the same genotype, often as a result of differing growing conditions. Synthetically-augmented datasets have shown promise in improving existing models when real data is not available. In this paper, we employ a contrastive unpaired translation (CUT) generative adversarial network (GAN) and simple image processing techniques to translate indoor plant images to appear as field images. While we train our network to translate an image containing only a single plant, we show that our method is easily extendable to produce multiple-plant field images. Furthermore, we use our synthetic multi-plant images to train several YoloV5 nano object detection models to perform the task of plant detection and measure the accuracy of the model on real field data images. Including training data generated by the CUT-GAN leads to better plant detection performance compared to a network trained solely on real data.},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/home/zenon/Zotero/storage/Y5MUHPDE/Krosney et al. - 2023 - Inside Out Transforming Images of Lab-Grown Plant.pdf;/home/zenon/Zotero/storage/8NB5H9E8/2211.html}
}

@article{kuznetsova2020,
  title = {The {{Open Images Dataset V4}}: {{Unified Image Classification}}, {{Object Detection}}, and {{Visual Relationship Detection}} at {{Scale}}},
  shorttitle = {The {{Open Images Dataset V4}}},
  author = {Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and Duerig, Tom and Ferrari, Vittorio},
  date = {2020-07},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  volume = {128},
  number = {7},
  eprint = {1811.00982},
  eprinttype = {arxiv},
  pages = {1956--1981},
  issn = {0920-5691, 1573-1405},
  doi = {10.1007/s11263-020-01316-z},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/R6SKDLQU/Kuznetsova et al. - 2020 - The Open Images Dataset V4 Unified Image Classifi.pdf}
}

@article{lecun1989,
  title = {Backpropagation {{Applied}} to {{Handwritten Zip Code Recognition}}},
  author = {LeCun, Y. and Boser, B. and Denker, J. S. and Henderson, D. and Howard, R. E. and Hubbard, W. and Jackel, L. D.},
  date = {1989-12-01},
  journaltitle = {Neural Computation},
  shortjournal = {Neural Computation},
  volume = {1},
  number = {4},
  pages = {541--551},
  issn = {0899-7667},
  doi = {10.1162/neco.1989.1.4.541},
  abstract = {The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification.},
  file = {/home/zenon/Zotero/storage/Y8AWACVG/LeCun et al. - 1989 - Backpropagation Applied to Handwritten Zip Code Re.pdf;/home/zenon/Zotero/storage/R7RK6LZ6/Backpropagation-Applied-to-Handwritten-Zip-Code.html}
}

@article{lecun1998,
  title = {Gradient-Based Learning Applied to Document Recognition},
  author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
  date = {1998-11},
  journaltitle = {Proceedings of the IEEE},
  volume = {86},
  number = {11},
  pages = {2278--2324},
  issn = {1558-2256},
  doi = {10.1109/5.726791},
  abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day.},
  eventtitle = {Proceedings of the {{IEEE}}},
  file = {/home/zenon/Zotero/storage/9X5LATEB/Lecun et al. - 1998 - Gradient-based learning applied to document recogn.pdf;/home/zenon/Zotero/storage/DY64NJW5/726791.html}
}

@article{li2022,
  title = {A {{Survey}} of {{Convolutional Neural Networks}}: {{Analysis}}, {{Applications}}, and {{Prospects}}},
  shorttitle = {A {{Survey}} of {{Convolutional Neural Networks}}},
  author = {Li, Zewen and Liu, Fan and Yang, Wenjie and Peng, Shouheng and Zhou, Jun},
  date = {2022-12},
  journaltitle = {IEEE Transactions on Neural Networks and Learning Systems},
  volume = {33},
  number = {12},
  pages = {6999--7019},
  issn = {2162-2388},
  doi = {10.1109/TNNLS.2021.3084827},
  abstract = {A convolutional neural network (CNN) is one of the most significant networks in the deep learning field. Since CNN made impressive achievements in many areas, including but not limited to computer vision and natural language processing, it attracted much attention from both industry and academia in the past few years. The existing reviews mainly focus on CNN’s applications in different scenarios without considering CNN from a general perspective, and some novel ideas proposed recently are not covered. In this review, we aim to provide some novel ideas and prospects in this fast-growing field. Besides, not only 2-D convolution but also 1-D and multidimensional ones are involved. First, this review introduces the history of CNN. Second, we provide an overview of various convolutions. Third, some classic and advanced CNN models are introduced; especially those key points making them reach state-of-the-art results. Fourth, through experimental analysis, we draw some conclusions and provide several rules of thumb for functions and hyperparameter selection. Fifth, the applications of 1-D, 2-D, and multidimensional convolution are covered. Finally, some open issues and promising directions for CNN are discussed as guidelines for future work.},
  eventtitle = {{{IEEE Transactions}} on {{Neural Networks}} and {{Learning Systems}}},
  file = {/home/zenon/Zotero/storage/U7JKC8DW/Li et al. - 2022 - A Survey of Convolutional Neural Networks Analysi.pdf;/home/zenon/Zotero/storage/99TTKB2L/9451544.html}
}

@online{li2022a,
  title = {{{YOLOv6}}: {{A Single-Stage Object Detection Framework}} for {{Industrial Applications}}},
  shorttitle = {{{YOLOv6}}},
  author = {Li, Chuyi and Li, Lulu and Jiang, Hongliang and Weng, Kaiheng and Geng, Yifei and Li, Liang and Ke, Zaidan and Li, Qingyuan and Cheng, Meng and Nie, Weiqiang and Li, Yiduo and Zhang, Bo and Liang, Yufei and Zhou, Linyuan and Xu, Xiaoming and Chu, Xiangxiang and Wei, Xiaoming and Wei, Xiaolin},
  date = {2022-09-07},
  eprint = {2209.02976},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2209.02976},
  abstract = {For years, the YOLO series has been the de facto industry-level standard for efficient object detection. The YOLO community has prospered overwhelmingly to enrich its use in a multitude of hardware platforms and abundant scenarios. In this technical report, we strive to push its limits to the next level, stepping forward with an unwavering mindset for industry application. Considering the diverse requirements for speed and accuracy in the real environment, we extensively examine the up-to-date object detection advancements either from industry or academia. Specifically, we heavily assimilate ideas from recent network design, training strategies, testing techniques, quantization, and optimization methods. On top of this, we integrate our thoughts and practice to build a suite of deployment-ready networks at various scales to accommodate diversified use cases. With the generous permission of YOLO authors, we name it YOLOv6. We also express our warm welcome to users and contributors for further enhancement. For a glimpse of performance, our YOLOv6-N hits 35.9\% AP on the COCO dataset at a throughput of 1234 FPS on an NVIDIA Tesla T4 GPU. YOLOv6-S strikes 43.5\% AP at 495 FPS, outperforming other mainstream detectors at the same scale\textasciitilde (YOLOv5-S, YOLOX-S, and PPYOLOE-S). Our quantized version of YOLOv6-S even brings a new state-of-the-art 43.3\% AP at 869 FPS. Furthermore, YOLOv6-M/L also achieves better accuracy performance (i.e., 49.5\%/52.3\%) than other detectors with a similar inference speed. We carefully conducted experiments to validate the effectiveness of each component. Our code is made available at https://github.com/meituan/YOLOv6.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/QGN6UE37/Li et al. - 2022 - YOLOv6 A Single-Stage Object Detection Framework .pdf;/home/zenon/Zotero/storage/W78N7C5U/2209.html}
}

@online{lin2015,
  title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}},
  shorttitle = {Microsoft {{COCO}}},
  author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Bourdev, Lubomir and Girshick, Ross and Hays, James and Perona, Pietro and Ramanan, Deva and Zitnick, C. Lawrence and Dollár, Piotr},
  date = {2015-02-20},
  eprint = {1405.0312},
  eprinttype = {arxiv},
  doi = {10.48550/arXiv.1405.0312},
  issue = {arXiv:1405.0312},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf}
}

@inproceedings{lin2017,
  title = {Feature {{Pyramid Networks}} for {{Object Detection}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
  date = {2017-07},
  pages = {936--944},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2017.106},
  abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/P54JRJGY/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/A8YVPLFS/8099589.html}
}

@inproceedings{lin2017b,
  title = {Focal {{Loss}} for {{Dense Object Detection}}},
  booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  author = {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Dollár, Piotr},
  date = {2017-10},
  pages = {2999--3007},
  issn = {2380-7504},
  doi = {10.1109/ICCV.2017.324},
  abstract = {The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  file = {/home/zenon/Zotero/storage/LL8HFKFW/Lin et al. - 2017 - Focal Loss for Dense Object Detection.pdf;/home/zenon/Zotero/storage/982Z922B/8237586.html}
}

@inproceedings{liu2015,
  title = {Very Deep Convolutional Neural Network Based Image Classification Using Small Training Sample Size},
  booktitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
  author = {Liu, Shuying and Deng, Weihong},
  date = {2015-11},
  pages = {730--734},
  issn = {2327-0985},
  doi = {10.1109/ACPR.2015.7486599},
  abstract = {Since Krizhevsky won the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2012 competition with the brilliant deep convolutional neural networks (D-CNNs), researchers have designed lots of D-CNNs. However, almost all the existing very deep convolutional neural networks are trained on the giant ImageNet datasets. Small datasets like CIFAR-10 has rarely taken advantage of the power of depth since deep models are easy to overfit. In this paper, we proposed a modified VGG-16 network and used this model to fit CIFAR-10. By adding stronger regularizer and using Batch Normalization, we achieved 8.45\% error rate on CIFAR-10 without severe overfitting. Our results show that the very deep CNN can be used to fit small datasets with simple and proper modifications and don't need to re-design specific small networks. We believe that if a model is strong enough to fit a large dataset, it can also fit a small one.},
  eventtitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
  file = {/home/zenon/Zotero/storage/H9B6RK53/Liu and Deng - 2015 - Very deep convolutional neural network based image.pdf;/home/zenon/Zotero/storage/BIPI3CNN/7486599.html}
}

@inproceedings{liu2016,
  title = {{{SSD}}: {{Single Shot MultiBox Detector}}},
  shorttitle = {{{SSD}}},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2016},
  author = {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.},
  editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  date = {2016},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {21--37},
  publisher = {{Springer International Publishing}},
  location = {{Cham}},
  doi = {10.1007/978-3-319-46448-0_2},
  abstract = {We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. SSD is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stages and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, COCO, and ILSVRC datasets confirm that SSD has competitive accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. For \$\$300 \textbackslash times 300\$\$300×300input, SSD achieves 74.3~\% mAP on VOC2007 test at 59~FPS on a Nvidia Titan X and for \$\$512 \textbackslash times 512\$\$512×512input, SSD achieves 76.9~\% mAP, outperforming a comparable state of the art Faster R-CNN model. Compared to other single stage methods, SSD has much better accuracy even with a smaller input image size. Code is available at https://github.com/weiliu89/caffe/tree/ssd.},
  isbn = {978-3-319-46448-0},
  langid = {english},
  keywords = {Convolutional neural network,Real-time object detection},
  file = {/home/zenon/Zotero/storage/LUL6FCIQ/Liu et al. - 2016 - SSD Single Shot MultiBox Detector.pdf}
}

@inproceedings{liu2018,
  title = {Path {{Aggregation Network}} for {{Instance Segmentation}}},
  booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Liu, Shu and Qi, Lu and Qin, Haifang and Shi, Jianping and Jia, Jiaya},
  date = {2018-06},
  pages = {8759--8768},
  issn = {2575-7075},
  doi = {10.1109/CVPR.2018.00913},
  abstract = {The way that information propagates in neural networks is of great importance. In this paper, we propose Path Aggregation Network (PANet) aiming at boosting information flow in proposal-based instance segmentation framework. Specifically, we enhance the entire feature hierarchy with accurate localization signals in lower layers by bottom-up path augmentation, which shortens the information path between lower layers and topmost feature. We present adaptive feature pooling, which links feature grid and all feature levels to make useful information in each level propagate directly to following proposal subnetworks. A complementary branch capturing different views for each proposal is created to further improve mask prediction. These improvements are simple to implement, with subtle extra computational overhead. Yet they are useful and make our PANet reach the 1st place in the COCO 2017 Challenge Instance Segmentation task and the 2nd place in Object Detection task without large-batch training. PANet is also state-of-the-art on MVD and Cityscapes.},
  eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  file = {/home/zenon/Zotero/storage/XCG9L37N/Liu et al. - 2018 - Path Aggregation Network for Instance Segmentation.pdf;/home/zenon/Zotero/storage/ZAJWVMP2/8579011.html}
}

@article{lopez-garcia2022,
  title = {Machine {{Learning-Based Processing}} of {{Multispectral}} and {{RGB UAV Imagery}} for the {{Multitemporal Monitoring}} of {{Vineyard Water Status}}},
  author = {López-García, Patricia and Intrigliolo, Diego and Moreno, Miguel A. and Martínez-Moreno, Alejandro and Ortega, José Fernando and Pérez-Álvarez, Eva Pilar and Ballesteros, Rocío},
  date = {2022-09},
  journaltitle = {Agronomy},
  volume = {12},
  number = {9},
  pages = {2122},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {2073-4395},
  doi = {10.3390/agronomy12092122},
  keywords = {ANN,machine learning,multispectral images,RGB images,UAV,vineyard,water stress},
  file = {/home/zenon/Zotero/storage/MJSM2BFH/López-García et al. - 2022 - Machine Learning-Based Processing of Multispectral.pdf}
}

@inproceedings{lowe1999,
  title = {Object {{Recognition}} from {{Local Scale-Invariant Features}}},
  booktitle = {Proceedings of the {{International Conference}} on {{Computer Vision-Volume}} 2 - {{Volume}} 2},
  author = {Lowe, David G.},
  date = {1999-09-20},
  series = {{{ICCV}} '99},
  pages = {1150},
  publisher = {{IEEE Computer Society}},
  location = {{USA}},
  abstract = {An object recognition system has been developed that uses a new class of local image features. The features are invariant to image scaling, translation, and rotation, and partially invariant to illumination changes and affine or 3D projection.These features share similar properties with neurons in inferior temporal cortex that are used for object recognition in primate vision. Features are efficiently detected through a staged filtering approach that identifies stable points in scale space. Image keys are created that allow for local geometric deformations by representing blurred image gradients in multiple orientation planes and at multiple scales.The keys are used as input to a nearest-neighbor indexing method that identifies candidate object matches. Final verification of each match is achieved by finding a low-residual least-squares solution for the unknown model parameters. Experimental results show that robust object recognition can be achieved in cluttered partially-occluded images with a computation time of under 2 seconds.},
  isbn = {978-0-7695-0164-2},
  file = {/home/zenon/Zotero/storage/XTECRTI7/Lowe - 1999 - Object Recognition from Local Scale-Invariant Feat.pdf}
}

@article{luo2016,
  title = {A Review of Automatic Selection Methods for Machine Learning Algorithms and Hyper-Parameter Values},
  author = {Luo, Gang},
  date = {2016-05-23},
  journaltitle = {Network Modeling Analysis in Health Informatics and Bioinformatics},
  shortjournal = {Netw Model Anal Health Inform Bioinforma},
  volume = {5},
  number = {1},
  pages = {18},
  issn = {2192-6670},
  doi = {10.1007/s13721-016-0125-6},
  abstract = {Machine learning studies automatic algorithms that improve themselves through experience. It is widely used for analyzing and extracting value from large biomedical data sets, or “big biomedical data,” advancing biomedical research, and improving healthcare. Before a machine learning model is trained, the user of a machine learning software tool typically must manually select a machine learning algorithm and set one or more model parameters termed hyper-parameters. The algorithm and hyper-parameter values used can greatly impact the resulting model’s performance, but their selection requires special expertise as well as many labor-intensive manual iterations. To make machine learning accessible to layman users with limited computing expertise, computer science researchers have proposed various automatic selection methods for algorithms and/or hyper-parameter values for a given supervised machine learning problem. This paper reviews these methods, identifies several of their limitations in the big biomedical data environment, and provides preliminary thoughts on how to address these limitations. These findings establish a foundation for future research on automatically selecting algorithms and hyper-parameter values for analyzing big biomedical data.},
  langid = {english},
  keywords = {Automatic algorithm selection,Automatic hyper-parameter value selection,Big biomedical data,Machine learning},
  file = {/home/zenon/Zotero/storage/RJ2UYUQX/Luo - 2016 - A review of automatic selection methods for machin.pdf}
}

@article{mateo-aroca2019,
  title = {Remote {{Image Capture System}} to {{Improve Aerial Supervision}} for {{Precision Irrigation}} in {{Agriculture}}},
  author = {Mateo-Aroca, Antonio and García-Mateos, Ginés and Ruiz-Canales, Antonio and Molina-García-Pardo, José María and Molina-Martínez, José Miguel},
  date = {2019-02},
  journaltitle = {Water},
  volume = {11},
  number = {2},
  pages = {255},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {2073-4441},
  doi = {10.3390/w11020255},
  keywords = {image capture system,irrigation management,lettuce,wireless,ZigBee and XBee},
  file = {/home/zenon/Zotero/storage/3JZLQNJT/Mateo-Aroca et al. - 2019 - Remote Image Capture System to Improve Aerial Supe.pdf}
}

@article{mcculloch1943,
  title = {A Logical Calculus of the Ideas Immanent in Nervous Activity},
  author = {McCulloch, Warren S. and Pitts, Walter},
  date = {1943-12-01},
  journaltitle = {The bulletin of mathematical biophysics},
  shortjournal = {Bulletin of Mathematical Biophysics},
  volume = {5},
  number = {4},
  pages = {115--133},
  issn = {1522-9602},
  doi = {10.1007/BF02478259},
  abstract = {Because of the “all-or-none” character of nervous activity, neural events and the relations among them can be treated by means of propositional logic. It is found that the behavior of every net can be described in these terms, with the addition of more complicated logical means for nets containing circles; and that for any logical expression satisfying certain conditions, one can find a net behaving in the fashion it describes. It is shown that many particular choices among possible neurophysiological assumptions are equivalent, in the sense that for every net behaving under one assumption, there exists another net which behaves under the other and gives the same results, although perhaps not in the same time. Various applications of the calculus are discussed.},
  langid = {english},
  keywords = {Excitatory Synapse,Inhibitory Synapse,Nervous Activity,Spatial Summation,Temporal Summation}
}

@article{mcenroe2022,
  title = {A {{Survey}} on the {{Convergence}} of {{Edge Computing}} and {{AI}} for {{UAVs}}: {{Opportunities}} and {{Challenges}}},
  shorttitle = {A {{Survey}} on the {{Convergence}} of {{Edge Computing}} and {{AI}} for {{UAVs}}},
  author = {McEnroe, Patrick and Wang, Shen and Liyanage, Madhusanka},
  date = {2022-09},
  journaltitle = {IEEE Internet of Things Journal},
  volume = {9},
  number = {17},
  pages = {15435--15459},
  issn = {2327-4662},
  doi = {10.1109/JIOT.2022.3176400},
  eventtitle = {{{IEEE Internet}} of {{Things Journal}}},
  keywords = {Artificial intelligence,Artificial intelligence (AI),Autonomous aerial vehicles,Cloud computing,edge AI,edge computing,Edge computing,edge intelligence,Internet of Things,Internet of Things (IoT),MEC,Servers,Task analysis,unmanned aerial vehicle (UAV)},
  file = {/home/zenon/Zotero/storage/3ECY7VJ5/McEnroe et al. - 2022 - A Survey on the Convergence of Edge Computing and .pdf}
}

@book{minsky2017,
  title = {Perceptrons: {{An Introduction}} to {{Computational Geometry}}},
  shorttitle = {Perceptrons},
  author = {Minsky, Marvin and Papert, Seymour A.},
  date = {2017-09-22},
  publisher = {{The MIT Press}},
  doi = {10.7551/mitpress/11301.001.0001},
  abstract = {The first systematic study of parallelism in computation by two pioneers in the field.Reissue of the 1988 Expanded Edition with a new foreword by Léon BottouIn},
  isbn = {978-0-262-34393-0},
  langid = {english},
  file = {/home/zenon/Zotero/storage/XZVYT2SM/PerceptronsAn-Introduction-to-Computational.html}
}

@online{misra2020,
  title = {Mish: {{A Self Regularized Non-Monotonic Activation Function}}},
  shorttitle = {Mish},
  author = {Misra, Diganta},
  date = {2020-08-13},
  eprint = {1908.08681},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.1908.08681},
  abstract = {We propose \$\textbackslash textit\{Mish\}\$, a novel self-regularized non-monotonic activation function which can be mathematically defined as: \$f(x)=x\textbackslash tanh(softplus(x))\$. As activation functions play a crucial role in the performance and training dynamics in neural networks, we validated experimentally on several well-known benchmarks against the best combinations of architectures and activation functions. We also observe that data augmentation techniques have a favorable effect on benchmarks like ImageNet-1k and MS-COCO across multiple architectures. For example, Mish outperformed Leaky ReLU on YOLOv4 with a CSP-DarkNet-53 backbone on average precision (\$AP\_\{50\}\^\{val\}\$) by 2.1\$\textbackslash\%\$ in MS-COCO object detection and ReLU on ResNet-50 on ImageNet-1k in Top-1 accuracy by \$\textbackslash approx\$1\$\textbackslash\%\$ while keeping all other network parameters and hyperparameters constant. Furthermore, we explore the mathematical formulation of Mish in relation with the Swish family of functions and propose an intuitive understanding on how the first derivative behavior may be acting as a regularizer helping the optimization of deep neural networks. Code is publicly available at https://github.com/digantamisra98/Mish.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  file = {/home/zenon/Zotero/storage/GWSBGS5H/Misra - 2020 - Mish A Self Regularized Non-Monotonic Activation .pdf;/home/zenon/Zotero/storage/Z4N3DK8U/1908.html}
}

@book{mitchell1997a,
  title = {Machine {{Learning}}},
  author = {Mitchell, Thomas M.},
  date = {1997-02},
  edition = {1},
  publisher = {{McGraw-Hill, Inc.}},
  location = {{USA}},
  abstract = {This exciting addition to the McGraw-Hill Series in Computer Science focuses on the concepts and techniques that contribute to the rapidly changing field of machine learning--including probability and statistics, artificial intelligence, and neural networks--unifying them all in a logical and coherent manner. Machine Learning serves as a useful reference tool for software developers and researchers, as well as an outstanding text for college students. Table of contents Chapter 1. Introduction Chapter 2. Concept Learning and the General-to-Specific Ordering Chapter 3. Decision Tree Learning Chapter 4. Artificial Neural Networks Chapter 5. Evaluating Hypotheses Chapter 6. Bayesian Learning Chapter 7. Computational Learning Theory Chapter 8. Instance-Based Learning Chapter 9. Inductive Logic Programming Chapter 10. Analytical Learning Chapter 11. Combining Inductive and Analytical Learning Chapter 12. Reinforcement Learning.},
  isbn = {978-0-07-042807-2},
  pagetotal = {432}
}

@book{murphy2012,
  title = {Machine {{Learning}}: {{A Probabilistic Perspective}}},
  shorttitle = {Machine {{Learning}}},
  author = {Murphy, Kevin P.},
  date = {2012-08-24},
  eprint = {NZP6AQAAQBAJ},
  eprinttype = {googlebooks},
  publisher = {{MIT Press}},
  abstract = {A comprehensive introduction to machine learning that uses probabilistic models and inference as a unifying approach.Today's Web-enabled deluge of electronic data calls for automated methods of data analysis. Machine learning provides these, developing methods that can automatically detect patterns in data and then use the uncovered patterns to predict future data. This textbook offers a comprehensive and self-contained introduction to the field of machine learning, based on a unified, probabilistic approach.The coverage combines breadth and depth, offering necessary background material on such topics as probability, optimization, and linear algebra as well as discussion of recent developments in the field, including conditional random fields, L1 regularization, and deep learning. The book is written in an informal, accessible style, complete with pseudo-code for the most important algorithms. All topics are copiously illustrated with color images and worked examples drawn from such application domains as biology, text processing, computer vision, and robotics. Rather than providing a cookbook of different heuristic methods, the book stresses a principled model-based approach, often using the language of graphical models to specify models in a concise and intuitive way. Almost all the models described have been implemented in a MATLAB software package—PMTK (probabilistic modeling toolkit)—that is freely available online. The book is suitable for upper-level undergraduates with an introductory-level college math background and beginning graduate students.},
  isbn = {978-0-262-01802-9},
  langid = {english},
  pagetotal = {1102},
  keywords = {Computers / Artificial Intelligence / General},
  file = {/home/zenon/Zotero/storage/T2BMVXG9/Murphy - 2012 - Machine Learning A Probabilistic Perspective.pdf}
}

@article{nadafzadeh2019,
  title = {Design and {{Fabrication}} of an {{Intelligent Control System}} for {{Determination}} of {{Watering Time}} for {{Turfgrass Plant Using Computer Vision System}} and {{Artificial Neural Network}}},
  author = {Nadafzadeh, Maryam and Abdanan Mehdizadeh, Saman},
  date = {2019-10-01},
  journaltitle = {Precision Agriculture},
  shortjournal = {Precision Agric},
  volume = {20},
  number = {5},
  pages = {857--879},
  issn = {1573-1618},
  doi = {10.1007/s11119-018-9618-x},
  keywords = {Artificial neural network,Digital image processing,Drought stress,Genetic algorithm,Intelligent irrigation control}
}

@article{pan2010,
  title = {A {{Survey}} on {{Transfer Learning}}},
  author = {Pan, Sinno Jialin and Yang, Qiang},
  date = {2010-10},
  journaltitle = {IEEE Transactions on Knowledge and Data Engineering},
  volume = {22},
  number = {10},
  pages = {1345--1359},
  issn = {1558-2191},
  doi = {10.1109/TKDE.2009.191},
  abstract = {A major assumption in many machine learning and data mining algorithms is that the training and future data must be in the same feature space and have the same distribution. However, in many real-world applications, this assumption may not hold. For example, we sometimes have a classification task in one domain of interest, but we only have sufficient training data in another domain of interest, where the latter data may be in a different feature space or follow a different data distribution. In such cases, knowledge transfer, if done successfully, would greatly improve the performance of learning by avoiding much expensive data-labeling efforts. In recent years, transfer learning has emerged as a new learning framework to address this problem. This survey focuses on categorizing and reviewing the current progress on transfer learning for classification, regression, and clustering problems. In this survey, we discuss the relationship between transfer learning and other related machine learning techniques such as domain adaptation, multitask learning and sample selection bias, as well as covariate shift. We also explore some potential future issues in transfer learning research.},
  eventtitle = {{{IEEE Transactions}} on {{Knowledge}} and {{Data Engineering}}},
  file = {/home/zenon/Zotero/storage/6A5WF47N/Pan and Yang - 2010 - A Survey on Transfer Learning.pdf;/home/zenon/Zotero/storage/ICD8848I/5288526.html}
}

@article{pinto2009,
  title = {A {{High-Throughput Screening Approach}} to {{Discovering Good Forms}} of {{Biologically Inspired Visual Representation}}},
  author = {Pinto, Nicolas and Doukhan, David and DiCarlo, James J. and Cox, David D.},
  date = {2009-11-26},
  journaltitle = {PLOS Computational Biology},
  shortjournal = {PLOS Computational Biology},
  volume = {5},
  number = {11},
  pages = {e1000579},
  publisher = {{Public Library of Science}},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1000579},
  abstract = {While many models of biological object recognition share a common set of “broad-stroke” properties, the performance of any one model depends strongly on the choice of parameters in a particular instantiation of that model—e.g., the number of units per layer, the size of pooling kernels, exponents in normalization operations, etc. Since the number of such parameters (explicit or implicit) is typically large and the computational cost of evaluating one particular parameter set is high, the space of possible model instantiations goes largely unexplored. Thus, when a model fails to approach the abilities of biological visual systems, we are left uncertain whether this failure is because we are missing a fundamental idea or because the correct “parts” have not been tuned correctly, assembled at sufficient scale, or provided with enough training. Here, we present a high-throughput approach to the exploration of such parameter sets, leveraging recent advances in stream processing hardware (high-end NVIDIA graphic cards and the PlayStation 3's IBM Cell Processor). In analogy to high-throughput screening approaches in molecular biology and genetics, we explored thousands of potential network architectures and parameter instantiations, screening those that show promising object recognition performance for further analysis. We show that this approach can yield significant, reproducible gains in performance across an array of basic object recognition tasks, consistently outperforming a variety of state-of-the-art purpose-built vision systems from the literature. As the scale of available computational power continues to expand, we argue that this approach has the potential to greatly accelerate progress in both artificial vision and our understanding of the computational underpinning of biological vision.},
  langid = {english},
  keywords = {Arithmetic,Boats,Computer object recognition,Face recognition,High throughput screening,Linear filters,Visual object recognition,Visual system},
  file = {/home/zenon/Zotero/storage/NE9BEY8F/Pinto et al. - 2009 - A High-Throughput Screening Approach to Discoverin.pdf}
}

@article{ramos-giraldo2020,
  title = {Drought {{Stress Detection Using Low-Cost Computer Vision Systems}} and {{Machine Learning Techniques}}},
  author = {Ramos-Giraldo, Paula and Reberg-Horton, Chris and Locke, Anna M. and Mirsky, Steven and Lobaton, Edgar},
  date = {2020-05},
  journaltitle = {IT Professional},
  volume = {22},
  number = {3},
  pages = {27--29},
  issn = {1941-045X},
  doi = {10.1109/MITP.2020.2986103},
  eventtitle = {{{IT Professional}}},
  keywords = {Agriculture,Climate change,Computer vision,Loss measurement,Machine learning,Stress measurement}
}

@inproceedings{ramos-giraldo2020a,
  title = {Low-{{Cost Smart Camera System}} for {{Water Stress Detection}} in {{Crops}}},
  booktitle = {2020 {{IEEE SENSORS}}},
  author = {Ramos-Giraldo, Paula and Reberg-Horton, S. Chris and Mirsky, Steven and Lobaton, Edgar and Locke, Anna M. and Henriquez, Esleyther and Zuniga, Ane and Minin, Artem},
  date = {2020-10},
  pages = {1--4},
  issn = {2168-9229},
  doi = {10.1109/SENSORS47125.2020.9278744},
  eventtitle = {2020 {{IEEE SENSORS}}},
  keywords = {Agriculture,Cameras,Computational modeling,computer vision,edge and cloud computing,IoT,machine learning,Sensor systems,Sensors,smart farming,Stress,Temperature sensors}
}

@inproceedings{redmon2016,
  title = {You {{Only Look Once}}: {{Unified}}, {{Real-Time Object Detection}}},
  shorttitle = {You {{Only Look Once}}},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali},
  date = {2016-06},
  pages = {779--788},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2016.91},
  abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/YMA63KNY/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf;/home/zenon/Zotero/storage/DJ3JER52/7780460.html}
}

@inproceedings{redmon2017,
  title = {{{YOLO9000}}: {{Better}}, {{Faster}}, {{Stronger}}},
  shorttitle = {{{YOLO9000}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Redmon, Joseph and Farhadi, Ali},
  date = {2017-07},
  pages = {6517--6525},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2017.690},
  abstract = {We introduce YOLO9000, a state-of-the-art, real-time object detection system that can detect over 9000 object categories. First we propose various improvements to the YOLO detection method, both novel and drawn from prior work. The improved model, YOLOv2, is state-of-the-art on standard detection tasks like PASCAL VOC and COCO. Using a novel, multi-scale training method the same YOLOv2 model can run at varying sizes, offering an easy tradeoff between speed and accuracy. At 67 FPS, YOLOv2 gets 76.8 mAP on VOC 2007. At 40 FPS, YOLOv2 gets 78.6 mAP, outperforming state-of-the-art methods like Faster RCNN with ResNet and SSD while still running significantly faster. Finally we propose a method to jointly train on object detection and classification. Using this method we train YOLO9000 simultaneously on the COCO detection dataset and the ImageNet classification dataset. Our joint training allows YOLO9000 to predict detections for object classes that dont have labelled detection data. We validate our approach on the ImageNet detection task. YOLO9000 gets 19.7 mAP on the ImageNet detection validation set despite only having detection data for 44 of the 200 classes. On the 156 classes not in COCO, YOLO9000 gets 16.0 mAP. YOLO9000 predicts detections for more than 9000 different object categories, all in real-time.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/MUTDKJCG/Redmon and Farhadi - 2017 - YOLO9000 Better, Faster, Stronger.pdf;/home/zenon/Zotero/storage/D5XNDDQC/8100173.html}
}

@online{redmon2018,
  title = {{{YOLOv3}}: {{An Incremental Improvement}}},
  shorttitle = {{{YOLOv3}}},
  author = {Redmon, Joseph and Farhadi, Ali},
  date = {2018-04-08},
  eprint = {1804.02767},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1804.02767},
  abstract = {We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. As always, all the code is online at https://pjreddie.com/yolo/},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/3EJMSL2T/Redmon and Farhadi - 2018 - YOLOv3 An Incremental Improvement.pdf;/home/zenon/Zotero/storage/BJVAIALX/1804.html}
}

@inproceedings{ren2015,
  title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}},
  shorttitle = {Faster {{R-CNN}}},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
  date = {2015},
  volume = {28},
  publisher = {{Curran Associates, Inc.}},
  url = {https://proceedings.neurips.cc/paper/2015/hash/14bfa6bb14875e45bba028a21ed38046-Abstract.html},
  urldate = {2023-10-27},
  abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully-convolutional network that simultaneously predicts object bounds and objectness scores at each position. RPNs are trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. With a simple alternating optimization, RPN and Fast R-CNN can be trained to share convolutional features. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007 (73.2\% mAP) and 2012 (70.4\% mAP) using 300 proposals per image. Code is available at https://github.com/ShaoqingRen/faster\_rcnn.},
  file = {/home/zenon/Zotero/storage/4XB3KRE8/Ren et al. - 2015 - Faster R-CNN Towards Real-Time Object Detection w.pdf}
}

@article{ren2017,
  title = {Faster {{R-CNN}}: {{Towards Real-Time Object Detection}} with {{Region Proposal Networks}}},
  shorttitle = {Faster {{R-CNN}}},
  author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
  date = {2017-06},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {39},
  number = {6},
  pages = {1137--1149},
  issn = {1939-3539},
  doi = {10.1109/TPAMI.2016.2577031},
  abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model [3], our detection system has a frame rate of 5 fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.},
  eventtitle = {{{IEEE Transactions}} on {{Pattern Analysis}} and {{Machine Intelligence}}},
  file = {/home/zenon/Zotero/storage/NBA8U8VS/Ren et al. - 2017 - Faster R-CNN Towards Real-Time Object Detection w.pdf;/home/zenon/Zotero/storage/FJKQTY4F/7485869.html}
}

@inproceedings{rezatofighi2019,
  title = {Generalized {{Intersection Over Union}}: {{A Metric}} and a {{Loss}} for {{Bounding Box Regression}}},
  shorttitle = {Generalized {{Intersection Over Union}}},
  booktitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Rezatofighi, Hamid and Tsoi, Nathan and Gwak, JunYoung and Sadeghian, Amir and Reid, Ian and Savarese, Silvio},
  date = {2019-06},
  pages = {658--666},
  issn = {2575-7075},
  doi = {10.1109/CVPR.2019.00075},
  abstract = {Intersection over Union (IoU) is the most popular evaluation metric used in the object detection benchmarks. However, there is a gap between optimizing the commonly used distance losses for regressing the parameters of a bounding box and maximizing this metric value. The optimal objective for a metric is the metric itself. In the case of axis-aligned 2D bounding boxes, it can be shown that IoU can be directly used as a regression loss. However, IoU has a plateau making it infeasible to optimize in the case of non-overlapping bounding boxes. In this paper, we address the this weakness by introducing a generalized version of IoU as both a new loss and a new metric. By incorporating this generalized IoU (GIoU) as a loss into the state-of-the art object detection frameworks, we show a consistent improvement on their performance using both the standard, IoU based, and new, GIoU based, performance measures on popular object detection benchmarks such as PASCAL VOC and MS COCO.},
  eventtitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/9AYQXVU3/Rezatofighi et al. - 2019 - Generalized Intersection Over Union A Metric and .pdf;/home/zenon/Zotero/storage/Q9STKHTG/8953982.html}
}

@article{rico-chavez2022,
  title = {Machine {{Learning}} for {{Plant Stress Modeling}}: {{A Perspective}} towards {{Hormesis Management}}},
  shorttitle = {Machine {{Learning}} for {{Plant Stress Modeling}}},
  author = {Rico-Chávez, Amanda Kim and Franco, Jesus Alejandro and Fernandez-Jaramillo, Arturo Alfonso and Contreras-Medina, Luis Miguel and Guevara-González, Ramón Gerardo and Hernandez-Escobedo, Quetzalcoatl},
  date = {2022-04-02},
  journaltitle = {Plants},
  shortjournal = {Plants (Basel)},
  volume = {11},
  number = {7},
  eprint = {35406950},
  eprinttype = {pmid},
  pages = {970},
  issn = {2223-7747},
  doi = {10.3390/plants11070970},
  abstract = {Plant stress is one of the most significant factors affecting plant fitness and, consequently, food production. However, plant stress may also be profitable since it behaves hormetically; at low doses, it stimulates positive traits in crops, such as the synthesis of specialized metabolites and additional stress tolerance. The controlled exposure of crops to low doses of stressors is therefore called hormesis management, and it is a promising method to increase crop productivity and quality. Nevertheless, hormesis management has severe limitations derived from the complexity of plant physiological responses to stress. Many technological advances assist plant stress science in overcoming such limitations, which results in extensive datasets originating from the multiple layers of the plant defensive response. For that reason, artificial intelligence tools, particularly Machine Learning (ML) and Deep Learning (DL), have become crucial for processing and interpreting data to accurately model plant stress responses such as genomic variation, gene and protein expression, and metabolite biosynthesis. In this review, we discuss the most recent ML and DL applications in plant stress science, focusing on their potential for improving the development of hormesis management protocols.},
  pmcid = {PMC9003083},
  file = {/home/zenon/Zotero/storage/56I7ELHW/Rico-Chávez et al. - 2022 - Machine Learning for Plant Stress Modeling A Pers.pdf}
}

@report{rosenblatt1957,
  type = {Technical Report},
  title = {The Perceptron: {{A}} Perceiving and Recognizing Automaton},
  author = {Rosenblatt, Frank},
  date = {1957-01},
  number = {85-460-1},
  institution = {{Cornell Aeronautical Laboratory}},
  location = {{Ithaca, NY}},
  file = {/home/zenon/Zotero/storage/FA8NA2T6/Rosenblatt - 1957 - The perceptron A perceiving and recognizing autom.pdf}
}

@book{rosenblatt1962,
  title = {Principles of {{Neurodynamics}}: {{Perceptrons}} and the {{Theory}} of {{Brain Mechanisms}}},
  shorttitle = {Principles of {{Neurodynamics}}},
  author = {Rosenblatt, Frank},
  date = {1962},
  eprint = {7FhRAAAAMAAJ},
  eprinttype = {googlebooks},
  publisher = {{Spartan Books}},
  abstract = {Part I attempts to review the background, basic sources of data, concepts, and methodology to be employed in the study of perceptrons. In Chapter 2, a brief review of the main alternative approaches to the development of brain models is presented. Chapter 3 considers the physiological and psychological criteria for a suitable model, and attempts to evaluate the empirical evidence which is available on several important issues. Chapter 4 contains basic definitions and some of the notation to be used in later sections are presented. Parts II and III are devoted to a summary of the established theoretical results obtained to date. Part II (Chapters 5 through 14) deals with the theory of three-layer series-coupled perceptrons, on which most work has been done to date. Part III (Chapters 15 through 20) deals with the theory of multi-layer and cross-coupled perceptrons. Part IV is concerned with more speculative models and problems for future analysis. Of necessity, the final chapters become increasingly heuristic in character, as the theory of perceptrons is not yet complete, and new possibilities are continually coming to light. (Author).},
  langid = {english},
  pagetotal = {648}
}

@article{rumelhart1986,
  title = {Learning Representations by Back-Propagating Errors},
  author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
  date = {1986-10},
  journaltitle = {Nature},
  volume = {323},
  number = {6088},
  pages = {533--536},
  publisher = {{Nature Publishing Group}},
  issn = {1476-4687},
  doi = {10.1038/323533a0},
  abstract = {We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal ‘hidden’ units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
  issue = {6088},
  langid = {english},
  keywords = {Humanities and Social Sciences,multidisciplinary,Science},
  file = {/home/zenon/Zotero/storage/G59XYHFP/Rumelhart et al. - 1986 - Learning representations by back-propagating error.pdf}
}

@online{russakovsky2015,
  title = {{{ImageNet Large Scale Visual Recognition Challenge}}},
  author = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li},
  date = {2015-01-29},
  eprint = {1409.0575},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1409.0575},
  abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the five years of the challenge, and propose future directions and improvements.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,I.4.8,I.5.2},
  file = {/home/zenon/Zotero/storage/MF8K4TPL/Russakovsky et al. - 2015 - ImageNet Large Scale Visual Recognition Challenge.pdf;/home/zenon/Zotero/storage/EZS75GZV/1409.html}
}

@article{samuel1959,
  title = {Some {{Studies}} in {{Machine Learning Using}} the {{Game}} of {{Checkers}}},
  author = {Samuel, A. L.},
  date = {1959-07},
  journaltitle = {IBM Journal of Research and Development},
  volume = {3},
  number = {3},
  pages = {210--229},
  issn = {0018-8646},
  doi = {10.1147/rd.33.0210},
  abstract = {Two machine-learning procedures have been investigated in some detail using the game of checkers. Enough work has been done to verify the fact that a computer can be programmed so that it will learn to play a better game of checkers than can be played by the person who wrote the program. Furthermore, it can learn to do this in a remarkably short period of time (8 or 10 hours of machine-playing time) when given only the rules of the game, a sense of direction, and a redundant and incomplete list of parameters which are thought to have something to do with the game, but whose correct signs and relative weights are unknown and unspecified. The principles of machine learning verified by these experiments are, of course, applicable to many other situations.},
  eventtitle = {{{IBM Journal}} of {{Research}} and {{Development}}},
  file = {/home/zenon/Zotero/storage/9YJSG7IJ/Samuel - 1959 - Some Studies in Machine Learning Using the Game of.pdf;/home/zenon/Zotero/storage/6XF4QCUQ/5392560.html}
}

@inproceedings{sandler2018,
  title = {{{MobileNetV2}}: {{Inverted Residuals}} and {{Linear Bottlenecks}}},
  shorttitle = {{{MobileNetV2}}},
  booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  date = {2018-06},
  pages = {4510--4520},
  issn = {2575-7075},
  doi = {10.1109/CVPR.2018.00474},
  abstract = {In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3. is based on an inverted residual structure where the shortcut connections are between the thin bottleneck layers. The intermediate expansion layer uses lightweight depthwise convolutions to filter features as a source of non-linearity. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on ImageNet [1] classification, COCO object detection [2], VOC image segmentation [3]. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as actual latency, and the number of parameters.},
  eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  file = {/home/zenon/Zotero/storage/EA4GZCUU/Sandler et al. - 2018 - MobileNetV2 Inverted Residuals and Linear Bottlen.pdf;/home/zenon/Zotero/storage/XLW9SUZ9/8578572.html}
}

@inproceedings{sears2007,
  title = {Prototyping {{Tools}} and {{Techniques}}},
  booktitle = {The {{Human-Computer Interaction Handbook}}},
  editor = {Sears, Andrew and Jacko, Julie A. and Jacko, Julie A.},
  date = {2007-09-19},
  pages = {1043--1066},
  publisher = {{CRC Press}},
  doi = {10.1201/9781410615862-66},
  abstract = {We begin with our definition of a prototype and then discuss prototypes as design artifacts, introducing four dimensions for analyzing them. We then discuss the role of prototyping within the design process, in particular the concept of a design space, and how it is expanded and contracted by generating and selecting design ideas. The next three sections describe specific prototyping approaches: Rapid prototyping, both off-line and on-line, for early stages of design, iterative prototyping, which uses on-line development tools, and evolutionary prototyping, which must be based on a sound software architecture.},
  isbn = {978-0-429-16397-5},
  langid = {english}
}

@article{selvaraju2020,
  title = {Grad-{{CAM}}: {{Visual Explanations}} from {{Deep Networks}} via {{Gradient-based Localization}}},
  shorttitle = {Grad-{{CAM}}},
  author = {Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},
  date = {2020-02},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  volume = {128},
  number = {2},
  eprint = {1610.02391},
  eprinttype = {arxiv},
  pages = {336--359},
  issn = {0920-5691, 1573-1405},
  doi = {10.1007/s11263-019-01228-7},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/home/zenon/Zotero/storage/QC22JBMX/Selvaraju et al. - 2020 - Grad-CAM Visual Explanations from Deep Networks v.pdf}
}

@inproceedings{shen2016,
  title = {Weighted Residuals for Very Deep Networks},
  booktitle = {2016 3rd {{International Conference}} on {{Systems}} and {{Informatics}} ({{ICSAI}})},
  author = {Shen, Falong and Gan, Rui and Zeng, Gang},
  date = {2016-11},
  pages = {936--941},
  doi = {10.1109/ICSAI.2016.7811085},
  abstract = {Deep residual networks have recently shown appealing performance on many challenging computer vision tasks. However, the original residual structure still has some defects making it difficult to converge on very deep networks. In this paper, we introduce a weighted residual network to address the incompatibility between ReLU and element-wise addition and the deep network initialization problem. The weighted residual network is able to learn to combine residuals from different layers effectively and efficiently. The proposed models enjoy a consistent improvement over accuracy and convergence with increasing depths from 100+ layers to 1000+ layers. Besides, the weighted residual networks have little more computation and GPU memory burden than the original residual networks. The networks are optimized by projected stochastic gradient descent. Experiments on CIFAR-10 have shown that our algorithm has a faster convergence speed than the original residual networks and reaches a high accuracy at 95.3\% with a 1192-layer model. Experiments on CIFAR-100 and ImageNet-1k have also verified the effectiveness of our proposed design.},
  eventtitle = {2016 3rd {{International Conference}} on {{Systems}} and {{Informatics}} ({{ICSAI}})},
  file = {/home/zenon/Zotero/storage/8NK8ULV5/Shen et al. - 2016 - Weighted residuals for very deep networks.pdf;/home/zenon/Zotero/storage/ZPRJ5TZ2/7811085.html}
}

@inproceedings{shrivastava2016,
  title = {Training {{Region-Based Object Detectors}} with {{Online Hard Example Mining}}},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Shrivastava, Abhinav and Gupta, Abhinav and Girshick, Ross},
  date = {2016-06},
  pages = {761--769},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2016.89},
  abstract = {The field of object detection has made significant advances riding on the wave of region-based ConvNets, but their training procedure still includes many heuristics and hyperparameters that are costly to tune. We present a simple yet surprisingly effective online hard example mining (OHEM) algorithm for training region-based ConvNet detectors. Our motivation is the same as it has always been - detection datasets contain an overwhelming number of easy examples and a small number of hard examples. Automatic selection of these hard examples can make training more effective and efficient. OHEM is a simple and intuitive algorithm that eliminates several heuristics and hyperparameters in common use. But more importantly, it yields consistent and significant boosts in detection performance on benchmarks like PASCAL VOC 2007 and 2012. Its effectiveness increases as datasets become larger and more difficult, as demonstrated by the results on the MS COCO dataset. Moreover, combined with complementary advances in the field, OHEM leads to state-of-the-art results of 78.9\% and 76.3\% mAP on PASCAL VOC 2007 and 2012 respectively.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/HB7BQR72/Shrivastava et al. - 2016 - Training Region-Based Object Detectors with Online.pdf;/home/zenon/Zotero/storage/PEAFAEE9/7780458.html}
}

@inproceedings{simard2003,
  title = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis},
  booktitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.},
  author = {Simard, P.Y. and Steinkraus, D. and Platt, J.C.},
  date = {2003-08},
  pages = {958--963},
  doi = {10.1109/ICDAR.2003.1227801},
  eventtitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}.},
  file = {/home/zenon/Zotero/storage/S6SE8F56/Simard et al. - 2003 - Best practices for convolutional neural networks a.pdf;/home/zenon/Zotero/storage/FQHDISEK/1227801.html}
}

@online{simonyan2015,
  title = {Very {{Deep Convolutional Networks}} for {{Large-Scale Image Recognition}}},
  author = {Simonyan, Karen and Zisserman, Andrew},
  date = {2015-04-10},
  eprint = {1409.1556},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1409.1556},
  abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/CW73C5N3/Simonyan and Zisserman - 2015 - Very Deep Convolutional Networks for Large-Scale I.pdf;/home/zenon/Zotero/storage/LQIW4V7G/1409.html}
}

@article{su2020,
  title = {Machine {{Learning-Based Crop Drought Mapping System}} by {{UAV Remote Sensing RGB Imagery}}},
  author = {Su, Jinya and Coombes, Matthew and Liu, Cunjia and Zhu, Yongchao and Song, Xingyang and Fang, Shibo and Guo, Lei and Chen, Wen-Hua},
  date = {2020-01},
  journaltitle = {Unmanned Systems},
  shortjournal = {Un. Sys.},
  volume = {08},
  number = {01},
  pages = {71--83},
  publisher = {{World Scientific Publishing Co.}},
  issn = {2301-3850},
  keywords = {Area-wise classification,Support Vector Machine (SVM),Unmanned Aerial Vehicle (UAV),wheat drought mapping},
  file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
}

@online{sussillo2015,
  title = {Random {{Walk Initialization}} for {{Training Very Deep Feedforward Networks}}},
  author = {Sussillo, David and Abbott, L. F.},
  date = {2015-02-27},
  eprint = {1412.6558},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.1412.6558},
  abstract = {Training very deep networks is an important open problem in machine learning. One of many difficulties is that the norm of the back-propagated error gradient can grow or decay exponentially. Here we show that training very deep feed-forward networks (FFNs) is not as difficult as previously thought. Unlike when back-propagation is applied to a recurrent network, application to an FFN amounts to multiplying the error gradient by a different random matrix at each layer. We show that the successive application of correctly scaled random matrices to an initial vector results in a random walk of the log of the norm of the resulting vectors, and we compute the scaling that makes this walk unbiased. The variance of the random walk grows only linearly with network depth and is inversely proportional to the size of each layer. Practically, this implies a gradient whose log-norm scales with the square root of the network depth and shows that the vanishing gradient problem can be mitigated by increasing the width of the layers. Mathematical analyses and experimental results using stochastic gradient descent to optimize tasks related to the MNIST and TIMIT datasets are provided to support these claims. Equations for the optimal matrix scaling are provided for the linear and ReLU cases.},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  file = {/home/zenon/Zotero/storage/XSY4PH2B/Sussillo and Abbott - 2015 - Random Walk Initialization for Training Very Deep .pdf;/home/zenon/Zotero/storage/39NYWJNU/1412.html}
}

@inproceedings{szegedy2015,
  title = {Going Deeper with Convolutions},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  date = {2015-06},
  pages = {1--9},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2015.7298594},
  abstract = {We propose a deep convolutional neural network architecture codenamed Inception that achieves the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. By a carefully crafted design, we increased the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC14 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/VL2YIAAN/Szegedy et al. - 2015 - Going deeper with convolutions.pdf;/home/zenon/Zotero/storage/GWTG8T26/7298594.html}
}

@article{szegedy2017,
  title = {Inception-v4, {{Inception-ResNet}} and the {{Impact}} of {{Residual Connections}} on {{Learning}}},
  author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent and Alemi, Alexander},
  date = {2017-02-12},
  journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {31},
  number = {1},
  pages = {4278--4284},
  issn = {2374-3468},
  doi = {10.1609/aaai.v31i1.11231},
  abstract = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question: Are there any benefits to combining Inception architectures with residual connections? Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks significantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classification task significantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4 networks, we achieve 3.08\% top-5 error on the test set of the ImageNet classification (CLS) challenge.},
  issue = {1},
  langid = {english},
  keywords = {Inception},
  file = {/home/zenon/Zotero/storage/JQVR2G3M/Szegedy et al. - 2017 - Inception-v4, Inception-ResNet and the Impact of R.pdf}
}

@online{terven2023,
  title = {A {{Comprehensive Review}} of {{YOLO}}: {{From YOLOv1}} and {{Beyond}}},
  shorttitle = {A {{Comprehensive Review}} of {{YOLO}}},
  author = {Terven, Juan and Cordova-Esparza, Diana},
  date = {2023-10-07},
  eprint = {2304.00501},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2304.00501},
  abstract = {YOLO has become a central real-time object detection system for robotics, driverless cars, and video monitoring applications. We present a comprehensive analysis of YOLO's evolution, examining the innovations and contributions in each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with Transformers. We start by describing the standard metrics and postprocessing; then, we discuss the major changes in network architecture and training tricks for each model. Finally, we summarize the essential lessons from YOLO's development and provide a perspective on its future, highlighting potential research directions to enhance real-time object detection systems.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,I.2.10},
  file = {/home/zenon/Zotero/storage/QT68D7SY/Terven and Cordova-Esparza - 2023 - A Comprehensive Review of YOLO From YOLOv1 and Be.pdf;/home/zenon/Zotero/storage/DJ5QRQMW/2304.html}
}

@inproceedings{turner2021,
  title = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}: {{Analysis}} of the {{Black-Box Optimization Challenge}} 2020},
  shorttitle = {Bayesian {{Optimization}} Is {{Superior}} to {{Random Search}} for {{Machine Learning Hyperparameter Tuning}}},
  booktitle = {Proceedings of the {{NeurIPS}} 2020 {{Competition}} and {{Demonstration Track}}},
  author = {Turner, Ryan and Eriksson, David and McCourt, Michael and Kiili, Juha and Laaksonen, Eero and Xu, Zhen and Guyon, Isabelle},
  date = {2021-08-07},
  pages = {3--26},
  publisher = {{PMLR}},
  issn = {2640-3498},
  url = {https://proceedings.mlr.press/v133/turner21a.html},
  urldate = {2023-11-16},
  abstract = {This paper presents the results and insights from the black-box optimization (BBO) challenge at NeurIPS2020 which ran from July–October, 2020. The challenge emphasized the importance of evaluating derivative-free optimizers for tuning the hyperparameters of machine learning models. This was the first black-box optimization challenge with a machine learning emphasis. It was based on tuning (validation set) performance of standard machine learning models on real datasets. This competition has widespread impact as black-box optimization (e.g., Bayesian optimization) is relevant for hyperparameter tuning in almost every machine learning project as well as many applications outside of machine learning. The final leaderboard was determined using the optimization performance on held-out (hidden) objective functions, where the optimizers ran without human intervention. Baselines were set using the default settings of several open source black-box optimization packages as well as random search.},
  eventtitle = {{{NeurIPS}} 2020 {{Competition}} and {{Demonstration Track}}},
  langid = {english},
  file = {/home/zenon/Zotero/storage/SADGV3GS/Turner et al. - 2021 - Bayesian Optimization is Superior to Random Search.pdf}
}

@article{uijlings2013,
  title = {Selective {{Search}} for {{Object Recognition}}},
  author = {Uijlings, J. R. R. and family=Sande, given=K. E. A., prefix=van de, useprefix=true and Gevers, T. and Smeulders, A. W. M.},
  date = {2013-09-01},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  volume = {104},
  number = {2},
  pages = {154--171},
  issn = {1573-1405},
  doi = {10.1007/s11263-013-0620-5},
  abstract = {This paper addresses the problem of generating possible object locations for use in object recognition. We introduce selective search which combines the strength of both an exhaustive search and segmentation. Like segmentation, we use the image structure to guide our sampling process. Like exhaustive search, we aim to capture all possible object locations. Instead of a single technique to generate possible object locations, we diversify our search and use a variety of complementary image partitionings to deal with as many image conditions as possible. Our selective search results in a small set of data-driven, class-independent, high quality locations, yielding 99~\% recall and a Mean Average Best Overlap of 0.879 at 10,097 locations. The reduced number of locations compared to an exhaustive search enables the use of stronger machine learning techniques and stronger appearance models for object recognition. In this paper we show that our selective search enables the use of the powerful Bag-of-Words model for recognition. The selective search software is made publicly available (Software: http://disi.unitn.it/\textasciitilde uijlings/SelectiveSearch.html).},
  langid = {english},
  keywords = {Appearance Model,Colour Space,Exhaustive Search,Object Location,Object Recognition},
  file = {/home/zenon/Zotero/storage/P39PKRXR/Uijlings et al. - 2013 - Selective Search for Object Recognition.pdf}
}

@inproceedings{venal2019,
  title = {Plant {{Stress Classification}} for {{Smart Agriculture}} Utilizing {{Convolutional Neural Network}} - {{Support Vector Machine}}},
  booktitle = {2019 {{International Conference}} on {{ICT}} for {{Smart Society}} ({{ICISS}})},
  author = {Venal, Maria Cecilia A. and Fajardo, Arnel C. and Hernandez, Alexander A.},
  date = {2019-11},
  volume = {7},
  pages = {1--5},
  issn = {2640-0545},
  doi = {10.1109/ICISS48059.2019.8969799},
  abstract = {Plant stresses considerably increasing due to changing environmental conditions. This study aims to classify plant stresses using a hybrid convolutional neural network and support vector machine. This study used soybean leaf images with identified plant stresses in model training, testing, and validation activities. The results show that the hybrid model achieves an overall accuracy of 98.02\%. This study found that the model is suitable for plant stress classification. This work contributes by providing a hybrid model that can potentially perform in a smart agriculture environment. This study presents some studies to extend their contribution.},
  eventtitle = {2019 {{International Conference}} on {{ICT}} for {{Smart Society}} ({{ICISS}})},
  file = {/home/zenon/Zotero/storage/DPWWLU5X/Venal et al. - 2019 - Plant Stress Classification for Smart Agriculture .pdf;/home/zenon/Zotero/storage/TI5Q683Q/8969799.html}
}

@inproceedings{viola2001,
  title = {Rapid Object Detection Using a Boosted Cascade of Simple Features},
  booktitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
  author = {Viola, P. and Jones, M.},
  date = {2001-12},
  volume = {1},
  pages = {I-I},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2001.990517},
  abstract = {This paper describes a machine learning approach for visual object detection which is capable of processing images extremely rapidly and achieving high detection rates. This work is distinguished by three key contributions. The first is the introduction of a new image representation called the "integral image" which allows the features used by our detector to be computed very quickly. The second is a learning algorithm, based on AdaBoost, which selects a small number of critical visual features from a larger set and yields extremely efficient classifiers. The third contribution is a method for combining increasingly more complex classifiers in a "cascade" which allows background regions of the image to be quickly discarded while spending more computation on promising object-like regions. The cascade can be viewed as an object specific focus-of-attention mechanism which unlike previous approaches provides statistical guarantees that discarded regions are unlikely to contain the object of interest. In the domain of face detection the system yields detection rates comparable to the best previous systems. Used in real-time applications, the detector runs at 15 frames per second without resorting to image differencing or skin color detection.},
  eventtitle = {Proceedings of the 2001 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}. {{CVPR}} 2001},
  keywords = {Detectors,Face detection,Filters,Focusing,Image representation,Machine learning,Object detection,Pixel,Robustness,Skin},
  file = {/home/zenon/Zotero/storage/7EMWJGGB/Viola and Jones - 2001 - Rapid object detection using a boosted cascade of .pdf;/home/zenon/Zotero/storage/PT4TV455/990517.html}
}

@inproceedings{viola2001a,
  title = {Robust Real-Time Face Detection},
  booktitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
  author = {Viola, P. and Jones, M.},
  date = {2001-07},
  volume = {2},
  pages = {747--747},
  doi = {10.1109/ICCV.2001.937709},
  eventtitle = {Proceedings {{Eighth IEEE International Conference}} on {{Computer Vision}}. {{ICCV}} 2001},
  keywords = {Boosting,Color,Detectors,Face detection,Information resources,Laboratories,Object detection,Pixel,Robustness,Video sequences},
  file = {/home/zenon/Zotero/storage/MX2PJDWC/Viola and Jones - 2001 - Robust real-time face detection.pdf;/home/zenon/Zotero/storage/NCMDRQ53/937709.html}
}

@article{virnodkar2020,
  title = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}: {{A Critical Review}}},
  shorttitle = {Remote {{Sensing}} and {{Machine Learning}} for {{Crop Water Stress Determination}} in {{Various Crops}}},
  author = {Virnodkar, Shyamal S. and Pachghare, Vinod K. and Patil, V. C. and Jha, Sunil Kumar},
  date = {2020-10-01},
  journaltitle = {Precision Agriculture},
  shortjournal = {Precision Agric},
  volume = {21},
  number = {5},
  pages = {1121--1155},
  issn = {1573-1618},
  doi = {10.1007/s11119-020-09711-9},
  keywords = {Crop water stress,Crops,Machine learning,Remote sensing}
}

@article{wakamori2020,
  title = {Multimodal {{Neural Network}} with {{Clustering-Based Drop}} for {{Estimating Plant Water Stress}}},
  author = {Wakamori, Kazumasa and Mizuno, Ryosuke and Nakanishi, Gota and Mineno, Hiroshi},
  date = {2020-01-01},
  journaltitle = {Computers and Electronics in Agriculture},
  shortjournal = {Computers and Electronics in Agriculture},
  volume = {168},
  pages = {105118},
  issn = {0168-1699},
  doi = {10.1016/j.compag.2019.105118},
  keywords = {Image processing,Multimodal deep learning,Plant water stress,Time-series modeling}
}

@online{wang2022,
  title = {{{YOLOv7}}: {{Trainable}} Bag-of-Freebies Sets New State-of-the-Art for Real-Time Object Detectors},
  shorttitle = {{{YOLOv7}}},
  author = {Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark},
  date = {2022-07-06},
  eprint = {2207.02696},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2207.02696},
  abstract = {YOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8\% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9\% AP) outperforms both transformer-based detector SWIN-L Cascade-Mask R-CNN (9.2 FPS A100, 53.9\% AP) by 509\% in speed and 2\% in accuracy, and convolutional-based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6 FPS A100, 55.2\% AP) by 551\% in speed and 0.7\% AP in accuracy, as well as YOLOv7 outperforms: YOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable DETR, DINO-5scale-R50, ViT-Adapter-B and many other object detectors in speed and accuracy. Moreover, we train YOLOv7 only on MS COCO dataset from scratch without using any other datasets or pre-trained weights. Source code is released in https://github.com/WongKinYiu/yolov7.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/TF6HV2VR/Wang et al. - 2022 - YOLOv7 Trainable bag-of-freebies sets new state-o.pdf;/home/zenon/Zotero/storage/4H69GZYY/2207.html}
}

@online{wang2022a,
  title = {Designing {{Network Design Strategies Through Gradient Path Analysis}}},
  author = {Wang, Chien-Yao and Liao, Hong-Yuan Mark and Yeh, I.-Hau},
  date = {2022-11-09},
  eprint = {2211.04800},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2211.04800},
  abstract = {Designing a high-efficiency and high-quality expressive network architecture has always been the most important research topic in the field of deep learning. Most of today's network design strategies focus on how to integrate features extracted from different layers, and how to design computing units to effectively extract these features, thereby enhancing the expressiveness of the network. This paper proposes a new network design strategy, i.e., to design the network architecture based on gradient path analysis. On the whole, most of today's mainstream network design strategies are based on feed forward path, that is, the network architecture is designed based on the data path. In this paper, we hope to enhance the expressive ability of the trained model by improving the network learning ability. Due to the mechanism driving the network parameter learning is the backward propagation algorithm, we design network design strategies based on back propagation path. We propose the gradient path design strategies for the layer-level, the stage-level, and the network-level, and the design strategies are proved to be superior and feasible from theoretical analysis and experiments.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/TFTTKYRE/Wang et al. - 2022 - Designing Network Design Strategies Through Gradie.pdf;/home/zenon/Zotero/storage/5I43K6ZQ/2211.html}
}

@inproceedings{woo2018,
  title = {{{CBAM}}: {{Convolutional Block Attention Module}}},
  shorttitle = {{{CBAM}}},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2018},
  author = {Woo, Sanghyun and Park, Jongchan and Lee, Joon-Young and Kweon, In So},
  editor = {Ferrari, Vittorio and Hebert, Martial and Sminchisescu, Cristian and Weiss, Yair},
  date = {2018},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {3--19},
  publisher = {{Springer International Publishing}},
  location = {{Cham}},
  doi = {10.1007/978-3-030-01234-2_1},
  abstract = {We propose Convolutional Block Attention Module (CBAM), a simple yet effective attention module for feed-forward convolutional neural networks. Given an intermediate feature map, our module sequentially infers attention maps along two separate dimensions, channel and spatial, then the attention maps are multiplied to the input feature map for adaptive feature refinement. Because CBAM is a lightweight and general module, it can be integrated into any CNN architectures seamlessly with negligible overheads and is end-to-end trainable along with base CNNs. We validate our CBAM through extensive experiments on ImageNet-1K, MS~COCO detection, and VOC~2007 detection datasets. Our experiments show consistent improvements in classification and detection performances with various models, demonstrating the wide applicability of CBAM. The code and models will be publicly available.},
  isbn = {978-3-030-01234-2},
  langid = {english},
  keywords = {Attention mechanism,Gated convolution,Object recognition},
  file = {/home/zenon/Zotero/storage/3LZKK3GI/Woo et al. - 2018 - CBAM Convolutional Block Attention Module.pdf}
}

@article{yang2020,
  title = {On Hyperparameter Optimization of Machine Learning Algorithms: {{Theory}} and Practice},
  shorttitle = {On Hyperparameter Optimization of Machine Learning Algorithms},
  author = {Yang, Li and Shami, Abdallah},
  date = {2020-11-20},
  journaltitle = {Neurocomputing},
  shortjournal = {Neurocomputing},
  volume = {415},
  pages = {295--316},
  issn = {0925-2312},
  doi = {10.1016/j.neucom.2020.07.061},
  abstract = {Machine learning algorithms have been used widely in various applications and areas. To fit a machine learning model into different problems, its hyper-parameters must be tuned. Selecting the best hyper-parameter configuration for machine learning models has a direct impact on the model’s performance. It often requires deep knowledge of machine learning algorithms and appropriate hyper-parameter optimization techniques. Although several automatic optimization techniques exist, they have different strengths and drawbacks when applied to different types of problems. In this paper, optimizing the hyper-parameters of common machine learning models is studied. We introduce several state-of-the-art optimization techniques and discuss how to apply them to machine learning algorithms. Many available libraries and frameworks developed for hyper-parameter optimization problems are provided, and some open challenges of hyper-parameter optimization research are also discussed in this paper. Moreover, experiments are conducted on benchmark datasets to compare the performance of different optimization methods and provide practical examples of hyper-parameter optimization. This survey paper will help industrial users, data analysts, and researchers to better develop machine learning models by identifying the proper hyper-parameter configurations effectively.},
  keywords = {Bayesian optimization,Genetic algorithm,Grid search,Hyper-parameter optimization,Machine learning,Particle swarm optimization},
  file = {/home/zenon/Zotero/storage/L5YW8KY9/Yang and Shami - 2020 - On hyperparameter optimization of machine learning.pdf;/home/zenon/Zotero/storage/YU3W3Z8L/S0925231220311693.html}
}

@inproceedings{zeiler2014,
  title = {Visualizing and {{Understanding Convolutional Networks}}},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2014},
  author = {Zeiler, Matthew D. and Fergus, Rob},
  editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne},
  date = {2014},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {818--833},
  publisher = {{Springer International Publishing}},
  location = {{Cham}},
  doi = {10.1007/978-3-319-10590-1_53},
  abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark Krizhevsky et al. [18]. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we explore both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. Used in a diagnostic role, these visualizations allow us to find model architectures that outperform Krizhevsky et al on the ImageNet classification benchmark. We also perform an ablation study to discover the performance contribution from different model layers. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.},
  isbn = {978-3-319-10590-1},
  langid = {english},
  keywords = {Convolutional Neural Network,Input Image,Pixel Space,Stochastic Gradient Descent,Training Image},
  file = {/home/zenon/Zotero/storage/5LSEHN2T/Zeiler and Fergus - 2014 - Visualizing and Understanding Convolutional Networ.pdf}
}

@inproceedings{zhang2021,
  title = {{{VarifocalNet}}: {{An IoU-aware Dense Object Detector}}},
  shorttitle = {{{VarifocalNet}}},
  booktitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Zhang, Haoyang and Wang, Ying and Dayoub, Feras and Sünderhauf, Niko},
  date = {2021-06},
  pages = {8510--8519},
  issn = {2575-7075},
  doi = {10.1109/CVPR46437.2021.00841},
  abstract = {Accurately ranking the vast number of candidate detections is crucial for dense object detectors to achieve high performance. Prior work uses the classification score or a combination of classification and predicted localization scores to rank candidates. However, neither option results in a reliable ranking, thus degrading detection performance. In this paper, we propose to learn an Iou-Aware Classification Score (IACS) as a joint representation of object presence confidence and localization accuracy. We show that dense object detectors can achieve a more accurate ranking of candidate detections based on the IACS. We design a new loss function, named Varifocal Loss, to train a dense object detector to predict the IACS, and propose a new star-shaped bounding box feature representation for IACS prediction and bounding box refinement. Combining these two new components and a bounding box refinement branch, we build an IoU-aware dense object detector based on the FCOS+ATSS architecture, that we call VarifocalNet or VFNet for short. Extensive experiments on MS COCO show that our VFNet consistently surpasses the strong baseline by 2.0 AP with different backbones. Our best model VFNet-X-1200 with Res2Net-101-DCN achieves a single-model single-scale AP of 55.1 on COCO test-dev, which is state-of-the-art among various object detectors. Code is available at: https://github.com/hyz-xmaster/VarifocalNet.},
  eventtitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/MNHCWJDV/Zhang et al. - 2021 - VarifocalNet An IoU-aware Dense Object Detector.pdf;/home/zenon/Zotero/storage/T89I8NJC/9578034.html}
}

@article{zheng2020,
  title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}},
  shorttitle = {Distance-{{IoU Loss}}},
  author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
  date = {2020-04-03},
  journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {34},
  number = {07},
  pages = {12993--13000},
  issn = {2374-3468},
  doi = {10.1609/aaai.v34i07.6999},
  abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while ℓn-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, i.e., overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster R-CNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.},
  issue = {07},
  langid = {english},
  file = {/home/zenon/Zotero/storage/5IUI6GPY/Zheng et al. - 2020 - Distance-IoU Loss Faster and Better Learning for .pdf}
}

@article{zhong2022,
  title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}},
  author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei},
  date = {2022-09},
  journaltitle = {Agriculture},
  volume = {12},
  number = {9},
  pages = {1360},
  publisher = {{Multidisciplinary Digital Publishing Institute}},
  issn = {2077-0472},
  doi = {10.3390/agriculture12091360},
  keywords = {cassava diseases,convolutional neural network,focal angular margin penalty softmax loss (FAMP-Softmax),intelligent agricultural engineering,transformer-embedded ResNet (T-RNet),unbalanced image samples},
  file = {/home/zenon/Zotero/storage/P7652AHL/Zhong et al. - 2022 - Classification of Cassava Leaf Disease Based on a .pdf}
}

@online{zhou2015,
  title = {Learning {{Deep Features}} for {{Discriminative Localization}}},
  author = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio},
  date = {2015-12-13},
  eprint = {1512.04150},
  eprinttype = {arxiv},
  doi = {10.48550/arXiv.1512.04150},
  issue = {arXiv:1512.04150},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/VMLHUG7J/Zhou et al. - 2015 - Learning Deep Features for Discriminative Localiza.pdf}
}

@article{zhuang2017,
  title = {Early {{Detection}} of {{Water Stress}} in {{Maize Based}} on {{Digital Images}}},
  author = {Zhuang, Shuo and Wang, Ping and Jiang, Boran and Li, Maosong and Gong, Zhihong},
  date = {2017-08-01},
  journaltitle = {Computers and Electronics in Agriculture},
  shortjournal = {Computers and Electronics in Agriculture},
  volume = {140},
  pages = {461--468},
  issn = {0168-1699},
  doi = {10.1016/j.compag.2017.06.022},
  keywords = {Early maize,Feature extraction,Gradient boosting decision tree,Image segmentation,Water stress}
}

@article{zhuang2021,
  title = {A {{Comprehensive Survey}} on {{Transfer Learning}}},
  author = {Zhuang, Fuzhen and Qi, Zhiyuan and Duan, Keyu and Xi, Dongbo and Zhu, Yongchun and Zhu, Hengshu and Xiong, Hui and He, Qing},
  date = {2021-01},
  journaltitle = {Proceedings of the IEEE},
  volume = {109},
  number = {1},
  pages = {43--76},
  issn = {1558-2256},
  doi = {10.1109/JPROC.2020.3004555},
  abstract = {Transfer learning aims at improving the performance of target learners on target domains by transferring the knowledge contained in different but related source domains. In this way, the dependence on a large number of target-domain data can be reduced for constructing target learners. Due to the wide application prospects, transfer learning has become a popular and promising area in machine learning. Although there are already some valuable and impressive surveys on transfer learning, these surveys introduce approaches in a relatively isolated way and lack the recent advances in transfer learning. Due to the rapid expansion of the transfer learning area, it is both necessary and challenging to comprehensively review the relevant studies. This survey attempts to connect and systematize the existing transfer learning research studies, as well as to summarize and interpret the mechanisms and the strategies of transfer learning in a comprehensive way, which may help readers have a better understanding of the current research status and ideas. Unlike previous surveys, this survey article reviews more than 40 representative transfer learning approaches, especially homogeneous transfer learning approaches, from the perspectives of data and model. The applications of transfer learning are also briefly introduced. In order to show the performance of different transfer learning models, over 20 representative transfer learning models are used for experiments. The models are performed on three different data sets, that is, Amazon Reviews, Reuters-21578, and Office-31, and the experimental results demonstrate the importance of selecting appropriate transfer learning models for different applications in practice.},
  eventtitle = {Proceedings of the {{IEEE}}},
  file = {/home/zenon/Zotero/storage/8L5G3WFK/Zhuang et al. - 2021 - A Comprehensive Survey on Transfer Learning.pdf;/home/zenon/Zotero/storage/CQS64R8H/9134370.html}
}

@online{zotero-143,
  title = {Pattern {{Recognition}} and {{Machine Learning}} - {{Google Books}}},
  url = {https://www.google.at/books/edition/Pattern_Recognition_and_Machine_Learning/qWPwnQEACAAJ?hl=de&bshm=rimc/1},
  urldate = {2023-10-01},
  file = {/home/zenon/Zotero/storage/CLHDBTJ2/qWPwnQEACAAJ.html}
}

@online{zotero-368,
  title = {Dataset {{Search}}},
  url = {https://datasetsearch.research.google.com/search?src=2&query=Healthy%20and%20Wilted%20Houseplant%20Images&docid=L2cvMTFzc3JqZDhrNA%3D%3D},
  urldate = {2023-12-08},
  file = {/home/zenon/Zotero/storage/48CAYZMW/search.html}
}

@online{zotero-372,
  title = {Healthy and {{Wilted Houseplant Images}}},
  url = {https://www.kaggle.com/datasets/russellchan/healthy-and-wilted-houseplant-images},
  urldate = {2023-12-08},
  abstract = {A collection of 904 houseplant images, classified as either healthy or wilted},
  langid = {english},
  file = {/home/zenon/Zotero/storage/2EDXR4MQ/datasets.html}
}

@software{zotero-374,
  title = {Open {{Neural Network Exchange}}},
  url = {https://github.com/onnx},
  urldate = {2023-12-08},
  abstract = {ONNX is an open ecosystem for interoperable AI models. It's a community project: we welcome your contributions! - Open Neural Network Exchange},
  file = {/home/zenon/Zotero/storage/GZ35DHBG/onnx.html}
}

@article{zou2023,
  title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}},
  shorttitle = {Object {{Detection}} in 20 {{Years}}},
  author = {Zou, Zhengxia and Chen, Keyan and Shi, Zhenwei and Guo, Yuhong and Ye, Jieping},
  date = {2023-03},
  journaltitle = {Proceedings of the IEEE},
  volume = {111},
  number = {3},
  pages = {257--276},
  issn = {1558-2256},
  doi = {10.1109/JPROC.2023.3238524},
  abstract = {Object detection, as of one the most fundamental and challenging problems in computer vision, has received great attention in recent years. Over the past two decades, we have seen a rapid technological evolution of object detection and its profound impact on the entire computer vision field. If we consider today’s object detection technique as a revolution driven by deep learning, then, back in the 1990s, we would see the ingenious thinking and long-term perspective design of early computer vision. This article extensively reviews this fast-moving research field in the light of technical evolution, spanning over a quarter-century’s time (from the 1990s to 2022). A number of topics have been covered in this article, including the milestone detectors in history, detection datasets, metrics, fundamental building blocks of the detection system, speedup techniques, and recent state-of-the-art detection methods.},
  eventtitle = {Proceedings of the {{IEEE}}},
  keywords = {Computer vision,Convolutional neural networks,convolutional neural networks (CNNs),deep learning,Deep learning,Detectors,Feature extraction,object detection,Object detection,technical evolution},
  file = {/home/zenon/Zotero/storage/TFBCMNKC/Zou et al. - 2023 - Object Detection in 20 Years A Survey.pdf;/home/zenon/Zotero/storage/A5ENIFX3/10028728.html}
}