diff --git a/thesis/references.bib b/thesis/references.bib index a848e10..02cc869 100644 --- a/thesis/references.bib +++ b/thesis/references.bib @@ -212,10 +212,12 @@ date = {2020-04-22}, eprint = {2004.10934}, eprinttype = {arxiv}, + eprintclass = {cs, eess}, doi = {10.48550/arXiv.2004.10934}, - issue = {arXiv:2004.10934}, + abstract = {There are a huge number of features which are said to improve Convolutional Neural Network (CNN) accuracy. Practical testing of combinations of such features on large datasets, and theoretical justification of the result, is required. Some features operate on certain models exclusively and for certain problems exclusively, or only for small-scale datasets; while some features, such as batch-normalization and residual-connections, are applicable to the majority of models, tasks, and datasets. We assume that such universal features include Weighted-Residual-Connections (WRC), Cross-Stage-Partial-connections (CSP), Cross mini-Batch Normalization (CmBN), Self-adversarial-training (SAT) and Mish-activation. We use new features: WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, CmBN, DropBlock regularization, and CIoU loss, and combine some of them to achieve state-of-the-art results: 43.5\% AP (65.7\% AP50) for the MS COCO dataset at a realtime speed of \textasciitilde 65 FPS on Tesla V100. Source code is at https://github.com/AlexeyAB/darknet}, + pubstate = {preprint}, keywords = {Computer Science - Computer Vision and Pattern Recognition,Electrical Engineering and Systems Science - Image and Video Processing}, - file = {/home/zenon/Zotero/storage/RELLHNCA/Bochkovskiy et al. - 2020 - YOLOv4 Optimal Speed and Accuracy of Object Detec.pdf} + file = {/home/zenon/Zotero/storage/QHHW6JLR/Bochkovskiy et al. - 2020 - YOLOv4 Optimal Speed and Accuracy of Object Detec.pdf;/home/zenon/Zotero/storage/FPLD4HZH/2004.html} } @online{brown2020, @@ -298,6 +300,20 @@ keywords = {Explosions,Image databases,Image retrieval,Information retrieval,Internet,Large-scale systems,Multimedia databases,Ontologies,Robustness,Spine} } +@inproceedings{ding2021, + title = {{{RepVGG}}: {{Making VGG-style ConvNets Great Again}}}, + shorttitle = {{{RepVGG}}}, + booktitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + author = {Ding, Xiaohan and Zhang, Xiangyu and Ma, Ningning and Han, Jungong and Ding, Guiguang and Sun, Jian}, + date = {2021-06}, + pages = {13728--13737}, + issn = {2575-7075}, + doi = {10.1109/CVPR46437.2021.01352}, + abstract = {We present a simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3 × 3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. On ImageNet, RepVGG reaches over 80\% top-1 accuracy, which is the first time for a plain model, to the best of our knowledge. On NVIDIA 1080Ti GPU, RepVGG models run 83\% faster than ResNet-50 or 101\% faster than ResNet-101 with higher accuracy and show favorable accuracy-speed trade-off compared to the state-of-the-art models like EfficientNet and RegNet. The code and trained models are available at https://github.com/megvii-model/RepVGG.}, + eventtitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + file = {/home/zenon/Zotero/storage/PIN4B83K/Ding et al. - 2021 - RepVGG Making VGG-style ConvNets Great Again.pdf;/home/zenon/Zotero/storage/IXAPBC22/9577516.html} +} + @inproceedings{erhan2014, title = {Scalable {{Object Detection Using Deep Neural Networks}}}, booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, @@ -433,6 +449,21 @@ file = {/home/zenon/Zotero/storage/B9KGZ7N2/Ge et al. - 2021 - YOLOX Exceeding YOLO Series in 2021.pdf;/home/zenon/Zotero/storage/XQTJLGLZ/2107.html} } +@online{gevorgyan2022, + title = {{{SIoU Loss}}: {{More Powerful Learning}} for {{Bounding Box Regression}}}, + shorttitle = {{{SIoU Loss}}}, + author = {Gevorgyan, Zhora}, + date = {2022-05-25}, + eprint = {2205.12740}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2205.12740}, + abstract = {The effectiveness of Object Detection, one of the central problems in computer vision tasks, highly depends on the definition of the loss function - a measure of how accurately your ML model can predict the expected outcome. Conventional object detection loss functions depend on aggregation of metrics of bounding box regression such as the distance, overlap area and aspect ratio of the predicted and ground truth boxes (i.e. GIoU, CIoU, ICIoU etc). However, none of the methods proposed and used to date considers the direction of the mismatch between the desired ground box and the predicted, "experimental" box. This shortage results in slower and less effective convergence as the predicted box can "wander around" during the training process and eventually end up producing a worse model. In this paper a new loss function SIoU was suggested, where penalty metrics were redefined considering the angle of the vector between the desired regression. Applied to conventional Neural Networks and datasets it is shown that SIoU improves both the speed of training and the accuracy of the inference. The effectiveness of the proposed loss function was revealed in a number of simulations and tests.}, + pubstate = {preprint}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,I.2,I.4}, + file = {/home/zenon/Zotero/storage/4T3GEYK9/Gevorgyan - 2022 - SIoU Loss More Powerful Learning for Bounding Box.pdf;/home/zenon/Zotero/storage/T7BRMTH2/2205.html} +} + @online{girshick, title = {Discriminatively {{Trained Deformable Part Models}} ({{Release}} 5)}, author = {Girshick, Ross B. and Felzenszwalb, Pedro F. and McAllester, David}, @@ -641,6 +672,15 @@ file = {/home/zenon/Zotero/storage/7SMC2DMX/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf} } +@software{jocher2020, + title = {{{YOLOv5}} by {{Ultralytics}}}, + author = {Jocher, Glenn}, + date = {2020-05}, + doi = {10.5281/zenodo.3908559}, + abstract = {YOLOv5 🚀 in PyTorch {$>$} ONNX {$>$} CoreML {$>$} TFLite}, + version = {7.0} +} + @software{jocher2022, title = {Ultralytics/{{Yolov5}}: {{V7}}.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}}, shorttitle = {Ultralytics/{{Yolov5}}}, @@ -759,6 +799,21 @@ file = {/home/zenon/Zotero/storage/U7JKC8DW/Li et al. - 2022 - A Survey of Convolutional Neural Networks Analysi.pdf;/home/zenon/Zotero/storage/99TTKB2L/9451544.html} } +@online{li2022a, + title = {{{YOLOv6}}: {{A Single-Stage Object Detection Framework}} for {{Industrial Applications}}}, + shorttitle = {{{YOLOv6}}}, + author = {Li, Chuyi and Li, Lulu and Jiang, Hongliang and Weng, Kaiheng and Geng, Yifei and Li, Liang and Ke, Zaidan and Li, Qingyuan and Cheng, Meng and Nie, Weiqiang and Li, Yiduo and Zhang, Bo and Liang, Yufei and Zhou, Linyuan and Xu, Xiaoming and Chu, Xiangxiang and Wei, Xiaoming and Wei, Xiaolin}, + date = {2022-09-07}, + eprint = {2209.02976}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2209.02976}, + abstract = {For years, the YOLO series has been the de facto industry-level standard for efficient object detection. The YOLO community has prospered overwhelmingly to enrich its use in a multitude of hardware platforms and abundant scenarios. In this technical report, we strive to push its limits to the next level, stepping forward with an unwavering mindset for industry application. Considering the diverse requirements for speed and accuracy in the real environment, we extensively examine the up-to-date object detection advancements either from industry or academia. Specifically, we heavily assimilate ideas from recent network design, training strategies, testing techniques, quantization, and optimization methods. On top of this, we integrate our thoughts and practice to build a suite of deployment-ready networks at various scales to accommodate diversified use cases. With the generous permission of YOLO authors, we name it YOLOv6. We also express our warm welcome to users and contributors for further enhancement. For a glimpse of performance, our YOLOv6-N hits 35.9\% AP on the COCO dataset at a throughput of 1234 FPS on an NVIDIA Tesla T4 GPU. YOLOv6-S strikes 43.5\% AP at 495 FPS, outperforming other mainstream detectors at the same scale\textasciitilde (YOLOv5-S, YOLOX-S, and PPYOLOE-S). Our quantized version of YOLOv6-S even brings a new state-of-the-art 43.3\% AP at 869 FPS. Furthermore, YOLOv6-M/L also achieves better accuracy performance (i.e., 49.5\%/52.3\%) than other detectors with a similar inference speed. We carefully conducted experiments to validate the effectiveness of each component. Our code is made available at https://github.com/meituan/YOLOv6.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition}, + file = {/home/zenon/Zotero/storage/QGN6UE37/Li et al. - 2022 - YOLOv6 A Single-Stage Object Detection Framework .pdf;/home/zenon/Zotero/storage/W78N7C5U/2209.html} +} + @online{lin2015, title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}}, shorttitle = {Microsoft {{COCO}}}, @@ -830,6 +885,19 @@ file = {/home/zenon/Zotero/storage/LUL6FCIQ/Liu et al. - 2016 - SSD Single Shot MultiBox Detector.pdf} } +@inproceedings{liu2018, + title = {Path {{Aggregation Network}} for {{Instance Segmentation}}}, + booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, + author = {Liu, Shu and Qi, Lu and Qin, Haifang and Shi, Jianping and Jia, Jiaya}, + date = {2018-06}, + pages = {8759--8768}, + issn = {2575-7075}, + doi = {10.1109/CVPR.2018.00913}, + abstract = {The way that information propagates in neural networks is of great importance. In this paper, we propose Path Aggregation Network (PANet) aiming at boosting information flow in proposal-based instance segmentation framework. Specifically, we enhance the entire feature hierarchy with accurate localization signals in lower layers by bottom-up path augmentation, which shortens the information path between lower layers and topmost feature. We present adaptive feature pooling, which links feature grid and all feature levels to make useful information in each level propagate directly to following proposal subnetworks. A complementary branch capturing different views for each proposal is created to further improve mask prediction. These improvements are simple to implement, with subtle extra computational overhead. Yet they are useful and make our PANet reach the 1st place in the COCO 2017 Challenge Instance Segmentation task and the 2nd place in Object Detection task without large-batch training. PANet is also state-of-the-art on MVD and Cityscapes.}, + eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}}, + file = {/home/zenon/Zotero/storage/XCG9L37N/Liu et al. - 2018 - Path Aggregation Network for Instance Segmentation.pdf;/home/zenon/Zotero/storage/ZAJWVMP2/8579011.html} +} + @article{lopez-garcia2022, title = {Machine {{Learning-Based Processing}} of {{Multispectral}} and {{RGB UAV Imagery}} for the {{Multitemporal Monitoring}} of {{Vineyard Water Status}}}, author = {López-García, Patricia and Intrigliolo, Diego and Moreno, Miguel A. and Martínez-Moreno, Alejandro and Ortega, José Fernando and Pérez-Álvarez, Eva Pilar and Ballesteros, Rocío}, @@ -936,6 +1004,21 @@ file = {/home/zenon/Zotero/storage/XZVYT2SM/PerceptronsAn-Introduction-to-Computational.html} } +@online{misra2020, + title = {Mish: {{A Self Regularized Non-Monotonic Activation Function}}}, + shorttitle = {Mish}, + author = {Misra, Diganta}, + date = {2020-08-13}, + eprint = {1908.08681}, + eprinttype = {arxiv}, + eprintclass = {cs, stat}, + doi = {10.48550/arXiv.1908.08681}, + abstract = {We propose \$\textbackslash textit\{Mish\}\$, a novel self-regularized non-monotonic activation function which can be mathematically defined as: \$f(x)=x\textbackslash tanh(softplus(x))\$. As activation functions play a crucial role in the performance and training dynamics in neural networks, we validated experimentally on several well-known benchmarks against the best combinations of architectures and activation functions. We also observe that data augmentation techniques have a favorable effect on benchmarks like ImageNet-1k and MS-COCO across multiple architectures. For example, Mish outperformed Leaky ReLU on YOLOv4 with a CSP-DarkNet-53 backbone on average precision (\$AP\_\{50\}\^\{val\}\$) by 2.1\$\textbackslash\%\$ in MS-COCO object detection and ReLU on ResNet-50 on ImageNet-1k in Top-1 accuracy by \$\textbackslash approx\$1\$\textbackslash\%\$ while keeping all other network parameters and hyperparameters constant. Furthermore, we explore the mathematical formulation of Mish in relation with the Swish family of functions and propose an intuitive understanding on how the first derivative behavior may be acting as a regularizer helping the optimization of deep neural networks. Code is publicly available at https://github.com/digantamisra98/Mish.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning}, + file = {/home/zenon/Zotero/storage/GWSBGS5H/Misra - 2020 - Mish A Self Regularized Non-Monotonic Activation .pdf;/home/zenon/Zotero/storage/Z4N3DK8U/1908.html} +} + @book{mitchell1997a, title = {Machine {{Learning}}}, author = {Mitchell, Thomas M.}, @@ -1110,6 +1193,20 @@ file = {/home/zenon/Zotero/storage/NBA8U8VS/Ren et al. - 2017 - Faster R-CNN Towards Real-Time Object Detection w.pdf;/home/zenon/Zotero/storage/FJKQTY4F/7485869.html} } +@inproceedings{rezatofighi2019, + title = {Generalized {{Intersection Over Union}}: {{A Metric}} and a {{Loss}} for {{Bounding Box Regression}}}, + shorttitle = {Generalized {{Intersection Over Union}}}, + booktitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + author = {Rezatofighi, Hamid and Tsoi, Nathan and Gwak, JunYoung and Sadeghian, Amir and Reid, Ian and Savarese, Silvio}, + date = {2019-06}, + pages = {658--666}, + issn = {2575-7075}, + doi = {10.1109/CVPR.2019.00075}, + abstract = {Intersection over Union (IoU) is the most popular evaluation metric used in the object detection benchmarks. However, there is a gap between optimizing the commonly used distance losses for regressing the parameters of a bounding box and maximizing this metric value. The optimal objective for a metric is the metric itself. In the case of axis-aligned 2D bounding boxes, it can be shown that IoU can be directly used as a regression loss. However, IoU has a plateau making it infeasible to optimize in the case of non-overlapping bounding boxes. In this paper, we address the this weakness by introducing a generalized version of IoU as both a new loss and a new metric. By incorporating this generalized IoU (GIoU) as a loss into the state-of-the art object detection frameworks, we show a consistent improvement on their performance using both the standard, IoU based, and new, GIoU based, performance measures on popular object detection benchmarks such as PASCAL VOC and MS COCO.}, + eventtitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + file = {/home/zenon/Zotero/storage/9AYQXVU3/Rezatofighi et al. - 2019 - Generalized Intersection Over Union A Metric and .pdf;/home/zenon/Zotero/storage/Q9STKHTG/8953982.html} +} + @article{rico-chavez2022, title = {Machine {{Learning}} for {{Plant Stress Modeling}}: {{A Perspective}} towards {{Hormesis Management}}}, shorttitle = {Machine {{Learning}} for {{Plant Stress Modeling}}}, @@ -1245,6 +1342,18 @@ file = {/home/zenon/Zotero/storage/QC22JBMX/Selvaraju et al. - 2020 - Grad-CAM Visual Explanations from Deep Networks v.pdf} } +@inproceedings{shen2016, + title = {Weighted Residuals for Very Deep Networks}, + booktitle = {2016 3rd {{International Conference}} on {{Systems}} and {{Informatics}} ({{ICSAI}})}, + author = {Shen, Falong and Gan, Rui and Zeng, Gang}, + date = {2016-11}, + pages = {936--941}, + doi = {10.1109/ICSAI.2016.7811085}, + abstract = {Deep residual networks have recently shown appealing performance on many challenging computer vision tasks. However, the original residual structure still has some defects making it difficult to converge on very deep networks. In this paper, we introduce a weighted residual network to address the incompatibility between ReLU and element-wise addition and the deep network initialization problem. The weighted residual network is able to learn to combine residuals from different layers effectively and efficiently. The proposed models enjoy a consistent improvement over accuracy and convergence with increasing depths from 100+ layers to 1000+ layers. Besides, the weighted residual networks have little more computation and GPU memory burden than the original residual networks. The networks are optimized by projected stochastic gradient descent. Experiments on CIFAR-10 have shown that our algorithm has a faster convergence speed than the original residual networks and reaches a high accuracy at 95.3\% with a 1192-layer model. Experiments on CIFAR-100 and ImageNet-1k have also verified the effectiveness of our proposed design.}, + eventtitle = {2016 3rd {{International Conference}} on {{Systems}} and {{Informatics}} ({{ICSAI}})}, + file = {/home/zenon/Zotero/storage/8NK8ULV5/Shen et al. - 2016 - Weighted residuals for very deep networks.pdf;/home/zenon/Zotero/storage/ZPRJ5TZ2/7811085.html} +} + @inproceedings{shrivastava2016, title = {Training {{Region-Based Object Detectors}} with {{Online Hard Example Mining}}}, booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, @@ -1475,6 +1584,54 @@ file = {/home/zenon/Zotero/storage/G27M4VFA/Wang et al. - 2022 - YOLOv7 Trainable Bag-of-Freebies Sets New State-o.pdf} } +@online{wang2022a, + title = {Designing {{Network Design Strategies Through Gradient Path Analysis}}}, + author = {Wang, Chien-Yao and Liao, Hong-Yuan Mark and Yeh, I.-Hau}, + date = {2022-11-09}, + eprint = {2211.04800}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2211.04800}, + abstract = {Designing a high-efficiency and high-quality expressive network architecture has always been the most important research topic in the field of deep learning. Most of today's network design strategies focus on how to integrate features extracted from different layers, and how to design computing units to effectively extract these features, thereby enhancing the expressiveness of the network. This paper proposes a new network design strategy, i.e., to design the network architecture based on gradient path analysis. On the whole, most of today's mainstream network design strategies are based on feed forward path, that is, the network architecture is designed based on the data path. In this paper, we hope to enhance the expressive ability of the trained model by improving the network learning ability. Due to the mechanism driving the network parameter learning is the backward propagation algorithm, we design network design strategies based on back propagation path. We propose the gradient path design strategies for the layer-level, the stage-level, and the network-level, and the design strategies are proved to be superior and feasible from theoretical analysis and experiments.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition}, + file = {/home/zenon/Zotero/storage/TFTTKYRE/Wang et al. - 2022 - Designing Network Design Strategies Through Gradie.pdf;/home/zenon/Zotero/storage/5I43K6ZQ/2211.html} +} + +@online{wang2022b, + title = {{{YOLOv7}}: {{Trainable}} Bag-of-Freebies Sets New State-of-the-Art for Real-Time Object Detectors}, + shorttitle = {{{YOLOv7}}}, + author = {Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark}, + date = {2022-07-06}, + eprint = {2207.02696}, + eprinttype = {arxiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2207.02696}, + abstract = {YOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8\% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9\% AP) outperforms both transformer-based detector SWIN-L Cascade-Mask R-CNN (9.2 FPS A100, 53.9\% AP) by 509\% in speed and 2\% in accuracy, and convolutional-based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6 FPS A100, 55.2\% AP) by 551\% in speed and 0.7\% AP in accuracy, as well as YOLOv7 outperforms: YOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable DETR, DINO-5scale-R50, ViT-Adapter-B and many other object detectors in speed and accuracy. Moreover, we train YOLOv7 only on MS COCO dataset from scratch without using any other datasets or pre-trained weights. Source code is released in https://github.com/WongKinYiu/yolov7.}, + pubstate = {preprint}, + keywords = {Computer Science - Computer Vision and Pattern Recognition}, + file = {/home/zenon/Zotero/storage/TF6HV2VR/Wang et al. - 2022 - YOLOv7 Trainable bag-of-freebies sets new state-o.pdf;/home/zenon/Zotero/storage/4H69GZYY/2207.html} +} + +@inproceedings{woo2018, + title = {{{CBAM}}: {{Convolutional Block Attention Module}}}, + shorttitle = {{{CBAM}}}, + booktitle = {Computer {{Vision}} – {{ECCV}} 2018}, + author = {Woo, Sanghyun and Park, Jongchan and Lee, Joon-Young and Kweon, In So}, + editor = {Ferrari, Vittorio and Hebert, Martial and Sminchisescu, Cristian and Weiss, Yair}, + date = {2018}, + series = {Lecture {{Notes}} in {{Computer Science}}}, + pages = {3--19}, + publisher = {{Springer International Publishing}}, + location = {{Cham}}, + doi = {10.1007/978-3-030-01234-2_1}, + abstract = {We propose Convolutional Block Attention Module (CBAM), a simple yet effective attention module for feed-forward convolutional neural networks. Given an intermediate feature map, our module sequentially infers attention maps along two separate dimensions, channel and spatial, then the attention maps are multiplied to the input feature map for adaptive feature refinement. Because CBAM is a lightweight and general module, it can be integrated into any CNN architectures seamlessly with negligible overheads and is end-to-end trainable along with base CNNs. We validate our CBAM through extensive experiments on ImageNet-1K, MS~COCO detection, and VOC~2007 detection datasets. Our experiments show consistent improvements in classification and detection performances with various models, demonstrating the wide applicability of CBAM. The code and models will be publicly available.}, + isbn = {978-3-030-01234-2}, + langid = {english}, + keywords = {Attention mechanism,Gated convolution,Object recognition}, + file = {/home/zenon/Zotero/storage/3LZKK3GI/Woo et al. - 2018 - CBAM Convolutional Block Attention Module.pdf} +} + @article{yang2020, title = {On Hyperparameter Optimization of Machine Learning Algorithms: {{Theory}} and Practice}, shorttitle = {On Hyperparameter Optimization of Machine Learning Algorithms}, @@ -1509,19 +1666,35 @@ file = {/home/zenon/Zotero/storage/5LSEHN2T/Zeiler and Fergus - 2014 - Visualizing and Understanding Convolutional Networ.pdf} } -@online{zheng2019a, +@inproceedings{zhang2021, + title = {{{VarifocalNet}}: {{An IoU-aware Dense Object Detector}}}, + shorttitle = {{{VarifocalNet}}}, + booktitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + author = {Zhang, Haoyang and Wang, Ying and Dayoub, Feras and Sünderhauf, Niko}, + date = {2021-06}, + pages = {8510--8519}, + issn = {2575-7075}, + doi = {10.1109/CVPR46437.2021.00841}, + abstract = {Accurately ranking the vast number of candidate detections is crucial for dense object detectors to achieve high performance. Prior work uses the classification score or a combination of classification and predicted localization scores to rank candidates. However, neither option results in a reliable ranking, thus degrading detection performance. In this paper, we propose to learn an Iou-Aware Classification Score (IACS) as a joint representation of object presence confidence and localization accuracy. We show that dense object detectors can achieve a more accurate ranking of candidate detections based on the IACS. We design a new loss function, named Varifocal Loss, to train a dense object detector to predict the IACS, and propose a new star-shaped bounding box feature representation for IACS prediction and bounding box refinement. Combining these two new components and a bounding box refinement branch, we build an IoU-aware dense object detector based on the FCOS+ATSS architecture, that we call VarifocalNet or VFNet for short. Extensive experiments on MS COCO show that our VFNet consistently surpasses the strong baseline by 2.0 AP with different backbones. Our best model VFNet-X-1200 with Res2Net-101-DCN achieves a single-model single-scale AP of 55.1 on COCO test-dev, which is state-of-the-art among various object detectors. Code is available at: https://github.com/hyz-xmaster/VarifocalNet.}, + eventtitle = {2021 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, + file = {/home/zenon/Zotero/storage/MNHCWJDV/Zhang et al. - 2021 - VarifocalNet An IoU-aware Dense Object Detector.pdf;/home/zenon/Zotero/storage/T89I8NJC/9578034.html} +} + +@article{zheng2020, title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}}, shorttitle = {Distance-{{IoU Loss}}}, author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei}, - date = {2019-11-19}, - eprint = {1911.08287}, - eprinttype = {arxiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.1911.08287}, - abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while \$\textbackslash ell\_n\$-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, \textbackslash ie, overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster RCNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.}, - pubstate = {preprint}, - keywords = {Computer Science - Computer Vision and Pattern Recognition}, - file = {/home/zenon/Zotero/storage/4666A7J8/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf;/home/zenon/Zotero/storage/V85B5F3R/1911.html} + date = {2020-04-03}, + journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, + volume = {34}, + number = {07}, + pages = {12993--13000}, + issn = {2374-3468}, + doi = {10.1609/aaai.v34i07.6999}, + abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while ℓn-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, i.e., overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster R-CNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.}, + issue = {07}, + langid = {english}, + file = {/home/zenon/Zotero/storage/5IUI6GPY/Zheng et al. - 2020 - Distance-IoU Loss Faster and Better Learning for .pdf} } @article{zhong2022, @@ -1586,6 +1759,15 @@ file = {/home/zenon/Zotero/storage/CLHDBTJ2/qWPwnQEACAAJ.html} } +@article{zotero-338, + title = {Ultralytics/Yolov5: V7.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}}, + shorttitle = {Ultralytics/Yolov5}, + doi = {10.5281/zenodo.7347926}, + abstract = {{$<$}div align="center"{$>$} {$<$}a align="center" href="https://ultralytics.com/yolov5" target="\_blank"{$>$} {$<$}img width="850" src="https://github.com/ultralytics/assets/blob/master/yolov5/v70/splash.png"{$><$}/a{$>$} {$<$}/div{$>$} {$<$}br{$>$} Our new YOLOv5 v7.0 instance segmentation models are the fastest and most accurate in the world, beating all current SOTA benchmarks. We've made them super simple to train, validate and deploy. See full details in our Release Notes and visit our YOLOv5 Segmentation Colab Notebook for quickstart tutorials. {$<$}div align="center"{$>$} {$<$}a align="center" href="https://ultralytics.com/yolov5" target="\_blank"{$>$} {$<$}img width="800" src="https://user-images.githubusercontent.com/26833433/203348073-9b85607b-03e2-48e1-a6ba-fe1c1c31749c.png"{$><$}/a{$>$} {$<$}/div{$>$} {$<$}br{$>$} Our primary goal with this release is to introduce super simple YOLOv5 segmentation workflows just like our existing object detection models. The new v7.0 YOLOv5-seg models below are just a start, we will continue to improve these going forward together with our existing detection and classification models. We'd love your feedback and contributions on this effort! This release incorporates 280 PRs from 41 contributors since our last release in August 2022. Important Updates Segmentation Models ⭐ NEW: SOTA YOLOv5-seg COCO-pretrained segmentation models are now available for the first time (https://github.com/ultralytics/yolov5/pull/9052 by @glenn-jocher, @AyushExel and @Laughing-q) Paddle Paddle Export: Export any YOLOv5 model (cls, seg, det) to Paddle format with python export.py --include paddle (https://github.com/ultralytics/yolov5/pull/9459 by @glenn-jocher) YOLOv5 AutoCache: Use python train.py --cache ram will now scan available memory and compare against predicted dataset RAM usage. This reduces risk in caching and should help improve adoption of the dataset caching feature, which can significantly speed up training. (https://github.com/ultralytics/yolov5/pull/10027 by @glenn-jocher) Comet Logging and Visualization Integration: Free forever, Comet lets you save YOLOv5 models, resume training, and interactively visualise and debug predictions. (https://github.com/ultralytics/yolov5/pull/9232 by @DN6) New Segmentation Checkpoints We trained YOLOv5 segmentations models on COCO for 300 epochs at image size 640 using A100 GPUs. We exported all models to ONNX FP32 for CPU speed tests and to TensorRT FP16 for GPU speed tests. We ran all speed tests on Google Colab Pro notebooks for easy reproducibility. Model size{$<$}br{$>$}\textsuperscript{(pixels) mAP\textsuperscript{box{$<$}br{$>$}50-95 mAP\textsuperscript{mask{$<$}br{$>$}50-95 Train time{$<$}br{$>$}\textsuperscript{300 epochs{$<$}br{$>$}A100 (hours) Speed{$<$}br{$>$}\textsuperscript{ONNX CPU{$<$}br{$>$}(ms) Speed{$<$}br{$>$}\textsuperscript{TRT A100{$<$}br{$>$}(ms) params{$<$}br{$>$}\textsuperscript{(M) FLOPs{$<$}br{$>$}\textsuperscript{@640 (B) YOLOv5n-seg 640 27.6 23.4 80:17 62.7 1.2 2.0 7.1 YOLOv5s-seg 640 37.6 31.7 88:16 173.3 1.4 7.6 26.4 YOLOv5m-seg 640 45.0 37.1 108:36 427.0 2.2 22.0 70.8 YOLOv5l-seg 640 49.0 39.9 66:43 (2x) 857.4 2.9 47.9 147.7 YOLOv5x-seg 640 50.7 41.4 62:56 (3x) 1579.2 4.5 88.8 265.7 All checkpoints are trained to 300 epochs with SGD optimizer with lr0=0.01 and weight\_decay=5e-5 at image size 640 and all default settings.{$<$}br{$>$}Runs logged to https://wandb.ai/glenn-jocher/YOLOv5\_v70\_official Accuracy values are for single-model single-scale on COCO dataset.{$<$}br{$>$}Reproduce by python segment/val.py --data coco.yaml --weights yolov5s-seg.pt Speed averaged over 100 inference images using a Colab Pro A100 High-RAM instance. Values indicate inference speed only (NMS adds about 1ms per image). {$<$}br{$>$}Reproduce by python segment/val.py --data coco.yaml --weights yolov5s-seg.pt --batch 1 Export to ONNX at FP32 and TensorRT at FP16 done with export.py. {$<$}br{$>$}Reproduce by python export.py --weights yolov5s-seg.pt --include engine --device 0 --half New Segmentation Usage Examples Train YOLOv5 segmentation training supports auto-download COCO128-seg segmentation dataset with --data coco128-seg.yaml argument and manual download of COCO-segments dataset with bash data/scripts/get\_coco.sh --train --val --segments and then python train.py --data coco.yaml. \# Single-GPU python segment/train.py --model yolov5s-seg.pt --data coco128-seg.yaml --epochs 5 --img 640 \# Multi-GPU DDP python -m torch.distributed.run --nproc\_per\_node 4 --master\_port 1 segment/train.py --model yolov5s-seg.pt --data coco128-seg.yaml --epochs 5 --img 640 --device 0,1,2,3 Val Validate YOLOv5m-seg accuracy on ImageNet-1k dataset: bash data/scripts/get\_coco.sh --val --segments \# download COCO val segments split (780MB, 5000 images) python segment/val.py --weights yolov5s-seg.pt --data coco.yaml --img 640 \# validate Predict Use pretrained YOLOv5m-seg to predict bus.jpg: python segment/predict.py --weights yolov5m-seg.pt --data data/images/bus.jpg model = torch.hub.load('ultralytics/yolov5', 'custom', 'yolov5m-seg.pt') \# load from PyTorch Hub (WARNING: inference not yet supported) Export Export YOLOv5s-seg model to ONNX and TensorRT: python export.py --weights yolov5s-seg.pt --include onnx engine --img 640 --device 0 Changelog Changes between previous release and this release: https://github.com/ultralytics/yolov5/compare/v6.2...v7.0 Changes since this release: https://github.com/ultralytics/yolov5/compare/v7.0...HEAD {$<$}details{$>$} {$<$}summary{$>$}🛠️ New Features and Bug Fixes (280){$<$}/summary{$>$} * Improve classification comments by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/8997 * Update `attempt\_download(release='v6.2')` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/8998 * Update README\_cn.md by @KieraMengru0907 in https://github.com/ultralytics/yolov5/pull/9001 * Update dataset `names` from array to dictionary by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9000 * [segment]: Allow inference on dirs and videos by @AyushExel in https://github.com/ultralytics/yolov5/pull/9003 * DockerHub tag update Usage example by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9005 * Add weight `decay` to argparser by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9006 * Add glob quotes to detect.py usage example by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9007 * Fix TorchScript JSON string key bug by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9015 * EMA FP32 assert classification bug fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9016 * Faster pre-processing for gray image input by @cher-liang in https://github.com/ultralytics/yolov5/pull/9009 * Improved `Profile()` inference timing by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9024 * `torch.empty()` for speed improvements by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9025 * Remove unused `time\_sync` import by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9026 * Add PyTorch Hub classification CI checks by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9027 * Attach transforms to model by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9028 * Default --data `imagenette160` training (fastest) by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9033 * VOC `names` dictionary fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9034 * Update train.py `import val as validate` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9037 * AutoBatch protect from negative batch sizes by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9048 * Temporarily remove `macos-latest` from CI by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9049 * Add `--save-hybrid` mAP warning by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9050 * Refactor for simplification by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9054 * Refactor for simplification 2 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9055 * zero-mAP fix return `.detach()` to EMA by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9056 * zero-mAP fix 3 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9058 * Daemon `plot\_labels()` for faster start by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9057 * TensorBoard fix in tutorial.ipynb by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9064 * zero-mAP fix remove `torch.empty()` forward pass in `.train()` mode by @0zppd in https://github.com/ultralytics/yolov5/pull/9068 * Rename 'labels' to 'instances' by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9066 * Threaded TensorBoard graph logging by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9070 * De-thread TensorBoard graph logging by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9071 * Two dimensional `size=(h,w)` AutoShape support by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9072 * Remove unused Timeout import by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9073 * Improved Usage example docstrings by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9075 * Install `torch` latest stable by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9092 * New `@try\_export` decorator by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9096 * Add optional `transforms` argument to LoadStreams() by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9105 * Streaming Classification support by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9106 * Fix numpy to torch cls streaming bug by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9112 * Infer Loggers project name by @AyushExel in https://github.com/ultralytics/yolov5/pull/9117 * Add CSV logging to GenericLogger by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9128 * New TryExcept decorator by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9154 * Fixed segment offsets by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9155 * New YOLOv5 v6.2 splash images by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9142 * Rename onnx\_dynamic -{$>$} dynamic by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9168 * Inline `\_make\_grid()` meshgrid by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9170 * Comment EMA assert by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9173 * Fix confidence threshold for ClearML debug images by @HighMans in https://github.com/ultralytics/yolov5/pull/9174 * Update Dockerfile-cpu by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9184 * Update Dockerfile-cpu to libpython3-dev by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9185 * Update Dockerfile-arm64 to libpython3-dev by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9187 * Fix AutoAnchor MPS bug by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9188 * Skip AMP check on MPS by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9189 * ClearML's set\_report\_period's time is defined in minutes not seconds. by @HighMans in https://github.com/ultralytics/yolov5/pull/9186 * Add `check\_git\_status(..., branch='master')` argument by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9199 * `check\_font()` on notebook init by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9200 * Comment `protobuf` in requirements.txt by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9207 * `check\_font()` fstring update by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9208 * AutoBatch protect from extreme batch sizes by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9209 * Default AutoBatch 0.8 fraction by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9212 * Delete rebase.yml by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9202 * Duplicate segment verification fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9225 * New `LetterBox(size)` `CenterCrop(size)`, `ToTensor()` transforms (\#9213) by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9213 * Add ClassificationModel TF export assert by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9226 * Remove usage of `pathlib.Path.unlink(missing\_ok=...)` by @ymerkli in https://github.com/ultralytics/yolov5/pull/9227 * Add support for `*.pfm` images by @spacewalk01 in https://github.com/ultralytics/yolov5/pull/9230 * Python check warning emoji by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9238 * Add `url\_getsize()` function by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9247 * Update dataloaders.py by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9250 * Refactor Loggers : Move code outside train.py by @AyushExel in https://github.com/ultralytics/yolov5/pull/9241 * Update general.py by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9252 * Add LoadImages.\_cv2\_rotate() by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9249 * Move `cudnn.benchmarks(True)` to LoadStreams by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9258 * `cudnn.benchmark = True` on Seed 0 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9259 * Update `TryExcept(msg='...')`` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9261 * Make sure best.pt model file is preserved ClearML by @thepycoder in https://github.com/ultralytics/yolov5/pull/9265 * DetectMultiBackend improvements by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9269 * Update DetectMultiBackend for tuple outputs by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9274 * Update DetectMultiBackend for tuple outputs 2 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9275 * Update benchmarks CI with `--hard-fail` min metric floor by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9276 * Add new `--vid-stride` inference parameter for videos by @VELCpro in https://github.com/ultralytics/yolov5/pull/9256 * [pre-commit.ci] pre-commit suggestions by @pre-commit-ci in https://github.com/ultralytics/yolov5/pull/9295 * Replace deprecated `np.int` with `int` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9307 * Comet Logging and Visualization Integration by @DN6 in https://github.com/ultralytics/yolov5/pull/9232 * Comet changes by @DN6 in https://github.com/ultralytics/yolov5/pull/9328 * Train.py line 486 typo fix by @robinned in https://github.com/ultralytics/yolov5/pull/9330 * Add dilated conv support by @YellowAndGreen in https://github.com/ultralytics/yolov5/pull/9347 * Update `check\_requirements()` single install by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9353 * Update `check\_requirements(args, cmds='')` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9355 * Update `check\_requirements()` multiple string by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9356 * Add PaddlePaddle export and inference by @kisaragychihaya in https://github.com/ultralytics/yolov5/pull/9240 * PaddlePaddle Usage examples by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9358 * labels.jpg names fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9361 * Exclude `ipython` from hubconf.py `check\_requirements()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9362 * `torch.jit.trace()` fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9363 * AMP Check fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9367 * Remove duplicate line in setup.cfg by @zldrobit in https://github.com/ultralytics/yolov5/pull/9380 * Remove `.train()` mode exports by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9429 * Continue on Docker arm64 failure by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9430 * Continue on Docker failure (all backends) by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9432 * Continue on Docker fail (all backends) fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9433 * YOLOv5 segmentation model support by @AyushExel in https://github.com/ultralytics/yolov5/pull/9052 * Fix val.py zero-TP bug by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9431 * New model.yaml `activation:` field by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9371 * Fix tick labels for background FN/FP by @hotohoto in https://github.com/ultralytics/yolov5/pull/9414 * Fix TensorRT exports to ONNX opset 12 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9441 * AutoShape explicit arguments fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9443 * Update Detections() instance printing by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9445 * AutoUpdate TensorFlow in export.py by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9447 * AutoBatch `cudnn.benchmark=True` fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9448 * Do not move downloaded zips by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9455 * Update general.py by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9454 * `Detect()` and `Segment()` fixes for CoreML and Paddle by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9458 * Add Paddle exports to benchmarks by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9459 * Add `macos-latest` runner for CoreML benchmarks by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9453 * Fix cutout bug by @Oswells in https://github.com/ultralytics/yolov5/pull/9452 * Optimize imports by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9464 * TensorRT SegmentationModel fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9465 * `Conv()` dilation argument fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9466 * Update ClassificationModel default training `imgsz=224` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9469 * Standardize warnings with `WARNING ⚠️ ...` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9467 * TensorFlow macOS AutoUpdate by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9471 * `segment/predict --save-txt` fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9478 * TensorFlow SegmentationModel support by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9472 * AutoBatch report include reserved+allocated by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9491 * Update Detect() grid init `for` loop by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9494 * Accelerate video inference by @mucunwuxian in https://github.com/ultralytics/yolov5/pull/9487 * Comet Image Logging Fix by @DN6 in https://github.com/ultralytics/yolov5/pull/9498 * Fix visualization title bug by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9500 * Add paddle tips by @Zengyf-CVer in https://github.com/ultralytics/yolov5/pull/9502 * Segmentation `polygons2masks\_overlap()` in `np.int32` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9493 * Fix `random\_perspective` param bug in segment by @FeiGeChuanShu in https://github.com/ultralytics/yolov5/pull/9512 * Remove `check\_requirements('flatbuffers==1.12')` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9514 * Fix TF Lite exports by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9517 * TFLite fix 2 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9518 * FROM nvcr.io/nvidia/pytorch:22.08-py3 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9520 * Remove scikit-learn constraint on coremltools 6.0 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9530 * Update scikit-learn constraint per coremltools 6.0 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9531 * Update `coremltools{$>$}=6.0` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9532 * Update albumentations by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9503 * import re by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9535 * TF.js fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9536 * Refactor dataset batch-size by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9551 * Add `--source screen` for screenshot inference by @zombob in https://github.com/ultralytics/yolov5/pull/9542 * Update `is\_url()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9566 * Detect.py supports running against a Triton container by @gaziqbal in https://github.com/ultralytics/yolov5/pull/9228 * New `scale\_segments()` function by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9570 * generator seed fix for DDP mAP drop by @Forever518 in https://github.com/ultralytics/yolov5/pull/9545 * Update default GitHub assets by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9573 * Update requirements.txt comment https://pytorch.org/get-started/locally/ by @davidamacey in https://github.com/ultralytics/yolov5/pull/9576 * Add segment line predictions by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9571 * TensorRT detect.py inference fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9581 * Update Comet links by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9587 * Add global YOLOv5\_DATASETS\_DIR by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9586 * Add Paperspace Gradient badges by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9588 * \#YOLOVISION22 announcement by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9590 * Bump actions/stale from 5 to 6 by @dependabot in https://github.com/ultralytics/yolov5/pull/9595 * \#YOLOVISION22 update by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9598 * Apple MPS -{$>$} CPU NMS fallback strategy by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9600 * Updated Segmentation and Classification usage by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9607 * Update export.py Usage examples by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9609 * Fix `is\_url('https://ultralytics.com')` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9610 * Add `results.save(save\_dir='path', exist\_ok=False)` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9617 * NMS MPS device wrapper by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9620 * Add SegmentationModel unsupported warning by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9632 * Disabled upload\_dataset flag temporarily due to an artifact related bug by @soumik12345 in https://github.com/ultralytics/yolov5/pull/9652 * Add NVIDIA Jetson Nano Deployment tutorial by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9656 * Added cutout import from utils/augmentations.py to use Cutout Aug in … by @senhorinfinito in https://github.com/ultralytics/yolov5/pull/9668 * Simplify val.py benchmark mode with speed mode by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9674 * Allow list for Comet artifact class 'names' field by @KristenKehrer in https://github.com/ultralytics/yolov5/pull/9654 * [pre-commit.ci] pre-commit suggestions by @pre-commit-ci in https://github.com/ultralytics/yolov5/pull/9685 * TensorRT `--dynamic` fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9691 * FROM nvcr.io/nvidia/pytorch:22.09-py3 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9711 * Error in utils/segment/general `masks2segments()` by @paulguerrie in https://github.com/ultralytics/yolov5/pull/9724 * Fix segment evolution keys by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9742 * Remove \#YOLOVISION22 notice by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9751 * Update Loggers by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9760 * update mask2segments and saving results by @vladoossss in https://github.com/ultralytics/yolov5/pull/9785 * HUB VOC fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9792 * Update hubconf.py local repo Usage example by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9803 * Fix xView dataloaders import by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9807 * Argoverse HUB fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9809 * `smart\_optimizer()` revert to weight with decay by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9817 * Allow PyTorch Hub results to display in notebooks by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9825 * Logger Cleanup by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9828 * Remove ipython from `check\_requirements` exclude list by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9841 * Update HUBDatasetStats() usage examples by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9842 * Update ZipFile to context manager by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9843 * Update README.md by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9846 * Webcam show fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9847 * Fix OpenVINO Usage example by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9874 * ClearML Dockerfile fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9876 * Windows Python 3.7 .isfile() fix by @SSTato in https://github.com/ultralytics/yolov5/pull/9879 * Add TFLite Metadata to TFLite and Edge TPU models by @paradigmn in https://github.com/ultralytics/yolov5/pull/9903 * Add `gnupg` to Dockerfile-cpu by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9932 * Add ClearML minimum version requirement by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9933 * Update Comet Integrations table text by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9937 * Update README.md by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9957 * Update README.md by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9958 * Update README.md by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9961 * Switch from suffix checks to archive checks by @kalenmike in https://github.com/ultralytics/yolov5/pull/9963 * FROM nvcr.io/nvidia/pytorch:22.10-py3 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9966 * Full-size proto code (optional) by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9980 * Update README.md by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9970 * Segmentation Tutorial by @paulguerrie in https://github.com/ultralytics/yolov5/pull/9521 * Fix `is\_colab()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9994 * Check online twice on AutoUpdate by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9999 * Add `min\_items` filter option by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9997 * Improved `check\_online()` robustness by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10000 * Fix `min\_items` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10001 * Update default `--epochs 100` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10024 * YOLOv5 AutoCache Update by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10027 * IoU `eps` adjustment by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10051 * Update get\_coco.sh by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10057 * [pre-commit.ci] pre-commit suggestions by @pre-commit-ci in https://github.com/ultralytics/yolov5/pull/10068 * Use MNIST160 by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10069 * Update Dockerfile keep default torch installation by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10071 * Add `ultralytics` pip package by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10103 * AutoShape integer image-size fix by @janus-zheng in https://github.com/ultralytics/yolov5/pull/10090 * YouTube Usage example comments by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10106 * Mapped project and name to ClearML by @thepycoder in https://github.com/ultralytics/yolov5/pull/10100 * Update IoU functions by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10123 * Add Ultralytics HUB to README by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10070 * Fix benchmark.py usage comment by @rusamentiaga in https://github.com/ultralytics/yolov5/pull/10131 * Update HUB banner image by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10134 * Copy-Paste zero value fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10152 * Add Copy-Paste to `mosaic9()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10165 * Add `join\_threads()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10086 * Fix dataloader filepath modification to perform replace only once and not for all occurences of string by @adumrewal in https://github.com/ultralytics/yolov5/pull/10163 * fix: prevent logging config clobbering by @rkechols in https://github.com/ultralytics/yolov5/pull/10133 * Filter PyTorch 1.13 UserWarnings by @triple-Mu in https://github.com/ultralytics/yolov5/pull/10166 * Revert "fix: prevent logging config clobbering" by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10177 * Apply make\_divisible for ONNX models in Autoshape by @janus-zheng in https://github.com/ultralytics/yolov5/pull/10172 * data.yaml `names.keys()` integer assert by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10190 * fix: try 2 - prevent logging config clobbering by @rkechols in https://github.com/ultralytics/yolov5/pull/10192 * Segment prediction labels normalization fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10205 * Simplify dataloader tqdm descriptions by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10210 * New global `TQDM\_BAR\_FORMAT` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10211 * Feature/classification tutorial refactor by @paulguerrie in https://github.com/ultralytics/yolov5/pull/10039 * Remove Colab notebook High-Memory notices by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10212 * Revert `--save-txt` to default False by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10213 * Add `--source screen` Usage example by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10215 * Add `git` info to training checkpoints by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/9655 * Add git info to cls, seg checkpoints by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10217 * Update Comet preview image by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10220 * Scope gitpyhon import in `check\_git\_info()` by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10221 * Squeezenet reshape outputs fix by @glenn-jocher in https://github.com/ultralytics/yolov5/pull/10222 {$<$}/details{$><$}details{$>$} {$<$}summary{$>$}😃 New Contributors (30){$<$}/summary{$>$} * @KieraMengru0907 made their first contribution in https://github.com/ultralytics/yolov5/pull/9001 * @cher-liang made their first contribution in https://github.com/ultralytics/yolov5/pull/9009 * @0zppd made their first contribution in https://github.com/ultralytics/yolov5/pull/9068 * @HighMans made their first contribution in https://github.com/ultralytics/yolov5/pull/9174 * @ymerkli made their first contribution in https://github.com/ultralytics/yolov5/pull/9227 * @spacewalk01 made their first contribution in https://github.com/ultralytics/yolov5/pull/9230 * @VELCpro made their first contribution in https://github.com/ultralytics/yolov5/pull/9256 * @DN6 made their first contribution in https://github.com/ultralytics/yolov5/pull/9232 * @robinned made their first contribution in https://github.com/ultralytics/yolov5/pull/9330 * @kisaragychihaya made their first contribution in https://github.com/ultralytics/yolov5/pull/9240 * @hotohoto made their first contribution in https://github.com/ultralytics/yolov5/pull/9414 * @Oswells made their first contribution in https://github.com/ultralytics/yolov5/pull/9452 * @mucunwuxian made their first contribution in https://github.com/ultralytics/yolov5/pull/9487 * @FeiGeChuanShu made their first contribution in https://github.com/ultralytics/yolov5/pull/9512 * @zombob made their first contribution in https://github.com/ultralytics/yolov5/pull/9542 * @gaziqbal made their first contribution in https://github.com/ultralytics/yolov5/pull/9228 * @Forever518 made their first contribution in https://github.com/ultralytics/yolov5/pull/9545 * @davidamacey made their first contribution in https://github.com/ultralytics/yolov5/pull/9576 * @soumik12345 made their first contribution in https://github.com/ultralytics/yolov5/pull/9652 * @senhorinfinito made their first contribution in https://github.com/ultralytics/yolov5/pull/9668 * @KristenKehrer made their first contribution in https://github.com/ultralytics/yolov5/pull/9654 * @paulguerrie made their first contribution in https://github.com/ultralytics/yolov5/pull/9724 * @vladoossss made their first contribution in https://github.com/ultralytics/yolov5/pull/9785 * @SSTato made their first contribution in https://github.com/ultralytics/yolov5/pull/9879 * @janus-zheng made their first contribution in https://github.com/ultralytics/yolov5/pull/10090 * @rusamentiaga made their first contribution in https://github.com/ultralytics/yolov5/pull/10131 * @adumrewal made their first contribution in https://github.com/ultralytics/yolov5/pull/10163 * @rkechols made their first contribution in https://github.com/ultralytics/yolov5/pull/10133 * @triple-Mu made their first contribution in https://github.com/ultralytics/yolov5/pull/10166 {$<$}/details{$>$}}}}}}}}}}, + langid = {english}, + file = {/home/zenon/Zotero/storage/IWYM45AP/7347926.html} +} + @article{zou2023, title = {Object {{Detection}} in 20 {{Years}}: {{A Survey}}}, shorttitle = {Object {{Detection}} in 20 {{Years}}}, diff --git a/thesis/thesis.pdf b/thesis/thesis.pdf index 0c03b74..2c171e9 100644 Binary files a/thesis/thesis.pdf and b/thesis/thesis.pdf differ diff --git a/thesis/thesis.tex b/thesis/thesis.tex index 7154090..9b8b18c 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -132,6 +132,14 @@ Challenge} \newacronym{bn}{BN}{Batch Normalization} \newacronym{uav}{UAV}{Unmanned Aerial Vehicle} \newacronym{csi}{CSI}{Camera Serial Interface} +\newacronym{nms}{NMS}{Non Maximum Suppression} +\newacronym{sam}{SAM}{Spatial Attention Module} +\newacronym{panet}{PANet}{Path Aggregation Network} +\newacronym{ciou}{CIoU}{Complete Intersection over Union} +\newacronym{siou}{SIoU}{Scylla Intersection over Union} +\newacronym{giou}{GIoU}{Generalized Intersection over Union} +\newacronym{elan}{ELAN}{Efficient Layer Aggregation Network} +\newacronym{eelan}{E-ELAN}{Extended Efficient Layer Aggregation Network} \begin{document} @@ -2084,20 +2092,18 @@ models. \section{Selected Methods} \label{sec:selected-methods} -Estimated 7 pages for this section. +In the following sections we will go into detail about the two +selected architectures for our prototype. The object detector we +chose---\gls{yolo}v7---is part of a larger family of models which all +function similarly, but have undergone substantial changes from +version to version. In order to understand the used model, we trace +the improvements to the \gls{yolo} family from version one to version +seven. For the classification stage, we have opted for a ResNet +architecture which is also described in detail. \subsection{You Only Look Once} \label{sec:methods-detection} -Describe the inner workings of the YOLOv7 model structure and contrast -it with previous versions as well as other object detectors. What has -changed and how did these improvements manifest themselves? Reference -the original paper~\cite{wang2022} and papers of previous versions of -the same model (YOLOv5~\cite{jocher2022}, -YOLOv4~\cite{bochkovskiy2020}). - -Estimated 2 pages for this section. - The \gls{yolo} family of object detection models started in 2015 when \cite{redmon2016} published the first version. Since then there have been up to 16 updated versions depending on how one counts. The @@ -2205,16 +2211,130 @@ the \gls{voc} 2007 data set compared to 63.4\% of the previous at \qty{40}{fps} (\gls{map} 78.6\%) and up to \qty{91}{fps} (\gls{map} 69\%). +\subsubsection{\gls{yolo}v3} +\label{sssec:yolov3} + +\gls{yolo}v3 \cite{redmon2018} provided additional updates to the +\gls{yolo}v2 model. To be competitive with the deeper network +structures of state-of-the-art models at the time, the authors +introduce a deeper feature extractor called Darknet-53. It makes use +of the residual connections popularized by ResNet \cite{he2016} (see +section~\ref{sssec:theory-resnet}). Darknet-53 is more accurate than +Darknet-19 and compares to ResNet-101, but can process more images per +second (\qty{78}{fps} versus \qty{53}{fps}). The activation function +throughout the network is still leaky \gls{relu}, as in earlier +versions. + +\gls{yolo}v3 uses multi-scale predictions to achieve better detection +ratios across object sizes. Inspired by \glspl{fpn} (see +section~\ref{sssec:theory-fpn}), \gls{yolo}v3 uses predictions at +different scales from the feature extractor and combines them to form +a final prediction. Combining the features from multiple scales is +often done in the \emph{neck} of the object detection architecture. + +Around the time of the publication of \gls{yolo}v3, researchers +started to use the terminology \emph{backbone}, \emph{neck} and +\emph{head} to describe the architecture of object detection +models. The feature extractor (Darknet-53 in this case) is the +\emph{backbone} and provides the feature maps which are aggregated in +the \emph{neck} and passed to the \emph{head} which outputs the final +predictions. In some cases there are additional postprocessing steps +in the head such as \gls{nms} to eliminate duplicate or suboptimal +detections. + +While \gls{yolo}v2 had problems detecting small objects, \gls{yolo}v3 +performs much better on them (\gls{ap} of 18.3\% versus 5\% on +\gls{coco}). The authors note, however, that the new model sometimes +has comparatively worse results with larger objects. The reasons for +this behavior are unknown. Additionally, \gls{yolo}v3 is still lagging +behind other detectors when it comes to accurately localizing +objects. The \gls{coco} evaluation metric was changed from the +previous \gls{ap}$_{0.5}$ to the \gls{map} between $0.5$ to $0.95$ +which penalizes detectors which do not achieve close to perfect +\gls{iou} scores. This change highlights \gls{yolo}v3's weakness in +that area. + +\subsubsection{\gls{yolo}v4} +\label{sssec:yolov4} + +Keeping in line with the aim of carefully balancing accuracy and speed +of detection, \textcite{bochkovskiy2020} publish the fourth version of +\gls{yolo}. The authors investigate the use of what they term +\emph{bag of freebies}---methods which increase training time while +increasing inference accuracy without sacrificing inference speed. A +prominent example of such methods is data augmentation (see +section~\ref{sec:methods-augmentation}). Specifically, the authors +propose to use mosaic augmentation which lowers the need for large +mini-batch sizes. They also use new features such as weighted residual +connections \cite{shen2016}, a modified \gls{sam} \cite{woo2018}, a +modified \gls{panet} \cite{liu2018} for the neck, \gls{ciou} loss +\cite{zheng2020} for the detector and the Mish activation function +\cite{misra2020}. + +Taken together, these additional improvements yield a \gls{map} of +43.5\% on the \gls{coco} test set while maintaining a speed of above +\qty{30}{fps} on modern \glspl{gpu}. \gls{yolo}v4 was the first +version which provided results on all scales (S, M, L) that were +better than almost all other detectors at the time without sacrificing +speed. + +\subsubsection{\gls{yolo}v5} +\label{sssec:yolov5} + +The author of \gls{yolo}v5 \cite{jocher2020} ported the code from +\gls{yolo}v4 from the Darknet framework to PyTorch which facilitated +better interoperability with other Python utilities. New in this +version is the pretraining algorithm called AutoAnchor which adjusts +the anchor boxes based on the data set at hand. This version also +implements a genetic algorithm for hyperparameter optimization (see +section~\ref{ssec:hypopt-evo}) which is used in our work as well. + +Version 5 comes in multiple architectures of various complexity. The +smallest---and therefore fastest---version is called \gls{yolo}v5n where +the \emph{n} stands for \emph{nano}. Additional versions with +increasing parameters are \gls{yolo}v5s (small), \gls{yolo}v5m +(medium), \gls{yolo}v5l (large), and \gls{yolo}v5x (extra large). The +smaller models are intended to be used in resource constrained +environments such as edge devices, but come with a cost in +accuracy. Conversely, the larger models are for tasks where high +accuracy is paramount and enough computational resources are +available. The \gls{yolo}v5x model achieves a \gls{map} of 50.7\% on +the \gls{coco} test data set. + +\subsubsection{\gls{yolo}v6} +\label{sssec:yolov6} + +The authors of \gls{yolo}v6 \cite{li2022a} use a new backbone based on +RepVGG \cite{ding2021} which they call EfficientRep. They also use +different losses for classification (Varifocal loss \cite{zhang2021}) +and bounding box regression (\gls{siou} +\cite{gevorgyan2022}/\gls{giou} \cite{rezatofighi2019}). \gls{yolo}v6 +is made available in eight scaled version of which the largest +achieves a \gls{map} of 57.2\% on the \gls{coco} test set. + +\subsubsection{\gls{yolo}v7} +\label{sssec:yolov7} + +At the time of implementation of our own plant detector, \gls{yolo}v7 +\cite{wang2022b} was the newest version within the \gls{yolo} +family. Similarly to \gls{yolo}v4, it introduces more trainable bag of +freebies which do not impact inference time. The improvements include +the use of \glspl{eelan} (based on \glspl{elan} \cite{wang2022a}), +joint depth and width model scaling techniques, reparameterization on +module level, and an auxiliary head---similarly to GoogleNet (see +section~\ref{sssec:theory-googlenet})---which assists during +training. The model does not use a pretrained backbone, it is instead +trained from scratch on the \gls{coco} data set. These changes result +in much smaller model sizes compared to \gls{yolo}v4 and a \gls{map} +of 56.8\% with a detection speed of over \qty{30}{fps}. + +We use \gls{yolo}v7 in our own work during the plant detection stage +because it was the fastest and most accurate object detector at the +time of implementation. + \subsection{ResNet} \label{sec:methods-classification} -Introduce the approach of the \emph{ResNet} networks which implement -residual connections to allow deeper layers. Describe the inner -workings of the ResNet model structure. Reference the original -paper~\cite{he2016}. - -Estimated 2 pages for this section. - Early research \cite{bengio1994,glorot2010} already demonstrated that the vanishing/exploding gradient problem with standard gradient descent and random initialization adversely affects convergence during @@ -3099,8 +3219,8 @@ Estimated 1 page for this section \listoftables % Starred version, i.e., \listoftables*, removes the toc entry. % Use an optional list of algorithms. -\listofalgorithms -\addcontentsline{toc}{chapter}{List of Algorithms} +% \listofalgorithms +% \addcontentsline{toc}{chapter}{List of Algorithms} % Add an index. \printindex @@ -3117,18 +3237,4 @@ Estimated 1 page for this section %%% mode: latex %%% TeX-master: "thesis" %%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t -%%% TeX-master: t %%% End: