Add softmax and section about loss functions

This commit is contained in:
Tobias Eidelpes 2023-09-28 17:54:45 +02:00
parent 419d8da5d6
commit 2ccc90a90b
3 changed files with 160 additions and 4 deletions

View File

@ -273,6 +273,38 @@
file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html} file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
} }
@online{ge2021,
title = {{{OTA}}: {{Optimal Transport Assignment}} for {{Object Detection}}},
shorttitle = {{{OTA}}},
author = {Ge, Zheng and Liu, Songtao and Li, Zeming and Yoshie, Osamu and Sun, Jian},
date = {2021-03-26},
eprint = {2103.14259},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2103.14259},
urldate = {2023-09-28},
abstract = {Recent advances in label assignment in object detection mainly seek to independently define positive/negative training samples for each ground-truth (gt) object. In this paper, we innovatively revisit the label assignment from a global perspective and propose to formulate the assigning procedure as an Optimal Transport (OT) problem -- a well-studied topic in Optimization Theory. Concretely, we define the unit transportation cost between each demander (anchor) and supplier (gt) pair as the weighted summation of their classification and regression losses. After formulation, finding the best assignment solution is converted to solve the optimal transport plan at minimal transportation costs, which can be solved via Sinkhorn-Knopp Iteration. On COCO, a single FCOS-ResNet-50 detector equipped with Optimal Transport Assignment (OTA) can reach 40.7\% mAP under 1X scheduler, outperforming all other existing assigning methods. Extensive experiments conducted on COCO and CrowdHuman further validate the effectiveness of our proposed OTA, especially its superiority in crowd scenarios. The code is available at https://github.com/Megvii-BaseDetection/OTA.},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/LE4HHL9X/Ge et al. - 2021 - OTA Optimal Transport Assignment for Object Detec.pdf;/home/zenon/Zotero/storage/MCF86ZKV/2103.html}
}
@online{ge2021a,
title = {{{YOLOX}}: {{Exceeding YOLO Series}} in 2021},
shorttitle = {{{YOLOX}}},
author = {Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
date = {2021-08-05},
eprint = {2107.08430},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2107.08430},
urldate = {2023-09-28},
abstract = {In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3\% AP on COCO, surpassing NanoDet by 1.8\% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3\% AP on COCO, outperforming the current best practice by 3.0\% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0\% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8\% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. Source code is at https://github.com/Megvii-BaseDetection/YOLOX.},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/B9KGZ7N2/Ge et al. - 2021 - YOLOX Exceeding YOLO Series in 2021.pdf;/home/zenon/Zotero/storage/XQTJLGLZ/2107.html}
}
@inproceedings{girshick2015, @inproceedings{girshick2015,
title = {Deformable Part Models Are Convolutional Neural Networks}, title = {Deformable Part Models Are Convolutional Neural Networks},
booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})}, booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -404,6 +436,21 @@
file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf} file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf}
} }
@online{lin2017,
title = {Feature {{Pyramid Networks}} for {{Object Detection}}},
author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
date = {2017-04-19},
eprint = {1612.03144},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.1612.03144},
urldate = {2023-09-28},
abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But recent deep learning object detectors have avoided pyramid representations, in part because they are compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using FPN in a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/8BBA7R4F/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/KUPLTHRQ/1612.html}
}
@incollection{liu2016, @incollection{liu2016,
title = {{{SSD}}: {{Single Shot MultiBox Detector}}}, title = {{{SSD}}: {{Single Shot MultiBox Detector}}},
shorttitle = {{{SSD}}}, shorttitle = {{{SSD}}},
@ -739,6 +786,22 @@
file = {/home/zenon/Zotero/storage/A7KFIFE2/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf} file = {/home/zenon/Zotero/storage/A7KFIFE2/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf}
} }
@online{zheng2019a,
title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}},
shorttitle = {Distance-{{IoU Loss}}},
author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
date = {2019-11-19},
eprint = {1911.08287},
eprinttype = {arxiv},
eprintclass = {cs},
doi = {10.48550/arXiv.1911.08287},
urldate = {2023-09-28},
abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while \$\textbackslash ell\_n\$-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, \textbackslash ie, overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster RCNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.},
pubstate = {preprint},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/home/zenon/Zotero/storage/4666A7J8/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf;/home/zenon/Zotero/storage/V85B5F3R/1911.html}
}
@article{zhong2022, @article{zhong2022,
title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}}, title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}},
author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei}, author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei},

Binary file not shown.

View File

@ -103,6 +103,7 @@
\newacronym{relu}{ReLU}{Rectified Linear Unit} \newacronym{relu}{ReLU}{Rectified Linear Unit}
\newacronym{elu}{ELU}{Exponential Linear Unit} \newacronym{elu}{ELU}{Exponential Linear Unit}
\newacronym{silu}{SiLU}{Sigmoid Linear Unit} \newacronym{silu}{SiLU}{Sigmoid Linear Unit}
\newacronym{mse}{MSE}{mean squared error}
\begin{document} \begin{document}
@ -611,7 +612,10 @@ function \cite{minsky2017}. Non-linear functions, however, are a
requirement for neural networks to become \emph{universal requirement for neural networks to become \emph{universal
approximators} \cite{hornik1989}. We will introduce several activation approximators} \cite{hornik1989}. We will introduce several activation
functions which are used in the field of machine learning in the functions which are used in the field of machine learning in the
following sections. following sections. There exist many more than can be discussed within
the scope of this thesis. However, the selection should give an
overview of the most used and influential ones in the author's
opinion.
\subsubsection{Identity} \subsubsection{Identity}
\label{sssec:theory-identity} \label{sssec:theory-identity}
@ -738,9 +742,100 @@ function has become an essential tool for deep learning practitioners
and has contributed to the success of many state-of-the-art models in and has contributed to the success of many state-of-the-art models in
computer vision, natural language processing, and other domains. computer vision, natural language processing, and other domains.
\subsubsection{Softmax}
\label{sssec:theory-softmax}
The softmax activation function is often used as the last activation
function of a neural network to normalize the output of a network to a
probability distribution over predicted output classes. It takes a
vector of numbers, known as logits, and scales them into
probabilities. The output of the softmax function is a vector with
probabilities of each possible outcome, and the probabilities in the
vector sum to one for all possible outcomes or classes. In
mathematical terms, the function is defined as
\begin{equation}
\label{eq:softmax}
\sigma(\vec{z})_{i} = \frac{e^{z_i}}{\sum_{j=1}^Ke^{z_j}}i\ \mathrm{for}\ i = 1,\dots,K\ \mathrm{and}\ \vec{z} = (z_1,\dots,z_K)\in\mathbb{R}^K
\end{equation}
where the standard exponential function is applied to each value in
the vector $\vec{z}$ and the result is normalized with the sum of the
exponentials.
\subsection{Loss Function} \subsection{Loss Function}
\label{ssec:theory-loss-function} \label{ssec:theory-loss-function}
Loss functions play a fundamental role in machine learning, as they
are used to evaluate the performance of a model and guide its
training. The choice of loss function can significantly impact the
accuracy and generalization of the model. There are various types of
loss functions, each with its strengths and weaknesses, and the
appropriate choice depends on the specific problem being addressed.
From the definition of a learning program from
section~\ref{sec:theory-ml}, loss functions constitute the performance
measure $P$ against which the results of the learning program are
measured. Only by minimizing the error obtained from the loss function
and updating the weights within the network is it possible to gain
experience $E$ at carrying out a task $T$. How the weights are updated
depends on the algorithm which is used during the \emph{backward pass}
to minimize the error. This type of procedure is referred to as
\emph{backpropagation} (see
section~\ref{ssec:theory-backpropagation}).
One common type of loss function is the \gls{mse} which is widely used
in regression problems. The \gls{mse} is a popular choice because it
is easy to compute and has a closed-form solution, making it efficient
to optimize. It does have some limitations, however. For instance, it
is sensitive to outliers, and it may not be appropriate for problems
with non-normal distributions. \gls{mse} measures the average squared
difference between predicted and actual values. It is calculated with
\begin{equation}
\label{eq:mse}
\mathrm{MSE_{test}} = \frac{1}{m}\sum_i(\hat{y}^{(\mathrm{test})} - y^{(\mathrm{test})})_i^2
\end{equation}
where $\hat{y}^{(\mathrm{test})}$ contains the predictions of the
model on the test set and $y^{(\mathrm{test})}$ refers to the target
labels \cite{goodfellow2016}. It follows that, if
$\hat{y}^{(\mathrm{test})} = y^{(\mathrm{test})}$, the error is $0$
and the model has produced a perfect prediction.
We cannot, however, take the results of the error on the test set to
update the weights during training because the test set must always
contain only samples which the model has not seen before. If the model
is trained to minimize the \gls{mse} on the test set and then
evaluated against the same set, the results will be how well the model
fits to the test set and not how well it generalizes. The goal,
therefore, is to minimize the error on the training set and to compare
the results against an evaluation on the test set. If the model
achieves very low error rates on the training set but not on the test
set, it is likely that the model is suffering from
\emph{overfitting}. Conversely, if the model does not achieve low
error rates on the training set, it is likely that the model is
suffering from \emph{underfitting}.
Another popular loss function is the cross-entropy loss, which is
commonly used in classification problems. Cross-entropy loss measures
the difference between predicted probabilities and actual labels. It
is a good choice for classification problems because it takes into
account the class imbalance issue and it is less sensitive to
outliers. However, cross-entropy loss has its own limitations. For
instance, it may not be appropriate for problems with more than two
classes, and it can be sensitive to the choice of the softmax
function.
In recent years, there has been an increasing interest in using
alternative loss functions that can better handle complex
problems. For example, the Huber loss is a modification of the MSE
loss that is more robust to outliers. The Smooth L1 loss is another
alternative that is less sensitive to outliers and can handle
non-normal distributions. These alternative loss functions have been
shown to be effective in various applications such as image
classification, object detection, and speech recognition.
\subsection{Backpropagation} \subsection{Backpropagation}
\label{ssec:theory-backpropagation} \label{ssec:theory-backpropagation}
@ -772,9 +867,6 @@ gives an introduction to object detection, explains common problems
researchers have faced and how they have been solved, and discusses researchers have faced and how they have been solved, and discusses
the two main approaches to object detection via deep learning. the two main approaches to object detection via deep learning.
\subsection{Definition}
\label{ssec:obj-definition}
\subsection{Traditional Methods} \subsection{Traditional Methods}
\label{ssec:obj-traditional} \label{ssec:obj-traditional}
@ -2001,4 +2093,5 @@ Estimated 1 page for this section
%%% TeX-master: t %%% TeX-master: t
%%% TeX-master: t %%% TeX-master: t
%%% TeX-master: t %%% TeX-master: t
%%% TeX-master: t
%%% End: %%% End: