Add softmax and section about loss functions

2023-09-28 17:54:45 +02:00 · 2023-09-28 17:54:45 +02:00 · 2ccc90a90b
commit 2ccc90a90b
parent 419d8da5d6
3 changed files with 160 additions and 4 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -273,6 +273,38 @@
  file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
 }
@online{ge2021,
  title = {{{OTA}}: {{Optimal Transport Assignment}} for {{Object Detection}}},
  shorttitle = {{{OTA}}},
  author = {Ge, Zheng and Liu, Songtao and Li, Zeming and Yoshie, Osamu and Sun, Jian},
  date = {2021-03-26},
  eprint = {2103.14259},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2103.14259},
  urldate = {2023-09-28},
  abstract = {Recent advances in label assignment in object detection mainly seek to independently define positive/negative training samples for each ground-truth (gt) object. In this paper, we innovatively revisit the label assignment from a global perspective and propose to formulate the assigning procedure as an Optimal Transport (OT) problem -- a well-studied topic in Optimization Theory. Concretely, we define the unit transportation cost between each demander (anchor) and supplier (gt) pair as the weighted summation of their classification and regression losses. After formulation, finding the best assignment solution is converted to solve the optimal transport plan at minimal transportation costs, which can be solved via Sinkhorn-Knopp Iteration. On COCO, a single FCOS-ResNet-50 detector equipped with Optimal Transport Assignment (OTA) can reach 40.7\% mAP under 1X scheduler, outperforming all other existing assigning methods. Extensive experiments conducted on COCO and CrowdHuman further validate the effectiveness of our proposed OTA, especially its superiority in crowd scenarios. The code is available at https://github.com/Megvii-BaseDetection/OTA.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/LE4HHL9X/Ge et al. - 2021 - OTA Optimal Transport Assignment for Object Detec.pdf;/home/zenon/Zotero/storage/MCF86ZKV/2103.html}
 }
@online{ge2021a,
  title = {{{YOLOX}}: {{Exceeding YOLO Series}} in 2021},
  shorttitle = {{{YOLOX}}},
  author = {Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
  date = {2021-08-05},
  eprint = {2107.08430},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2107.08430},
  urldate = {2023-09-28},
  abstract = {In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3\% AP on COCO, surpassing NanoDet by 1.8\% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3\% AP on COCO, outperforming the current best practice by 3.0\% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0\% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8\% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. Source code is at https://github.com/Megvii-BaseDetection/YOLOX.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/B9KGZ7N2/Ge et al. - 2021 - YOLOX Exceeding YOLO Series in 2021.pdf;/home/zenon/Zotero/storage/XQTJLGLZ/2107.html}
 }
@inproceedings{girshick2015,
  title = {Deformable Part Models Are Convolutional Neural Networks},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -404,6 +436,21 @@
  file = {/home/zenon/Zotero/storage/ZMCI6A8T/Lin et al. - 2015 - Microsoft COCO Common Objects in Context.pdf}
 }
@online{lin2017,
  title = {Feature {{Pyramid Networks}} for {{Object Detection}}},
  author = {Lin, Tsung-Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
  date = {2017-04-19},
  eprint = {1612.03144},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1612.03144},
  urldate = {2023-09-28},
  abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But recent deep learning object detectors have avoided pyramid representations, in part because they are compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using FPN in a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/8BBA7R4F/Lin et al. - 2017 - Feature Pyramid Networks for Object Detection.pdf;/home/zenon/Zotero/storage/KUPLTHRQ/1612.html}
 }
@incollection{liu2016,
  title = {{{SSD}}: {{Single Shot MultiBox Detector}}},
  shorttitle = {{{SSD}}},
@ -739,6 +786,22 @@
  file = {/home/zenon/Zotero/storage/A7KFIFE2/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf}
 }
@online{zheng2019a,
  title = {Distance-{{IoU Loss}}: {{Faster}} and {{Better Learning}} for {{Bounding Box Regression}}},
  shorttitle = {Distance-{{IoU Loss}}},
  author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
  date = {2019-11-19},
  eprint = {1911.08287},
  eprinttype = {arxiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1911.08287},
  urldate = {2023-09-28},
  abstract = {Bounding box regression is the crucial step in object detection. In existing methods, while \$\textbackslash ell\_n\$-norm loss is widely adopted for bounding box regression, it is not tailored to the evaluation metric, i.e., Intersection over Union (IoU). Recently, IoU loss and generalized IoU (GIoU) loss have been proposed to benefit the IoU metric, but still suffer from the problems of slow convergence and inaccurate regression. In this paper, we propose a Distance-IoU (DIoU) loss by incorporating the normalized distance between the predicted box and the target box, which converges much faster in training than IoU and GIoU losses. Furthermore, this paper summarizes three geometric factors in bounding box regression, \textbackslash ie, overlap area, central point distance and aspect ratio, based on which a Complete IoU (CIoU) loss is proposed, thereby leading to faster convergence and better performance. By incorporating DIoU and CIoU losses into state-of-the-art object detection algorithms, e.g., YOLO v3, SSD and Faster RCNN, we achieve notable performance gains in terms of not only IoU metric but also GIoU metric. Moreover, DIoU can be easily adopted into non-maximum suppression (NMS) to act as the criterion, further boosting performance improvement. The source code and trained models are available at https://github.com/Zzh-tju/DIoU.},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/zenon/Zotero/storage/4666A7J8/Zheng et al. - 2019 - Distance-IoU Loss Faster and Better Learning for .pdf;/home/zenon/Zotero/storage/V85B5F3R/1911.html}
 }
@article{zhong2022,
  title = {Classification of {{Cassava Leaf Disease Based}} on a {{Non-Balanced Dataset Using Transformer-Embedded ResNet}}},
  author = {Zhong, Yiwei and Huang, Baojin and Tang, Chaowei},
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -103,6 +103,7 @@
 \newacronym{relu}{ReLU}{Rectified Linear Unit}
 \newacronym{elu}{ELU}{Exponential Linear Unit}
 \newacronym{silu}{SiLU}{Sigmoid Linear Unit}
 \newacronym{mse}{MSE}{mean squared error}
 \begin{document}
@ -611,7 +612,10 @@ function \cite{minsky2017}. Non-linear functions, however, are a
 requirement for neural networks to become \emph{universal
 approximators} \cite{hornik1989}. We will introduce several activation
 functions which are used in the field of machine learning in the
-following sections.
+following sections. There exist many more than can be discussed within
 the scope of this thesis. However, the selection should give an
 overview of the most used and influential ones in the author's
 opinion.
 \subsubsection{Identity}
 \label{sssec:theory-identity}
@ -738,9 +742,100 @@ function has become an essential tool for deep learning practitioners
 and has contributed to the success of many state-of-the-art models in
 computer vision, natural language processing, and other domains.
 \subsubsection{Softmax}
 \label{sssec:theory-softmax}
 The softmax activation function is often used as the last activation
 function of a neural network to normalize the output of a network to a
 probability distribution over predicted output classes. It takes a
 vector of numbers, known as logits, and scales them into
 probabilities. The output of the softmax function is a vector with
 probabilities of each possible outcome, and the probabilities in the
 vector sum to one for all possible outcomes or classes. In
 mathematical terms, the function is defined as
 \begin{equation}
  \label{eq:softmax}
  \sigma(\vec{z})_{i} = \frac{e^{z_i}}{\sum_{j=1}^Ke^{z_j}}i\ \mathrm{for}\ i = 1,\dots,K\ \mathrm{and}\ \vec{z} = (z_1,\dots,z_K)\in\mathbb{R}^K
 \end{equation}
 where the standard exponential function is applied to each value in
 the vector $\vec{z}$ and the result is normalized with the sum of the
 exponentials.
 \subsection{Loss Function}
 \label{ssec:theory-loss-function}
 Loss functions play a fundamental role in machine learning, as they
 are used to evaluate the performance of a model and guide its
 training. The choice of loss function can significantly impact the
 accuracy and generalization of the model. There are various types of
 loss functions, each with its strengths and weaknesses, and the
 appropriate choice depends on the specific problem being addressed.
 From the definition of a learning program from
 section~\ref{sec:theory-ml}, loss functions constitute the performance
 measure $P$ against which the results of the learning program are
 measured. Only by minimizing the error obtained from the loss function
 and updating the weights within the network is it possible to gain
 experience $E$ at carrying out a task $T$. How the weights are updated
 depends on the algorithm which is used during the \emph{backward pass}
 to minimize the error. This type of procedure is referred to as
 \emph{backpropagation} (see
 section~\ref{ssec:theory-backpropagation}).
 One common type of loss function is the \gls{mse} which is widely used
 in regression problems. The \gls{mse} is a popular choice because it
 is easy to compute and has a closed-form solution, making it efficient
 to optimize. It does have some limitations, however. For instance, it
 is sensitive to outliers, and it may not be appropriate for problems
 with non-normal distributions. \gls{mse} measures the average squared
 difference between predicted and actual values. It is calculated with
 \begin{equation}
  \label{eq:mse}
  \mathrm{MSE_{test}} = \frac{1}{m}\sum_i(\hat{y}^{(\mathrm{test})} - y^{(\mathrm{test})})_i^2
 \end{equation}
 where $\hat{y}^{(\mathrm{test})}$ contains the predictions of the
 model on the test set and $y^{(\mathrm{test})}$ refers to the target
 labels \cite{goodfellow2016}. It follows that, if
 $\hat{y}^{(\mathrm{test})} = y^{(\mathrm{test})}$, the error is $0$
 and the model has produced a perfect prediction.
 We cannot, however, take the results of the error on the test set to
 update the weights during training because the test set must always
 contain only samples which the model has not seen before. If the model
 is trained to minimize the \gls{mse} on the test set and then
 evaluated against the same set, the results will be how well the model
 fits to the test set and not how well it generalizes. The goal,
 therefore, is to minimize the error on the training set and to compare
 the results against an evaluation on the test set. If the model
 achieves very low error rates on the training set but not on the test
 set, it is likely that the model is suffering from
 \emph{overfitting}. Conversely, if the model does not achieve low
 error rates on the training set, it is likely that the model is
 suffering from \emph{underfitting}.
 Another popular loss function is the cross-entropy loss, which is
 commonly used in classification problems. Cross-entropy loss measures
 the difference between predicted probabilities and actual labels. It
 is a good choice for classification problems because it takes into
 account the class imbalance issue and it is less sensitive to
 outliers. However, cross-entropy loss has its own limitations. For
 instance, it may not be appropriate for problems with more than two
 classes, and it can be sensitive to the choice of the softmax
 function.
 In recent years, there has been an increasing interest in using
 alternative loss functions that can better handle complex
 problems. For example, the Huber loss is a modification of the MSE
 loss that is more robust to outliers. The Smooth L1 loss is another
 alternative that is less sensitive to outliers and can handle
 non-normal distributions. These alternative loss functions have been
 shown to be effective in various applications such as image
 classification, object detection, and speech recognition.
 \subsection{Backpropagation}
 \label{ssec:theory-backpropagation}
@ -772,9 +867,6 @@ gives an introduction to object detection, explains common problems
 researchers have faced and how they have been solved, and discusses
 the two main approaches to object detection via deep learning.
 \subsection{Definition}
 \label{ssec:obj-definition}
 \subsection{Traditional Methods}
 \label{ssec:obj-traditional}
@ -2001,4 +2093,5 @@ Estimated 1 page for this section
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
 %%% End: