Add section on activation functions

This commit is contained in:
Tobias Eidelpes 2023-09-27 19:28:03 +02:00
parent 99489d8d20
commit 419d8da5d6
3 changed files with 202 additions and 0 deletions

View File

@ -257,6 +257,22 @@
keywords = {Algorithm AdaBoost,Cumulative Loss,Final Hypothesis,Loss Function,Weak Hypothesis}
}
@article{fukushima1969,
title = {Visual {{Feature Extraction}} by a {{Multilayered Network}} of {{Analog Threshold Elements}}},
author = {Fukushima, Kunihiko},
date = {1969-10},
journaltitle = {IEEE Transactions on Systems Science and Cybernetics},
volume = {5},
number = {4},
pages = {322--333},
issn = {2168-2887},
doi = {10.1109/TSSC.1969.300225},
urldate = {2023-09-27},
abstract = {A new type of visual feature extracting network has been synthesized, and the response of the network has been simulated on a digital computer. This research has been done as a first step towards the realization of a recognizer of handwritten characters. The design of the network was suggested by biological systems, especially, the visual systems of cat and monkey. The network is composed of analog threshold elements connected in layers. Each analog threshold element receives inputs from a large number of elements in the neighbouring layers and performs its own special functions. It takes care of one restricted part of the photoreceptor layer, on which an input pattem is presented, and it responds to one particular feature of the input pattem, such as brightness contrast, a dot in the pattern, a line segment of a particular orientation, or an end of the line. This means that the network performs parallel processing of the information. With the propagation of the information through the layered network, the input pattern is successively decomposed into dots, groups of line segments of the same orientation, and the ends of these line segments.},
eventtitle = {{{IEEE Transactions}} on {{Systems Science}} and {{Cybernetics}}},
file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
}
@inproceedings{girshick2015,
title = {Deformable Part Models Are Convolutional Neural Networks},
booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -271,6 +287,20 @@
file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html}
}
@book{goodfellow2016,
title = {Deep {{Learning}}},
author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
date = {2016-11-10},
eprint = {omivDQAAQBAJ},
eprinttype = {googlebooks},
publisher = {{MIT Press}},
abstract = {An introduction to a broad range of topics in deep learning, covering mathematical and conceptual background, deep learning techniques used in industry, and research perspectives.“Written by three experts in the field, Deep Learning is the only comprehensive book on the subject.”—Elon Musk, cochair of OpenAI; cofounder and CEO of Tesla and SpaceXDeep learning is a form of machine learning that enables computers to learn from experience and understand the world in terms of a hierarchy of concepts. Because the computer gathers knowledge from experience, there is no need for a human computer operator to formally specify all the knowledge that the computer needs. The hierarchy of concepts allows the computer to learn complicated concepts by building them out of simpler ones; a graph of these hierarchies would be many layers deep. This book introduces a broad range of topics in deep learning. The text offers mathematical and conceptual background, covering relevant concepts in linear algebra, probability theory and information theory, numerical computation, and machine learning. It describes deep learning techniques used by practitioners in industry, including deep feedforward networks, regularization, optimization algorithms, convolutional networks, sequence modeling, and practical methodology; and it surveys such applications as natural language processing, speech recognition, computer vision, online recommendation systems, bioinformatics, and videogames. Finally, the book offers research perspectives, covering such theoretical topics as linear factor models, autoencoders, representation learning, structured probabilistic models, Monte Carlo methods, the partition function, approximate inference, and deep generative models. Deep Learning can be used by undergraduate or graduate students planning careers in either industry or research, and by software engineers who want to begin using deep learning in their products or platforms. A website offers supplementary material for both readers and instructors.},
isbn = {978-0-262-33737-3},
langid = {english},
pagetotal = {801},
keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science}
}
@inproceedings{he2016,
title = {Deep {{Residual Learning}} for {{Image Recognition}}},
booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -284,6 +314,23 @@
file = {/home/zenon/Zotero/storage/JDX3S8QK/He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf}
}
@article{hornik1989,
title = {Multilayer Feedforward Networks Are Universal Approximators},
author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
date = {1989-01-01},
journaltitle = {Neural Networks},
shortjournal = {Neural Networks},
volume = {2},
number = {5},
pages = {359--366},
issn = {0893-6080},
doi = {10.1016/0893-6080(89)90020-8},
urldate = {2023-09-27},
abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.},
keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation},
file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html}
}
@software{jocher2022,
title = {Ultralytics/{{Yolov5}}: {{V7}}.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}},
shorttitle = {Ultralytics/{{Yolov5}}},
@ -437,6 +484,20 @@
file = {/home/zenon/Zotero/storage/3ECY7VJ5/McEnroe et al. - 2022 - A Survey on the Convergence of Edge Computing and .pdf}
}
@book{minsky2017,
title = {Perceptrons: {{An Introduction}} to {{Computational Geometry}}},
shorttitle = {Perceptrons},
author = {Minsky, Marvin and Papert, Seymour A.},
date = {2017-09-22},
publisher = {{The MIT Press}},
doi = {10.7551/mitpress/11301.001.0001},
urldate = {2023-09-27},
abstract = {The first systematic study of parallelism in computation by two pioneers in the field.Reissue of the 1988 Expanded Edition with a new foreword by Léon BottouIn},
isbn = {978-0-262-34393-0},
langid = {english},
file = {/home/zenon/Zotero/storage/XZVYT2SM/PerceptronsAn-Introduction-to-Computational.html}
}
@book{mitchell1997a,
title = {Machine {{Learning}}},
author = {Mitchell, Thomas M.},

Binary file not shown.

View File

@ -100,6 +100,9 @@
\newacronym{ai}{AI}{Artificial Intelligence}
\newacronym{mfcc}{MFCC}{Mel-frequency Cepstral Coefficient}
\newacronym{mlp}{MLP}{Multilayer Perceptron}
\newacronym{relu}{ReLU}{Rectified Linear Unit}
\newacronym{elu}{ELU}{Exponential Linear Unit}
\newacronym{silu}{SiLU}{Sigmoid Linear Unit}
\begin{document}
@ -596,7 +599,144 @@ Figure~\ref{fig:neural-network}, represents a simpler structure.
\subsection{Activation Functions}
\label{ssec:theory-activation-functions}
Activation functions are the functions \emph{inside} each neuron which
receive inputs and produce an output value. The nature of these
functions is that they need a certain amount of \emph{excitation} from
the inputs before they produce an output, hence the name
\emph{activation function}. Activation functions are either linear or
non-linear. Linear functions are limited in their capabilities because
they cannot approximate certain functions. For example, a perceptron,
which uses a linear activation function, cannot approximate the XOR
function \cite{minsky2017}. Non-linear functions, however, are a
requirement for neural networks to become \emph{universal
approximators} \cite{hornik1989}. We will introduce several activation
functions which are used in the field of machine learning in the
following sections.
\subsubsection{Identity}
\label{sssec:theory-identity}
The simplest activation function is the identity function. It is defined as
\begin{equation}
\label{eq:identity}
g(x) = x
\end{equation}
If all layers in an artificial neural network use the identity
activation function, the network is equivalent to a single-layer
structure. The identity function is often used for layers which do not
need an activation function per se, but require one to uphold
consistency with the rest of the network structure.
\subsubsection{Heaviside Step}
\label{sssec:theory-heaviside}
The Heaviside step function, also known as the unit step function, is
a mathematical function that is commonly used in control theory and
signal processing to represent a signal that switches on at a
specified time and stays on. The function is named after Oliver
Heaviside, who introduced it in the late 19th century. It is defined
as
\begin{equation}
\label{eq:heaviside}
H(x) =
\begin{cases}
1, x\geq 0 \\
0, x < 0
\end{cases}.
\end{equation}
In engineering applications, the Heaviside step function is used to
describe functions whose values change abruptly at specified values of
time $t$. We have already mentioned the Heaviside step function in
section~\ref{ssec:theory-nn} when introducing the perceptron. It can
only classify linearly separable variables when used in a neural
network and is, therefore, not suitable for complex intra-data
relationships. A major downside to using the Heaviside step function
is that it is not differentiable at $x = 0$ and has a $0$ derivative
elsewhere. These properties make it unsuitable for use with gradient
descent during \todo[noline]{link to backpropagation section}
backpropagation.
\subsubsection{Sigmoid}
\label{sssec:theory-sigmoid}
The sigmoid activation function is one of the most important functions
to introduce non-linearity into the outputs of a neuron. It is a
special case of a logistic function and used synonymously with
logistic function in machine learning. It is defined as
\begin{equation}
\label{eq:sigmoid}
\sigma(x) = \frac{1}{1 + e^{-x}}
\end{equation}
It has a characteristic S-shaped curve, mapping each input value to a
number between $0$ and $1$, regardless of input size. This
\emph{squashing} property is particularly desirable for binary
classification problems because the outputs can be interpreted as
probabilities. Additionally to the squashing propery, it is also a
saturating function because large values map to $1$ and very small
values to $0$. If a learning algorithm has to update the weights in
the network, saturated neurons are very inefficient and difficult to
process because the outputs do not provide valuable information. In
contrast to the Heaviside step function
(section~\ref{sssec:theory-heaviside}), it is differentiable which
allows it to be used with gradient descent optimization
algorithms. \todo[noline]{link to gradient descent and vanishing
gradient sections} Unfortunately, the sigmoid function suffers from
the vanishing gradient problem, which makes it unsuitable for training
deep neural networks.
\subsubsection{Rectified Linear Unit}
\label{sssec:theory-relu}
The \gls{relu} function is defined as
\begin{equation}
\label{eq:relu}
f(x) = \max(0, x) =
\begin{cases}
x, x > 0 \\
0, x \leq 0
\end{cases}
\end{equation}
which means that it returns the input value if it is positive, and
returns zero if it is negative. It was first introduced by
\textcite{fukushima1969} in a modified form to construct a visual
feature extractor. The \gls{relu} function is nearly linear, and it
thus preserves many of the properties that make linear models easy to
optimize with gradient-based methods \cite{goodfellow2016}. In
contrast to the sigmoid activation function, the \gls{relu} function
overcomes the vanishing gradient problem \todo{link to vanishing
gradient problem} and is therefore suitable for training deep neural
networks. Furthermore, the \gls{relu} function is easier to calculate
than sigmoid functions which allows networks to be trained more
quickly. Even though it is not differentiable at $0$, it is
differentiable everywhere else and often used with gradient descent
during optimization.
The \gls{relu} function suffers from the dying \gls{relu} problem,
which can cause some neurons to become inactive. Large gradients,
which are passed back through the network to update the weights, are
typically the source of this. If many neurons are pushed into this
state, the model's capability of learning new patterns is
diminished. To address this problem, there are two possibilities. One
solution is to make sure that the learning rate is not set too high,
which reduces the problem but does not fully remove it. Another
solution is to use one of the several variants of the ReLU function
such as leaky \gls{relu}, \gls{elu}, and \gls{silu}.
In recent years, the \gls{relu} function has become the most popular
activation function for deep neural networks and is recommended as the
default activation function in modern neural networks
\cite{goodfellow2016}. Despite its limitations, the \gls{relu}
function has become an essential tool for deep learning practitioners
and has contributed to the success of many state-of-the-art models in
computer vision, natural language processing, and other domains.
\subsection{Loss Function}
\label{ssec:theory-loss-function}
@ -1860,4 +2000,5 @@ Estimated 1 page for this section
%%% TeX-master: t
%%% TeX-master: t
%%% TeX-master: t
%%% TeX-master: t
%%% End: