Add section on activation functions

2023-09-27 19:28:03 +02:00 · 2023-09-27 19:28:03 +02:00 · 419d8da5d6
commit 419d8da5d6
parent 99489d8d20
3 changed files with 202 additions and 0 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -257,6 +257,22 @@
  keywords = {Algorithm AdaBoost,Cumulative Loss,Final Hypothesis,Loss Function,Weak Hypothesis}
 }

+@article{fukushima1969,
+  title = {Visual {{Feature Extraction}} by a {{Multilayered Network}} of {{Analog Threshold Elements}}},
+  author = {Fukushima, Kunihiko},
+  date = {1969-10},
+  journaltitle = {IEEE Transactions on Systems Science and Cybernetics},
+  volume = {5},
+  number = {4},
+  pages = {322--333},
+  issn = {2168-2887},
+  doi = {10.1109/TSSC.1969.300225},
+  urldate = {2023-09-27},
+  abstract = {A new type of visual feature extracting network has been synthesized, and the response of the network has been simulated on a digital computer. This research has been done as a first step towards the realization of a recognizer of handwritten characters. The design of the network was suggested by biological systems, especially, the visual systems of cat and monkey. The network is composed of analog threshold elements connected in layers. Each analog threshold element receives inputs from a large number of elements in the neighbouring layers and performs its own special functions. It takes care of one restricted part of the photoreceptor layer, on which an input pattem is presented, and it responds to one particular feature of the input pattem, such as brightness contrast, a dot in the pattern, a line segment of a particular orientation, or an end of the line. This means that the network performs parallel processing of the information. With the propagation of the information through the layered network, the input pattern is successively decomposed into dots, groups of line segments of the same orientation, and the ends of these line segments.},
+  eventtitle = {{{IEEE Transactions}} on {{Systems Science}} and {{Cybernetics}}},
+  file = {/home/zenon/Zotero/storage/U5UBHUUH/Visual_Feature_Extraction_by_a_Multilayered_Network_of_Analog_Threshold_Elements.pdf;/home/zenon/Zotero/storage/YIJ8SNLD/Fukushima - 1969 - Visual Feature Extraction by a Multilayered Networ.pdf;/home/zenon/Zotero/storage/HLJYDE2X/4082265.html}
+}
+
@inproceedings{girshick2015,
  title = {Deformable Part Models Are Convolutional Neural Networks},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -271,6 +287,20 @@
  file = {/home/zenon/Zotero/storage/M8INWK6B/Girshick et al. - 2015 - Deformable part models are convolutional neural ne.pdf;/home/zenon/Zotero/storage/MHWCXFBZ/7298641.html}
 }

+@book{goodfellow2016,
+  title = {Deep {{Learning}}},
+  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
+  date = {2016-11-10},
+  eprint = {omivDQAAQBAJ},
+  eprinttype = {googlebooks},
+  publisher = {{MIT Press}},
+  abstract = {An introduction to a broad range of topics in deep learning, covering mathematical and conceptual background, deep learning techniques used in industry, and research perspectives.“Written by three experts in the field, Deep Learning is the only comprehensive book on the subject.”—Elon Musk, cochair of OpenAI; cofounder and CEO of Tesla and SpaceXDeep learning is a form of machine learning that enables computers to learn from experience and understand the world in terms of a hierarchy of concepts. Because the computer gathers knowledge from experience, there is no need for a human computer operator to formally specify all the knowledge that the computer needs. The hierarchy of concepts allows the computer to learn complicated concepts by building them out of simpler ones; a graph of these hierarchies would be many layers deep. This book introduces a broad range of topics in deep learning. The text offers mathematical and conceptual background, covering relevant concepts in linear algebra, probability theory and information theory, numerical computation, and machine learning. It describes deep learning techniques used by practitioners in industry, including deep feedforward networks, regularization, optimization algorithms, convolutional networks, sequence modeling, and practical methodology; and it surveys such applications as natural language processing, speech recognition, computer vision, online recommendation systems, bioinformatics, and videogames. Finally, the book offers research perspectives, covering such theoretical topics as linear factor models, autoencoders, representation learning, structured probabilistic models, Monte Carlo methods, the partition function, approximate inference, and deep generative models. Deep Learning can be used by undergraduate or graduate students planning careers in either industry or research, and by software engineers who want to begin using deep learning in their products or platforms. A website offers supplementary material for both readers and instructors.},
+  isbn = {978-0-262-33737-3},
+  langid = {english},
+  pagetotal = {801},
+  keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science}
+}
+
@inproceedings{he2016,
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -284,6 +314,23 @@
  file = {/home/zenon/Zotero/storage/JDX3S8QK/He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf}
 }

+@article{hornik1989,
+  title = {Multilayer Feedforward Networks Are Universal Approximators},
+  author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
+  date = {1989-01-01},
+  journaltitle = {Neural Networks},
+  shortjournal = {Neural Networks},
+  volume = {2},
+  number = {5},
+  pages = {359--366},
+  issn = {0893-6080},
+  doi = {10.1016/0893-6080(89)90020-8},
+  urldate = {2023-09-27},
+  abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.},
+  keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation},
+  file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html}
+}
+
@software{jocher2022,
  title = {Ultralytics/{{Yolov5}}: {{V7}}.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}},
  shorttitle = {Ultralytics/{{Yolov5}}},
@ -437,6 +484,20 @@
  file = {/home/zenon/Zotero/storage/3ECY7VJ5/McEnroe et al. - 2022 - A Survey on the Convergence of Edge Computing and .pdf}
 }

+@book{minsky2017,
+  title = {Perceptrons: {{An Introduction}} to {{Computational Geometry}}},
+  shorttitle = {Perceptrons},
+  author = {Minsky, Marvin and Papert, Seymour A.},
+  date = {2017-09-22},
+  publisher = {{The MIT Press}},
+  doi = {10.7551/mitpress/11301.001.0001},
+  urldate = {2023-09-27},
+  abstract = {The first systematic study of parallelism in computation by two pioneers in the field.Reissue of the 1988 Expanded Edition with a new foreword by Léon BottouIn},
+  isbn = {978-0-262-34393-0},
+  langid = {english},
+  file = {/home/zenon/Zotero/storage/XZVYT2SM/PerceptronsAn-Introduction-to-Computational.html}
+}
+
@book{mitchell1997a,
  title = {Machine {{Learning}}},
  author = {Mitchell, Thomas M.},
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -100,6 +100,9 @@
 \newacronym{ai}{AI}{Artificial Intelligence}
 \newacronym{mfcc}{MFCC}{Mel-frequency Cepstral Coefficient}
 \newacronym{mlp}{MLP}{Multilayer Perceptron}
+\newacronym{relu}{ReLU}{Rectified Linear Unit}
+\newacronym{elu}{ELU}{Exponential Linear Unit}
+\newacronym{silu}{SiLU}{Sigmoid Linear Unit}

 \begin{document}

@ -596,7 +599,144 @@ Figure~\ref{fig:neural-network}, represents a simpler structure.
 \subsection{Activation Functions}
 \label{ssec:theory-activation-functions}

+Activation functions are the functions \emph{inside} each neuron which
+receive inputs and produce an output value. The nature of these
+functions is that they need a certain amount of \emph{excitation} from
+the inputs before they produce an output, hence the name
+\emph{activation function}. Activation functions are either linear or
+non-linear. Linear functions are limited in their capabilities because
+they cannot approximate certain functions. For example, a perceptron,
+which uses a linear activation function, cannot approximate the XOR
+function \cite{minsky2017}. Non-linear functions, however, are a
+requirement for neural networks to become \emph{universal
+approximators} \cite{hornik1989}. We will introduce several activation
+functions which are used in the field of machine learning in the
+following sections.

+\subsubsection{Identity}
+\label{sssec:theory-identity}
+
+The simplest activation function is the identity function. It is defined as
+
+\begin{equation}
+  \label{eq:identity}
+  g(x) = x
+\end{equation}
+
+If all layers in an artificial neural network use the identity
+activation function, the network is equivalent to a single-layer
+structure. The identity function is often used for layers which do not
+need an activation function per se, but require one to uphold
+consistency with the rest of the network structure.
+
+\subsubsection{Heaviside Step}
+\label{sssec:theory-heaviside}
+
+The Heaviside step function, also known as the unit step function, is
+a mathematical function that is commonly used in control theory and
+signal processing to represent a signal that switches on at a
+specified time and stays on. The function is named after Oliver
+Heaviside, who introduced it in the late 19th century. It is defined
+as
+
+\begin{equation}
+  \label{eq:heaviside}
+  H(x) = 
+  \begin{cases}
+    1, x\geq 0 \\
+    0, x < 0
+  \end{cases}.
+\end{equation}
+
+In engineering applications, the Heaviside step function is used to
+describe functions whose values change abruptly at specified values of
+time $t$. We have already mentioned the Heaviside step function in
+section~\ref{ssec:theory-nn} when introducing the perceptron. It can
+only classify linearly separable variables when used in a neural
+network and is, therefore, not suitable for complex intra-data
+relationships. A major downside to using the Heaviside step function
+is that it is not differentiable at $x = 0$ and has a $0$ derivative
+elsewhere. These properties make it unsuitable for use with gradient
+descent during \todo[noline]{link to backpropagation section}
+backpropagation.
+
+\subsubsection{Sigmoid}
+\label{sssec:theory-sigmoid}
+
+The sigmoid activation function is one of the most important functions
+to introduce non-linearity into the outputs of a neuron. It is a
+special case of a logistic function and used synonymously with
+logistic function in machine learning. It is defined as
+
+\begin{equation}
+  \label{eq:sigmoid}
+  \sigma(x) = \frac{1}{1 + e^{-x}}
+\end{equation}
+
+It has a characteristic S-shaped curve, mapping each input value to a
+number between $0$ and $1$, regardless of input size. This
+\emph{squashing} property is particularly desirable for binary
+classification problems because the outputs can be interpreted as
+probabilities. Additionally to the squashing propery, it is also a
+saturating function because large values map to $1$ and very small
+values to $0$. If a learning algorithm has to update the weights in
+the network, saturated neurons are very inefficient and difficult to
+process because the outputs do not provide valuable information. In
+contrast to the Heaviside step function
+(section~\ref{sssec:theory-heaviside}), it is differentiable which
+allows it to be used with gradient descent optimization
+algorithms. \todo[noline]{link to gradient descent and vanishing
+gradient sections} Unfortunately, the sigmoid function suffers from
+the vanishing gradient problem, which makes it unsuitable for training
+deep neural networks.
+
+\subsubsection{Rectified Linear Unit}
+\label{sssec:theory-relu}
+
+The \gls{relu} function is defined as
+
+\begin{equation}
+  \label{eq:relu}
+  f(x) = \max(0, x) =
+  \begin{cases}
+    x, x > 0 \\
+    0, x \leq 0
+  \end{cases}
+\end{equation}
+
+which means that it returns the input value if it is positive, and
+returns zero if it is negative. It was first introduced by
+\textcite{fukushima1969} in a modified form to construct a visual
+feature extractor. The \gls{relu} function is nearly linear, and it
+thus preserves many of the properties that make linear models easy to
+optimize with gradient-based methods \cite{goodfellow2016}. In
+contrast to the sigmoid activation function, the \gls{relu} function
+overcomes the vanishing gradient problem \todo{link to vanishing
+gradient problem} and is therefore suitable for training deep neural
+networks. Furthermore, the \gls{relu} function is easier to calculate
+than sigmoid functions which allows networks to be trained more
+quickly. Even though it is not differentiable at $0$, it is
+differentiable everywhere else and often used with gradient descent
+during optimization.
+
+The \gls{relu} function suffers from the dying \gls{relu} problem,
+which can cause some neurons to become inactive. Large gradients,
+which are passed back through the network to update the weights, are
+typically the source of this. If many neurons are pushed into this
+state, the model's capability of learning new patterns is
+diminished. To address this problem, there are two possibilities. One
+solution is to make sure that the learning rate is not set too high,
+which reduces the problem but does not fully remove it. Another
+solution is to use one of the several variants of the ReLU function
+such as leaky \gls{relu}, \gls{elu}, and \gls{silu}.
+
+In recent years, the \gls{relu} function has become the most popular
+activation function for deep neural networks and is recommended as the
+default activation function in modern neural networks
+\cite{goodfellow2016}. Despite its limitations, the \gls{relu}
+function has become an essential tool for deep learning practitioners
+and has contributed to the success of many state-of-the-art models in
+computer vision, natural language processing, and other domains.

 \subsection{Loss Function}
 \label{ssec:theory-loss-function}
@ -1860,4 +2000,5 @@ Estimated 1 page for this section
 %%% TeX-master: t
 %%% TeX-master: t
 %%% TeX-master: t
+%%% TeX-master: t
 %%% End: