diff --git a/thesis/graphics/residual-connection/res.pdf b/thesis/graphics/residual-connection/res.pdf new file mode 100644 index 0000000..644915a Binary files /dev/null and b/thesis/graphics/residual-connection/res.pdf differ diff --git a/thesis/graphics/residual-connection/res.tex b/thesis/graphics/residual-connection/res.tex new file mode 100644 index 0000000..5f1d5c5 --- /dev/null +++ b/thesis/graphics/residual-connection/res.tex @@ -0,0 +1,43 @@ +\documentclass{standalone} +\usepackage{tikz} + +\usetikzlibrary{graphs,quotes} +\tikzstyle{block} = [draw, rectangle] +\tikzstyle{sum} = [draw, fill=blue!20, circle, node distance=1cm] +\tikzstyle{input} = [coordinate] +\tikzstyle{output} = [coordinate] +\tikzstyle{pinstyle} = [pin edge={to-,thin,black}] + +\begin{document} +% \tikz\graph[grow down=3em] +% { +% x [as=$\mathbf{x}$] +% ->[thick] wl1 [block,as=weight layer] +% ->[thick,"relu"] wl2 [block,as=weight layer] +% ->[thick] plus [as=$\bigoplus$] +% ->[thick,"relu"] +% empty [as=]; +% x ->[thick,bend left=90,distance=5em,"$\mathbf{x}$"] plus; +% }; + +\begin{tikzpicture} + \node (start) at (0,0) {}; + \node[draw] (wl1) at (0,-1) {weight layer}; + \node[draw] (wl2) at (0,-2) {weight layer}; + \node (plus) at (0,-3) {$\bigoplus$}; + \node (end) at (0,-3.75) {}; + + \node (fx) at (-1.5, -1.5) {$\mathcal{F}(\mathbf{x})$}; + \node (fxx) at (-1.5, -3) {$\mathcal{F}(\mathbf{x}) + \mathbf{x}$}; + + \draw[->,thick] (start) to node[near start,left] {$\mathbf{x}$} (wl1); + \draw[->,thick] (wl1) to node[right] {relu} (wl2); + \draw[->,thick] (wl2) to (plus); + \draw[->,thick] (plus) to node[right] {relu} (end); + \draw[->,thick] (0,-0.35) to[bend left=90,distance=5em] node[right,align=center] {$\mathbf{x}$\\identity} (plus); +\end{tikzpicture} +\end{document} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: diff --git a/thesis/thesis.tex b/thesis/thesis.tex index cb766e8..9e74689 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -1810,6 +1810,41 @@ paper~\cite{he2016}. Estimated 2 pages for this section. +Early research \cite{bengio1994,glorot2010} already demonstrated that +the vanishing/exploding gradient problem with standard gradient +descent and random initialization adversely affects convergence during +training and results in worse performance than would be otherwise +achievable with the same architecture. If a neural network is trained +with gradient descent by the application of the chain rule +(backpropagation), weight updates are passed from the later layers +back through the network to the early layers. Unfortunately, with some +activation functions (notably $\tanh$), the gradient can be very small +and decreases exponentially the further it passes through the +network. The effect being that the early layers do not receive any +weight updates which can stop the learning process entirely. + +There are multiple potential solutions to the vanishing gradient +problem. Different weight initialization schemes +\cite{glorot2010,sussillo2015} as well as batch normalization layers +\cite{ioffe2015} can help mitigate the problem. The most effective +solution yet, however, was proposed as \emph{residual connections} by +\textcite{he2016}. Instead of connecting each layer only to the +previous and next layer in a sequential way, the authors add the input +of the previous layer to the output of the next layer. This is +achieved through the aforementioned residual or skip connections (see +figure~\ref{fig:residual-connection}). + +\begin{figure} + \centering + \includegraphics[width=0.35\textwidth]{graphics/residual-connection/res.pdf} + \caption[Residual connection]{Residual connections: information from + previous layers flows into subsequent layers before the activation + function is applied. The symbol $\bigoplus$ represents simple + element-wise addition. Figure redrawn from \textcite{he2016}.} + \label{fig:residual-connection} +\end{figure} + + \subsection{Data Augmentation} \label{sec:methods-augmentation}