diff --git a/thesis/graphics/residual-connection/res.pdf b/thesis/graphics/residual-connection/res.pdf
new file mode 100644
index 0000000..644915a
Binary files /dev/null and b/thesis/graphics/residual-connection/res.pdf differ
diff --git a/thesis/graphics/residual-connection/res.tex b/thesis/graphics/residual-connection/res.tex
new file mode 100644
index 0000000..5f1d5c5
--- /dev/null
+++ b/thesis/graphics/residual-connection/res.tex
@@ -0,0 +1,43 @@
+\documentclass{standalone}
+\usepackage{tikz}
+
+\usetikzlibrary{graphs,quotes}
+\tikzstyle{block} = [draw, rectangle]
+\tikzstyle{sum} = [draw, fill=blue!20, circle, node distance=1cm]
+\tikzstyle{input} = [coordinate]
+\tikzstyle{output} = [coordinate]
+\tikzstyle{pinstyle} = [pin edge={to-,thin,black}]
+
+\begin{document}
+% \tikz\graph[grow down=3em]
+% {
+%   x [as=$\mathbf{x}$]
+%   ->[thick] wl1 [block,as=weight layer]
+%   ->[thick,"relu"] wl2 [block,as=weight layer]
+%   ->[thick] plus [as=$\bigoplus$]
+%   ->[thick,"relu"]
+%   empty [as=];
+%   x ->[thick,bend left=90,distance=5em,"$\mathbf{x}$"] plus;
+% };
+
+\begin{tikzpicture}
+  \node (start) at (0,0) {};
+  \node[draw] (wl1) at (0,-1) {weight layer};
+  \node[draw] (wl2) at (0,-2) {weight layer};
+  \node (plus) at (0,-3) {$\bigoplus$};
+  \node (end) at (0,-3.75) {};
+
+  \node (fx) at (-1.5, -1.5) {$\mathcal{F}(\mathbf{x})$};
+  \node (fxx) at (-1.5, -3) {$\mathcal{F}(\mathbf{x}) + \mathbf{x}$};
+
+  \draw[->,thick] (start) to node[near start,left] {$\mathbf{x}$} (wl1); 
+  \draw[->,thick] (wl1) to node[right] {relu} (wl2); 
+  \draw[->,thick] (wl2) to (plus); 
+  \draw[->,thick] (plus) to node[right] {relu} (end); 
+  \draw[->,thick] (0,-0.35) to[bend left=90,distance=5em] node[right,align=center] {$\mathbf{x}$\\identity} (plus); 
+\end{tikzpicture}
+\end{document}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
diff --git a/thesis/thesis.tex b/thesis/thesis.tex
index cb766e8..9e74689 100644
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@@ -1810,6 +1810,41 @@ paper~\cite{he2016}.
 
 Estimated 2 pages for this section.
 
+Early research \cite{bengio1994,glorot2010} already demonstrated that
+the vanishing/exploding gradient problem with standard gradient
+descent and random initialization adversely affects convergence during
+training and results in worse performance than would be otherwise
+achievable with the same architecture. If a neural network is trained
+with gradient descent by the application of the chain rule
+(backpropagation), weight updates are passed from the later layers
+back through the network to the early layers. Unfortunately, with some
+activation functions (notably $\tanh$), the gradient can be very small
+and decreases exponentially the further it passes through the
+network. The effect being that the early layers do not receive any
+weight updates which can stop the learning process entirely.
+
+There are multiple potential solutions to the vanishing gradient
+problem. Different weight initialization schemes
+\cite{glorot2010,sussillo2015} as well as batch normalization layers
+\cite{ioffe2015} can help mitigate the problem. The most effective
+solution yet, however, was proposed as \emph{residual connections} by
+\textcite{he2016}. Instead of connecting each layer only to the
+previous and next layer in a sequential way, the authors add the input
+of the previous layer to the output of the next layer. This is
+achieved through the aforementioned residual or skip connections (see
+figure~\ref{fig:residual-connection}).
+
+\begin{figure}
+  \centering
+  \includegraphics[width=0.35\textwidth]{graphics/residual-connection/res.pdf}
+  \caption[Residual connection]{Residual connections: information from
+    previous layers flows into subsequent layers before the activation
+    function is applied. The symbol $\bigoplus$ represents simple
+    element-wise addition. Figure redrawn from \textcite{he2016}.}
+  \label{fig:residual-connection}
+\end{figure}
+
+
 \subsection{Data Augmentation}
 \label{sec:methods-augmentation}