Replace neural network graphic with own

2023-11-09 20:21:42 +01:00 · 2023-11-09 20:21:42 +01:00 · a8f7b1b6e3
commit a8f7b1b6e3
parent 7f4f05a5d1
3 changed files with 95 additions and 5 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -81,6 +81,21 @@
  file = {/home/zenon/Zotero/storage/RSNWFVIZ/Azimi et al. - 2021 - Intelligent Monitoring of Stress Induced by Water .pdf}
 }
@article{bengio1994,
  title = {Learning Long-Term Dependencies with Gradient Descent Is Difficult},
  author = {Bengio, Y. and Simard, P. and Frasconi, P.},
  date = {1994-03},
  journaltitle = {IEEE Transactions on Neural Networks},
  volume = {5},
  number = {2},
  pages = {157--166},
  issn = {1941-0093},
  doi = {10.1109/72.279181},
  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captured increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.{$<>$}},
  eventtitle = {{{IEEE Transactions}} on {{Neural Networks}}},
  file = {/home/zenon/Zotero/storage/VNSQAD8H/Bengio et al. - 1994 - Learning long-term dependencies with gradient desc.pdf;/home/zenon/Zotero/storage/5BQESSD9/279181.html}
 }
@article{benos2021,
  title = {Machine {{Learning}} in {{Agriculture}}: {{A Comprehensive Updated Review}}},
  shorttitle = {Machine {{Learning}} in {{Agriculture}}},
@ -416,6 +431,22 @@
  file = {/home/zenon/Zotero/storage/MQPF5MGW/Girshick et al. - 2016 - Region-Based Convolutional Networks for Accurate O.pdf;/home/zenon/Zotero/storage/EKC4WHDQ/7112511.html}
 }
@inproceedings{glorot2010,
  title = {Understanding the Difficulty of Training Deep Feedforward Neural Networks},
  booktitle = {Proceedings of the {{Thirteenth International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
  author = {Glorot, Xavier and Bengio, Yoshua},
  date = {2010-03-31},
  pages = {249--256},
  publisher = {{JMLR Workshop and Conference Proceedings}},
  issn = {1938-7228},
  url = {https://proceedings.mlr.press/v9/glorot10a.html},
  urldate = {2023-11-08},
  abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future.  We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1.  Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.},
  eventtitle = {Proceedings of the {{Thirteenth International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
  langid = {english},
  file = {/home/zenon/Zotero/storage/Q2UQI5CZ/Glorot and Bengio - 2010 - Understanding the difficulty of training deep feed.pdf}
 }
@book{goodfellow2016,
  title = {Deep {{Learning}}},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
@ -487,6 +518,36 @@
  file = {/home/zenon/Zotero/storage/FN7FDBHL/0893608089900208.html}
 }
@inproceedings{huang2017,
  title = {Densely {{Connected Convolutional Networks}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q.},
  date = {2017-07},
  pages = {2261--2269},
  issn = {1063-6919},
  doi = {10.1109/CVPR.2017.243},
  abstract = {Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections-one between each layer and its subsequent layer-our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and pre-trained models are available at https://github.com/liuzhuang13/DenseNet.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  file = {/home/zenon/Zotero/storage/G2G38ZEF/Huang et al. - 2017 - Densely Connected Convolutional Networks.pdf;/home/zenon/Zotero/storage/3Q7NRNV6/8099726.html}
 }
@inproceedings{ioffe2015,
  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
  shorttitle = {Batch {{Normalization}}},
  booktitle = {Proceedings of the 32nd {{International Conference}} on {{Machine Learning}}},
  author = {Ioffe, Sergey and Szegedy, Christian},
  date = {2015-06-01},
  pages = {448--456},
  publisher = {{PMLR}},
  issn = {1938-7228},
  url = {https://proceedings.mlr.press/v37/ioffe15.html},
  urldate = {2023-11-09},
  abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer’s inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a stateof-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82\% top-5 test error, exceeding the accuracy of human raters.},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  langid = {english},
  file = {/home/zenon/Zotero/storage/7SMC2DMX/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf}
 }
@software{jocher2022,
  title = {Ultralytics/{{Yolov5}}: {{V7}}.0 - {{YOLOv5 SOTA Realtime Instance Segmentation}}},
  shorttitle = {Ultralytics/{{Yolov5}}},
@ -1078,6 +1139,20 @@
  file = {/home/zenon/Zotero/storage/KUHDEQJF/Su et al. - 2020 - Machine Learning-Based Crop Drought Mapping System.pdf}
 }
@online{sussillo2015,
  title = {Random {{Walk Initialization}} for {{Training Very Deep Feedforward Networks}}},
  author = {Sussillo, David and Abbott, L. F.},
  date = {2015-02-27},
  eprint = {1412.6558},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.1412.6558},
  abstract = {Training very deep networks is an important open problem in machine learning. One of many difficulties is that the norm of the back-propagated error gradient can grow or decay exponentially. Here we show that training very deep feed-forward networks (FFNs) is not as difficult as previously thought. Unlike when back-propagation is applied to a recurrent network, application to an FFN amounts to multiplying the error gradient by a different random matrix at each layer. We show that the successive application of correctly scaled random matrices to an initial vector results in a random walk of the log of the norm of the resulting vectors, and we compute the scaling that makes this walk unbiased. The variance of the random walk grows only linearly with network depth and is inversely proportional to the size of each layer. Practically, this implies a gradient whose log-norm scales with the square root of the network depth and shows that the vanishing gradient problem can be mitigated by increasing the width of the layers. Mathematical analyses and experimental results using stochastic gradient descent to optimize tasks related to the MNIST and TIMIT datasets are provided to support these claims. Equations for the optimal matrix scaling are provided for the linear and ReLU cases.},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  file = {/home/zenon/Zotero/storage/XSY4PH2B/Sussillo and Abbott - 2015 - Random Walk Initialization for Training Very Deep .pdf;/home/zenon/Zotero/storage/39NYWJNU/1412.html}
 }
@inproceedings{szegedy2015,
  title = {Going Deeper with Convolutions},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
@ -1091,6 +1166,23 @@
  file = {/home/zenon/Zotero/storage/VL2YIAAN/Szegedy et al. - 2015 - Going deeper with convolutions.pdf;/home/zenon/Zotero/storage/GWTG8T26/7298594.html}
 }
@article{szegedy2017,
  title = {Inception-v4, {{Inception-ResNet}} and the {{Impact}} of {{Residual Connections}} on {{Learning}}},
  author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent and Alemi, Alexander},
  date = {2017-02-12},
  journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {31},
  number = {1},
  pages = {4278--4284},
  issn = {2374-3468},
  doi = {10.1609/aaai.v31i1.11231},
  abstract = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question: Are there any benefits to combining Inception architectures with residual connections? Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks significantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classification task significantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4 networks, we achieve 3.08\% top-5 error on the test set of the ImageNet classification (CLS) challenge.},
  issue = {1},
  langid = {english},
  keywords = {Inception},
  file = {/home/zenon/Zotero/storage/JQVR2G3M/Szegedy et al. - 2017 - Inception-v4, Inception-ResNet and the Impact of R.pdf}
 }
@article{uijlings2013,
  title = {Selective {{Search}} for {{Object Recognition}}},
  author = {Uijlings, J. R. R. and family=Sande, given=K. E. A., prefix=van de, useprefix=true and Gevers, T. and Smeulders, A. W. M.},
--- a/thesis/thesis.pdf
+++ b/thesis/thesis.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -552,13 +552,11 @@ three-layer fully-connected artificial neural network.
 \begin{figure}
  \centering
-  \def\svgwidth{\columnwidth}
+  \includegraphics[width=0.5\textwidth]{graphics/neural-network/neural-network.pdf}
  \scalebox{0.75}{\input{graphics/neural-network.pdf_tex}}
  \caption[Structure of an artificial neural network]{Structure of an
    artificial neural network. Information travels from left to right
-    through the network using the neurons and the connections between
+    through the network using neurons and the connections between
-    them. Attribution en:User:Cburnett, CC BY-SA 3.0 via Wikimedia
+    them.}
    Commons.}
  \label{fig:neural-network}
 \end{figure}