Add Safety & Robustness section

This commit is contained in:
Tobias Eidelpes 2021-12-16 16:27:20 +01:00
parent e56343388d
commit dea54366bc
2 changed files with 102 additions and 0 deletions

View File

@ -102,4 +102,37 @@
file = {/home/zenon/Zotero/storage/EQDGFNC4/Suh et al. - 2021 - Trustworthiness in Mobile Cyber-Physical Systems.pdf;/home/zenon/Zotero/storage/798R34VM/1676.html}
}
@online{tsiprasRobustnessMayBe2019,
title = {Robustness {{May Be}} at {{Odds}} with {{Accuracy}}},
author = {Tsipras, Dimitris and Santurkar, Shibani and Engstrom, Logan and Turner, Alexander and Madry, Aleksander},
date = {2019-09-09},
eprint = {1805.12152},
eprinttype = {arxiv},
primaryclass = {cs, stat},
url = {http://arxiv.org/abs/1805.12152},
urldate = {2021-12-16},
abstract = {We show that there may exist an inherent tension between the goal of adversarial robustness and that of standard generalization. Specifically, training robust models may not only be more resource-consuming, but also lead to a reduction of standard accuracy. We demonstrate that this trade-off between the standard accuracy of a model and its robustness to adversarial perturbations provably exists in a fairly simple and natural setting. These findings also corroborate a similar phenomenon observed empirically in more complex settings. Further, we argue that this phenomenon is a consequence of robust classifiers learning fundamentally different feature representations than standard classifiers. These differences, in particular, seem to result in unexpected benefits: the representations learned by robust models tend to align better with salient data characteristics and human perception.},
archiveprefix = {arXiv},
version = {5},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
file = {/home/zenon/Zotero/storage/VWTMWIEK/Tsipras et al. - 2019 - Robustness May Be at Odds with Accuracy.pdf;/home/zenon/Zotero/storage/DG7EDYAM/1805.html}
}
@article{xuAdversarialAttacksDefenses2020,
title = {Adversarial {{Attacks}} and {{Defenses}} in {{Images}}, {{Graphs}} and {{Text}}: {{A Review}}},
shorttitle = {Adversarial {{Attacks}} and {{Defenses}} in {{Images}}, {{Graphs}} and {{Text}}},
author = {Xu, Han and Ma, Yao and Liu, Hao-Chen and Deb, Debayan and Liu, Hui and Tang, Ji-Liang and Jain, Anil K.},
date = {2020-04-01},
journaltitle = {International Journal of Automation and Computing},
shortjournal = {Int. J. Autom. Comput.},
volume = {17},
number = {2},
pages = {151--178},
issn = {1751-8520},
doi = {10.1007/s11633-019-1211-x},
abstract = {Deep neural networks (DNN) have achieved unprecedented success in numerous machine learning tasks in various domains. However, the existence of adversarial examples raises our concerns in adopting deep learning to safety-critical applications. As a result, we have witnessed increasing interests in studying attack and defense mechanisms for DNN models on different data types, such as images, graphs and text. Thus, it is necessary to provide a systematic and comprehensive overview of the main threats of attacks and the success of corresponding countermeasures. In this survey, we review the state of the art algorithms for generating adversarial examples and the countermeasures against adversarial examples, for three most popular data types, including images, graphs and text.},
langid = {english},
file = {/home/zenon/Zotero/storage/LWZNKZLR/Xu et al. - 2020 - Adversarial Attacks and Defenses in Images, Graphs.pdf}
}

View File

@ -208,6 +208,75 @@ to accurate performance metrics.
\section{Computational Aspects of Trustworthy AI}
\label{sec:taxonomy}
While there have been rapid advances in the quality of machine learning models
and neural networks, scholars, the public and policymakers are increasingly
recognizing the dangers of artificial intelligence. Concerns about privacy and
methods for deanonymizing individual data points to discrimination through
learned biases and environmental impacts have prompted a new area of research
which is focused on altering the models to alleviate these concerns. A recent
survey by \textcite{liuTrustworthyAIComputational2021} summarizes the state of
the art in trustworthy AI research. The authors collect research from a
computational perspective and divide it into six categories: safety and
robustness, non-discrimination and fairness, explainability, privacy,
accountability and auditability and environmental well-being. The following
sections summarize the computational methods for each category.
\subsection{Safety and Robustness}
Machine learning models should be able to give robust results even in the face
of adversarial attacks or naturally occurring noise in the training data. It has
been shown that even small perturbations in the training set can affect the
quality of the model disproportionately \cite{madryDeepLearningModels2019}. In
order to build models which retain their accuracy and general performance even
under less than ideal circumstances, it is necessary to study different forms of
attacks and how to defend against them. Safe and robust models lead to increases
in trustworthiness because beneficiaries can more easily depend on their
results (reflective trust).
\subsubsection{Threat models} model by which method an attacker manages to break
the performance of a particular machine learning algorithm. \emph{Poisoning
attacks} allow an attacker to intentionally introduce bad samples into the
training set which results in wrong predictions by the model. While many models
are trained beforehand, other models are constantly being updated by data that
the model receives from its beneficiaries. One such example may be Netflix'
movie recommendation system that receives which type of movies certain users are
interested in. A malicious user could therefore attack the recommendation engine
by supplying wrong inputs. \emph{Evasion attacks} consist of alterations which
are made to the training samples in such a way that these alternations—while
generally invisible to the human eye—mislead the algorithm.
\emph{White-box attacks} allow an attacker to clearly see all parameters and all
functions of a model. \emph{Black-box attackers}, on the other hand, can only
give inputs to the model and obtain the outputs. The former type of attack is
generally easier to carry out.
\emph{Targeted attacks} are aimed at specific classes of a machine learning
classifier for example. Suppose a model is trained to recognize facial features.
In a targeted attack, an attacker would try to feed inputs to the model such
that just one person is consistently incorrectly classified. This type of attack
is in contrast to \emph{non-targeted attacks} which seek to undermine the
model's performance in general. Targeted attacks are usually much harder to
detect as the predictions are correct overall but incorrect for a tiny subset.
\subsubsection{Defenses against adversarial attacks} are specific to the domain
a model is working in. \textcite{xuAdversarialAttacksDefenses2020} describe
different attacks and defenses for text, image and graph data in deep neural
networks. Defending against adversarial attacks often has negative impacts on
training time and accuracy \cite{tsiprasRobustnessMayBe2019}. Balancing these
trade-offs is therefore critical for real-world applications.
\subsection{Non-discrimination and Fairness}
\subsection{Explainability}
\subsection{Privacy}
\subsection{Accountability and Auditability}
\subsection{Environmental Well-Being}
\section{Social Computing}
\label{sec:social-computing}