diff --git a/trustworthy-ai.bib b/trustworthy-ai.bib index 007f660..c7ce200 100644 --- a/trustworthy-ai.bib +++ b/trustworthy-ai.bib @@ -152,6 +152,23 @@ file = {/home/zenon/Zotero/storage/FZVU8FXW/Mehrabi et al. - 2021 - A Survey on Bias and Fairness in Machine Learning.pdf} } +@inproceedings{ribeiroWhyShouldTrust2016, + title = {Why {{Should I Trust You}}?: {{Explaining}} the {{Predictions}} of {{Any Classifier}}}, + shorttitle = {"{{Why Should I Trust You}}?}, + booktitle = {Proceedings of the 22nd {{ACM SIGKDD International Conference}} on {{Knowledge Discovery}} and {{Data Mining}}}, + author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos}, + date = {2016-08-13}, + series = {{{KDD}} '16}, + pages = {1135--1144}, + publisher = {{Association for Computing Machinery}}, + location = {{New York, NY, USA}}, + doi = {10.1145/2939672.2939778}, + abstract = {Despite widespread adoption, machine learning models remain mostly black boxes. Understanding the reasons behind predictions is, however, quite important in assessing trust, which is fundamental if one plans to take action based on a prediction, or when choosing whether to deploy a new model. Such understanding also provides insights into the model, which can be used to transform an untrustworthy model or prediction into a trustworthy one. In this work, we propose LIME, a novel explanation technique that explains the predictions of any classifier in an interpretable and faithful manner, by learning an interpretable model locally varound the prediction. We also propose a method to explain models by presenting representative individual predictions and their explanations in a non-redundant way, framing the task as a submodular optimization problem. We demonstrate the flexibility of these methods by explaining different models for text (e.g. random forests) and image classification (e.g. neural networks). We show the utility of explanations via novel experiments, both simulated and with human subjects, on various scenarios that require trust: deciding if one should trust a prediction, choosing between models, improving an untrustworthy classifier, and identifying why a classifier should not be trusted.}, + isbn = {978-1-4503-4232-2}, + keywords = {black box classifier,explaining machine learning,interpretability,interpretable machine learning}, + file = {/home/zenon/Zotero/storage/5F5BJIAT/Ribeiro et al. - 2016 - Why Should I Trust You Explaining the Predicti.pdf} +} + @online{roseFaceDetectionCamerasGlitches2010, title = {Face-{{Detection Cameras}}: {{Glitches Spur Charges}} of {{Racism}}}, author = {Rose, Adam}, diff --git a/trustworthy-ai.tex b/trustworthy-ai.tex index b74d5d4..1411229 100644 --- a/trustworthy-ai.tex +++ b/trustworthy-ai.tex @@ -331,7 +331,43 @@ ever-increasing model complexity, made possible by massive deep neural networks (DNNs) and other similarly complex architectures. Due to their size models are treated as black-boxes with no apparent way to know how a particular prediction came to be. This lack of explainability disallows humans to trust artificial -intelligence systems especially in critical areas such as medicine. +intelligence systems especially in critical areas such as medicine. To combat +the development towards difficult to understand artificial intelligence systems, +a new research field called \emph{eXplainable Artificial Intelligence} (XAI) has +emerged. + +Scholars distinguish between two similar but slightly different terms: +\emph{explainability} and \emph{interpretability}. Interpretable systems allow +humans to \emph{look inside} the model to determine which predictions it is +going to make. This is only possible if most or all parameters of the model are +visible to an observer and changes to those parameters result in predictable +changes in outputs. Explainability, on the other hand, applies to black-box +systems such as deep neural networks where the system explains its predictions +after the fact. + +The definition of interpretability already provides one possibility for +explainable models. If the model is constructed in a way which makes the +parameters visible and a decision can be traced from a starting point to the +outcome, the model is inherently explainable. Examples are decision trees, +linear regression models, rule-based models and Bayesian networks. This approach +is not possible for neural networks and thus \emph{model-agnostic explanations} +have to be found. \textsc{LIME} \cite{ribeiroWhyShouldTrust2016} is a tool to +find such model-agnostic explanations. \textsc{LIME} works \enquote{…by learning +an interpretable model locally around the prediction} +\cite[p.~1]{ribeiroWhyShouldTrust2016}. An advantage of this approach is that +\textsc{LIME} is useful for any model, regardless of how it is constructed. Due +to the high amount of flexibility introduced by model\nobreakdash-agnostic +explanations, these can even be used for already interpretable models such as +random forest classifiers. + +Deep neural networks can also be explained using either a \emph{gradient-based} +or \emph{perturbation-based} explanation algorithm. Gradient-based algorithms +attempt to evaluate how much outputs change if inputs are modified. If the +gradient for a set of inputs is large, those inputs have a large effect on +outputs. Similarly, a small gradient indicates that the change in inputs does +not affect the outputs to a large extent. Perturbation-based explanations work +by finding perturbations in the inputs that alter the model's predictions the +most. \textsc{LIME} is an example of a perturbation-based explanation algorithm. \subsection{Privacy}