\documentclass[11pt]{article}
\usepackage{amsmath}
\usepackage{amssymb}
\begin{document}


\begin{center}
{\bf \Large{Large deviations}}
\bigskip

{Francis~Comets
\\
Universit{\'e} Paris Diderot, France\\
{\tt http://www.proba.jussieu.fr/\~{}comets/}
}
\end{center}


Large deviations is concerned with the study of rare events and of
small probabilities. Let $X_i, 1 \leq i \leq n,$ be independent identically
distributed (i.i.d.) real random
variables with expectation $m$, and $\bar X_n=(X_1+\ldots X_n)/n$ their
empirical mean. The law of large numbers shows that, for any Borel
$A \subset {\mathbb R}$ not containing $m$ in its closure,
$P(\bar X_n \in A) \to 0$ as $n \to \infty$, but does not tell us how fast
the probability vanishes. Large deviations state it is exponential in $n$, and
give us the rate of decay.
{\bf Cram\'er's theorem} states that,
\begin{equation*}
P(\bar X_n \in A) = \exp -n\big(\inf\{I(x); x \in A\} + o(1)\big)
\end{equation*}
as $n \to \infty$, for all interval $A$.
The rate function $I$ can be computed as the Legendre conjugate of
the logarithmic moment generating function of $X$,
\begin{equation*}
I(x)=\sup\{ \lambda x - \ln E \exp( \lambda X_1); \lambda \in {\mathbb R}\},
\end{equation*}
and is called the Cram\'er transform of the common law of the $X_i$'s.
The natural assumption is the finiteness of the moment generating function
in a neighborhood of the origin, i.e., the property of
exponential tails. The function $I: {\mathbb R} \to [0, +\infty]$
is convex with $I(m)=0$.

\noindent
$\bullet$ In the Gaussian case $X_i \sim {\cal N}(m, \sigma^2)$, we find
$I(x)=(x-m)^2/(2\sigma^2)$;\\
$\bullet$ In the Bernoulli case
$P(X_i=1)=p=1-P(X_i=0)$, we find the entropy function
$I(x)=x \ln(x/p)+(1-x) \ln(1-x)/(1-p)$ for $ x\in [0,1]$, and $I(x)=+\infty$ otherwise.
\medskip

To emphasize the importance of rare events, let us mention a consequence,
the {\bf Erd\"os-R\'enyi law}: consider an infinite sequence
$X_i, i \geq 1$, of
Bernoulli i.i.d. variables with parameter $p$, and define $R_n$ the
length of the longest consecutive run, contained within the first $n$ tosses,
in which the fraction of 1's is at least $a$ ($a >p$).
Erd\"os and R\'enyi proved that, almost surely as $n \to \infty$,
$$
R_n/ \ln n \longrightarrow I(a)^{-1} ,
$$
with the function $I$ from the Bernoulli case above. Though it may look
paradoxical, large deviations
are at the core of this event of full probability. This result is the basis of
bioinformatics applications like sequence matching,
and of statistical tests for sequence randomness.
\bigskip

The theory does not only apply to independent variables,
%but also to a variety of frameworks
%(general state space, Markov or Gaussian processes, large deviations from
%ergodic theorems\ldots).
but allows for many variations, including weakly dependent variables in a
general state space, Markov or Gaussian processes, large deviations from
ergodic theorems, non-asymptotic bounds, asymptotic expansions (Edgeworth
expansions), \ldots.
\bigskip

Here is the {\bf formal definition}.
Given a Polish space (i.e., a separable complete metric space) $\mathcal X$,
let $\{ \mathbb{P}_n\}$ be a sequence of
Borel probability measures on $\mathcal X$, let ${a_n}$ be a positive sequence
tending to infinity, and finally let $I:{\mathcal X}
\to [0,+\infty]$ be a lower continuous functional on $X$ which level sets$\{x: I(x)\leq a\}$ are compact for all $a <\infty$. We say that
the sequence $\{ \mathbb{P}_n\}$ satisfies a large deviation principle with speed ${a_n}$ and rate $I$, if for each measurable set $E \subset X$
$$
-\inf_{x \in E^\circ} I(x) \le \varliminf_n a_n^{-1}
\ln\mathbb{P}_n(E) \le \varlimsup_n a_n^{-1}
\ln\mathbb{P}_n(E) \le -\inf_{x \in \bar{E}} I(x)
$$
where $\bar{E}$ and $E^\circ$ denote respectively the closure and interior of
$ E.$ The rate function can be obtained as
\begin{equation*}
I(x)= - \lim_{\delta \searrow 0} \lim_{n \to \infty} a_n^{-1}
\ln\mathbb{P}_n( B(x, \delta)),
\end{equation*}
with $B(x, \delta)$ the ball of center $x$ and radius $\delta$. Large deviation theory allows for an abstract version
of Laplace method for estimating integrals:
Varadhan's lemma states that, for any continuous
function $F: X \to {\mathbb R}$ with
$$
\lim_{M \to \infty} \limsup_{n \to \infty} a_n^{-1} \ln \int_{F(x)\geq M} e^{a_nF(x)} dP_n(x) = - \infty
$$
(a bounded $F$ is fine),
we have
$$
\lim_{n \to \infty} a_n^{-1} \ln \int_X e^{a_nF(x)} dP_n(x) = \sup_x \{F(x)-I(x)\},
$$
and the sequence of probability measures $ e^{a_nF} dP_n/ \int e^{a_nF} dP_n$
concentrates on the set of maximizers.

\bigskip

{\bf Sanov's theorem} and sampling with replacement: let $\mu$ be a probability
measure on a set $\Sigma$ that we assume finite for simplicity,
with $\mu(y)>0$ for all $y \in \Sigma$.
Let $Y_i, i \geq 1$, an i.i.d. sequence with law $\mu$,
and $N_n$ the score vector of the $n$-sample,
$$
N_n (y) = \sum_{i=1}^n {\bf 1}_{y}(Y_i).
$$
By the law of large numbers, $N_n/n \to \mu$ a.s. From the multinomial distribution, one can check that,
for all $\nu$ such that $n \nu$ is a possible score vector for the $n$-sample,
$$
(n+1)^{-|\Sigma|} e^{-n H(\nu | \mu)} \leq
P(n^{-1} N_n = \nu ) \leq e^{-n H(\nu | \mu)} ,
$$
where $H(\nu | \mu)=\sum_{y \in \Sigma} \nu(y) \ln \frac{\nu(y)}{\mu(y)}$
is the relative entropy of $\nu$ with respect to $\mu$.
The large deviations theorem holds for the empirical distribution
of a general $n$-sample, with speed $n$ and rate $I(\nu)=H(\nu | \mu)$
given by the natural generalization of the above formula.
This result, due to Sanov,
has many consequences in information theory and statistical
mechanics \cite{DZ, dH}, and for exponential families in statistics.
Applications in statistics also include point estimation (by giving the
exponential rate of convergence of $M$-estimators) and for
hypothesis testing (Bahadur efficiency) \cite{Kes}, and concentration
inequalities \cite{DZ}.

\bigskip
Consider now a {\bf Markov chain} $(Y_n, n \geq 0)$. For simplicity we assume that it is irreducible with a finite state space
$\Sigma$. We denote by $Q=(Q(i,j); i,j \in \Sigma)$ the transition matrix, and for any $V: \Sigma \to {\mathbb R}$,
$Q_V(i,j)=Q(i,j)e^{V(j)}$. By Perron-Frobenius theorem, $n^{-1} \ln \sum_j Q_V^n(i,j) \to \ln \lambda_V(Q)$
as $n \to \infty$, with $\lambda_V(Q)$ the principal eigenvalue of the positive matrix $Q_V$. By the ergodic theorem,
$N_n/n$ converges to the (unique) invariant law for $Q$.
The law of the empirical distribution $N_n/n$ satisfes a large deviation principle with speed $n$ and rate $I_Q$
given by
$$
I_Q(\nu)= \sup_V \{ \sum_j V(j) \nu(j) - \ln \lambda_V(Q)\}
$$
for any law $\nu$ on $\Sigma$.

\bigskip

 Consider next a {\bf Markov process}  $(Y_t, t \in {\mathbb R}^+)$. We assume it is  irreducible on the finite state space $\Sigma$, and denote by $a(i,j)$ the transition rate from $i$ to $j$ ($a(i,j)\geq 0$ for $i \neq j$, $\sum_j a(i,j)=0$).  
Then, similarly to the time-discrete case, the law of the empirical distribution $(t^{-1} \int_0^t {\bf 1}_j(Y_s)ds; j \in \Sigma)$   satisfes a large deviation principle with speed $t$ and rate $I_a$ given by
$$
I_a(\nu)= \sup_V \{ \sum_j V(j) \nu(j) - \lambda_V(a)\},
$$
with $ \lambda_V(a)$ the principal eigenvalue of the matrix $(a(i,j)+\delta(i,j)V(j))_{i,j}$.
Now, assume in addition that the process is reversible with respect to a probability measure $\pi$, i.e. $\pi(i) a(i,j)=
\pi(j) a(j,i)$ for all $i,j$. Then, $\pi$ is the invariant measure for the process and $\pi(i)>0$ for all $i \in \Sigma$.
Using the variational formula for eigenvalues of symmetric operators, Donsker and Varadhan found that the rate function takes a simple form in the reversible case,
$$
I_a(\nu)= \frac{1}{2} \sum_{i,j} \pi(i) a(i,j) \left( \sqrt{ \frac{\nu(i)}{\pi(i)}} -\sqrt{ \frac{\nu(j)}{\pi(j)}} \right)^2 ,
$$
that is the value of the Dirichlet form of the reversible process on the square root of the density of $\nu$
with respect to the invariant measure.

\bigskip

The {\bf Freidlin-Wentzell theory} deals with diffusion processes with small noise,
$$
d X^{\epsilon}_t = b(X^{\epsilon}_t) dt + \sqrt{\epsilon}\; \sigma(X^{\epsilon}_t) dB_t\;
, \qquad X^{\epsilon}_0=y.
$$
The coefficients $b, \sigma$ are uniformly lipshitz functions,
and $B$ is a standard
Brownian motion. The sequence $X^{\epsilon}$ can be viewed as $\epsilon \searrow 0$
as a small random perturbation of the ordinary differential equation (ODE)
$$
d x_t = b(x_t ) dt\;,\qquad x_0=y.
$$
Indeed, $X^{\epsilon} \to x$ in the supremum norm on bounded time-intervals.
Freidlin and Wentzell have shown that, on a finite time interval $[0,T]$,
the sequence $X^{\epsilon}$ with values in the path space obeys the LDP with speed $\epsilon^{-1}$ and rate function
$$
I_{0,T}(\phi)= \frac{1}{2}\int_0^T \sigma(\phi(t))^{-2}
\Big(\dot{\phi(t)}-b(\phi(t))\Big)^2 dt
$$
if $\phi$ is absolutely continuous with square-integrable derivative and
$\phi(0)=y$; $I(\phi)= \infty$ otherwise. (To fit in the above formal
definition, take a sequence $\epsilon= \epsilon_n \searrow 0$, and
for $\mathbb{P}_n$ the law of $X^{\epsilon_n}$.) Note that $I(\phi)=0$ if and only if $\phi$ is a solution of the
above ODE. To follow closely any other path $\phi$ during a finite time $T$ is an event of probability $\exp \{-\epsilon^{-1}
I_{0,T}(\phi)\}$ at leading order, and therefore is very rare for small $\epsilon$.

A simple case is $\sigma=1$
and $b(x)=- V'(x)$ with a smooth $V$; we view $V(x)$ as the height of $x$, and
a key role is played by the local minima of $V$.
%, say $(z_i; i =\ldots,-1,0,1,2,\ldots)$.
With an overwhelming probability as $\epsilon \searrow 0$, the picture will be as follows in a generic situation. The process $X^\epsilon$
will stay close to the solution of the ODE starting from $y$, and will eventually come near the local minimum (say $z_0$)
which attracts $y$, and stay around for times of order $\exp o(\epsilon^{-1})$. But, by ergodicity, it will leave
the neighborhood of $z_0$ at some time, and, even more, it will visit all points: it is then important how these large deviations occur.
Let $D$ be domain of attraction of $z_0$,
$h$ be its depth (i.e., the height difference between $z_0$ and the lowest point on the boundary of $D$, that we call the lowest pass).
Up to times of order $\exp (\epsilon^{-1} h)$, the process remains in the part of $D$ of relative height smaller than $h$,
and will occupy this region with density proportional to $\exp -(\epsilon^{-1} V(\cdot))$. At some random time $\tau$ of order
$\exp (\epsilon^{-1} h)$, it will leave $D$ through the lowest path, and fall down towards a new local minimum,
following roughly a path of the ODE. The piece of path just before leaving $D$ is the time-reversed of an ODE path.
The ratio of $\tau$ to its expected value converges to an exponential law.


The { Freidlin-Wentzell theory} has applications in physics (metastability
phenomena, analysis of rare events)
and engineering (tracking loops, statistical
analysis of signals, stabilization of systems and
algorithms)
\cite{FV, Azencott, DZ, OV}.

\bigskip

{\it Acknowledgement:}
This article is based on an article from Lovric, Miodrag (2011), International
Encyclopedia of Statistical Science. Heidelberg: Springer Science$+$Business Media, LLC

\bigskip

\begin{thebibliography}{99}

\bibitem{Azencott}
Azencott, R.
{\it Grandes d\'eviations et applications.} (French) Eighth Saint Flour Probability Summer School---1978, 1--176,
Lecture Notes in Math. 774, Springer, Berlin, 1980

\bibitem{DZ}
Dembo, Amir; Zeitouni, Ofer:
{\it Large deviations techniques and applications}.
Springer, New York, 1998.
%%%%%%%%%%%%%%%%
\bibitem{DeuschelStroock}
Deuschel, J.-D., Stroock, D.
{\it Large deviations.} Academic Press, Inc., Boston, MA, 1989
%%%%%%%%%
\bibitem{FengKurtz}
Feng, J., Kurtz, T.
\textit{Large deviations for stochastic processes.} American Mathematical Society, Providence, RI, 2006

%%%
\bibitem{dH}
den Hollander, Frank:
{\it Large deviations.} American Mathematical Society, Providence, RI, 2000.
%%%%%%%%%%%%%%%%
\bibitem{FV}
Freidlin, M. I.; Wentzell, A. D.:
{\it Random perturbations of dynamical systems.}
Springer-Verlag, New York, 1998.
%%%%%%%%%%%%%
\bibitem{Kes}
Kester, A.:
{\it Some large deviation results in statistics.}
CWI Tract, 18. Centrum voor Wiskunde en Informatica, Amsterdam, 1985.
%%%%%%%%%%%%%
\bibitem{KipnisLandim}
Kipnis, C., Landim, C.:
{\it Scaling limits of interacting particle systems.}
Springer-Verlag, Berlin, 1999
%%%%%%%%%
\bibitem{OV}
Olivieri, Enzo; Vares, Maria Eul\'alia:
{\it Large deviations and metastability. }
Cambridge University Press, 2005.
%%%%%%%%%%
\bibitem{Varadhan}
Varadhan, S. R. S.:
Large deviations. \textit{Ann. Probab.} \textbf{ 36} (2008), 397--419
\end{thebibliography}

\end{document}