\documentstyle[12pt,epsfig]{article} \textwidth6.25in \textheight8.5in \oddsidemargin.25in \topmargin0in %\renewcommand{\baselinestretch}{2.0} \def\be{\begin{equation}} \def\ee{\end{equation}} \def\la{\langle} \def\ra{\rangle} \def\uom{\underline{\omega}} \def\IP{\hbox{\rm I\kern -1.6pt{\rm P}}} \def\IC{{\hbox{\rm C\kern-.58em{\raise.53ex\hbox{$\scriptscriptstyle|$}} \kern-.55em{\raise.53ex\hbox{$\scriptscriptstyle|$}} }}} \def\IQ{{\hbox{\rm Q\kern-.64em{\raise.53ex\hbox{$\scriptscriptstyle|$}} \kern-.55em{\raise.53ex\hbox{$\scriptscriptstyle|$}} }}} \def\IN{\hbox{I\kern-.2em\hbox{N}}} \def\IR{\hbox{\rm I\kern-.2em\hbox{\rm R}}} \def\ZZ{\hbox{{\rm Z}\kern-.3em{\rm Z}}} %\def\IT{\hbox{\rm T\kern-.38em{\raise.415ex\hbox{$\scriptstyle|$}} }} \def\IT{{\rm Tor}} \def\I1{\hbox{{1}\kern-.28em\hbox{I}}} \newtheorem{theorem}{Theorem}[section] %\newtheorem{theorem}{Theorem} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{sublemma}[theorem]{Sublemma} \newtheorem{proposition}[theorem]{Proposition} \newtheorem{corollary}[theorem]{Corollary} \newtheorem{remark}[theorem]{Remark} \begin{document} \begin{center} {\Huge Dynamical Systems, MA 760} \end{center} %\renewcommand{\theequation}{\arabic{section}.\arabic{equation}} \section{Measure Theory (reminder)} \noindent{\sc 1.1 Definition}. A $\sigma${\bf -algebra} $\cal B$ for a set $X$ is a collection of subsets of $X$ such that \begin{itemize} \item[(i)] $\emptyset,X\in{\cal B}$ \item[(ii)] $\{B_i\}_{i=1}^{\infty}\in{\cal B}\ \ \Longrightarrow\ \ \cup_{i=1}^{\infty}B_i\in{\cal B}$ \item[(iii)] $B\in{\cal B}\ \ \Longrightarrow\ \ B^c\in{\cal B}$ \end{itemize} A pair $(X,{\cal B})$ is called a {\bf measurable space}. Sets $B\in{\cal B}$ are said to be {\bf measurable}.\\ \noindent{\sc 1.2 Definition}. A {\bf measure} $m$ on $(X,\cal B)$ is a function $m:{\cal B}\to\IR\cup\{+\infty\}$ such that \begin{itemize} \item[(i)] $m(B)\geq 0$ for all $B\in{\cal B}$ \item[(ii)] $m(\emptyset)=0$ \item[(iii)] $\{B_i\}_{i=1}^{\infty}\in{\cal B}$ and $B_i\cap B_j= \emptyset$ for $i\neq j$ $\ \ \Longrightarrow\ \ m\left (\cup_{i=1}^{\infty}B_i\right ) =\sum_{i=1}^{\infty}m(B_i)$ \end{itemize} The property (iii) is called $\sigma${\bf -additivity} or {\bf countable additivity}. We use the obvious convention: if $m(B_i)=\infty$ for some $i$, then $\sum m(B_i)=\infty$. $\Box$\\ \noindent{\sc 1.3 Exercise}. Show that if $m(X)<\infty$, then the clause (ii) of Definition~1.2 follows from (i) and (iii). Construct an example of a function $m:{\cal B}\to\IR\cup\{+\infty\}$ that satisfies (i) and (iii) but not (ii).\\ \noindent{\sc 1.4 Remark}. We say that a $\sigma$-algebra $\cal B$ is closed under countable (and hence, also finite) unions. It is easy to show that $\cal B$ is also closed under countable (and finite) intersections, i.e. $\{B_i\}_{i=1}^{\infty}\in{\cal B}\ \ \Longrightarrow\ \ \cap_{i=1}^{\infty}B_i\in{\cal B}$ [to prove this, just use the formula $\cap_{i=1}^{\infty}B_i=\left ( \cup_{i=1}^{\infty} B_i^c\right )^c\;$]. Also, $\cal B$ is closed under differences and symmetric differences, i.e. $A,B\in{\cal B}\ \ \Longrightarrow\ \ A\setminus B\in{\cal B}$ and $A\Delta B\in\cal B$, where $A\Delta B = (A\setminus B)\cup (B\setminus A)$ [the proof is simple].\\ \noindent{\sc 1.5 Remark}. Measures have the following simple properties: $A\subset B\ \ \Longrightarrow\ \ m(B\setminus A) =m(B)-m(A)$ and $m(A)\leq m(B)$. In particular, $m(B)\leq m(X)$ for all $B\in\cal B$. If $m(X)<\infty$, then $m$ is said to be {\bf finite} (otherwise, it is called {\bf infinite}). If $m(X)=1$, then $m$ is called a {\bf probability measure} or just a {\bf probability}. In this course, we will only deal with probability measures.\\ \noindent{\sc 1.6 Remark}. For any set $X$, there are two trivial $\sigma$-algebras. One is {\em minimal}, it consists of the sets $X$ and $\emptyset$ only. The other is {\em maximal}, it contains all the subsets of $X$. The latter one is denoted by $2^X$. (Note: if $X$ is a finite set of $n$ elements, then the maximal $\sigma$-algebra contains exactly $2^n$ sets.)\\ \noindent{\sc 1.7 Examples of measures:}\\ \noindent (a) Let $X=(a,b)\subset\IR$, $\cal B$ the Borel $\sigma$-algebra of $X$, and $m$ the Lebesgue measure on $X$. When $a=-\infty$ or $b=\infty$ (or both), then $m$ is infinite, otherwise $m$ is finite.\\ \noindent (b) Let $X=(a,b)\subset\IR$, $\cal B$ the Borel $\sigma$-algebra of $X$, again $m$ the Lebesgue measure on $X$, and $f:X\to\IR$ is an integrable nonnegative function. Then $$ \mu(B)=\int_B f(x)\, dm(x)\ \ \ \ \ \ \ {\rm for}\ \ B\in{\cal B} $$ defines a measure $\mu$ on $X$. The function $f(x)$ is called the {\bf density} of the measure $\mu$. \\ \noindent (c) Let $(X,{\cal B})$ be an arbitrary measurable space and $x\in X$ a selected point. The measure $\delta_x$ defined by $$ \delta_x(B)=\left\{\begin{array}{ll} 1 & {\rm if}\ \ x\in B\\ 0 & {\rm otherwise}\end{array}\right . $$ is called a {\bf delta-measure} or a {\bf Dirac measure} (concentrated at $x$).\\ \noindent (d) Let $X$ be a finite or countable set, say, $X=\{1,2,\ldots\}$. Then any measure $m$ on $(X,2^X)$ is determined by the numbers $p_i=m(\{i\})$, $i\in X$, because $$ m(B)=\sum_{i\in B}p_i\ \ \ \ \ \ \ {\rm for}\ {\rm any}\ \ B\subset X $$ \noindent{\sc 1.8 Convention}. Whenever $X$ is a finite or countable set, then we always consider the $\sigma$-algebra $2^X$. If $X\subset\IR$, then we consider the Borel $\sigma$-algebra (unless otherwise stated).\\ \noindent{\sc 1.9 Remark}. $\sigma$-algebras are not necessarily closed under uncountable unions or intersections. If they were, then the Borel $\sigma$-algebra for $\IR$ would contain {\rm all} the subsets of $\IR$, which we know is not the case.\\ \noindent{\sc 1.10 Exercise}. Let $X$ be a finite set, say, $X=\{1,2,\ldots,n\}$. Describe all probability measures on $X$. Hint: use Example 1.7(d) and recall the notion of a {\em simplex} from geometry.\\ \noindent{\sc 1.11 Remark}. Let $m$ be a measure on $(X,{\cal B})$ and $c\geq 0$. Then $cm$ is a measure defined by $(cm)(B)=c\cdot m(B)$ for all $B\in{\cal B}$. Let $m_1$ and $m_2$ be two measures on $(X,{\cal B})$. Then $m_1+m_2$ is a measure defined by $(m_1+m_2)(B)=m_1(B)+m_2(B)$ for all $B\in{\cal B}$. Hence, we can add measures and multiply them by nonnegative constants.\\ \noindent{\sc 1.12 Lemma}. If $m_1$ and $m_2$ are two probability measures on $(X,{\cal B})$, then $pm_1+(1-p)m_2$ is a probability measure for every $0\leq p\leq 1$. Hence, the set of all probability measures on $(X,{\cal B})$ is {\em convex}.\\ \noindent{\sc 1.13 Definition}. If $m$ is a finite measure on $(X,{\cal B})$ with $m(X)>0$, then the measure $m_1=cm$, where $c=1/m(X)$, is a probability measure. The multiplication of $m$ by $1/m(X)$ is called the {\bf normalization}, and $m_1$ is called the {\bf normalized measure}.\\ Note: $\sigma$-algebras are usually quite complicated and contain many ``weird'' sets. Fortunately, it is often enough to deal with certain ``nice'' sets that ``represent'' the entire $\sigma$-algebra. \\ \noindent{\sc 1.14 Definition}. An {\bf algebra} $\cal A$ for a set $X$ is a collection of subsets of $X$ such that \begin{itemize} \item[(i)] $\emptyset,X\in{\cal A}$ \item[(ii)] $\{A_i\}_{i=1}^{n}\in{\cal A}\ \ \Longrightarrow\ \ \cup_{i=1}^{n}A_i\in{\cal A}$ \item[(iii)] $A\in{\cal A}\ \ \Longrightarrow\ \ A^c\in{\cal A}$ \end{itemize} [Note the difference from Definition 1.1: now only finite unions are required to belong in $\cal A$, not countable.]\\ \medskip \noindent{\sc 1.15 Examples of algebras}:\\ \noindent (i) Let $X=[a,b]\subset\IR$. Finite unions of subintervals\footnote{This includes open, closed, and semi-open intervals, like $(c,d)$, $[c,d]$, $(c,d]$ and $[c,d)$.} of $X$ make an algebra. \\ \noindent (ii) Let $X=\IR$. Finite unions of intervals (including infinite intervals like $(a,\infty)$ and $(-\infty,b)$) make an algebra.\\ \noindent{\sc 1.16 Lemma}. The intersection of any family of $\sigma$-algebras of a set $X$ is always a $\sigma$-algebra of $X$ (the family itself may be finite, countable or uncountable). The same property holds for algebras.\\ \noindent{\sc 1.17 Definition}. Let $\cal J$ be any collection of subsets of $X$. The intersection of all $\sigma$-algebras (algebras) containing $\cal J$ is the {\em minimal} $\sigma$-algebra (resp., algebra) containing $\cal J$. It is called the $\sigma$-algebra (resp., algebra) {\bf generated by} $\cal J$ and denoted by ${\cal B}({\cal J})$ (resp., by ${\cal A}({\cal J})$).\\ A simple but useful fact: if $\cal J$ is finite (countable), then ${\cal A}({\cal J})$ is also finite (countable).\\ \noindent{\sc 1.18 Definition}. Let $X$ be a topological space. Then the $\sigma$-algebra generated by the collection of all open sets is called the {\bf Borel} $\sigma$-algebra of $X$. Sets in this $\sigma$-algebra are called {\bf Borel sets}. Any measure defined on the Borel $\sigma$-algebra of $X$ is called a {\bf Borel measure}.\\ The following theorem is particularly helpful in many proofs:\\ \noindent{\sc 1.19 Approximation Theorem}. Let $m$ be a probability measure on $(X,{\cal B})$ and let $\cal A$ be an algebra which generates $\cal B$, i.e. such that ${\cal B}({\cal A}) ={\cal B}$. Then for any $B\in\cal B$ and any $\varepsilon>0$ there is an $A\in\cal A$ such that $m(A\Delta B)<\varepsilon$. That is, the sets of the $\sigma$-algebra $\cal B$ can be approximated arbitrarily well by sets of the algebra $\cal A$. \\ For constructing measures, the next theorem can be very useful. \\ \noindent{\sc 1.20 Definition}. Let $\cal A$ be an algebra of $X$. A nonnegative function $m_0:{\cal A}\to\IR$ is said to be $\sigma${\bf -additive} (or {\bf countably additive}) if for any sequence $\{A_i\}_{i=1}^{\infty}$ of disjoint sets $A_i\in\cal A$ such that $\cup_{i=1}^{\infty}A_i\in\cal A$ we have $m_0\left (\cup_{i=1}^{\infty}A_i\right) =\sum_{i=1}^{\infty}m_0(A_i)$. Note: we have to assume that $\cup_{i=1}^{\infty}A_i\in\cal A$, since this does not automatically hold for algebras.\\ \noindent{\sc 1.21 Extension Theorem}. Let $\cal A$ be an algebra of $X$ and $m_0:{\cal A}\to\IR$ a $\sigma$-additive nonnegative function. Then there is a unique finite measure $m$ on $(X,{\cal B}({\cal A}))$ that coincides with $m_0$ on $\cal A$. (We say that $m$ {\bf extends} $m_0$ from $\cal A$ to ${\cal B}({\cal A})$.) Therefore, to construct a measure on a $\sigma$-algebra, it is sufficient to construct a $\sigma$-additive function on an algebra that generates the $\sigma$-algebra. \\ \noindent{\sc 1.22 Theorem}. Let $(X,{\cal B})$ be a measurable space and $\cal J$ a collection of subsets of $X$ that generates $\cal B$, i.e. such that ${\cal B} ({\cal J})={\cal B}$. Suppose two measures, $\mu_1$ and $\mu_2$, agree on $\cal J$, i.e. $\mu_1(A)=\mu_2(A)$ for all $A\in {\cal J}$, and $\mu_1(X)=\mu_2(X)$. Then $\mu_1=\mu_2$. \\ \noindent{\sc 1.23 Corollary}. If two Borel measures, $\mu_1$ and $\mu_2$, on $X=(a,b)\subset\IR$ agree on subintervals of $X$, then $\mu_1=\mu_2$. It is enough to require the agreement for all open intervals or for all closed intervals only.\\ \noindent{\sc 1.24 Corollary}. Let $X$ be a topological space and $\cal B$ its Borel $\sigma$-algebra. If two measures agree on the open sets, then they are equal. \\ Theorems 1.19, 1.21, and 1.22 are given without proofs here. Some were proved in Real Analysis. In any case, their proofs are beyond the scope of this course. \newpage \section{Simplest Examples and Basic Definitions} \noindent{\sc 2.1 Circle Rotation}. Let $X$ be a circle and $T:X\to X$ a transformation specified by rotating the circle $X$ through some angle. The circle $X$ can be coordinatized by the (polar) angle $\theta \in [0,2\pi)$ and the map $T$ specified by $T(\theta)=\theta+\theta_0$ (mod $2\pi$), where $\theta_0$ is the angle of rotation. Alternatively, we can use a complex variable $z$ and define $X=\{|z|=1\}$ and $T(z)=e^{i\theta_0}z$. However, we prefer to have a circle of unit length and use the coordinate $x$ on $X$ such that $0\leq x<1$. Equivalently, $X$ can be thought of as a closed unit interval $[0,1]$ with the endpoints 0 and 1 identified. Then we set $T(x)=x+a$ (mod 1), where the constant $a$ plays the role of the angle of rotation.\\ \noindent{\sc 2.2 Doubling map}. Let $X=[0,1)$ and $T:X\to X$ be a function defined by $T(x)=2x$ (mod 1). Again, one can think of $X$ as the unit circle and $x$ the angle measure, then $T$ doubles angles. For this reason $T$ is also called the {\bf angle doubling map}. \\ \begin{figure}[h] \centering \epsfig{figure=ds-1.eps}\caption{The circle rotation (a) and the doubling map (b).} \end{figure} \noindent{\sc 2.3 Definition}. In dynamical systems, we deal with {\bf iterates} of a given map $T:X\to X$, i.e. with the sequence of maps $T^n:X\to X$, $n\geq 1$, defined by $$ T^n=\underbrace{T\circ T\circ \cdots \circ T}_n $$ For any point $x_0\in X$, the sequence $$ x_0,\ x_1=T(x_0),\ x_2=T^2(x_0),\ldots,\ x_n=T^n(x_0),\ldots $$ is called the {\bf trajectory} of $x_0$ (or, sometimes, the {\bf orbit} of $x_0$). We are usually interested in the overall behavior of the sequence $\{x_n\}$ rather than in individual points $x_1,x_2$, etc. The variable $n$ is called {\bf time}\footnote{Note: $n$ only takes integral values. For this reason it is also called {\bf discrete time}.}. We think of $x_0$ as the {\bf initial point} and of $x_n=T^n(x_0)$ as its {\bf image at time} $n$. \\ \noindent{\sc 2.4. Remarks}. For the circle rotation, $T^n(x)=x+na$ (mod 1), so $T^n$ is the rotation through the angle $na$. For the doubling map, $T^n(x)=2^nx$ (mod 1), i.e. the graph of $T^n$ consists of $2^n$ branches, each has slope $2^n$.\\ \noindent{\sc 2.5 Question}. Suppose $A\subset X$ is some subset of interest. We want to see whether a trajectory $\{x_n\}$ hits $A$ at time $n$, i.e. whether $x_n\in A$. This happens whenever $T^n(x_0)\in A$, i.e. whenever $x_0\in (T^n)^{-1}(A)$; here $(T^n)^{-1}$ (also denoted by $T^{-n}$) is the inverse map.\\ \noindent{\sc 2.6 Definition}. For any $n\geq 1$ and any subset $A\subset X$ the set $$ T^{-n}(A)=\{y\in X:\, T^n(y)\in A\} $$ is the {\bf preimage} of $A$ under $T^n$. For any point $x\in X$ the set $T^{-n}(x)=\{y\in X:\, T^n(y)=x\}$ is the {\bf full preimage} of $x$ under $T^n$. Any particular point $y\in T^{-n}(x)$ is called {\bf a preimage} of $x$ under $T^{n}$. Note: $T^{-n}$ is not necessarily a pointwise map on $X$, it takes points to sets (and sets to sets).\\ \noindent{\sc 2.7 Remarks}. For the circle rotation, $T^{-n}(x)=x-na$ (mod 1) is the rotation through the angle $-na$. For the doubling map, $T^{-n}(x)$ is a set consisting of $2^n$ points $\{(x+i)/2^n\}$, $0\leq i \leq 2^n-1$.\\ \noindent{\sc 2.8 Exercise}. Verify by direct inspection the following simple properties of $T^{-n}$: \begin{enumerate} \item[(a)] $T^{-m}(T^{-n}(A))=T^{-(m+n)}(A)$ for all $m,n\geq 1$;\item[(b)] if $A\cap B=\emptyset$, then $T^{-n}(A)\cap T^{-n}(B)=\emptyset$;\item[(c)] for any $A\subset X$ we have $T^{-n}(A^c)=(T^{-n}(A))^c$;\item[(d)] for any $A,B\subset X$ we have $T^{-n}(A\cup B)=T^{-n}(A)\cup T^{-n}(B)$;\item[(e)] for any $A,B\subset X$ we have $T^{-n}(A\cap B)=T^{-n}(A)\cap T^{-n}(B)$;\item[(f)] for any $A,B\subset X$ we have $T^{-n}(A\setminus B)=T^{-n}(A)\setminus T^{-n}(B)$; \end{enumerate} Therefore, $T^{-n}$ neatly preserves all the set-theoretic operations. The properties (d) and (e) can be easily extended to countable unions and intersections. A curious remark: most of the above properties fail (!) for $T^n$. Give some counterexamples.\\ Next, if $A\subset X$ is a ``nice'' (say, Borel) set, then we want $T^{-n}(A)$ to be ``nice'' as well.\\ \noindent{\sc 2.9 Definition}. Let $X$ be a set with a $\sigma$-algebra $\cal B$. A transformation $T:\, X\to X$ is called {\bf measurable} if $T^{-1}(B)\in\cal B$ for every $B\in\cal B$. Note: this implies $T^{-n}(B)\in\cal B$ for all $B\in\cal B$ and $n\geq 1$. We do not require (and do not need) that $T^n(B)\in\cal B$ for $n\geq 1$. \\ \noindent{\sc 2.10 Lemma}. Let $(X,{\cal B})$ be a measurable space and $T:X\to X$ a transformation. Fix $n\geq 1$. Then the collection of sets $\{T^{-n}(B):\, B\in{\cal B}\}$ is a $\sigma$-algebra. It is denoted by $T^{-n}({\cal B})$. Also, the collection of sets $\{B\subset X:\, T^{-n}(B)\in{\cal B}\}$ is a $\sigma$-algebra.\\ {\em Proof}: This easily follows from 2.8. $\Box$\\ \noindent{\sc 2.11 Theorem}. Let $(X,{\cal B})$ be a measurable space and $\cal J$ a collection of subsets of $X$ that generates $\cal B$, i.e. ${\cal B}({\cal J})={\cal B}$. Then a transformation $T:X\to X$ is measurable iff $T^{-1}(A)\in\cal B$ for all $A\in\cal J$. \\ {\em Proof}: The collection of subsets ${\cal B}_1= \{B\subset X:\, T^{-1}(B)\in{\cal B}\}$ is a $\sigma$-algebra (by 2.10), and it is assumed that ${\cal J}\subset{\cal B}_1$. Hence, ${\cal B}\subset{\cal B}_1$. $\Box$\\ \noindent{\sc 2.12 Corollary}. If $X$ is a topological space and $\cal B$ its Borel $\sigma$-algebra, then $T:X\to X$ is measurable iff $T^{-1}(B)\in\cal B$ for every open set $B$. Note: every continuous transformation is measurable. \\ \noindent{\sc 2.13 Corollary}. If $X=(a,b)\subset\IR$ and $\cal B$ is the Borel $\sigma$-algebra of $X$, then $T:X\to X$ is measurable iff $T^{-1}(B)\in\cal B$ for every interval $B\subset X$. \\ \noindent{\sc 2.14 Examples}. We can now easily check that the circle rotation and doubling map are measurable. Indeed, the preimage of any interval is an interval for the circle rotation and a union of two intervals for the doubling map.\\ \noindent{\sc 2.15 Question 2.5 continued}. We now want to know ``how many'' points $x_0\in X$ hit $A$ at time $n$. Since usually the number of those points is infinite, then we translate this question into the language of measures. Suppose $\mu$ is a fixed probability measure on $X$. Then we want to compute $$ \mu(\{x\in X:\, T^n(x)\in A\})=\mu(T^{-n}(A)) $$ The value of $\mu(T^{-n}(A))$ is the ``fraction'' of points that hit $A$ at time $n$, or the ``chance'' that a randomly selected point $x$ hits $A$ at time $n$. \\ \noindent{\sc 2.16 Lemma}. Let $\mu$ be a probability measure on $(X,{\cal B})$ and $T:X\to X$ a measurable transformation. Then, for every $n\geq 1$, the function $\mu_n:{\cal B}\to\IR$ given by $$ \mu_n(B)=\mu(T^{-n}(B))\ \ \ \ \ \ \ \ \forall B\in{\cal B} $$ is a probability measure on $(X, {\cal B})$.\\ {\em Proof}: This easily follows from 2.8. $\Box$\\ \noindent Note: one probability measure $\mu$ determines another probability measure, $\mu_n$, for every $n\geq 1$.\\ \noindent{\sc 2.17 Definition}. Let $(X,{\cal B})$ be a measurable space. Denote by ${\cal M}={\cal M}(X)$ the set of all probability measures on $X$. Then a measurable map $T:X\to X$ induces a map $T: {\cal M} \to {\cal M}$. For every $\mu\in{\cal M}$ the measure $T\mu$ is defined by $$ (T\mu)(B)=\mu(T^{-1}(B))\ \ \ \ \ \ \ \ \ \forall B\in{\cal B} $$ \noindent{\sc 2.18 Remark}. In the notation of Lemma 2.16, we have $\mu_1=T\mu$, hence $\mu_n=T^n\mu$ for all $n\geq 1$. So, the iteration of $T$ on $X$ corresponds to the iteration of $T$ on $\cal M$. Note also that $T$ is a {\em linear} map on $\cal M$ in the sense that $T(p\mu_1+(1-p)\mu_2)=pT(\mu_1)+(1-p)T(\mu_2)$ for every $\mu_1,\mu_2\in {\cal M}$ and $0\leq p\leq 1$.\\ It would be very convenient to have $\mu_n=\mu$ for all $n\geq 1$ in Lemma~2.16, so that one measure $\mu$ would describe all the iterates of $T$. This requires $T\mu=\mu$.\\ \noindent{\sc 2.19 Definition}. A probability measure $\mu$ is said to be {\bf invariant under} $T$, or $T${\bf -invariant}, if $T\mu=\mu$. Equivalently, $\mu(T^{-1}(B))=\mu(B)$ for all $B\in{\cal B}$.\\ We also say that $T$ {\bf preserves} the measure $\mu$. A map $T:X\to X$ that preserves a measure $\mu$ is called a {\bf measure-preserving map}. This is our notion of a dynamical system.\\ \noindent{\sc 2.20 Theorem}. Let $(X,{\cal B})$ be a measurable space and $\cal J$ a collection of subsets of $X$ that generates $\cal B$, i.e. ${\cal B}({\cal J})={\cal B}$. Let $T:X\to X$ be a measurable transformation. Then a probability measure $\mu$ is $T$-invariant iff $\mu(T^{-1}(A))=\mu(A)$ for all $A\in\cal J$. \\ {\em Proof}: We need to show that the measures $T\mu$ and $\mu$ are equal. This easily follows from 1.22. $\Box$\\ \noindent{\sc 2.21 Examples}. Let $m$ be the Lebesgue measure on the unit interval $X=[0,1)$. Then $m$ is invariant under the circle rotation and under the doubling map. Indeed, for any interval $A=(c,d)\subset X$ its preimage under the circle rotation is another interval with the same length. For the doubling map, $T^{-1}A$ is a union of two intervals, one is $(c/2,d/2)$ and the other $((c+1)/2, (d+1)/2)$. Their total length is $d-c$, which is $m(A)$. We are done. The invariance of the Lebesgue measure $m$ can be interpreted as follows: for any Borel set $B\subset X$ the chance that a randomly selected point in $X$ hits $B$ at time $n$ equals $m(B)$ (and this chance does not depend on $n$).\\ \noindent{\sc 2.22 Definition}. If $T(x)=x$, then $x$ is called a {\bf fixed point} for the map $T$. If $T^n(x)=x$ for some $n\geq 1$, then $x$ is called a {\bf periodic point} for the map $T$, and $n$ is its period. The smallest such $n$ is said to be the {\bf minimal period} of $x$. Note: if $x$ is a periodic point with a minimal period $n\geq 2$, then the map $T$ cyclically permutes $n$ points $x_0=x$, $x_1=T(x),\ldots, x_{n-1}=T^{n-1}(x)$. That is, $T(x_i)=x_{i+1}$ and $T(x_{n-1})=x_0$.\\ \noindent{\sc 2.23 Remark}. If $x$ is a fixed point for the map $T$, then the delta-measure $\delta_x$ is invariant under $T$. If $x$ is a periodic point with a minimal period $n\geq 2$, then the measure $(\delta_{x_0}+\delta_{x_1}+\cdots+\delta_{x_{n-1}})/n$ is $T$-invariant (we use the notation of 1.7(c) and 2.22). Check these two facts by direct inspection.\\ \noindent{\sc 2.24 Exercise}. A point $x$ is called an {\bf atom} for a measure $\mu$ if $\mu(\{x\})>0$. Show that if $x$ is an atom for a $T$-invariant measure, then $x$ is a periodic point. \\ \noindent{\sc 2.25 More examples}. Usually, a transformation $T:X\to X$ has many invariant measures. If $T:X\to X$ is the identity, i.e. $T(x)=x$ for all $x\in X$, then every probability measure on $X$ is invariant. For the doubling map, the delta measure $\delta_0$ concentrated at zero is invariant, since $T(0)=0$. The measure $\mu =0.5\,\delta_{1/3}+0.5\,\delta_{2/3}$ is also invariant (guess, why). From the physics point of view, though, the most interesting and important invariant measures are those which are absolutely continuous with respect to the Lebesgue measure. \\ \noindent{\sc 2.26 Exercises} (some are rather challenging): \begin{enumerate} \item[(a)] Let $X=\IN$ (the set of natural numbers) and $T:X\to X$ defined by $T(x)=x+1$. Show that $T$ has no invariant measures. \item[(b)] Let $X=\IR$ and $T:X\to X$ defined by $T(x)=x+a$ with a constant $a\neq 0$. Show that $T$ has no invariant measures\footnote{Definition 2.19 can be extended to nonprobability measures: a finite or infinite measure $\mu$ is $T$-invariant if $\mu(T^{-1}(B)) =\mu(B)$ for all $B\in\cal B$. Under this extension, the Lebesgue measure on $\IR$ is invariant. In this course, though, we only deal with probability invariant measures.}. \item[(c)] Let $X=(0,1)$ and $T:X\to X$ defined by $T(x)=x^2$. Show that $T$ has no invariant measures. \item[(d)] Let $X=[0,1]$ and $T:X\to X$ defined by $T(x)=x/2$ for $x>0$ and by $T(0)=1$. Show that $T$ has no invariant measures. \item[(e)] Let $X=[0,1]$ and $T:X\to X$ defined by $T(x)=x^2$. Find all $T$-invariant measures. \item[(f)] Let $X=\{1,2,\ldots,m\}$ be a finite set, and $T:X\to X$ a permutation (i.e., a bijection of $X$). Describe all $T$-invariant measures. \end{enumerate} \noindent{\sc 2.27 Example}. The doubling map can be slightly generalized as follows. For an integer $k\geq 2$ we define $T(x)=kx$ (mod 1) on the set $X=[0,1)$. This map has many properties similar to those of the doubling map. In particular, the Lebesgue measure $m$ is $T$-invariant. Consider, more specifically, the map $T$ with $k=10$. For $x\in X$ let $x=0.i_0i_1i_2\ldots$ be the infinite decimal fraction representing $x$. Then $10x=i_0.i_1i_2\ldots$, hence $T(x)=0.i_1i_2i_3\ldots$. It is just as easy to see that $T^n(x)=0.i_ni_{n+1}i_{n+2}\ldots$ for all $n\geq 1$. Why is this interesting? Consider the set $A_r=[r/10,(r+1)/10)$ for some $r=0,1,\ldots,9$. The inclusion $x\in A_r$ means that the decimal representation $x=0.i_0i_1i_2\ldots$ starts with $r$, i.e. $i_0=r$. Therefore, $T^n(x) \in A_r$ means that the $n$-th digit in the decimal representation of $x$ is $r$, i.e. $i_n=r$. Let $A_r(n)$ be the set of points $x\in[0,1)$ whose decimal representation has $r$ at the $n$-th place ($n\geq 0$). Note that $A_r(0)=A_r$ and $A_r(n)=T^{-n}(A_r)$ for all $n\geq 0$. Since the Lebesgue measure $m$ is $T$-invariant, we have $m(A_r(n))=m(T^{-n}(A_r))=m(A_r)=0.1$. This means that the chance that for a randomly selected point $x\in X$, the $n$-th digit in its decimal representation of $x$ is $r$, equals 0.1 (for any $r$ and any $n$). We will see more of this map later.\\ \noindent{\sc 2.28 The doubling map revisited}. The above discussion of the map $T(x)=10x$ (mod 1) applies to the doubling map, provided one uses the binary number system. In the binary system, every point $x\in [0,1)$ has an infinite representation $x=(0.i_0i_1i_2\ldots)_2$ where $i_n$, $n\geq 0$, are binary digits, i.e. zeroes and ones. Then $2x=(i_0.i_1i_2\ldots)_2$, hence $T(x)=(0.i_1i_2i_3\ldots)_2$. It is again easy to see that $T^n(x)=(0.i_ni_{n+1}i_{n+2}\ldots)_2$ for all $n\geq 1$. \\ \noindent{\sc 2.29 Remark}. We note how $T$ acts on the sequence of digits $i_0i_1i_2\ldots$ in both examples 2.27 and 2.28: the first (leftmost) digit is dropped and the rest of the sequence is moved (shifted) to the left, so that the second digit becomes the first, etc. We will see more of shift maps in Section~10. \newpage \section{More of Measure Theory} In this section, $X$ is a compact metric space (or, at least, a compact metrisable topological space) and $\cal B$ its Borel $\sigma$-algebra. \\ \noindent{\sc 3.1 Standard notation and facts}. The space of continuous functions $f:\, X\to\IR$ is denoted by $C(X)$. (Very rarely, we will need to consider complex-valued continuous functions $f:X\to\IC$, then we shall indicate necessary changes.) The space $C(X)$ is a vector space (usually, infinite-dimensional). It has a norm $||f||=\sup_x|f(x)|$ that makes it a metric space with distance between $f,g\in C(X)$ given by $||f-g||$. \\ Probability measures $\mu\in{\cal M}(X)$ can be identified with special maps $J_{\mu}:\, C(X)\to\IR$ defined by $$ J_{\mu}(f)=\int_X f\, d\mu $$ \\ \noindent{\sc 3.2 Fact}. For each probability measure $\mu\in{\cal M}(X)$ the map $J_{\mu}:C(X)\to\IR$ has three characteristic properties: \begin{enumerate} \item[(J1)] It is a linear and continuous map. \item[(J2)] It is positive, i.e. $J_{\mu}(f)\geq 0$ if $f(x)\geq 0\ \ \ \forall x\in X$. \item[(J3)] It preserves unity, i.e. $J_{\mu}(\I1)=1$, where $\I1(x)=1\ \ \ \forall x\in X$. \end{enumerate} {\em Proof} goes by a direct inspection.\\ \noindent{\sc 3.3 Fact}. If $J_{\mu_1}(f)=J_{\mu_2}(f)$ for all $f\in C(X)$, then $\mu_1=\mu_2$.\\ {\em Proof}. See Walters, pp. 147--148.\\ \noindent{\sc 3.4 Fact (Riesz Representation Theorem)}. If $J:\, C(X)\to\IR$ is a map with properties (J1)-(J3), then there is a measure $\mu\in{\cal M}(X)$ such that $J=J_{\mu}$.\\ {\em Proof} was given in Real Analysis.\\ The representation of measures by integrals of continuous functions allows us to define a very useful topology on $\cal M$, called the weak* topology.\\ \noindent{\sc 3.5 ``Definition''}. The {\bf weak* topology} on $\cal M$ is defined so that, as $n\to\infty$ $$ \mu_n\to\mu\ \ \ \ \ \Longleftrightarrow\ \ \ \ \ \int_Xf\, d\mu_n\to\int_Xf\, d\mu \ \ \ \forall f\in C(X) $$ The convergence of measures in the weak* topology is called the {\bf weak convergence}. As we see, it is equivalent to the convergence of integrals of continuous functions. This is not a formal definition (see one below), but it is what one remembers and uses in practice.\\ \noindent{\sc 3.6 Definition}. The weak* topology can be defined formally. For any $\mu_0\in{\cal M}(X)$, any finite collection of functions $f_1,\ldots,f_k \in C(X)$ and $\varepsilon>0$ the set $$ V_{\mu_0}(f_1,\ldots,f_k;\varepsilon) =\left\{\mu\in{\cal M}(X):\, \left |\int_Xf_i\, d\mu-\int_Xf_i\, d\mu_0\right | <\varepsilon,\ 1\leq i\leq k\right\} $$ is open in the weak* topology. These sets make a basis of the weak* topology. An exercise: check that 3.6 implies 3.5, indeed.\\ \noindent{\sc 3.8 Remark}. The weak* topology is metrisable. A metric on ${\cal M}(X)$ that gives the weak* topology can be defined as follows. Let $\{f_n\}_{n=1}^{\infty}$ be a countable dense subset\footnote{A countable dense subset of $C(X)$ exists whenever $X$ is a metrisable compact Hausdorff space.} of $C(X)$. Then for every $\mu,\nu\in {\cal M}(X)$ we set $$ D(\mu,\nu)=\sum_{n=1}^{\infty} \frac{|\int f_n\, d\mu-\int f_n\, d\nu|}{2^n\, ||f_n||} $$ Unfortunately, this metric depends on the choice of $\{f_n\}$, and there is no standard metric on ${\cal M}(X)$ that gives the weak* topology.\\ {\em Proof}. See Walters, pp. 148--149.\\ \noindent{\sc 3.9 Remark}. There is a standard metric on ${\cal M}(X)$, defined by $$ D_{\rm var}(\mu,\nu)=\,{\rm total}\ {\rm variation}\ {\rm of}\ \mu-\nu $$ but it does not give the weak* topology. It is, in a sense, {\em too strong}.\\ \noindent{\sc 3.10 Example}. Let $m$ be the Lebesgue measure on $X=[0,1]$. For $N\geq 1$, let $x_i=i/N$ for $1\leq i\leq N$. Consider the measure $$ \mu^{(N)}=(\delta_{x_1}+\cdots+\delta_{x_N})/N $$ This is a uniform measure on the finite set $\{x_i\}$, $1\leq i\leq N$. Each point $x_i$ is an atom for $\mu^{(N)}$. Note that for $f\in C(X)$ $$ \int_X f\, d\mu^{(N)} = \frac 1N (f(x_1)+\cdots +f(x_N)) $$ We know from Calculus I that, as $N\to\infty$, $$ \int_X f\, d\mu^{(N)} \to \int_X f\, dm\ \ \ \ \ \forall f\in C(X) $$ Hence $\mu^{(N)}\to m$, as $N\to\infty$, in the weak* topology.\\ \noindent{\sc 3.11 Remarks}. \\ (i) The map $X\to{\cal M}(X)$ given by $x\mapsto \delta_x$ is continuous in the weak* topology.\\ (ii) The convergence $\mu_n\to\mu$ in the weak* topology is equivalent to $$ \limsup_n \mu_n(F)\leq \mu(F) $$ for every closed set $F\subset X$.\\ (iii) The convergence $\mu_n\to\mu$ in the weak* topology is equivalent to $$ \liminf_n \mu_n(U)\geq \mu(U) $$ for every open set $U\subset X$.\\ (iv) The convergence $\mu_n\to\mu$ in the weak* topology is equivalent to $$ \lim_n \mu_n(A) = \mu(A) $$ for every set $A\subset X$ such that $\mu(\partial A)=0$.\\ {\em Proof}. See Walters, pp. 149--150 and references therein.\\ \noindent{\sc 3.12 Theorem (Alaoglu)}. The set ${\cal M}(X)$ is compact in the weak* topology. In particular, every sequence of measures $\mu_n$ has a (weakly) convergent subsequence.\\ {\em Proof}. See Walters, pp. 150.\\ \noindent{\sc 3.13 Definition}. Let $N\geq 1$ and $\{x_i\}$, $1\leq i\leq N$, a finite collection of points in $X$ (not necessarily distinct). We call $$ \mu^{(N)}=(\delta_{x_1}+\cdots+\delta_{x_N})/N $$ the {\bf uniform atomic measure} supported on the points $x_1,\ldots,x_N$. We have seen in Example 3.10 that the Lebesgue measure $m$ on $X=[0,1]$ can be approximated by uniform atomic measures.\\ \noindent{\sc 3.14 Theorem}. Any measure $\mu\in{\cal M}(X)$ can be approximated by uniform atomic measures, i.e. for every $\mu\in{\cal M}(X)$ there is a sequence of such measures $\mu^{(N)}$ that converges to $\mu$ in the weak* topology. (Proof is left as an exercise.)\\ \noindent{\sc 3.15 A physical/philosophical essay about measures}. It would be helpful if we could visualize a measure. One is used to think of the measure $\mu(A)$ of a set $A$ as the ``size'' of $A$. This is suitable for measure theory, but not for dynamical systems. Why not? Because here a transformation $T$ acts on both $X$ and ${\cal M}(X)$, so one has to deal with a sequence of measures $\mu_n=T^n\mu$. Thus, $\mu_n(A)$ changes with $n$, while the set $A$ does not (so its ``size'' should not change either). There is a way to visualize measures that works for dynamical systems. Let $\mu$ be a probability measure and $\mu^{(N)}$ a uniform atomic measure approximating $\mu$, which is supported on some points $x_1,\ldots,x_N$. If $N$ is large enough, then for all practical purposes (and physical applications) the measures $\mu$ and $\mu^{(N)}$ are indistinguishable. We can say that the points $x_1,\ldots,x_N$ {\em represent} our measure $\mu$, i.e. we can ``visualize'' $\mu$ by looking at the set of points $x_1,\ldots,x_N$. Now, how do we visualize the measures $\mu_n=T^n\mu$? For each $n\geq 1$ the measure $\mu_n$ can be usually approximated by $\mu_n^{(N)}=T^n\mu^{(N)}$. The measure $\mu_n^{(N)}$ is another uniform atomic measure, which is supported on the points $\{T^n(x_i)\}$, $1\leq i\leq N$. Hence, we can ``visualize'' $\mu_n$ by looking at the set of points $\{T^n(x_i)\}$, the images of the original points $\{x_i\}$. Now $\mu_n^{(N)}(A)$ can change with $n$ depending on the balance of ``incoming'' and ``outgoing'' representative points that move in and out under $T$. \begin{figure}[h] \centering \epsfig{figure=ds-2.eps}\caption{Three points leave the set $A$ and three new points come in.} \end{figure} Now, what does a $T$-invariant measure $\mu$ ``look like''? If $\mu$ is invariant, then $T\mu=\mu$, and then usually $T\mu^{(N)}\approx \mu^{(N)}$. This means that (loosely speaking) for a generic measurable set $A$ we have $\#\{i:\, x_i\in A\}\approx \#\{i:\, T(x_i)\in A\}$. So, the number of points that leave the set $A$ (go out) is about the same as the number of points that enter the set $A$ (come in), see Fig.~2. Suppose you first look at the set of points $\{x_i\}$ representing the measure $\mu$ (this is how you ``visualize'' $\mu$). Now you apply $T$ (turn the ``switch'' on), so that each point $x_i$ instantaneously jumps to $T(x_i)$. You then look again at the newly obtained set of points $\{T(x_i)\}$. If $\mu$ is invariant, then this set should look exactly the same as the old set $\{x_i\}$, i.e. you should not notice any difference in the general appearance of your set of points {\em before} you apply $T$ and {\em after} you apply $T$ (even though each individual point ``jumps'' somewhere). The preservation of that ``general picture'' (or general structure) of the set of representative points by $T$ is exactly how physicists ``see'' the invariance of the measure $\mu$. \newpage \section{Measure-preserving Transformations} Here we begin a systematic study of measurable maps and their invariant measures. In this section, $(X,B)$ is a measurable space and $T:\, X\to X$ is a measurable transformation. We denote by ${\cal M}={\cal M}(X)$ the set of all probability measures on $X$ and by ${\cal M}_{\rm inv}= {\cal M}_{\rm inv}(X,T)$ the set of all $T$-invariant probability measures on $X$.\\ \noindent{\sc 4.1 Remark}. ${\cal M}_{\rm inv}\subset {\cal M}$, and ${\cal M}_{\rm inv}$ is a convex set (i.e., if $\mu_1,\mu_2\in {\cal M}_{\rm inv}$ then $p\mu_1+(1-p)\mu_2\in {\cal M}_{\rm inv}$ for all $0\leq p\leq 1$). The set ${\cal M}_{\rm inv}$ may be empty, see Exercises 2.26 (a), (b), (c), and (d).\\ \noindent{\sc 4.2 Theorem (Bogolyubov-Krylov)}. If $X$ is a compact metrisable topological space and $T:\, X\to X$ a continuous transformation, then ${\cal M}_{\rm inv} \neq\emptyset$, i.e. $T$ has at least one invariant measure. \\ {\em Proof}. See Walters, pp. 151--152, and Pollicott, pp. 8--9.\\ \noindent{\sc 4.3 Remarks}. Several interesting facts are involved in the proof of Theorem~4.2: \begin{enumerate} \item[(a)] The map $T:{\cal M}\to {\cal M}$ is continuous in the weak* topology. \item[(b)] For any measure $\mu\in{\cal M}$ every accumulation point of the sequence $(\mu+T\mu+\cdots+T^{n-1}\mu)/n$ is a $T$-invariant measure. This is a very helpful method for constructing invariant measures, even in a broader context than that of Theorem~4.2. \item[(c)] A general fixed point theorem by Shauder (or Tykhonov-Shauder) says that a continuous transformation of a compact convex set always has a fixed point. This provides an alternative proof of Theorem~4.2 -- based on the continuity of $T:{\cal M}\to {\cal M}$ and the Shauder theorem. \end{enumerate} \noindent{\sc 4.4 Remarks}. The assumption on the compactness of $X$ in Theorem~4.2 cannot be dropped, see Examples~2.26(bc). The assumption on the continuity of $T$ in Theorem~4.2 cannot be dropped either, see Examples~2.26(d). \\ \noindent We now return to general measurable maps on measurable spaces. Our next step is to learn how to use functions $f:X\to\IR$ (or, more generally, $f:X\to\IC$) in the study of invariant measures. \\ \noindent{\sc 4.5 Notation}. We denote by $L^0(X)$ the set of all measurable functions $f:X\to\IR$. Given a measure $\mu\in {\cal M}$, for any $p>0$ we denote $$ L^p_{\mu}(X)=\left \{f\in L^0(X):\, \int_X |f|^p\, d\mu < \infty\right \} $$ This is a vector space with norm $||f||_p=\int_X|f|^p\, d\mu$. In addition, the space $L^2_{\mu}(X)$ has a scalar product\footnote{If $f$ and $g$ are complex-valued functions, then we define $\langle f,g\rangle = \int f\bar{g}\, d\mu$.} defined by $$ \langle f,g\rangle = \int_X fg\, d\mu $$ We denote by $L^{\infty}(X)$ the set of all bounded functions on $X$. It is a linear space with norm $||f||_{\infty}=\sup_x |f(x)|$.\\ \noindent{\sc 4.6 Lemma (Characterizing invariant measures)}. A measure $\mu\in{\cal M}$ is $T$-invariant if and only if $$ \int_X f\circ T\, d\mu = \int_X f\, d\mu \ \ \ \ \ \ \forall f\in L^0(X) $$ (if one integral is infinite or does not exist, then the other has the same property).\\ {\em Proof}. See Walters, p.\ 25, and Pollicott, p.\ 6.\\ The lemma 4.6 is a particular case of a more general statement:\\ \noindent{\sc 4.7 Lemma}. For any measure $\mu\in{\cal M}$ its image $\mu_1=T\mu$ is characterized by $$ \int_X f\circ T\, d\mu = \int_X f\, d\mu_1 \ \ \ \ \ \ \forall f\in L^0(X) $$ (again, if one integral is infinite or does not exist, then the other has the same property).\\ {\em Proof}. See Walters, p.\ 25. Also, Pollicott's argument on p.\ 6 applies with obvious changes.\\ \noindent {\sc 4.8 Remark}. The above lemma is, in fact, a generalized change of variable formula. If $x\in X$ and $y=T(x)$, then $\int f(y)\, d\mu_1(y)=\int f(Tx)\, d\mu(x)$. \\ \noindent {\sc 4.9 Remark}. If $X$ is a compact metrisable topological space and $T:\, X\to X$ a continuous map (as in Section~3), then Lemma~4.6 can be slightly improved: $\mu\in{\cal M}$ is $T$-invariant if and only if $\int f\circ T\, d\mu = \int f\, d\mu$ for all $f\in C(X)$.\\ {\em Proof}. See Walters, p. 151.\\ \noindent{\sc 4.10 Definition}. A measurable transformation $T:\, X\to X$ induces a map $U_T:\, L^0(X):\to L^0(X)$ defined by $$ (U_Tf)(x) \colon = (f\circ T)(x)=f(T(x)) $$ \noindent{\sc 4.11 Simple properties}. \begin{enumerate} \item[(a)] For every $n\geq 1$, we have $U_{T^n}=(U_T)^n$. \item[(b)] The map $U_T$ is linear. \item[(c)] The map $U_T$ takes $L^{\infty}(X)$ into itself and does not increase the norm $||\cdot ||_{\infty}$, i.e. $||U_Tf||_{\infty}\leq ||f||_{\infty}$. Moreover, if $T$ is {\em onto}, then $||U_Tf||_{\infty} = ||f||_{\infty}$. \item[(d)] For any $T$-invariant measure $\mu$, the map $U_T$ takes $L^p_{\mu}(X)$ into itself and preserves the norm $||\cdot ||_p$, i.e. $||U_Tf||_p=||f||_p$. It also preserves the scalar product in $L^2_{\mu}(X)$, i.e.\ $ \langle U_Tf,U_Tg\rangle = \langle f,g\rangle$, i.e. $U_T$ is a {\bf unitary} operator. This explains the notation $U_T$. \end{enumerate} \noindent{\sc 4.12 Remark}. Given a function $f:X\to\IR$ and a point $x\in X$, the sequence $$ (U_T^nf)(x)=f(T^nx),\ \ \ \ \ \ n\geq 0 $$ consists of the values of the function $f$ at times $n\geq 0$ along the orbit of the point $x$. If $f$ is a physical parameter (such as temperature), then this sequence consists of its measurements taken at successive time moments. It is also called the {\bf time series}. That is what physicists (and other scientists) observe in experiments, so the behavior of the time series is just as important as that of the orbit $\{T^n(x)\}$. \newpage \section{More Examples} \noindent{\sc 5.1 Baker's transformation}. Let $$ X=\{(x,y):\ 0\leq x<1,\ 0\leq y<1\} $$ be a unit square on the $xy$ plane. The baker's map $T:\, X\to X$ is defined by $$ T(x,y)=\left \{ \begin{array}{ll} (2x,y/2) & {\rm if}\ \ 0\leq x<1/2 \\ (2x-1,(y+1)/2) & {\rm if}\ \ 1/2\leq x<1 \end{array}\right . $$ \begin{figure}[h] \centering \epsfig{figure=ds-3.eps}\caption{The baker's map constructed in two steps.} \end{figure} \noindent The action of $T$ is shown on Fig.~3. At step 1, the square $X$ is transformed into a rectangle by the linear map $(x,y)\mapsto (2x,y/2)$ (so $X$ is stretched in the $x$ direction and squeezed in the $y$ direction). At step 2, the rectangle is cut in half and its right half is placed atop its left half. This process resembles the way a baker kneads dough, hence the name {\bf baker's map}. \\ \noindent{\sc 5.2 Remarks}. The baker's transformation is a bijection of the unit square $X$. Its inverse, $T^{-1}:X\to X$, satisfies similar equations: $$ T^{-1}(x,y)=\left \{ \begin{array}{ll} (x/2,2y) & {\rm if}\ \ 0\leq y<1/2 \\ ((x+1)/2,2y-1) & {\rm if}\ \ 1/2\leq y<1 \end{array}\right . $$ Note that $T$ is discontinuous on the line $x=1/2$ but continuous elsewhere. Such maps are said to be {\em piecewise continuous}. Moreover, $T$ is {\em piecewise smooth} (and even {\em piecewise linear}). The map $T^{-1}$ has the same properties, except it is discontinuous on another line, $y=1/2$.\\ \noindent{\sc 5.3 Proposition}. Let $\cal B$ be the Borel $\sigma$-algebra and $m$ the Lebesgue measure on the unit square\footnote{Let us adopt a convention: whenever $X\subset\IR^d$, $d\geq 2$, then $\cal B$ will denote the Borel $\sigma$-algebra and $m$ the Lebesgue measure on $X$.} $X$. Then the baker's map $T:X\to X$ is measurable and preserves $m$.\\ {\em Proof}. By 2.11 and 2.20, it is enough to find a collection of subsets ${\cal J}\subset{\cal B}$ such that ${\cal B}({\cal J})={\cal B}$ and $T^{-1}(A)\in{\cal B}$ and $m(T^{-1}(A))=m(A)$ for all $A\in{\cal J}$. Let $\cal J$ consist of subrectangles $A\subset X$ that do not intersect the line $y=1/2$ (in this case $T^{-1}(A)$ will be a subrectangle as well). This will do it. $\Box$\\ \noindent{\sc 5.4 Remark}. In the proof of Proposition 5.3, it is enough to restrict $\cal J$ to rectangles of diameter $<\varepsilon$, where $\varepsilon$ is an arbitrarily small positive number. Taking the limit as $\varepsilon\to 0$, the condition $m(T^{-1}(A))=m(A)$ will reduce to \be |\, {\rm det}\, DT|=1 \label{DT=1} \ee where $DT$ is the matrix of partial derivatives of $T$. In our case $$ DT=\left ( \begin{array}{rr} 2 & 0 \\ 0 & 1/2 \end{array}\right ) $$ Hence, $|\,$det$\, DT|=1$, indeed. \\ \noindent{\sc 5.5 ``Digital'' representation of the baker's map}. Let $(x,y)\in X$ be an arbitrary point. Let $x=0.i_0i_1i_2\ldots$ be an infinite representation of $x$ in the binary system, cf.\ 2.28. Let $y=0.j_1j_2j_3\ldots$ be the binary representation of $y$. Then the image $(x',y')=T(x,y)$ has representation $$ x'=0.i_1i_2i_3\ldots \ \ \ \ \ \ {\rm and}\ \ \ \ \ \ y'=0.i_0j_1j_2\ldots $$ Note that it is remarkably simple: the first digit of $x$ becomes the first digit of $y$, and the rest of the digits shift accordingly. This suggests the following trick. Let us reverse the sequence $j_1j_2j_3\ldots$ and append it to the sequence $i_0i_1i_2\ldots$ on the left: \be \ldots j_3j_2j_1i_0i_1i_2\ldots \label{iii} \ee thus obtaining {\em one} sequence of binary digits (0's and 1's), which is infinite in both directions (we call that a {\bf double infinite} sequence). This sequence represents a pair of real numbers $x$ and $y$, i.e. a point in the square $X$. For convenience, let us denote $\omega_k=i_k$ for $k=0,1,2,\ldots$ and $\omega_{-k}=j_k$ for $k=1,2,\ldots$. Then the double infinite sequence (\ref{iii}) will look like \be \uom= (\ldots \omega_{-3}\omega_{-2}\omega_{-1} \omega_0\omega_1\omega_2\ldots) \label{ooo} \ee The point $\omega_0$ is the first digit of $x$ here. How does $T$ act on double infinite sequences? If a point $(x,y)\in X$ is represented by a sequence (\ref{ooo}), then its image $(x',y')=T(x,y)$ is represented by another sequence $$ \uom'= (\ldots \omega_{-3}'\omega_{-2}'\omega_{-1}' \omega_0'\omega_1'\omega_2'\ldots) $$ that is obtained by the rule $\omega_k'=\omega_{k+1}$ for all $k\in\ZZ$. Hence, $T$ corresponds to the {\bf left shift} on representing sequences. \\ \noindent{\sc 5.6 Exercises}. Let $X$ be a unit square and $T:X\to X$ a diffeomorphism (a bijection of $X$ such that both $T$ and $T^{-1}$ are smooth everywhere). \begin{itemize} \item[(a)] Show that if $|\,{\rm det}\, DT|\equiv 1$, then $T$ preserves the Lebesgue measure $m$. \item[(b)] Let $\mu$ be a probability measure on $X$ with density $f(x,y)$, cf.\ 1.7 (b), which means that for any Borel set $A\subset X$ $$ \mu(A)=\int_A f(x,y)\, dx\, dy = \int_A f\, dm $$ Then the measure $\mu_1=T\mu$ has density $f_1(x,y)$ given by $$ f_1(x,y)= \frac{f(x_1,y_1)}{|\,{\rm det}\, DT(x_1,y_1)|} $$ where $(x_1,y_1)=T^{-1}(x,y)$. \item[(c)] A probability measure $\mu$ on $X$ with density $f(x,y)$ is $T$-invariant iff \be f(x,y)= \frac{f(x_1,y_1)}{|\,{\rm det}\, DT(x_1,y_1)|} \label{ff2} \ee for $m$-almost all $(x,y)\in X$, here again $(x_1,y_1)=T^{-1}(x,y)$. Precisely, the set of points $(x,y)\in X$ where (\ref{ff2}) fails must have zero Lebesgue measure. \end{itemize} \noindent{\sc 5.7 Remark}. So, we have a criterion of the invariance of the Lebesgue measure that is based on the identity (\ref{DT=1}). It can be modified and applied to the doubling map of the unit interval $T(x)=2x$ (mod 1), cf.\ 2.2. Let $x\in X=[0,1)$ and $T^{-1}(x)=\{y_1,y_2\}$ be the full preimage of $x$ (say, $y_1=x/2$ and $y_2=(x+1)/2$). Then the $T$-invariance of the Lebesgue measure is ``equivalent'' to $$ \sum_{i=1}^2 \frac{1}{|T'(y_i)|}=1 $$ This is obviously true since $T'(y)\equiv 2$.\\ \noindent{\sc 5.8 Exercises}. Let $X=[0,1]$ be the unit interval (which may be open, closed or semiopen). Let $T: X\to X$ be a {\bf piecewise monotonic map}, i.e.\ there are points $0=a_01/2\end{array}\right . $$ This is the {\bf tent map}, see Fig.~4a. Note that $T$ is continuous and its graph is a ``tent'' with a sharp tip. Since $T$ is a ``two-to-one'' map and $|T'(x)|\equiv 2$, it preserves the Lebesgue measure by 5.8(a).\\ \begin{figure}[h] \centering \epsfig{figure=ds-4.eps}\caption{The tent map (a), the quadratic map (b) and the Gauss map (c).} \end{figure} \noindent{\sc 5.10 Example: a quadratic map}. Let $X=[0,1]$ and $T:X\to X$ be defined by $$ T(x)=4x(1-x) $$ Note that $T$ is continuous and even smooth, and its graph is a ``tent'' with a curved top, see Fig.~4b. Note that $T$ is a ``two-to-one'' map and $|T'(x)|=|4-8x|$. It does not preserve the Lebesgue measure by 5.8(a).\\ \noindent{\sc 5.11 Exercise}. Show that the quadratic map in 5.10 preserves the measure $\mu$ with density $$ f(x)=\frac{1}{\pi\,\sqrt{x(1-x)}} $$ Here $1/\pi$ is just the normalizing factor introduced to make $\mu$ a probability measure.\\ \noindent{\sc 5.12 Example: the Gauss map}. Let $X=[0,1]$ and $T:X\to X$ be defined by $T(x)=1/x$ (mod 1) for $x\neq 0$ and $T(0)=0$. That is, $T(x)$ is the fractional part of $1/x$. The graph of this function has infinitely many branches, see Fig.~4c. Its discontinuity points are $x=1/n$, $n\in\IN$, and $x=0$. Every point $x<1$ has infinitely many preimages.\\ \noindent{\sc 5.13 Exercise}. Show that the Gauss map in 5.12 preserves the measure $\mu$ with density $$ f(x)=\frac{1}{\ln 2}\cdot\frac{1}{1+x} $$ Here $1/\ln 2$ is just the normalizing factor introduced to make $\mu$ a probability measure. This density was found by K.~F.~Gauss in the Nineteenth Century.\\ \noindent{\sc 5.14 Remark}. The density of a $T$-invariant measure is a solution of the functional equation (\ref{ff1}). Generally, it is very hard (if at all possible) to solve it. The solutions given in Exercises~5.11 and 5.13 for two particular maps have been found basically ``by accident'' or ``by trial and error''. There are no general algorithms to solve functional equations.\\ \noindent{\sc 5.15 Definition}. An map $T:I\to I$ of an interval $I\subset\IR$ is said to have an {\bf absolutely continuous invariant measure} ({\bf a.c.i.m.}\ for short) if there is a $T$-invariant measure on $I$ with a density.\\ We have seen that the doubling map, the tent map, the quadratic map, and the Gauss map all have a.c.i.m.\\ \noindent{\sc 5.16 Remark}. The existence or uniqueness of an a.c.i.m.\ is not guaranteed. For all of the above interval maps the a.i.c.m.\ is indeed unique, and we will prove that in Section~8.\\ By far, we have seen maps of various kinds: some were one-to-one (e.g., circle rotations and the baker's transformation) and some others were two-to-one (the doubling map and the tent map) or even infinitely-many-to-one (the Gauss map). If a map $T:X\to X$ is a bijection, one can use $T^{-1}$ as well.\\ \noindent{\sc 5.17 Definition}. A bijective map $T:X\to X$ is called an {\bf automorphism} if both $T$ and $T^{-1}$ are measurable maps. (Examples: circle rotations and the baker's map.) On the contrary, if $T$ is not an automorphism, then $T$ is called an {\bf endomorphism}.\\ \noindent{\sc 5.18 Exercise}. If an automorphism $T$ preserves a measure $\mu$, then its inverse $T^{-1}$ also preserves $\mu$. \newpage \section{Recurrence} Here we turn back to the general properties of measure-preserving transformations. In this section, $T:X\to X$ always denotes a measurable map with a $T$-invariant measure $\mu$. \\ \noindent{\sc 6.1 Definition}. Let $A\subset X$ be a measurable set and $x\in A$. Denote by $$ \tau_A(x)=\min\{n\in\IN:\ T^n(x)\in A\} $$ the {\bf first return time} when the point $x$ comes back to the set $A$. If such a time does not exist (i.e., if $x$ never returns to $A$) then we set $\tau_A(x)=\infty$. \\ \noindent{\sc 6.2 Theorem (Poincar\'{e} Recurrence Theorem)}. If $\mu(A)>0$, then almost every point $x\in A$ does return to $A$, i.e. $$ \mu(\{x\in A:\ \tau_A(x)=\infty\})=0 $$ {\em Proof}. See Walters, p. 26 or Pollicott, p. 9. \\ \noindent{\sc 6.3 Remark}. Poincar\'{e} Theorem 6.2 is false if the measure $\mu$ is infinite. For example, let $X=\IR$, $\mu$ be the Lebesgue measure and $T(x)=x+1$. Then the set $A=(0,1)$ has no returning points at all.\\ \noindent{\sc 6.4 Corollary}. If $\mu(A)>0$, then almost every point $x\in A$ returns to $A$ infinitely many times, i.e. for almost every $x\in A$ there is a sequence $00$. The map $T_A:A\to A$ defined by $$ T_A(x)=T^{\tau_A(x)}(x) $$ is called the {\bf first return map} (or {\bf Poincar\'{e} map}) on $A$. It is actually defined on the set $\{x\in A:\ \tau_A(x)<\infty\}$, but that set coincides with $A$ up to a set of measure zero. It is more convenient to restrict $T_A$ to the set $\tilde{A}$. Note that $T_A(\tilde{A})\subset\tilde{A}$, i.e. $\tilde{A}$ is invariant under $T_A$. We now can consider $T_A:\tilde{A}\to\tilde{A}$ as a new transformation, induced by $T$ and $A$.\\ Note that $\mu(\tilde{A})=\mu(A)$ and $\mu(A\Delta \tilde{A})=0$ by 6.4, so one commonly identifies $A$ with $\tilde{A}$ by neglecting a set of zero measure. Still, for the sake of clarity we will keep working with $\tilde{A}$.\\ \noindent{\sc 6.7 Definition}. Let ${\cal B}_{\tilde{A}}$ be the $\sigma$-algebra induced on $\tilde{A}$, i.e. $$ {\cal B}_{\tilde{A}}=\{B\in{\cal B}:\ B\subset\tilde{A}\} $$ Define a probability measure $\mu_{\tilde{A}}$ on $(\tilde{A},{\cal B}_{\tilde{A}})$ by $$ \mu_{\tilde{A}}(B)=\mu(B)/\mu(\tilde{A}) \ \ \ \ \ {\rm for}\ {\rm all}\ \ B\in {\cal B}_{\tilde{A}} $$ The measure $\mu_{\tilde{A}}$ is called the {\bf conditional measure}.\\ \noindent{\sc 6.8 Theorem}. Let $A\subset X$ and $\mu(A)>0$. Then the first return map $T_A:\tilde{A}\to \tilde{A}$ preserves the conditional measure $\mu_{\tilde{A}}$.\\ {\em Proof}. Let $B\in{\cal B}_{\tilde{A}}$. Define a sequence of sets $B_n\subset \tilde{A}$ by $$ B_n=\{x\in \tilde{A}:\ \tau_A(x)=n\ \&\ T^n(x)\in B\} $$ Then $T_A^{-1}(B)=\cup_{n\geq 1} B_n$. Next, the sets $B_n$ are pairwise disjoint, because $\tau_A$ takes different values on different $B_n$'s. Lastly, we need to show that $\mu(B) = \sum_n\mu(B_n)$. For each $n\geq 1$, let $$ C_n=\{x\in T^{-n}(B):\ T^ix\notin A\ \ \forall i=0,1,\ldots,n-1\} $$ Verify by direct inspection that $T^{-1}(C_n)=C_{n+1}\cup B_{n+1}$ and $C_{n+1}\cap B_{n+1}=\emptyset$. Therefore, $$ \mu(C_n)=\mu(C_{n+1})+\mu(B_{n+1}) $$ Also, $T^{-1}(B)=C_1\cup B_1$ and $C_1\cap B_1=\emptyset$, hence $\mu(B)=\mu(C_1)+\mu(B_1)$. Adding all these equations for measures together proves the theorem. Right? Not quite. We also need to show that $\lim_{n\to\infty}\mu(C_n)=0$. Do this as an exercise. Hint: show that the sets $C_n$ are pairwise disjoint. $\Box$ \\ \noindent{\sc 6.9 Exercise}. Let $f:X\to\IR^+$ be a {\em positive} measurable function, i.e. $f(x)>0$ for all (or almost all) $x\in X$. Show that $$ \sum_{n=0}^{\infty} f(T^n(x))=\infty $$ for almost every point $x\in X$. Find an example of a strictly positive function $f>0$ so that the above series converges for some points $x\in X$. (Hint: you can use the map 2.26e).\\ The following construction is in a sense opposite to that of the first return map. Before, we shrunk the space $X$ to a smaller set $A$. Now we will construct a ``larger'' space to replace $X$. \\ \begin{figure}[h] \centering \epsfig{figure=ds-5.eps}\caption{The tower map $S$. A point $x$ with $\tau(x)=4$ is shown.} \end{figure} \noindent{\sc 6.10 Definition}. Let $T:X\to X$ be a transformation preserving a measure $\mu$ and $\tau:X\to\IN$ a measurable positive integral-valued function on $X$ such that $$ D=\int_X\tau\, d\mu<\infty $$ Let $$ Y=\{(x,k):\ x\in X,\ k=1,2,\ldots,\tau(x)\} $$ This is called a {\bf tower}. It is naturally partitioned into levels (``floors'') $Y=\cup_{n\geq 1}Y_n$ where $$ Y_n=\{(x,k)\in Y:\ k=n\} $$ Let $\varphi:Y\to X$ be a natural projection $\varphi(x,n)=x$. Define a map $S:Y\to Y$ by $$ S(x,k)=\left \{ \begin{array}{cc} (x,k+1) & {\rm if}\ \ k<\tau(x) \\ (T(x),1) & {\rm if}\ \ k=\tau(x) \end{array} \right . $$ Note that $S$ moves each point straight up the tower until it reaches the top level (``the ceiling''), then $S$ takes it down to the level zero and at that time applies the ``old'' map $T$, see Fig.~5. The function $\tau(x)$ is called the {\bf ceiling function}. Define a measure $\mu'$ on each level $Y_n$, $n\geq 1$, by $$ \mu'(B)=\mu(\varphi(B))\ \ \ \ B\subset Y_n $$ (this also defines a $\sigma$-algebra on $Y_n$). Then we obtain a measure $\mu'$ on $Y$. It is finite and $\mu'(Y)=D$, then the measure $\tilde{\mu}=\mu'/D$ is a probability measure on $Y$. The map $S$ preserves the measure $\tilde{\mu}$, which can be verified directly. \newpage \section{Ergodicity} \noindent{\sc 7.1 Definition (not a good one!)}. A set $B\subset X$ is said to be $T$-invariant if $T(B)\subset B$.\\ \noindent{\sc 7.2 Remark}. The above definition is standard in some mathematical courses, but not so convenient for dynamical systems. First of all, $T(B)$ may not be measurable, even if $B$ is. Second, when $B$ is invariant, then $B^c=X\setminus B$ may not be. These considerations motivate the following:\\ \noindent{\sc 7.3 Definition} A measurable set $B\subset X$ is {\bf (fully) $T$-invariant} if $T^{-1}(B)=B$. \\ Note that now when $B$ is invariant, then so is $B^c$, i.e.\ $T^{-1}(B^c)=B^c$. The following exercise shows, though, that Definition 7.1 is essentially equivalent to 7.3:\\ \noindent{\sc 7.4 Exercise}. Let $T(B)\subset B$ for a set $B\in{\cal B}$. Consider the ``forward limit'' $B_+=\cap_{n\geq 0}T^{n}(B)$ and the ``backward limit'' $B_-=\cup_{n\geq 0}T^{-n}(B)$. \begin{itemize} \item[(a)] Show that the set $B_-$ is measurable and (fully) invariant, i.e. $T^{-1}(B_-)=B_-$. Is $B_+$ also fully invariant? (Prove or give a counterexample.) \item[(b)] Show that if $\mu$ is a $T$-invariant measure, then $\mu(B\Delta B_-)=0$. Also, if $T^n(B)\in{\cal B}$ for all $n\geq 1$, then prove $\mu(B\Delta B_+)=0$. \end{itemize} This shows that for any invariant set $B$ in the sense of 7.1 there is a fully invariant set $B_-$ that coincides with $B$ up to a null set. \\ In dynamical systems, we work with measures. Sets of zero measure are treated as negligible and often ignored.\\ \noindent{\sc 7.5 Definitions}. Let $\mu\in{\cal M}$ be a probability measure. We call $A$ a {\bf null set} if $\mu(A)=0$. We say that $A$ and $B$ {\bf coincide (mod 0)} if $\mu(A\Delta B)=0$. In this case we write $A=B$ (mod 0).\\ \noindent{\sc 7.6 Remark}. The relation $A=B$ (mod 0) is an equivalence relation. In particular, to check that $A=B$ (mod 0) and $B=C$ (mod 0) implies $A=C$ (mod 0), one can use a simple formula $A\Delta C\subset (A\Delta B)\cup (B\Delta C)$.\\ \noindent{\sc 7.7 Definition}. Let $\mu$ be an invariant measure. We say that a set $B\subset X$ is {\bf invariant (mod 0)} if $B=T^{-1}(B)$ (mod 0), i.e. $\mu(B\Delta T^{-1}B)=0$.\\ Note: in this case $B=T^{-n}B$ (mod 0) for all $n\geq 1$. This easily follows from 7.6.\\ \noindent{\sc 7.8 Definition}. Let $\mu$ be a probability measure. If $f,g:X\to\IR$ are two measurable functions, then we say that $f$ and $g$ are $\mu${\bf -equivalent}, and write $f=g$ (mod 0), if $\mu\{x:\ f(x)\neq g(x)\}=0$.\\ \noindent{\sc 7.9 Exercise}. Let $B\in{\cal B}$. Consider the set $$ B_{\infty}=\cap_{n=0}^{\infty}\cup_{m=n}^{\infty}T^{-m}(B) $$ consisting of points whose orbits visit $B$ infinitely many times. \begin{itemize} \item[(a)] Show that the set $B_{\infty}$ is measurable and (fully) invariant, i.e. $T^{-1}(B_{\infty})=B_{\infty}$. \item[(b)] Let $\mu$ be a $T$-invariant measure. Suppose that $B=T^{-1}(B)$ (mod 0), i.e. $B$ is invariant (mod 0). Prove that $B=B_{\infty}$ (mod 0). \end{itemize} This shows that for any (mod 0) invariant set $B$ there is a fully invariant set $B_{\infty}$ that coincides (mod 0) with $B$. \\ \noindent{\sc 7.10 Examples}. \begin{itemize} \item[(a)] If $X$ is a finite set and $T:X\to X$ is a permutation (see 2.26f), then every set whose elements form a cycle is fully invariant. \item[(b)] If $X=[0,1]$ and $T(x)=x^2$, see 2.26e, then the sets $B_1=\{0\}$, $B_2=(0,1)$ and $B_3=\{1\}$ are fully invariant. \end{itemize} \begin{figure}[h] \centering \epsfig{figure=ds-6.eps}\caption{The decomposition of $X$ into two invariant subsets.} \end{figure} \noindent{\sc 7.11 Remark}. Let $B\subset X$ be a (fully) invariant set. Then $T(B)\subset B$ and $T(B^c)\subset B^c$. So, the orbits that originate in $B$ never visit $B^c$ and the orbits originating in $B^c$ never visit $B$. The space $X$ naturally decomposes into two ``noninteracting'' parts: $X_1=B$ and $X_2=B^c$. We can restrict $T$ to $X_1$ and $X_2$ and consider two measurable maps $T_i=T:X_i\to X_i$ with $i=1,2$, separately. Whenever $\mu_1$ is a $T_1$-invariant measure on $X_1$ and $\mu_2$ is a $T_2$-invariant measure on $X_2$, then any weighted sum $p\mu_1+ (1-p)\mu_2$ for $0\leq p\leq 1$ will be a $T$-invariant measure on $X$. Also, any $T$-invariant measure $\mu$ on $X$ such that $\mu(B)>0$ and $\mu(B^c)>0$ is a weighted sum of the conditional measures $\mu_B$ and $\mu_{B^c}$ on $B$ and $B^c$, which are invariant under $T_1$ and $T_2$, respectively. In this case one can reduce the study of the map $T:X\to X$ to the study of two ``smaller'' maps $T_1:X_1\to X_1$ and $T_2:X_2\to X_2$.\\ \noindent{\sc 7.12 Definition}. A $T$-invariant measure $\mu$ is said to be {\bf ergodic} if there is no $T$-invariant subset $B\subset X$ such that $\mu(B)>0$ and $\mu(B^c)>0$. (Equivalently: there is no $T$- invariant set $B$ such that $0<\mu(B)<1$.)\\ We also say that $T$ is {\bf ergodic} (with respect to the measure $\mu$). The set of ergodic measures is denoted by ${\cal M}_{\rm erg} \subset {\cal M}_{\rm inv}$.\\ \noindent{\sc 7.13 Examples}. \begin{itemize} \item[(a)] If $X$ is a finite set and $T:X\to X$ is a permutation (see 2.26f), then a $T$-invariant measure is ergodic iff it is concentrated on one cycle. \item[(b)] If $X=[0,1]$ and $T(x)=x^2$, see 2.26e, then the only ergodic measures are $\delta_0$ and $\delta_1$. Note: a $T$-invariant set $(0,1)$ cannot carry any $T$-invariant measure, see 2.26c. \end{itemize} We see that ergodic measures are, in a sense, ``extreme'' invariant measures.\\ Recall that the collection of $T$-invariant measures, ${\cal M}_{\rm inv}$ is a convex subset of ${\cal M}(X)$, see 4.1. For any convex set $R$ a point $x\in R$ is called an {\bf extremal point} of $R$ if there is no pair of points $x_1,x_2\in R$, $x_1\neq x_2$, such that $x=px_1+(1-p)x_2$ for some $00$ and $\nu(B_{\infty}^c)<0$. Thus, $\mu(B_{\infty})>0$ and $\mu(B_{\infty}^c)>0$, so the measure $\mu$ is not ergodic. $\Box$ \\ We see that a nonergodic measure $\mu$ can be represented by a weighted sum of two other invariant measures. If those measures are not ergodic either, this decomposition can be continued, and ultimately we can represent $\mu$ by ergodic measures only:\\ \noindent{\sc 7.15 Theorem (Ergodic Decomposition)}. Let $X$ be a topological space. Given any invariant measure $\mu\in{\cal M}_{\rm inv}$ there exists a probability measure $\rho_{\mu}$ on the space ${\cal M}_{\rm inv}$ such that \\ {\rm (a)} $\rho_{\mu}({\cal M}_{\rm erg})=1$;\\ {\rm (b)} for any $f\in L^1_{\mu}(X)$ $$ \int_X f\, d\mu = \int_{{\cal M}_{\rm erg}} \left (\int_X f\, d\nu \right )\, d\rho_{\mu}(\nu) $$ (i.e. the invariant measure $\mu$ is an affine combination, weighted by $\rho_{\mu}$, of ergodic measures $\nu\in{\cal M}_{\rm erg}$).\\ We assume this statement without proof. A proof is outlined in Pollicott, pp. 18-19. The ergodic decomposition theorem 7.15 is conceptually important, but practically it is hardly useful (see Remark~7.21, however). \\ \noindent{\sc 7.16 Corollary}. Let $X$ be a compact metrisable topological space and $T:X\to X$ a continuous transformation. Then ${\cal M}_{\rm inv}$ is closed and ${\cal M}_{\rm erg}\neq\emptyset$, i.e. there exists at least one ergodic measure. \\ \noindent{\sc 7.17 Definition}. Two measures $\mu_1,\mu_2 \in{\cal M}(X)$ are said to be {\bf mutually singular} (denoted $\mu_1\perp \mu_2$) if there is a set $B\in{\cal B}$ such that $\mu_1(B)=1$ and $\mu_2(B)=0$.\\ \noindent{\sc 7.18 Proposition}. If $\mu_1,\mu_2\in{\cal M}_{\rm erg}$ are two ergodic measures, then they either coincide ($\mu_1=\mu_2$) or are mutually singular ($\mu_1\perp\mu_2$).\\ {\em Proof}. Consider the signed measure $\nu=\mu_1-\mu_2$. Arguing as in the proof of Proposition~7.14 gives $\mu_1(B_{\infty})>0$ and $\mu_2(B_{\infty}^c)>0$. Now use the ergodicity of $\mu_1$ and $\mu_2$. \\ \noindent{\sc 7.19 Proposition}. Let $\mu\in{\cal M}_{\rm inv}$ be an invariant measure. Suppose the measure $\rho_{\mu}$ in 7.15 is concentrated on a finite or countable set of ergodic measures $\nu_1,\nu_2,\ldots$. Since all of them are mutually singular by 7.18, there is a partition $X=\cup_{n\geq 1}X_n$, $X_i\neq X_j$ for $i\neq j$, such that for each $n\geq 1$ the measure $\nu_n$ is concentrated on $X_n$, i.e.\ $\nu_n(X_n)=1$. In this case $\nu_n=\mu_{X_n}$ is the conditional measure induced by $\mu$ on $X_n$. \\ \noindent{\sc 7.20 Remark}. The above proposition can be generalized to arbitrary invariant measures $\mu$. That is, the space $X$ can be decomposed into smaller $T$-invariant subsets on each of which the conditional measure induced by $\mu$ is ergodic. However, when those subsets have zero $\mu$-measure, the definition of conditional measures is quite involved and we do not give it here, see Pollicott, p.\ 18, and Walters, p.\ 9. \\ \noindent{\sc 7.21 Remark}. Let $A\subset X$ be a subset such that $\nu(A)=1$ for every ergodic measure $\nu$. Then $\mu(A)=1$ for every invariant measure $\mu$. Indeed, we can set $f=\chi_A$, the indicator of the set $A$, and apply Theorem~7.15.\\ Invariance of sets (defined by 7.3) is just as important as invariance of functions:\\ \noindent{\sc 7.22 Definition}. A function $f\in L^0(X)$ is {\bf invariant} if $U_Tf=f$, i.e. $f(T(x))=f(x)$ for all $x\in X$. In this case $f$ is constant on every orbit $\{T^nx\}$. Given an invariant measure $\mu$, a function $f$ is {\bf almost everywhere invariant} if $f(T(x))=f(x)$ for a.e.\ $x\in X$. This precisely means $\mu(\{x:\ f(T(x))\neq f(x)\})=0$.\\ Note: if the function $f$ is invariant a.e., then $f$ is constant on the orbit of almost every point $x\in X$.\\ \noindent{\sc 7.23 Lemma (Characterizing ergodic measures)}. An invariant measure $\mu$ is ergodic iff any invariant (alternatively: any almost everywhere invariant) function $f$ is constant a.e. Note: it is enough to restrict this criterion to functions $f\in L^1_{\mu}(X)$ or $f\in L^2_{\mu}(X)$ or even $f\in L^{\infty}(X)$. \\ {\em Proof}. See Walters, p.\ 28, or Pollicott, p.\ 10.\\ \noindent{\sc 7.24 Exercise}. Let $\mu$ be an ergodic measure and $\mu(A)>0$. Show that $\mu(A_{\infty})=1$, i.e. almost every point $x\in X$ visits the set $A$ infinitely many times. \\ \noindent{\sc 7.25 Corollary}. Let $X$ be a topological space with a countable basis. Let $\mu$ be an ergodic measure such that $\mu(U)>0$ for every nonempty open set $U$ (this is quite common in physics). Then the orbit of almost every point $x\in X$ visits every open set infinitely many times. In particular, the orbit of almost every point is dense in $X$. \\ The following two exercises are very similar. Do only one of them (your choice):\\ \noindent{\sc 7.26 Exercise}. Let a map $T:X\to X$ be ergodic with respect to a measure $\mu$. Let $\mu(A)>0$. Show that the first return map $T_A:\tilde{A}\to\tilde{A}$ constructed in 6.6 is ergodic with respect to the conditional measure $\mu_{\tilde{A}}$ (defined in 6.7).\\ \noindent{\sc 7.27 Exercise}. Let a map $T:X\to X$ be ergodic with respect to a measure $\mu$. Let $\tau:X\to\IN$ be a function with a finite integral. Show that the map $S$ on the tower $Y$ defined in 6.10 is ergodic with respect to the measure $\tilde{\mu}$ (also defined in 6.10).\\ An important concept in dynamical systems is that of isomorphism.\\ \noindent{\sc 7.28 Definition}. For $i=1,2$, let $T_i:X_i\to X_i$ be a transformation preserving a probability measure $\mu_i$. We say that $T_1$ and $T_2$ are {\bf isomorphic} if for each $i=1,2$ there is a $T_i$-invariant set $B_i\subset X_i$ of full measure, i.e. $T(B_i)\subset B_i$ and $\mu_i(B_i)=1$, and a bijection $\varphi:B_1\to B_2$ such that \begin{itemize} \item[(i)] $\varphi$ preserves measures, i.e. for every measurable set $A\subset B_1$ the set $\varphi(A)\subset B_2$ is measurable and $\mu_1(A)=\mu_2(\varphi(A))$ (and vice versa); \item[(ii)] $\varphi$ preserves dynamics, i.e. $\varphi\circ T_1=T_2\circ\varphi$ on $B_1$. \end{itemize} We call $\varphi$ an {\bf isomorphism} and write $T_1\simeq T_2$. An isomorphism means that two dynamical systems, $(X_1,T_1,\mu_1)$ and $(X_2,T_2,\mu_2)$ are equivalent, up to sets of zero measure (which we neglect).\\ \noindent{\sc 7.29 Remarks}. Isomorphism is an equivalence relation. Also, if $T_1\simeq T_2$, then $T_1^n\simeq T_2^n$ for all $n\geq 1$.\\ \noindent{\sc 7.30 Exercise}. Assume that $T_1\simeq T_2$. Prove that $T_1$ is ergodic if and only if so is $T_2$. \newpage \section{Examples of Ergodic Maps} Here we show that many of the maps we have discussed so far are ergodic. Let us start with the circle rotation $T(x)=x+a$ (mod 1), where $x\in X=[0,1)$. \\ \noindent{\sc 8.1 Claim}. If $a\in\IQ$, i.e.\ $a$ is a rational number, then every point $x\in X$ is periodic. \\ {\em Proof}. Let $a=p/q$. Then $T^n(x)=x+np/q$ (mod 1) for all $n\geq 1$, hence $T^q(x)=x$. $\Box$ Note that if $p$ and $q$ are relatively prime, then $q$ is the minimal period of every point $x\in X$.\\ \noindent{\sc 8.2 Claim}. If $a$ is irrational, then for every $x\in X$ the trajectory $\{T^nx\}$, $n\geq 0$, is dense in $X$.\\ {\em Proof}. First, if $a$ is irrational, then no point $x$ can be periodic. Indeed, if $x$ is periodic with period $n$, then $x=T^n(x)=x+na$ (mod 1), hence $na\in\ZZ$, so $a$ is a rational number. Therefore, all points $\{T^nx\}$, $n\geq 1$, are distinct. Hence for any $\varepsilon >0$ there are two of those points, say $T^m(x)$ and $T^{m+k}(x)$, which are $\varepsilon$-close to each other. Put $y=T^m(x)$, then $T^k(y)=T^{m+k}(x)$, so dist$(y,T^ky)\leq\varepsilon$. Since $T$ preserves distances, we have $$ {\rm dist}(y,T^ky)=\, {\rm dist}(T^ky,T^{2k}y)=\, {\rm dist}(T^{2k}y,T^{3k}y)=\cdots $$ Since $T$ also preserves orientation, then the sequence $y,T^ky, T^{2k}y,T^{3k}y,\ldots$ moves in one direction on the circle, and eventually will go around it and make an $\varepsilon$-dense subset of $X$. Since $\varepsilon>0$ is arbitrary, we get our claim. $\Box$ \\ \noindent{\sc 8.3 Claim}. If $a$ is rational, then $T$ is not ergodic with respect to the Lebesgue measure $m$ on $X$.\\ {\em Proof}. Let $a=p/q$. For any $\varepsilon>0$ the set $$ B_{\varepsilon}=\cup_{i=0}^{q-1} \left ( i/q,i/q+\varepsilon\right ) $$ is invariant. Its measure is $m(B_{\varepsilon})=\varepsilon q>0$. $\Box$ \\ \noindent{\sc 8.4 Claim}. If $a$ is irrational, then $T$ is ergodic with respect to the Lebesgue measure $m$ on $X$.\\ {\em Proof}. We will need one fact from real analysis. Let $B\subset\IR$ be a Borel set. A point $x\in \IR$ is called a {\bf Lebesgue density point} (or just a {\bf density point}) of the set $B$ if $$ \lim_{\varepsilon\to 0} \frac{m(B\cap [x-\varepsilon,x+\varepsilon])}{2\varepsilon}=1 $$ The fact is that $m$-almost every point $x\in B$ is a density point. Now, assume that $T$ is not ergodic and $B\subset X$ is a $T$-invariant set, such that $00$). Find such a small $\varepsilon>0$ that $m(B\cap [x-\varepsilon,x+\varepsilon])>1.9\,\varepsilon$ and $m(B^c\cap [y-\varepsilon,y+\varepsilon])>1.9\,\varepsilon$. Now, since the orbit of $x$ is dense in $X$, there is an $n\geq 1$ such that dist$(T^nx,y)<0.1\,\varepsilon$. Because the set $B$ is $T$-invariant, we have $m(B\cap [T^nx-\varepsilon,T^nx+\varepsilon])>1.9\,\varepsilon$. This easily leads to a contradiction. $\Box$ \\ \begin{figure}[h] \centering \epsfig{figure=ds-8.eps}\caption{The sequence $y,T^ky,T^{2k}y,\ldots$ on the circle (a), and a set $B$ with its preimage $T^{-1}B$, the latter consisting of two identical parts (b).} \end{figure} Next we turn to the doubling map $T(x)=2x$ (mod 1), where $x\in X=[0,1)$. \\ \noindent{\sc 8.5 Claim}. The doubling map is ergodic with respect to the Lebesgue measure $m$.\\ {\em Proof}. Let $B\subset X$ be a Borel set. Note that $T^{-1}B$ consists of two identical copies of $B$, each is twice as small as the original $B$, one lies on the interval $[0,1/2)$ and the other on the interval $[1/2,1)$. Hence, $$ m(T^{-1}B\cap [0,1/2))=m(T^{-1}B\cap [1/2,1))=m(B)/2 $$ Applying the same argument to $T^{-n}B$ we get the following. Let $J_{i,n}=[i/2^n,(i+1)/2^n)$ for $i=0,1,\ldots,2^n-1$ (we call these {\bf binary intervals}). Then \be m(T^{-n}B\cap J_{i,n})=m(B)/2^n=m(B)\, m(J_{i,n}) \label{TnB} \ee for all $i=0,1,\ldots,2^n-1$. It is easy to see that the collection of all binary intervals $\{J_{i,n}\}$, $n\geq 1$, $0\leq i\leq 2^n-1$, generates the Borel $\sigma$-algebra on $X$. Finite unions of binary intervals make an algebra. Let $A\subset X$ be a Borel set. By the approximation theorem 1.19 for any $\varepsilon>0$ the set $A$ can be approximated by a finite union of binary intervals, i.e.\ there is $A_0=\cup_{j}J_{i_j,n}$ so that $m(A\Delta A_0) <\varepsilon$. Here $n$ can be made the same for all $j$. Certainly, $n$ depends on $\varepsilon$, so let us call it $n_{\varepsilon}$. Now, (\ref{TnB}) implies $$ m(T^{-n}B\cap A_0)=m(B)\, m(A_0) $$ for all $n\geq n_{\varepsilon}$. Since $A_0$ approximates $A$, it is easy to derive that $$ |m(T^{-n}B\cap A)-m(B)\, m(A)|<2\varepsilon $$ for all $n\geq n_{\varepsilon}$. Hence, \be m(A\cap T^{-n}B) \to m(A)\, m(B) \ \ \ \ \ \ {\rm as}\ \ n\to\infty \label{ABmix1} \ee Note: this is true for {\em any} (!) Borel sets $A,B\subset X$. Now, if $B$ is a $T$-invariant set, then $T^{-n}B=B$ and (setting $A=B$) we get $m(B)=[m(B)]^2$. This is only possible if $m(B)=0$ or $m(B)=1$. $\Box$ \\ \noindent{\sc 8.6 Remark}. The same argument, without changes, applies to the tent map. Hence, the tent map is ergodic with respect to the Lebesgue measure.\\ Next, we take the quadratic map $T(x)=4x(1-x)$ for $x\in [0,1]$. This one has an absolutely continuous invariant measure (a.c.i.m.) with density given in 5.11. It is hard, in this case, to show the ergodicity directly, but there is a helpful trick. This map is shown on Fig.~4b and looks very much like the tent map shown on Fig.~4a (topologically, they are equivalent). Maybe they are isomorphic? \\ \noindent{\sc 8.7 Claim}. The tent map and the quadratic map are isomorphic. \\ {\em Proof}. Let $T_1(x)=2x$ for $x\leq 1/2$ and $T_1(x)=2-2x$ for $x>1/2$ be the tent map. Let $T_2(y)=4y(1-y)$ be the quadratic map. The isomorphism is established by the function $$ y=\varphi(x)=\frac{1-\cos \pi x}{2} $$ It is a bijection of the unit interval $[0,1]$ onto itself. We need to verify the preservation of measures 7.28(i) and dynamics 7.28(ii). We first check 7.28(i). Let $x\in (0,1)$ and $y=\varphi(x)$. Take a small interval $(x,x+dx)$ and let $y+dy= \varphi(x+dx)$. The preservation of measures means $$ dx = f(y)\, dy + o(dy) $$ where $f(y)$ is the density function given in 5.11. Dividing by $dx$ and taking the limit $dx\to 0$ gives $$ f(\varphi(x))\, \varphi'(x) = 1 $$ So, we need to verify this identity, which can be done by direct substitution. Next, we show how to check 7.28(ii). Let $x<1/2$. Then $$ \varphi(T_1(x))=\varphi(2x)=\frac{1-\cos 2\pi x}{2}=\sin^2\pi x $$ On the other hand, $$ T_2(\varphi(x))=4\varphi(x)(1-\varphi(x))= 4\times \frac{1-\cos\pi x}{2}\times \frac{1+\cos\pi x}{2}=\sin^2\pi x $$ so we get $\varphi\circ T_1=T_2\circ \varphi$. The case $x>1/2$ is similar. $\Box$ \\ \noindent{\sc 8.8 Corollary}. The quadratic map $T(x)=4x(1-x)$ is ergodic with respect to the a.c.i.m.\ with density given in 5.11.\\ Lastly, we turn to the baker's transformation of the unit square. \\ \noindent{\sc 8.9 Claim}. The baker's map is ergodic with respect to the Lebesgue measure.\\ {\em Proof}. After you grasp the proof of 8.5, this should be pretty clear. Our main tool will be binary rectangles, rather than binary intervals. A {\bf binary rectangle} is $$ R_{i,j,m,n}=\{(x,y):\ i/2^m\leq x<(i+1)/2^m,\ j/2^n\leq y<(j+1)/2^n \} $$ It is easy to see that the collection of all binary rectangles generates the Borel $\sigma$-algebra on $X$. Also, finite unions of binary rectangles make an algebra. Consider an arbitrary binary rectangle $R_{i,j,m,n}$. Note that $T(R_{i,j,m,n})=R_{s,t,m-1,n+1}$ for some $s,t$. Therefore, $T^m(R_{i,j,m,n})=R_{u,v,0,n+m}$ for some $u,v$. This last set is a rectangle, which stretches across $X$ all the way in the $x$ direction (from $x=0$ to $x=1$). Similarly, $T^{-n}(R_{i,j,m,n})=R_{e,f,m+n,0}$ is a rectangle stretching across $X$ all the way in the $y$ direction (from $y=0$ to $y=1$). Now, let $A$ and $B$ be two Borel subsets of $X$. By the approximation theorem 1.19, for any $\varepsilon>0$ there are sets $A_0$ and $B_0$, each being a finite union of some binary rectangles, such that $m(A\Delta A_0)<\varepsilon$ and $m(B\Delta B_0)<\varepsilon$. Let $A_0=\cup_{p,q}R_{i_p,j_q,m,n}$, where $m$ and $n$ can be made the same for all $(p,q)$. Then $T^m(A_0)$ is a union of binary rectangles stretched across $X$ all the way in the $x$ direction. The same is obviously true for $T^k(A_0)$ whenever $k\geq m$. Now let $B_0=\cup_{r,s}R_{i_r,j_s,m,n}$, where $m$ and $n$ can be made the same for all $(r,s)$ (and the same as above). Then $T^{-n}(B_0)$ is a union of binary rectangles stretched across $X$ all the way in the $y$ direction. The same is obviously true for $T^{-\ell}(B_0)$ whenever $\ell\geq n$. The above observations imply $$ m(T^kA_0\cap T^{-\ell}B_0)=m(T^kA_0)\, m(T^{-\ell}B_0) $$ for all $k\geq m$ and $\ell\geq n$. Since the Lebesgue measure $m$ is $T$-invariant and $T$ is an automorphism, $$ m(A_0\cap T^{-N}B_0)=m(A_0)\, m(B_0) $$ for all $N\geq m+n$. Next, because $A_0$ approximates $A$ and $B_0$ approximates $B$, it is easy to derive that $$ |m(A\cap T^{-N}B)-m(A)\, m(B)|<4\varepsilon $$ for all $N\geq m+n$. Hence, \be m(A\cap T^{-N}B) \to m(A)\, m(B) \ \ \ \ \ \ {\rm as}\ \ N\to\infty \label{ABmix2} \ee Note: this is true for {\em any} (!) Borel sets $A,B\subset X$. Now, if $B$ is a $T$-invariant set, then $T^{-N}B=B$ and (setting $A=B$) we get $m(B)=[m(B)]^2$. This is only possible if $m(B)=0$ or $m(B)=1$. $\Box$ \\ Finally, we discuss the uniqueness of the absolutely continuous invariant measures (a.c.i.m.'s) constructed in Section~5.\\ \noindent{\sc 8.10 Theorem}. Let $T:X\to X$ be a map of an interval $X\subset\IR$ that has an ergodic a.c.i.m.\ $\mu$ with a positive density $f(x)>0$. Then that a.c.i.m.\ is unique.\\ {\em Proof}. The assumption $f(x)>0$ implies that $\mu$ is equivalent to the Lebesgue measure $m$, i.e. $\mu(B)=0$ iff $m(B)=0$. Since $\mu$ is ergodic, then for each $T$-invariant set $B$ we have $m(B)=0$ or $m(B^c)=0$. If there were another a.c.i.m.\ $\nu$ with density $g(x)$, then for any $T$-invariant set $B$ we would have either $\nu(B)=\int_B g\, dm=0$ or $\nu(B^c)=\int_{B^c} g\, dm=0$. Hence, the measure with density $g$ would be ergodic, too. On the other hand, distinct ergodic measures are mutually singular by 7.18, a contradiction. $\Box$ \\ \begin{figure}[h] \centering \epsfig{figure=ds-9.eps}\caption{The function in Exercise 8.13.} \end{figure} \noindent{\sc 8.11 Remark}. The last theorem (with the same proof) extends to maps on a unit square or, more generally, any domain of $\IR^d$ for any $d\geq 1$.\\ \noindent{\sc 8.12 Corollary}. The doubling map, the tent map, the quadratic map and the baker's transformation have unique a.c.i.m.'s.\\ \noindent{\sc 8.13 Exercise}. Let $T:[0,1)\to [0,1)$ be defined by $$ T(x)=\left\{\begin{array}{ll} 2x & {\rm if}\ \ x<1/4\\ 2x-1/2 & {\rm if}\ \ 1/4\leq x<3/4 \\ 2x-1 & {\rm if} \ \ 3/40$. Assume, additionally, that $$ 0f_-(x)\}$. If $\mu(A)>0$, then apply Ergodic Theorem to the restriction of $T$ to $A$ preserving the conditional measure $\mu_A$. This yields $\int_A f_+ \, d\mu=\int_A f\, d\mu = \int_A f_-\, d\mu$, a contradiction. Similarly one can show that the set $\{x:\, f_+(x)0\}$ and $A=\cup_{N\geq 1}A_N$. Then \be \int_{A_N} f\, d\mu \geq 0 \ \ \ \ {\rm and}\ \ \ \ \ \int_{A} f\, d\mu \geq 0 \label{maxert} \ee {\em Proof}. See Walters, pp. 37-38. We just sketch the proof here. First we prove that $\int_{A_N} f\, d\mu \geq 0$. For $0\leq n\leq N$ we have $F_N\geq S_n$, hence $F_N\circ T\geq S_n\circ T$, and hence $F_N\circ T + f\geq S_{n+1}$. Therefore, for all $x\in A_N$ we have $$ F_N(T(x))+f(x)\geq \max_{1\leq n\leq N}\{S_n(x)\}=F_N(x) $$ (since $S_0(x)=0$ and $F_N(x)>0$). Thus, $f(x)\geq F_N(x)-F_N(T(x))$ for $x\in A_N$. Note also that $F_N(y)=0$ and $F_N(T(y))\geq 0$ for all $y\in A_N^c$. Hence, $$ \int_{A_N}f\, d\mu \geq \int_{A_N}F_N\, d\mu-\int_{A_N} F_N\circ T\,d\mu \geq \int_XF_N\, d\mu-\int_XF_N\circ T\,d\mu = 0 $$ the last equation is due to the invariance of $\mu$. Next, since $F_N\leq F_{N+1}$, then $A_N\subset A_{N+1}$ for all $N$. Now $$ \int_{A} f\, d\mu =\lim_{N\to\infty}\int_{A_N} f\, d\mu \geq 0 \ \ \ \ \ \ \ \ \ \ \ \Box $$ Step 2 consists in using Lemma 9.8 to prove the clause (a) of Theorem 9.2. See Walters, p. 38. We sketch the argument here. Let $$ \bar{f}(x)=\limsup_{n\to +\infty}\frac 1n\, S_n(x) $$ and $$ \underline{f}(x)=\liminf_{n\to +\infty}\frac 1n\, S_n(x) $$ It is enough to show that $\bar{f}=\underline{f}$ a.e. If this is not the case, then there are real numbers $\alpha>\beta$ such that the set $$ E=E_{\alpha,\beta}=\{x:\ \bar{f}(x)>\alpha\ \ {\rm and} \ \ \underline{f}<\beta\} $$ has positive measure, i.e. $\mu(E)>0$. Note that the functions $\bar{f}$ and $\underline{f}$ are invariant, hence $E$ is a fully $T$-invariant set. Consider a function $g=(f-\alpha)\,\chi_E$, where $\chi_E$ is the indicator of $E$. Then for all $x\in E$ we have $$ \sup_{N\geq 1} \Big ( g(x)+g(Tx)+\cdots +g(T^{N-1}x)\Big ) > 0 $$ and $g\equiv 0$ on $E^c$. Applying (\ref{maxert}) to the function $g$ we get $\int_E g\, d\mu \geq 0$, hence \be \int_E f\, d\mu \geq \alpha\, \mu(E) \label{Ea} \ee Similarly, we can show that \be \int_E f\, d\mu \leq \beta\, \mu(E) \label{Eb} \ee (by applying the previous argument to the function $-f$). But then (\ref{Ea}) and (\ref{Eb}) imply $\mu(E)=0$. \\ Step 3 consists of deriving the clauses (b), (c) and (d) from (a). See Walters, p. 39. The clause (b) is trivial. The clause (d) follows by 7.23. The integrability of $f_+$ in the clause (c) follows from Fatou's lemma, which in fact gives \be \| f_+ \|_1 \leq \| f \|_1 \label{f+f} \ee It remains to prove the integral identity in (c). Our argument is different from that of Walters. First, observe that $\int_X S_n/n\, d\mu=\int_Xf\, d\mu$ for all $n$, because $\mu$ is invariant. If $f$ is bounded, then $\|S_n/n\|_{\infty}\leq \|f\|_{\infty}$, hence the sequence $S_n/n$ is uniformly bounded. In this case the integral identity in (c) follows from the dominated convergence theorem. For an arbitrary $f\in L^1_{\mu}(X)$, we take $\varepsilon>0$ and approximate $f$ with a bounded function $\phi$ so that $\|f-\phi\|_1<\varepsilon$. Since $\phi$ is bounded, we have $\int \phi\, d\mu=\int \phi_+\, d\mu$. Lastly, $$ \|f_+-\phi_+\|_1= \|(f-\phi)_+\|_1\leq \|f-\phi\|_1<\varepsilon $$ where we applied (\ref{f+f}) to the function $f-\phi$. This implies $|\int f\, d\mu-\int f_+\, d\mu|<2\varepsilon$. Ergodic Theorem 9.2 is now proved. $\Box$\\ \noindent{\sc 9.9 Corollary ($L^p$ Ergodic Theorem of Von Neumann)}. Let $1\leq p<\infty$. If $f\in L^p_{\mu}(X)$, then $\|S_n/n-f_+\|_p\to 0$ as $n\to\infty$.\\ {\em Proof}. See Walters, p.\ 36. By the way, it is quite similar to our proof of the clause (c) of Theorem 9.2. $\Box$ \\ \noindent{\sc 9.10 Exercise}. Let $A\subset X$ and $\mu(A)>0$. Prove that $r_A(x)>0$ (defined in 9.5) for almost every point $x\in A$. [Hint: consider the set $B=\{x\in A:\ r_A(x)=0\}$.]\\ \noindent{\sc 9.11 Example}. There is an interesting application of the ergodic theorem to number theory. Recall Example 2.27, which involves the map $T(x)=10x$ (mod 1) on the unit interval $X=[0,1)$. This map preserves the Lebesgue measure $m$, as shown in 2.27. One can also show that $m$ is ergodic by using the same argument as in 8.5, we omit details. Consider the set $A_r=[r/10,(r+1)/10)$ for some $r=0,1,\ldots,9$. For $x\in X$, the inclusion $T^n(x)\in A_r$ means that the $n$-th digit in the decimal representation of $x$ is $r$, see 2.27. For $n\geq 1$, let $K_r(n,x)$ be the number of occurrences of the digit $r$ among the first $n$ digits of the decimal representation of $x$. This is exactly $\#\{0\leq i\leq n-1:\ T^i(x)\in A_r\}$. Corollary 9.6 now implies that $$ \lim_{n\to\infty} K_r(n,x)/n = m(A_r)=0.1 $$ for almost every $x\in X$. The above fact is known in number theory. A number $x\in [0,1)$ is called {\bf normal} if for every $r=0,1,\ldots,9$ the asymptotic frequency of occurrences of the digit $r$ in the decimal representation of $x$ is exactly 0.1. The fact, which we just proved, that almost every point $x\in X$ is normal is known as {\bf Borel Theorem on Normal Numbers}.\\ Next we derive some further consequences of Ergodic Theorem 9.2 along the lines of 9.5 and 9.6. We need to assume that $X$ is a compact metrisable topological space, but the map $T$ does not have to be continuous. For $x\in X$, consider the sequence of uniform atomic measures (recall Definition 3.13) \be \mu^{(n)}_x=(\delta_x+\delta_{Tx}+\cdots +\delta_{T^{n-1}x})/n \label{munx} \ee As $n\to\infty$, the measure $\mu_x^{(n)}$ may converge to a probability measure $\mu\in{\cal M}(X)$ in the weak* topology.\\ \noindent{\sc 9.12 Definition}. A point $x\in X$ is said to be $\mu${\bf -generic} for a measure $\mu\in {\cal M}(X)$ if the sequence $\mu^{(n)}_x$ defined by (\ref{munx}) weakly converges to $\mu$ as $n\to\infty$. Equivalently, $$ \lim_{n\to\infty} \frac 1n\sum_{i=0}^{n-1} F(T^i(x)) = \int_X F\, d\mu \ \ \ \ \ \ \ \forall F\in C(X) $$ i.e. the {\em time averages} are equal to the {\em space averages} for all continuous functions.\\ If a point $x$ is $\mu$-generic, then the trajectory $\{T^nx\}$ of $x$ is distributed in the space $X$ according to the measure $\mu$ (one can say that $x$ is ``attracted'' by the measure $\mu$).\\ \noindent{\sc 9.13 Definition}. Let $T:X\to X$ be a map and $\mu\in{\cal M}(X)$. The set $$ B_{\mu}=\{x:\ x\ {\rm is}\ \mu{\rm-generic}\} $$ is called the {\bf basin of attraction} of the measure $\mu$. \\ Note: if $\mu\neq \nu$ are two distinct measures, then $B_{\mu}\cap B_{\nu}=\emptyset$.\\ \noindent{\sc 9.14 Proposition}. If $\mu$ is an ergodic measure, then $\mu$-almost every point $x\in X$ is $\mu$-generic, i.e. $\mu(B_{\mu})=1$.\\ {\em Proof}. Let ${\cal J}$ be a countable basis in the topology on $X$ and ${\cal A}({\cal J})=\{A_k\}$ a countable algebra of $X$ generated by $\cal J$, see 1.17. For every $k$ we have $\mu^{(n)}_x(A_k)\to \mu(A_k)$ as $n\to\infty$ a.e.\ by 9.6, i.e. we have this convergence for all $x\in X_k$ with $\mu(X_k)=1$. Let $X_{\infty}=\cap_k X_k$. Obviously, $\mu(X_{\infty})=1$. Since every open set $U\subset X$ is a union of some disjoint elements of ${\cal A}({\cal J})$, one can easily derive that $\liminf_n\mu^{(n)}_x(U)\geq\mu(U)$ for every $x\in X_{\infty}$. Therefore, $\mu^{(n)}_x$ weakly converges to $\mu$ by 3.11(iii). $\Box$\\ \noindent{\sc 9.15 Applications in Physics}. Suppose a map $T:X\to X$ models a physical process. In this case, usually, $X$ is a compact topological space with some natural coordinates on it (examples: a compact domain in $\IR^d$, a sphere, a torus, etc.). The coordinates allow us to define the Lebesgue measure $m$ on $X$. It measures area or volume in $X$, depending on the dimension of $X$. Let $m$ be normalized so that $m(X)=1$. A typical physical experiment (or a numerical test done with the aid of a computer) consists of choosing a point $x\in X$ {\em at random} and experimentally following (or numerically generating) its trajectory $x,T(x),\ldots,T^{n-1}(x)$ until some large time $n$. The points $\{x,T(x),\ldots,T^{n-1}(x)\}$ represent the measure $\mu^{(n)}_x$ defined by (\ref{munx}). Proposition 9.14 shows that if $\mu$ is an ergodic $T$-invariant measure and the point $x\in X$ is typical with respect to $\mu$, then the measure $\mu^{(n)}$ weakly converges to $\mu$ as $n\to\infty$, i.e.\ the measure $\mu$ describes the distribution of typical orbits in the space $X$. However, in practice one may NOT want (or may NOT be able) to choose a point $x$ typical with respect to some ergodic measure $\mu$. Why should those points be physically interesting? Physicists may not even have any ergodic measure at hands! What they want is to choose a point $x$ typical with respect to the Lebesgue measure $m$ on $X$. It is a fundamental principle in statistical physics that only such points are physically relevant (or experimentally observable). Such points are also easy to generate by computer programs (using so called random number generators). This motivates the following definition: \\ \noindent{\sc 9.16 Definition}. Let $X$ be a compact space with natural coordinates and a (normalized) Lebesgue measure $m$. A $T$-invariant measure $\mu$ is said to be {\bf physically observable} if $m(B_{\mu})>0$. Such measures are also referred to as {\bf Sinai-Bowen-Ruelle (SRB) measures} in the modern theory of dynamical systems. \\ A measure $\mu$ is physically observable if there is a chance to ``observe'' $\mu$ by following a trajectory chosen at random with respect to the Lebesgue measure $m$, i.e.\ observe $\mu$ in a physical experiment or by a computer simulation.\\ \noindent{\sc 9.17 Proposition}. Let $T:X\to X$ be the irrational circle rotation, or the doubling map, or the tent map, or the quadratic map, or the baker's transformation, and $\mu$ the absolutely continuous invariant measure on $X$. Then $\mu$ is physically observable. Moreover, $m(B_{\mu})=1$, i.e. the a.c.i.m.\ $\mu$ is the {\em only} physically observable measure. \\ {\em Proof}. Since $\mu$ is ergodic, then $\mu(B_{\mu})=1$ by 9.14. Also, $\mu$ is equivalent to the Lebesgue measure $m$, hence $m(B_{\mu})=1$. $\Box$ \\ \noindent{\sc 9.18 Exercise}. Let $T(x)=x^2$ for $x\in X=[0,1]$. Which invariant measures are physically observable? Recall Exercise 2.26(e). \\ \noindent{\sc 9.19 Exercise (optional; it is rather tricky)}. Let $T(x)=x+a$ (mod 1) for $X=[0,1)$ be a circle rotation with a rational $a\in \IQ$. Are there any physically observable measures? \\ \newpage \section{Symbolic Dynamics} Recall that in Example 2.28 we represented every point $x\in X=[0,1)$ by a sequence of binary digits $i_0i_1i_2\ldots$ and the doubling map $T(x)=2x$ (mod 1) then corresponded to the left shift of the sequence (with the removal of the first digit $i_0$). In Example 5.5 we represented every point $(x,y)\in X=[0,1)\times [0,1)$ of the unit square by a double infinite sequence of binary digits $\{\omega_n\}_{n=-\infty}^{\infty}$ and then the baker's map $T:\, X\to X$ corresponded to the left shift of the entire sequence. Here we generalize these examples.\\ \noindent{\sc 10.1 Definitions}. Let $S$ be a finite set of $r\geq 2$ elements. We call $S$ an {\bf alphabet} and elements $s\in S$ {\bf letters}. We label the letters by $1,2,\ldots,r$, i.e. we assume $S=\{1,\ldots,r\}$. Let $\Omega_+=\Omega_{+,r}=S^{\ZZ_+}$ denote the space of infinite sequences of letters. So a point $\uom\in\Omega_+$ is a sequence $\uom=\{\omega_n\}_{n=0}^{\infty}$ with $\omega_n\in S$ for each $n\geq 0$. Also, let $\Omega=\Omega_r=S^{\ZZ}$ be the space of double infinite sequences of elements of $S$, i.e. $\Omega$ consists of sequences $\uom=\{\omega_n\}_{n=-\infty}^{\infty}$ with $\omega_n\in S$ for each $n\in\ZZ$. We call $\Omega_+$ and $\Omega$ {\bf symbolic spaces}. (For brevity, we suppress the index $r$.) We equip the finite set $S$ with the discrete topology (i.e., all its subsets are open). We consider the {\bf product topology} on $\Omega_+$ and $\Omega$. The product topology is generated by cylinders, see below. Cylinders make a countable basis in the product topology and play the same important role as intervals in $\IR$. Let $0\leq m\leq n<\infty$ and $\omega_i'\in S$ for $m\leq i\leq n$. A {\bf cylinder} is the set $$ C_{m,n}(\omega_m',\ldots,\omega_n') =\{\uom\in\Omega_+:\ \omega_i=\omega_i' \ \ {\rm for}\ {\rm all}\ m\leq i\leq n\} $$ In other words, the cylinder $C_{m,n}(\omega_m',\ldots,\omega_n')$ consists of all sequences whose ``coordinates'' from $m$ to $n$ are fixed (equal to the given letters $\omega_m',\ldots,\omega_n'$). Similarly, a cylinder $C_{m,n}\subset \Omega$ can be defined for any $-\inftyi_0\} $$ Clearly, $\Omega_+'$ is countable set, so $\mu(\Omega_+')=0$. Now, for each $x\in [0,1)\setminus B'$ the binary representation $x=0.i_0i_1i_2\ldots$ constructed in 2.28 is unique. We define a sequence $\uom=\phi(x)\in\Omega_+$ by $\omega_n=i_n+1$ for all $n\geq 0$. This defines a bijection between $[0,1)\setminus B'$ and $\Omega_+\setminus\Omega_+'$. One can check by direct inspection that $\phi$ preserves the measure and dynamics, i.e. it is an isomorphism. $\Box$ \\ \noindent{\sc 10.12 Proposition}. The baker's map defined by 5.1 with the Lebesgue measure $m$ is isomorphic to the (two-sided) Bernoulli shift $B_2(1/2,1/2)$.\\ {\em Proof}. Similar to the previous one. \\ \noindent{\sc 10.13 Exercise}. Let $C$ and $C'$ be two cylinders and $\mu$ a Bernoulli measure. Show that there is an $n_0\geq 0$ such that $$ \mu(C\cap \sigma^{-n}(C'))=\mu(C)\, \mu(C') $$ for all $n\geq n_0$. Note: this applies to both $\Omega$ and $\Omega_+$.\\ \noindent{\sc 10.14 Theorem}. Every Bernoulli shift is ergodic. \\ {\em Proof}. Let $A$ and $B$ be two Borel subsets of $\Omega$ (or $\Omega_+$). By the approximation theorem 1.19, for any $\varepsilon>0$ there are sets $A_0$ and $B_0$, each being a finite disjoint union of some cylinders, such that $m(A\Delta A_0)<\varepsilon$ and $m(B\Delta B_0)<\varepsilon$. The result of Exercise 10.13 implies that there is an $n_0\geq 0$ such that $$ \mu(A_0\cap \sigma^{-n}(B_0))=\mu(A_0)\, \mu(B_0) $$ for all $n\geq n_0$. Since $A_0$ approximates $A$ and $B_0$ approximates $B$, it is easy to derive that $$ |\mu (A\cap \sigma^{-n}(B))-\mu(A)\, \mu(B)|<4\varepsilon $$ for all $n\geq n_0$. Hence, \be \mu(A\cap \sigma^{-n}(B)) \to \mu(A)\, \mu(B) \ \ \ \ \ \ {\rm as}\ \ n\to\infty \label{ABmix3} \ee Now, if $B$ is a $\sigma$-invariant set, then $\sigma^{-n}(B)=B$ and (setting $A=B$) we get $\mu(B)=[\mu(B)]^2$. This is only possible if $\mu(B)=0$ or $\mu(B)=1$. $\Box$\\ Whenever we establish an isomorphism between a given dynamical system $(X,T,\mu)$ and a symbolic system $(\Omega,\sigma,\nu)$ or $(\Omega_+,\sigma,\nu)$ with some $\sigma$-invariant measure $\nu$, we call this a {\bf symbolic representation} of $(X,T,\mu)$. We now outline a standard method of constructing symbolic representations. \\ \noindent{\sc 10.15 Definition}. Let $X=X_1\cup \cdots \cup X_r$ be a finite partition of $X$ into disjoint parts, $X_i\cap X_j=\emptyset$ for $i\neq j$. Let $T:X\to X$ be a map. For every point $x\in X$ its {\bf itinerary} is a sequence defined by $$ \{\omega_n\}_{n=0}^{\infty}:\ \ \ \ \ \ T^n(x)\in X_{\omega_n}\ \ \ \forall n\geq 0 $$ If the map $T$ is a bijection, i.e. $T^{-1}:X\to X$ is also defined, then the {\bf full itinerary} of a point $x\in X$ is a double infinite sequence defined by $$ \{\omega_n\}_{n=-\infty}^{\infty}:\ \ \ \ \ \ T^n(x)\in X_{\omega_n}\ \ \ \forall n\in\ZZ $$ \\ \noindent{\sc 10.16 Definition}. A partition $X=X_1\cup \cdots \cup X_r$ is called a {\bf generating partition} if distinct points have distinct itineraries. Equivalently, for any $x\neq y$ there is an $n$ such that $T^n(x)\in X_i$ and $T^n(y)\in X_j$ with some $i\neq j$.\\ \noindent{\sc 10.17 Construction of a symbolic representation}. Let $T:X\to X$ be a map and $X=X_1\cup \cdots \cup X_r$ a generating partition. Let $\phi: X\to \Omega_+$ (or $\phi:X\to\Omega$, if $T$ is an automorphism) be the map that takes every point $x\in X$ to its itinerary constructed in 10.15. This map is injective for any generating partition. Let $\Omega_X=\phi(X)$ be the image of $X$. Then $\Omega_X$ is $\sigma$-invariant, i.e. $\sigma(\Omega_X)\subset\Omega_X$. Moreover, $\phi\circ T=\sigma\circ\phi$. If $T$ has an invariant measure $\mu$ on $X$, one can define a measure $\nu$ on $\Omega_X$ by $\nu(B)=\mu(\phi^{-1}(B))$. Then the dynamical systems $(X,T,\mu)$ and $(\Omega_X,\sigma,\nu)$ will be isomorphic. This is a general principle for the construction of a symbolic representation.\\ \noindent{\sc 10.18 Remark}. In the above symbolic representation of $T:X\to X$, any cylinder $C_{m,n}(\omega_{m},\ldots,\omega_n) \subset \Omega_X$ corresponds to the set $$ X_{m,n}(\omega_{m},\ldots,\omega_n) =\cap_{k=m}^{n}T^{-k}X_{\omega_k} $$ that is, $\phi^{-1}(C_{m,n}(\omega_{m},\ldots,\omega_n)) =X_{m,n}(\omega_{m},\ldots,\omega_n)$.\\ \noindent{\sc 10.19 Remarks}. The symbolic representation of the doubling map corresponds to the partition $X_1=[0,0.5)$ and $X_2=[0.5,1)$. The symbolic representation of the baker's map corresponds to the partition of the square $X$ by the line $x=0.5$. \newpage \section{Mixing} In this section, we again assume that $T:X\to X$ is a transformation preserving a measure $\mu$. \\ \noindent{\sc 11.1 Proposition}. $T$ is ergodic iff for every two functions $f,g\in L_{\mu}^2(X)$ \be \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1}\int_X f(T^ix)g(x)\, d\mu = \int_X f\, d\mu\cdot \int_X g\,d\mu \label{fgerg} \ee {\em Proof}. If $T$ is not ergodic, then there is an invariant set $A$ with $0<\mu(A)<1$. The choice of $f=g=\chi_A$ shows that (\ref{fgerg}) fails. Let $T$ be ergodic. By Ergodic Theorem, $$ \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1} f(T^ix) = \int_X f\, d\mu \ \ \ \ \ {\rm a.e.} $$ Multiplying by $g(x)$ and integrating over $X$ should give the result. The exact argument is similar to the proof of (c) in Ergodic Theorem. First, for bounded functions $f$ and $g$ we can use the dominated convergence theorem. Then, for arbitrary functions $f,g\in L_{\mu}^2(X)$, we approximate them by bounded functions (in the $L^2$ metric) and use standard integral estimates (including the Schwarz inequality) to obtain the result. The (boring) details are left out. $\Box$ \\ \noindent{\sc 11.2 Remarks}. \begin{itemize} \item[(a)] We assume in 11.1 that $f,g\in L^2_{\mu}(X)$ (not just $f,g\in L^1_{\mu}(X)$) in order to ensure the existence of the integrals on the left hand side of (\ref{fgerg}). \item[(b)] Physicists use a more convenient notation: $\int_X f\, d\mu = \la f\ra_{\mu}$ (or just $\la f\ra$). Then (\ref{fgerg}) can be rewritten as $$ \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1}\la (f\circ T^i)\, g\ra = \la f \ra \, \la g\ra $$ \end{itemize} \noindent{\sc 11.3 Proposition}. $T$ is ergodic iff for every two measurable sets $A,B\subset X$ \be \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1} \mu(T^{-i}A\cap B)= \mu(A)\, \mu(B) \label{ABerg} \ee {\em Proof}. If $T$ is not ergodic, let $A$ be an invariant set with $0<\mu(A)<1$ and choose $B=A$, then (\ref{ABerg}) fails. If $T$ is ergodic, apply Proposition 11.1 with $f=\chi_A$ and $g=\chi_B$. $\Box$ \\ \noindent{\sc 11.4 Remark}. In the proofs of Claims 8.5 and 8.9 and Theorem~10.14, we have obtained a stronger convergence than (\ref{ABerg}): $$ \lim_{n\to\infty} \mu(T^{-n}A\cap B)=\mu(A)\, \mu(B) $$ for all pairs of measurable sets $A,B\subset X$. This motivates the following definition:\\ \noindent{\sc 11.5 Definition}. We say that a map $T:X\to X$ is {\bf mixing} (with respect to $\mu$) if for all measurable subsets $A,B\subset X$ \be \lim_{n\to\infty} \mu(T^{-n}A\cap B)=\mu(A)\, \mu(B) \label{ABmix} \ee We also say that the measure $\mu$ is mixing if this holds. \\ \noindent{\sc 11.6 Corollary}. Mixing transformations are ergodic. \\ \noindent{\sc 11.7 Exercise}. Assume that two maps $T_1:X_1\to X_1$ and $T_2:X_2\to X_2$ are isomorphic, i.e.\ $T_1\simeq T_2$. Prove that $T_1$ is mixing if and only if $T_2$ is.\\ \noindent{\sc 11.8 Proposition}. The doubling map, the tent map, the quadratic map and the baker's map are mixing with respect to their absolutely continuous invariant measures. \\ {\em Proof}. This follows from our proofs of Claims 8.5 and 8.9 (formulas (\ref{ABmix1}) and (\ref{ABmix2})), Remark 8.6 and Exercise 11.7. $\Box$ \\ \noindent{\sc 11.9 Proposition}. Every Bernoulli shift is mixing. \\ {\em Proof}. See the proof of Theorem 10.14 (formula (\ref{ABmix3})). $\Box$ \\ \noindent{\sc 11.10 Claim}. Circle rotations are never mixing (with respect to the Lebesgue measure). \\ {\em Proof}. Let $T(x)=x+a$ (mod 1) be a circle rotation. If $a\in\ZZ$, then $T$ is an identity and the claim is trivial. Otherwise $T^{-1}(x)\neq x$. Take a small interval (an arc) $A'=(x-\varepsilon,x+\varepsilon)$ such that $T^{-1}(A')\cap A'=\emptyset$. Let $A=(x-\varepsilon/2,x+\varepsilon/2)$ and $B=A$. If (\ref{ABmix}) holds, then $T^{-n}A\cap A\neq\emptyset$ for all large enough $n$. On the other hand, if $T^{-n}A\cap A\neq\emptyset$ for some $n$, then $T^{-(n+1)}A\subset T^{-1}A'$ is disjoint from $A$, a contradiction. $\Box$ \\ Note: if an automorphism $T$ is mixing, then so is $T^{-1}$. Hence, the mixing of an automorphism $T$ can be defined by $$ \lim_{n\to\infty} \mu(T^{n}A\cap B)=\mu(A)\, \mu(B) $$ This is so because $\mu(T^{n}A\cap B)=\mu(A\cap T^{-n}B)$.\\ \noindent{\sc 11.11 Proposition}. $T$ is mixing iff for every two functions $f,g\in L_{\mu}^2(X)$ \be \lim_{n\to\infty} \la (f\circ T^n)\, g\ra = \la f\ra \, \la g\ra \label{fgmix} \ee {\em Proof}. If $T$ is not mixing, we take two sets $A$ and $B$ on which (\ref{ABmix}) fails and set $f=\chi_A$ and $g=\chi_B$, then (\ref{fgmix}) will fail as well. Let $T$ be mixing. Then (\ref{fgmix}) follows from (\ref{ABmix}) directly, for the characteristic functions $f=\chi_A$ and $g=\chi_B$ of any pair of sets $A,B\subset X$. By taking linear combinations of characteristic functions, we obtain (\ref{fgmix}) for any pair of simple functions. Then we approximate arbitrary functions $f,g\in L^2_{\mu}(X)$ by simple functions (in the $L^2$ metric) and use standard integral estimates (including the Schwarz inequality) to obtain the result. Once again, the boring details are left out. $\Box$ \\ \noindent{\sc 11.12 Remark}. Because $\mu(A)=\mu(T^{-n}A)$, the characteristic equation of the mixing property (\ref{ABmix}) can be rewritten as \be \lim_{n\to\infty} \Big | \,\mu(T^{-n}A\cap B)- \mu(T^{-n}A)\, \mu(B) \Big |= 0 \label{ABmixn} \ee In probability theory, the events $A$ and $B$ that satisfy $P(A\cap B)=P(A)\, P(B)$ are called {\bf independent}. The mixing condition (\ref{ABmixn}) in fact says that $T^{-n}A$ and $B$ become {\em asymptotically independent} as $n\to\infty$. In other words, the events $x\in B$ and $x\in T^{-n}A$ are almost independent for large $n$. Note that $x\in T^{-n}A$ is equivalent to $T^n(x)\in A$. Now we can say that the event $x\in B$ (a condition on the initial point $x$) becomes independent of the event $T^n(x)\in A$ (a condition on its image at time $n$) as $n\to\infty$. Or shortly, the {\bf distant future will become independent of the present} as time goes on. Alternatively, thinking of $T^n(x)$ as a {\em present} point and $x$ as initial, or {\em past} point, we can say that the {\bf present becomes independent of the remote past}. Another way to look at mixing is this. Given a set $A\subset X$, the condition (\ref{ABmixn}) must hold for every set $B$, as $n\to\infty$. Then $T^{-n}A$ must be a rather weird set that tends to overlap with every subset $B\subset X$ as $n\to\infty$. One may expect that $T^{-n}A$ ``spreads out'' and behaves like an octopus that penetrates with its tentacles every tiny corner of the space $X$, see Fig.~10. \begin{figure}[h] \centering \epsfig{figure=ds-10.eps}\caption{A set $A$ and its preimage $T^{-n}A$ under a mixing map.} \end{figure} \noindent{\sc 11.13 Remark}. In physics and ergodic theory, the quantity $$ C(f,g)=\la f\, g\ra - \la f\ra\, \la g\ra $$ is called the {\bf correlation}\footnote{In probability theory, the quantity $C(f,g)$ is called the {\bf covariance}, while the ratio $\rho_{f,g}=C(f,g)/\sqrt{C(f,f)\,C(g,g)}$ is called the correlation. We will use here the physics/ergodic terminology and call $C(f,g)$ the correlation.} between $f$ and $g$. The quantity $$ C_n(f,g)=\la (f\circ T^n)\, g\ra - \la f\ra\, \la g\ra $$ is called the correlation between $f$ and $g$ at time $n$. Now the mixing property (\ref{fgmix}) is equivalent to the convergence of correlations to zero, as time goes on. In this case, physicists say that the {\bf correlations decay}. For physical theories, it is very important to know just how fast the correlations decay for particular functions $f$ and $g$. The rate (or speed) of the decay of correlations characterizes various physics models.\\ \noindent{\sc 11.14 Definition}. We say that a map $T$ is {\bf weakly mixing} (with respect to $\mu$) if for all measurable subsets $A,B\subset X$ $$ \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1} |\mu(T^{-i}A\cap B)-\mu(A)\, \mu(B)|=0 \label{ABmixw} $$ We can also say that the measure $\mu$ is weakly mixing if this holds. \\ \noindent{\sc 11.15 Proposition}. If $T$ is mixing, then it is weakly mixing. If $T$ is weakly mixing, then it is ergodic.\\ Note: the circle rotation $T(x)=x+a$ (mod 1) is not even weakly mixing.\\ \noindent{\sc 11.16 Definition}. Let $k\geq 2$. We say that a map $T:X\to X$ is $k${\bf-mixing}, or {\bf mixing of multiplicity} $k$ (with respect to $\mu$), if for any measurable subsets $A_1,A_2,\ldots,A_k\subset X$ and $0\leq n_11$, the line $L_1$ is expanded (``stretched out'') by a factor of $\lambda_1$ under $T_A$. On the other hand, $|\lambda_2|<1$, so the other line $L_2$ is compressed (contracted) by a factor of $|\lambda_2|$ under $T_A$ (and it is flipped over, because $\lambda_2<0$). Locally, near the fixed point $\bf 0$, the action of $T_A$ is shown on Fig.~12, it looks like a ``saddle''. The orbit of any point near $\bf 0$ lies on a hyperbola (or a pair of hyperbolas). In differential equations such fixed points are referred to as hyperbolic.\\ \begin{figure}[h] \centering \epsfig{figure=ds-12.eps}\caption{A fixed point of a hyperbolic toral automorphism.} \end{figure} \noindent{\sc 12.15 Definition}. A linear total automorphism $T_A$ is {\bf hyperbolic} if the eigenvalues of $A$ are real numbers different from $\pm 1$. \\ \noindent{\sc 12.16 Exercise}. Show that for any hyperbolic toral automorphism both eigenvalues are irrational, the absolute value of one of them is greater than 1, and that of the other eigenvalue is less than 1. \\ Let $\lambda=\min\{|\lambda_1|,|\lambda_2|\}$. Note that $\lambda^{-1}=\max\{|\lambda_1|,|\lambda_2|\}$. Note also that the inverse matrix $A^{-1}$ has eigenvalues $\lambda_1^{-1}$ and $\lambda_2^{-1}$ and the same eigenvectors as $A$ does. So, $A^{-1}$ contracts $L_1$ by a factor of $\lambda$ and expands $L_2$ by a factor of $\lambda^{-1}$.\\ \noindent{\sc 12.17 Definition}. The line $L_1$ spanned by the eigenvector $v_1$ corresponding to the larger (in absolute value) eigenvalue of $A$ is called the {\bf unstable manifold}. It is expanded (``stretched out'') under $T_A$. The line $L_2$ spanned by the eigenvector $v_2$ corresponding to the other, smaller eigenvalue of $A$ is called the {\bf stable manifold}. Note that both lines extend infinitely long, they wrap around the torus infinitely many times. \\ \noindent{\sc 12.18 Exercise}. Show that for any hyperbolic toral automorphism the lines $L_1$ and $L_2$ are dense on the torus ${\rm Tor}^2$. Hint: verify that the equation of the line $L_i$ for $i=1,2$ is $y=\gamma_i x$ where $\gamma_i=(\lambda_i-a)/b$ is an irrational number by 12.16 (assume that $b\neq 0$ for simplicity). For any real number $\alpha$ the points $(n\alpha,n\alpha\gamma_i)$ (mod 1) for $n=0,1,2,\ldots$ belong in $L_i$. Now use Theorem~12.5 to show that for some $\alpha\neq 0$ these points make a dense set. (Note: your $\alpha$ should be chosen carefully, so that (c) will be satisfied!) \\ Note: with a little extra effort one can show that for any $\varepsilon>0$ there is a $d>0$ such that every segment of length $d$ on the line $L_1$ intersects every disk of radius $\varepsilon>0$ on the torus ${\rm Tor}^2$.\\ \noindent{\sc 12.19 Rectangular partitions}. Further analysis of the map $T_A$ involves symbolic dynamics. According to 10.17, one needs to start with a generating partition. Here we partition the torus ${\rm Tor}^2$ into rectangles with sides parallel to the stable and unstable lines. Figure~13(a) shows the partition of the torus into three rectangles for Example~12.14. The sides of the rectangles are made by pieces of the lines $L_1$ and $L_2$. Fig.~13(b) shows the images of those three rectangles under $T_A$, respectively. Note that each rectangle is stretched by $T_A$ in the direction of $L_1$ (the unstable direction) and compressed in the direction of $L_2$ (the stable direction), but it retains its rectangular shape.\\ \begin{figure}[h] \centering \epsfig{figure=ds-13.eps}\caption{A Markov partition of ${\rm Tor}^2$ for Example 12.14.} \end{figure} \noindent{\sc 12.20 Proper intersection}. Denote the rectangles by $R_1$ (white), $R_2$ (light grey) and $R_3$ (dark grey). Let's closely examine the intersections $T_A(R_i)\cap R_j$ for each pair $i,j$. If it is not empty, then it is a subrectangle in $R_j$, which stretches completely across $R_j$ in the unstable direction. Also, it is a subrectangle in $T_A(R_i)$, which stretches completely across $T_A(R_i)$ in the stable direction. In other words, $T_A(R_i)$ intersects $R_j$ {\bf properly} (transversely), as illustrated in Fig.~14. \\ \begin{figure}[h] \centering \epsfig{figure=ds-14.eps}\caption{A proper intersection (a) of $T_A(R_i)$ (grey) and $R_j$ (white). The figures (b), (c), and (d) illustrate improper intersections.} \end{figure} \noindent{\sc 12.21 Definition}. A partition of ${\rm Tor}^2$ into rectangles $\{R_i\}_{i=1}^r$ with sides parallel to $L_1$ and $L_2$ is called a {\bf Markov partition} if all intersections $T_A(R_i)\cap R_j$, $1\leq i,j\leq r$, with nonempty interior are connected and proper.\\ A partition shown in Fig.~13(a) is then a Markov partition.\\ \noindent{\sc 12.22 Lemma}. Any Markov partition is generating.\\ {\em Proof}. If not, then some distinct points $x\neq y$ have the same itinerary, i.e. $x,y\in\cap_{k=-\infty}^{\infty}T_A^{-k}R_{i_k}$ for some sequence $\{i_k\}_{k=-\infty}^{\infty}$. However, the diameter of the set $\cap_{k=-n}^{n}T_A^{-k}R_{i_k}$ is $O(\lambda^n)$, which converges to zero as $n\to\infty$, a contradiction. $\Box$ Note: it is essential for this proof that the intersections $T(R_i)\cap R_j$ are connected, without this assumption Lemma 12.22 may fail. \\ Recall that a generating partition ${\rm Tor}^2=R_1\cup\cdots\cup R_r$ into $r$ disjoint subsets gives rise to a symbolic representation of an automorphism $T_A:{\rm Tor}^2\to {\rm Tor}^2$ by a shift $\sigma:\Omega_r\to\Omega_r$ on a symbolic space with $r$ symbols as defined by 10.15--10.17. By 10.18, every cylinder $C_{m,n}(i_{m},\ldots,i_{n})$ corresponds to the intersection $$ R_{m,n}(i_{m},\ldots,i_{n})= \cap_{k=m}^{n}T_A^{-k}(R_{i_k}) $$ that is, $\phi^{-1}(C_{m,n}(i_{m},\ldots,i_{n})) =R_{m,n}(i_{m},\ldots,i_{n})$. \\ We now study the Lebesgue measure $m$ and the induced measure $\mu$ on $\Omega_r$.\\ For each rectangle $R_i$, denote by $s_i$ and $u_i$ its sides parallel to the stable direction ($L_2$) and the unstable direction ($L_1$), respectively. Then $m(R_i)=s_iu_i$. Due to the properness of intersections, if $T_AR_i\cap R_j$ has nonempty interior, then $m(T_AR_i\cap R_j)=\lambda s_iu_j$, where $\lambda=\min\{|\lambda_1|,|\lambda_2|\}$. \\ \noindent{\sc 12.23 Lemma}. (a) For any integers $m0$ for all $1\leq i,j\leq r$.\\ {\em Proof}. For large $s>1$, the set $T_A^s(R_i)$ is a very long narrow rectangle, one long side of which lies on the line $L_1$. Then by Exercise 12.18 (and the remark after it), $T_A^s(R_i)$ intersects every rectangle $R_j$ of the Markov partition. Now 12.29 completes the proof. $\Box$ \\ \noindent{\sc 12.32 Theorem (Limit Theorem for Markov Chains)}. If $\Pi^s$ has all positive entries for some $s>0$, then for all $1\leq i,j\leq r$ we have $\pi_{ij}^{(t)}\to p_j$ as $t\to\infty$.\\ {\em Proof}. We outline the argument. Fix $1\leq i,j\leq r$ and let $n,t\geq 1$. Since $\Pi^{n+t}=\Pi^n\Pi^t$, $$ \pi_{ij}^{(n+t)}=\sum_{k=1}^r\pi_{ik}^{(n)}\pi_{kj}^{(t)} $$ Let $\delta_n=\min_k\pi_{ik}^{(n)}\geq 0$. Note that $\max_k\pi_{ik}^{(n)}\leq 1-\delta_n$. Now let $$ m_t=\min_k\pi_{kj}^{(t)} \ \ \ \ \ \ {\rm and}\ \ \ \ \ \ M_t=\max_k\pi_{kj}^{(t)} $$ The following estimate is rather elementary: \be (1-\delta_n)m_t+\delta_nM_t \leq \pi_{ij}^{(n+t)}\leq \delta_nm_t+(1-\delta_n)M_t \label{Doeblin} \ee Hence, $M_{t+n}\leq M_t$ and $m_{t+n}\geq m_t$. Next we show that $M_t-m_t\to 0$ as $t\to\infty$. Let $t=ms+n$ with $0\leq n0$, we have $M_{ms}-m_{ms}1$, the vector $A^n {\bf u}$ grows exponentially fast as $n\to\infty$ and shrinks exponentially fast as $n\to -\infty$. If $|\lambda_j|<1$, then it is vice versa. If $|\lambda_j|=1$, then there is no exponential growth or contraction, but there might be a slow (subexponential) growth or contraction of the vector $A^n {\bf u}$, see an example in 13.5. \\ \noindent{\sc 13.2 Exercise}. A complete proof of the equation (\ref{logalpha}) might be quite lengthy and tedious. But you can at least verify it in the simple case dim$\,E_j=2$. There are two principal subcases here. If $\lambda_j$ is a real root of multiplicity 2, then $A$ restricted to $E_j$ is given by a Jordan matrix $J=\left (\begin{array}{cc}\lambda_j & 1 \\ 0 & \lambda_j \end{array}\right )$ in some basis. Verify that $J^n=\left (\begin{array}{cc}\lambda_j^n & n\lambda_j^{n-1} \\ 0 & \lambda_j^n \end{array}\right )$ for all $n\in\ZZ$ and then derive (\ref{logalpha}). If $\lambda_j=a+bi$ is a complex root and $b\neq 0$, then the corresponding Jordan canonical block is $J=\left (\begin{array}{rc}a & b \\ -b & a \end{array}\right )$. Verify that $J^n=|\lambda_j|^n\left (\begin{array}{rc} \cos n\varphi & \sin n\varphi \\ -\sin n\varphi & \cos n\varphi \end{array}\right )$ for some $\varphi\in [0,2\pi)$ and all $n\in\ZZ$, and then derive (\ref{logalpha}). For an extra credit, try to prove (\ref{logalpha}) in any dimension. \\ \noindent{\sc 13.3 Definition}. The numbers $\chi_j=\ln |\lambda_{j}|$ that appear in (\ref{logalpha}) are called the {\bf characteristic exponents} or the {\bf Lyapunov exponents} of the matrix $A$. \\ Note: some distinct eigenvalues $\lambda_i\neq\lambda_j$ may correspond to the same Lyapunov exponent, this happens whenever $|\lambda_i|=|\lambda_j|$. In this case each nonzero vector ${\bf u}\in E_{i} \oplus E_j$ satisfies (\ref{logalpha}). \\ \noindent{\sc 13.4 Proposition (Lyapunov decomposition)}. Every nonsingular real matrix $A$ has distinct Lyapunov exponents $\chi_1>\chi_2>\cdots >\chi_m$ (with $m\leq s$) and there is a decomposition $\IR^{d} = {\cal E}_{1}\oplus\cdots\oplus {\cal E}_{m}$ such that $A({\cal E}_j)={\cal E}_j$ and \be \lim_{n \to\pm\infty } \frac{1}{n} \ln \| A^n {\bf u} \| = \chi_{j} \label{lambda} \ \ \ \ \ \ \forall {\bf u}\in {\cal E}_j, \ \ {\bf u}\neq {\bf 0} \ee The number ${\rm dim}\,{\cal E}_j$ is called the {\bf multiplicity} of the Lyapunov exponent $\chi_j$. We also call ${\cal E}_j$ the {\bf characteristic spaces} for $A$. \\ \noindent{\sc 13.5 Remark}. We have $$ \sum_j\chi_j\cdot{\rm dim}\,{\cal E}_j =\ln \left |\, {\rm det}\, A \right | $$ because det$\,A$ equals the product of all the eigenvalues of $A$ (counting multiplicity). \\ \noindent{\sc 13.6 Examples}. For the matrices $A_1= \left (\begin{array}{cc} 0 & 1 \\ 1 & 0 \end{array} \right )$ and $A_2= \left (\begin{array}{cc} 1 & 1 \\ 0 & 1 \end{array} \right )$ (these are taken from Examples 12.11 nd 12.12, respectively) all Lyapunov exponents are zero. The same is true for the matrix $A_3= \left (\begin{array}{rr} 0 & 1 \\ -1& 0 \end{array} \right )$, whose eigenvalues are $\pm i$. Note that $A_2^n$ does expand and contract vectors, but very slowly (at most linearly in $n$). On the other hand, for any matrix $A$ defining a hyperbolic toral automorphism $T_A$ (see 12.15) one Lyapunov exponent is positive, $\chi_1=\ln\lambda^{-1}>0$, and the other is negative, $\chi_2=\ln\lambda<0$. Note that $\chi_1+\chi_2=0$, because det$\,A=\pm 1$.\\ \noindent{\sc 13.7 Definition}. A matrix $A$ and the corresponding linear map $A:\IR^d\to\IR^d$ are called {\bf hyperbolic} if none of the eigenvalues of $A$ (real or complex) lie on the unit circle $|z|=1$.\\ Equivalently, we have the following principle:.\\ \begin{center} \begin{tabular}{||c||} \hline\hline \\ $\ \ $ $A$ is hyperbolic iff all the Lyapunov exponents of $A$ are different from zero$\ \ $\\ \\ \hline \hline \end{tabular}\vspace*{0.2cm} \end{center} \noindent{\sc 13.8 Definition}. The $A$-invariant subspaces $$ {\cal E}^s=\oplus_{\chi_j<0}{\cal E}_j, \ \ \ \ \ \ {\cal E}^u=\oplus_{\chi_j>0}{\cal E}_j \ \ \ \ {\rm and}\ \ \ \ {\cal E}^c={\cal E}_j|_{\chi_j=0} $$ are called {\bf stable}, {\bf unstable}, and {\bf neutral} (or {\bf central}) subspaces, respectively. Note that $$ \IR^d={\cal E}^s\oplus {\cal E}^u \oplus {\cal E}^c $$ If the matrix $A$ is hyperbolic, then ${\cal E}^c=\{{\bf 0}\}$, hence ${\cal E}^c$ can be omitted from the above decomposition. \\ The equation (\ref{logalpha}) easily implies $$ \limsup_{n\to\pm\infty}\frac{1}{n} \ln \| A^n {\bf u} \| <0 \ \ \ \ \ \ \forall {\bf u}\in {\cal E}^s,\ \ {\bf u}\neq {\bf 0} $$ and $$ \liminf_{n\to\pm\infty}\frac{1}{n} \ln \| A^n {\bf u} \| >0 \ \ \ \ \ \ \forall {\bf u}\in {\cal E}^u,\ \ {\bf u}\neq {\bf 0} $$ Let the matrix $A$ have at least one nonzero Lyapunov exponent $\chi_i\neq 0$. Denote $\chi=\min\{|\chi_i|:\, \chi_i\neq 0\}$ and $\lambda=e^{-\chi}$, Note that $\chi>0$ and $\lambda<1$.\\ \noindent{\sc 13.9 Proposition}. For any $\varepsilon>0$ there is a $K>0$ such that for all $n\geq 0$ $$ \| A^n {\bf u} \| \leq K(\lambda+\varepsilon)^n \|{\bf u}\| \ \ \ {\rm and}\ \ \ \| A^{-n} {\bf u} \| \geq K^{-1}(\lambda+\varepsilon)^{-n} \|{\bf u}\| \ \ \ \ \forall {\bf u}\in {\cal E}^s $$ and $$ \| A^n {\bf u} \| \geq K^{-1}(\lambda+\varepsilon)^{-n} \|{\bf u}\| \ \ \ {\rm and}\ \ \ \| A^{-n} {\bf u} \| \leq K(\lambda+\varepsilon)^{n} \|{\bf u}\| \ \ \ \ \forall {\bf u}\in {\cal E}^u $$ {\em Proof}. It is enough to prove the above bounds for unit vectors only. For every unit vector $\bf u$ there is a $K= K(\varepsilon,{\bf u})$ such that all these bounds hold, but $K$ may depend on $\bf u$. Then we pick an orthonormal basis $e_1,\ldots,e_k$ in ${\cal E}^s$ (resp., ${\cal E}^u$), ensure the above bounds with the same constant $K(\varepsilon)$ for all the vectors $e_1,\ldots,e_k$. Then we use the triangle inequality to derive the proposition for all unit vectors $\bf u$ in ${\cal E}^s$ and ${\cal E}^u$. $\Box$ \\ Thus, vectors ${\bf u}\in{\cal E}^u$ grow exponentially fast under $A^n$ as $n\to\infty$ and shrink exponentially fast as $n\to -\infty$. For vectors ${\bf u}\in{\cal E}^s$, it is exactly the opposite. Now what happens to other vectors in ${\bf u}\in\IR^d$?\\ \noindent{\sc 13.10 Corollary}. For any vector ${\bf u}\notin{\cal E}^u\cup {\cal E}^s$ and any $\varepsilon>0$ there is a $K>0$ such that for all $n\in\ZZ$ $$ \| A^n {\bf u} \| \geq K(\lambda+\varepsilon)^{-|n|} \|{\bf u}\| $$ that is, the vector $\bf u$ grows under $A^n$ exponentially fast in both time directions: as $n\to +\infty$ and as $n\to -\infty$.\\ Next, we extend the above results to nonlinear maps. \\ \noindent{\sc 13.11 Definition}. Let $U\subset\IR^d$ be an open set and $T:U\to\IR^d$ a smooth one-to-one map with a fixed point $x$, i.e.\ $T(x)=x$. Then the matrix $A=D_xT$ acts on tangent vectors ${\bf u}\in{\cal T}_x \IR^d$, and the tangent space ${\cal T}_x \IR^d$ can be naturally identified with $\IR^d$. Note that $D_xT^n=(D_xT)^n=A^n$ by the chain rule. Assume that det$\,A\neq 0$. The Lyapunov exponents of the matrix $A$ are called the {\bf Lyapunov exponents} of the map $T$ at the point $x$. The corresponding subspaces ${\cal E}^s,{\cal E}^u, {\cal E}^c\subset {\cal T}_x\IR^d$ are called the {\bf stable}, {\bf unstable}, and {\bf neutral} (or {\bf central}) subspaces, respectively, for the map $T$ at the point $x$. \\ Note: the subspaces ${\cal E}^s,{\cal E}^u,{\cal E}^c$ are invariant under $D_xT$ but not necessarily under the map $T$ itself. On the other hand, $D_xT$ is a linear approximation to the map $T$ at the point $x$. This allows us to obtain the following theorem, whose proof we omit. \\ \noindent{\sc 13.12 Theorem (Hadamard-Perron)}. There exist two submanifolds $W^s\subset U$ and $W^u\subset U$ such that \begin{itemize} \item[(a)] $W^s\cap W^u=\{x\}$; \item[(b)] the spaces ${\cal E}^s$ and ${\cal E}^u$ are tangent to $W^s$ and $W^u$, respectively, at the point $x$; \item[(c)] $T(W^s)\subset W^s$ and $T^{-1}(W^u) \subset W^u$; \item[(d)] $T^n(y)\to x$ for every $y\in W^s$ and $T^{-n}(y)\to x$ for every $y\in W^u$, as $n\to\infty$. \end{itemize} We omit the proof, but remark that the manifold $W^u$ is constructed as a limit of $(T^n{\cal E}^u)\cap V(x)$, as $n\to\infty$, where $V(x)$ is a sufficiently small neighborhood of $x$. The existence of this limit is proved by the contraction mapping principle. Similarly, $W^s$ is constructed as a limit of $(T^{-n}{\cal E}^s)\cap V(x)$, as $n\to\infty$. \\ Since $A=D_xT$ is a linear part of the map $T$ at $x$, it is easy to obtain the following corollary to 13.9 and 13.12: \\ \noindent{\sc 13.13 Corollary}. For any $\varepsilon>0$ there is a neighborhood $V(x)$ of the point $x$ and a $K>0$ such that for all $n\geq 0$ $$ {\rm dist}(T^ny,x) \leq K(\lambda+\varepsilon)^n \cdot{\rm dist}(y,x) \ \ \ \ \forall y \in W^s\cap V(x) $$ and $$ {\rm dist}(T^{-n}y,x) \leq K(\lambda+\varepsilon)^n \cdot{\rm dist}(y,x) \ \ \ \ \forall y \in W^u\cap V(x) $$ Locally, near the point $x$, the map $T$ acts as shown on Fig.~15. \\ \begin{figure}[h] \centering \epsfig{figure=ds-15.eps}\caption{The action of $T$ near a hyperbolic fixed point $x$.} \end{figure} \noindent{\sc 13.14 Definition}. $W^s$ and $W^u$ are called the {\bf stable} and {\bf unstable manifolds}, respectively, for the map $T$ at the point $x$. The map $T$ is called {\bf hyperbolic} at a fixed point $x$ (and then $x$ is called a {\bf hyperbolic fixed point} for $T$) if dim$\,{\cal E}^c=0$. In this case dim$\, W^s+\,$dim$\, W^u=d$. \\ \noindent \noindent{\sc 13.15 Definition}. A hyperbolic point $x$ is called a {\bf source} (a {\bf repeller}) if dim$\,{\cal E}^s=0$ (hence ${\cal E}^u$ coincides with ${\cal T}_x\IR^d$). It is called a {\bf sink} (an {\bf attractor}) if dim$\,{\cal E}^u=0$ (hence ${\cal E}^s$ coincides with ${\cal T}_x\IR^d$). It is called a {\bf saddle} (a truly hyperbolic point) if both $\,{\cal E}^s$ and $\,{\cal E}^u$ are not trivial. \\ \noindent{\sc 13.16 Remark}. Let $x$ be a saddle point and $y\neq x$ another point very close to $x$. If $y\in W^u$, then the trajectory $T^ny$ moves away from $x$ exponentially fast for $n>0$, at least until $T^ny$ leaves a certain neighborhood of $x$. If $y\in W^s$, then the trajectory $T^ny$ moves away from $x$ exponentially fast for $n<0$. Now, if $y\notin W^u\cup W^s$, then the trajectory $T^ny$ moves away from $x$ exponentially fast for both $n>0$ and $n<0$. This fact is known as the {\bf separation principle}: nearby trajectories tend to separate exponentially fast, either in the future or in the past or (in most cases) both.\\ All the above definitions and results extend to any diffeomorphism $T:U\to T(U)\subset M$ on an open subset $U\subset M$ of a Riemannian manifold $M$, rather than $U\subset \IR^d$. A Riemannian structure in $M$ is necessary for the norm $\|\cdot\|$ to be well defined on $M$. Henceforth we assume that $T$ is defined on an open subset of a Riemannian manifold $M$. \\ \noindent{\sc 13.17 Remark}. All the above definitions and results easily apply to a periodic point $x\in U$ rather than a fixed point. If $T^p(x)=x$, we can just consider $T^p$ instead of $T$.\\ \noindent Note: $D_xT^n=D_{T^{n-1}x}T\cdots D_{Tx}T\cdot D_{x}T$ by the chain rule. \\ Next, we turn to nonperiodic points. This is the most interesting and important part of the story.\\ \noindent{\sc 13.18 Definition}. Let the map $T^n$ be differentiable at a point $x\in M$ for all $n\in\ZZ$. Assume that there are numbers $\chi_1>\cdots >\chi_m$ and the tangent space ${\cal T}_xM$ is a direct sum of subspaces ${\cal E}_{1}\oplus\cdots\oplus {\cal E}_{m}$ such that if ${\bf 0} \neq {\bf u} \in {\cal E}_{i},$ then \be \lim_{n \to\pm\infty } \frac{1}{n} \ln \| (D_xT^{n})\, {\bf u} \| = \chi_{i} \label{chi1} \ee Then the values $ \chi_{i} $ are called the {\bf Lyapunov exponents} of the map $T$ at the point $x$. The number dim$\,{\cal E}_i$ is called the {\bf multiplicity} of the Lyapunov exponent $\chi_i$. The spaces ${\cal E}_j$ are called {\bf characteristic subspaces} at $x$. The subspaces $$ {\cal E}^s=\oplus_{\chi_i<0}{\cal E}_i, \ \ \ \ \ \ {\cal E}^u=\oplus_{\chi_i>0}{\cal E}_i \ \ \ \ {\rm and}\ \ \ \ {\cal E}^c={\cal E}_j|_{\chi_j=0} $$ are called {\bf stable}, {\bf unstable}, and {\bf neutral} (or {\bf central}) subspaces of ${\cal T}_x M$, respectively. \\ We note that the existence of the Lyapunov exponents $\chi_i$ and the subspaces ${\cal E}_i$ is not guaranteed for any point $x\in U$, as an example will show soon. We say that a point $x$ {\bf has all Lyapunov exponents} if $\chi_i$ and ${\cal E}_i$ exist. \\ \noindent{\sc 13.19 Remark}. If a point $x\in U$ has all Lyapunov exponents, then so do points $T^n(x)$ for all $n\in \ZZ$. Moreover, the points $T^n(x)$ have the same Lyapunov exponents (with the same multiplicity) as $x$ does, and the characteristic subspaces are invariant along the trajectory of $x$: $$ (D_xT^n)({\cal E}_i(x))={\cal E}_i(T^nx) $$ for all $n\in \ZZ$ and each $i$.\\ In particular, observe that the Lyapunov exponents $\chi_i$ and their multiplicities dim$\,{\cal E}_i$ are $T$-invariant functions.\\ \noindent{\sc 13.20 Example}. Let $T_A:{\rm Tor}^2\to {\rm Tor}^2$ be a hyperbolic toral automorphism. Then all Lyapunov exponents exist everywhere on ${\rm Tor}^2$, and they are $\chi_1=\ln \lambda^{-1}>0$ and $\chi_2=\ln \lambda<0$. The corresponding subspaces ${\cal E}_1$ and ${\cal E}_2$ are parallel to the lines $L_1$ and $L_2$, respectively.\\ \noindent{\sc 13.21 Example}. Let $T:X\to X$ be the baker's transformation of the unit square $X$. Let $X'\subset X$ be the set of points where $T^n$ is differentiable for all $n\in \ZZ$. Then for every $x\in X'$ all Lyapunov exponents exist, and they are $\chi_1=\ln 2>0$ and $\chi_2=-\ln 2<0$. The corresponding subspaces ${\cal E}_1$ and ${\cal E}_2$ are parallel to the $x$ axis and $y$ axis, respectively. Note that $T^{\pm n}$ fails to be differentiable on the lines $x=k/2^n$ and $y=m/2^n$, with $k,m=0,1,\ldots,2^n$. Hence $m(X')=1$, i.e. all Lyapunov exponents exist almost everywhere. \\ \begin{figure}[h] \centering \epsfig{figure=ds-16.eps}\caption{The map in Example 13.22.} \end{figure} \noindent{\sc 13.22 Example}. Let $T: X \to X$ be a diffeomorphism of the unit circle $X=S^1$ given by $T(x) = x + \frac{1}{3 \pi } \sin 2 \pi x,$ where $0\leq x<1$ is the cyclic coordinate on $X$. We have two fixed points here, $x_0=0$ and $x_1=1/2$. Lyapunov exponents exist at both fixed points: $\chi(x_0)=\ln |T'(x_0)|=\ln (5/3)>0$ and $\chi(x_1)=\ln |T'(x_1)|=\ln (1/3)<0$. Since $\lambda(x_0)>0$, the point $x_0$ is unstable (a {\bf repeller}). Likewise, $x_1$ is a stable point (an {\bf attractor}). For any point $x\in (0,1/2)$ we have $T^n(x)\to x_1$ and $T^{-n}(x)\to x_0$ as $n\to\infty$. Hence, by the chain rule, for any nonzero tangent vector ${\bf u}\in {\cal T}_x(X)$ we have $$ \lim_{n \to -\infty } \frac{1}{n} \ln \| (T^{n})'_{x} {\bf u} \| = \ln (5/3) \neq \ln (1/3) = \lim_{n \to \infty} \frac{1}{n} \ln \| (T^{n})'_{x} {\bf u} \| $$ This shows that the limit in (\ref{chi1}) does not exist. The same conclusion holds for any $x\in (1/2,1)$.\\ \noindent{\sc 13.23 Remark}. In the last example, Lyapunov exponents only exist at two fixed points, $x_0$ and $x_1$, and nowhere else on $X$. Hence, Lyapunov exponents seem to be very scarce on $X$ in any ``reasonable'' sense: topologically, with respect to the usual Lebesgue measure, by simple count. But none of these considerations are relevant in the dynamical sense, where only invariant measures count. The Lebesgue measure is not invariant under $T$, so it does not characterize this map. It is not hard to see that any invariant measure in this example is $p\delta_{x_0}+ (1-p)\delta_{x_1}$ with some $0\leq p\leq 1$, compare this to Exercise 2.26(e). Hence, with respect to any invariant measure, Lyapunov exponents do exist almost everywhere!\\ It is remarkable that the above fact is very general, and this is the content of the Oseledec\footnote{The name is pronounced {\it Oseledets}.} multiplicative ergodic theorem, which we call shortly Oseledec's theorem.\\ \noindent{\sc 13.24 Theorem (Oseledec)}. Assume that $M$ is a compact manifold and $T: M\to M$ is a $C^1$ diffeomorphism preserving a Borel probability measure $\mu$. Then there exists a $T$-invariant set $N\subset M$, $\mu (N)=1$, such that for every point $x \in N$ all Lyapunov exponents exist. \\ First of all, it is enough to prove the theorem for ergodic measures $\mu$, because then we can apply Ergodic Decomposition Theorem 7.15 to any nonergodic invariant measure (see Remark 7.21). So, we assume from now on that $\mu$ is ergodic.\\ We will only prove this theorem for surfaces, i.e.\ for dim$\, M=2$. The proof will be done in two major steps. The first one consists in proving the following theorem. \\ \noindent{\sc 13.25 Theorem (Upper Lyapunov Exponent)}. Under the above assumptions, there is a $\chi_+\in\IR$ such that $$ \lim_{n\to +\infty}\frac{1}{n}\ln\|D_xT^n\|=\chi_+ $$ for almost every point $x\in M$. Here $\|D_xT^n\|= \sup_{\|{\bf u}\|=1}\|D_xT^n{\bf u}\|$ is the norm of the matrix $D_xT^n$. \\ {\em Proof}. By the chain rule, $$ \|D_xT^{n+m}\| \leq \|D_xT^n\|\, \|D_{T^nx}T^m\| $$ Let $F_n(x)=\ln \|D_xT^n\|$, then \be F_{n+m}(x)\leq F_n(x)+F_m(T^nx) \label{subadd} \ee This condition is referred to as the {\bf subadditivity} of the sequence of functions $\{F_n\}$. Now 13.25 follows from the next general statement:\\ \noindent{\sc 13.26 Theorem (Subadditive Ergodic Theorem)}. Let $T:X\to X$ be a transformation preserving an ergodic measure $\mu$, and $\{F_n\}\in L^1_{\mu}(X)$, $n\geq 1$, a sequence of integrable functions on $X$ such that (\ref{subadd}) holds for almost every $x\in X$ and all $n,m\geq 1$. Then there is a $\chi\in\IR\cup\{-\infty\}$ such that $$ \lim_{n\to +\infty}\frac{1}{n}F_n(x)=\chi $$ for almost every point $x\in X$.\\ \noindent{\sc 13.27 Remarks}. In Theorem 13.25, $F_1(x)=\ln \|D_xT\|$ is a continuous function on $M$, because $T$ is $C^1$. Hence there is an upper bound \be F_{\max}:=\sup_{x\in M} F_1(x) < \infty \label{Fbound} \ee By iterating the subadditivity condition (\ref{subadd}) we obtain for all $x\in M$ \be \frac 1n F_n(x)\leq \frac 1n \sum_{i=0}^{n-1} F_1(T^ix) \leq F_{\max} \label{Fbound1} \ee Also, by the chain rule $(D_{T^nx}T^{-n})(D_xT^n)=I$ (the identity matrix), hence \be 1=\|I\| \leq \| D_xT^n\| \, \|D_{T^nx}T^{-n}\| \label{chain} \ee therefore \be \frac 1n F_n(x)=\frac 1n \ln \| D_xT^n\| \geq -\frac 1n \ln \|D_{T^nx}T^{-n}\| \geq - \ln \max_{x\in M} \|D_xT^{-1}\| =: F_{\min} > -\infty \label{Fbound2} \ee As a result, $\chi_+$ in 13.25 is finite and $\chi_+\in [ F_{\min}, F_{\max}]$. \\ The proof of Subadditive Ergodic Theorem is given in Pollicott, pp.\ 37--40. We sketch the principal steps here, assuming the bounds (\ref{Fbound}) and (\ref{Fbound2}) for simplicity. \\ \noindent{\sc 13.28 Lemma (on subadditive sequences)}. If $\{a_n\}$, $n\geq 1$, is a sequence of real numbers and $a_{n+m}\leq a_n+a_m$ for all $n,m\geq 1$, then $a_n/n$ converges as $n\to\infty$ and $$ \lim_{n\to\infty} a_n/n = \inf_{n\geq 1} \{a_n/n\} $$ The proof of Lemma 13.28 is elementary and we leave it as an exercise. We now continue the proof of Subadditive Ergodic Theorem 13.26. The subadditivity condition (\ref{subadd}) and the invariance of $\mu$ imply $\int F_{n+m}\, d\mu \leq \int F_n\, d\mu +\int F_m\, d\mu$. Hence, by 13.28, there is a limit $$ \lim_{n\to\infty} \frac 1n \int_X F_n\, d\mu = \chi:= \inf\left\{\frac 1n \int_X F_n\, d\mu\right \} $$ Next, let $$ F_+(x)=\limsup_{n\to\infty} \frac 1n F_n(x) \ \ \ \ {\rm and}\ \ \ \ F_-(x)=\liminf_{n\to\infty} \frac 1n F_n(x) $$ By subadditivity, $$ \frac 1n F_n(x)\leq \frac 1n F_1(x) + \frac 1n F_{n-1}(Tx) $$ Letting $n\to\infty$ gives $F_+(x) \leq F_+(Tx)$ as well as $F_-(x) \leq F_-(Tx)$.\\ \noindent{\sc 13.29 Lemma}. If an integrable function $F:X\to \IR$ has the property $F(x)\leq F(Tx)$ for almost every $x\in X$, then $F(x)=F(Tx)$ almost everywhere, i.e. $F$ is $T$-invariant.\\ {\em Proof}. Let $\Delta(x)=F(Tx)-F(x)$. Then $\Delta(x)\geq 0$ a.e.\ and $\int\Delta\, d\mu=0$, hence $\Delta(x)=0$ a.e.\\ Thus, the functions $F_-(x)$ and $F_+(x)$ are $T$-invariant and therefore constant almost everywhere (by the ergodicity of $\mu$). Denote their values by $\bar{F}_-$ and $\bar{F}_+$, respectively. We now prove that $\bar{F}_-=\bar{F}_+$. Fix a small $\varepsilon>0$ and define $$ n_-(x)=\min \left \{ n\geq 1: \frac 1n F_n(x) <\bar{F}_-+\varepsilon \right \} $$ Choose $n_{\varepsilon}\geq 1$ so large that $\mu\{x:\, n_-(x)>n_{\varepsilon}\}<\varepsilon$. Denote $A_{\varepsilon} =\{x:\, n_-(x)>n_{\varepsilon}\}$. Define $$ n(x)=\left \{ \begin{array}{cc} n_-(x) & {\rm if}\ \ n_-(x)\leq n_{\varepsilon} \\ 1 & {\rm otherwise} \end{array} \right . $$ Fix a point $x\in X$. For $N\geq 1$ let $$ k_N=\#\{0\leq i\leq N-1:\ T^ix\in A_{\varepsilon}\} $$ Define a sequence $0=n_00$ is arbitrary, $\bar{F}_+=\bar{F}_-$. Thus, for a.e.\ $x\in M$ $$ \lim_{n\to\infty}\frac 1n F_n(x)=\bar{F}_+=\bar{F}_- $$ Finally, by (\ref{Fbound1}) and (\ref{Fbound2}), we have $F_{\min} \leq \frac 1n F_n(x) \leq F_{\max}$, and then by Dominated Convergence Theorem $\bar{F}_+=\bar{F}_- = \chi$. This proves Theorem~13.26, and thus 13.25. $\Box$ \\ This completes the first major step in the proof of 13.24. We turn to the second one starting with some extensions of Theorem 13.25.\\ \noindent{\sc 13.30 Corollary (Upper Lyapunov Exponents with Shift)}. \begin{itemize} \item[(a)] We have $$ \lim_{n\to +\infty} \frac 1n \ln \| D_{T^{-n}x}T^n \| = \chi_+ $$ for almost every point $x\in M$. \item[(b)] There is a $\chi_-\in\IR$ such that $$ \lim_{n\to +\infty} \frac 1n \ln \| D_{x}T^{-n} \| = \lim_{n\to +\infty} \frac 1n \ln \| D_{T^{n}x}T^{-n} \| = \chi_- $$ for almost every $x\in M$. \item[(c)] Lastly, $\chi_+ + \chi_- \geq 0$. \end{itemize} {\em Proof}. Consider the function $G_n(x)=\ln\|D_{T^{-n}x}T^n\|$. By the chain rule $G_{n+m}(x)\leq G_n(x)+G_m(T^{-n}x)$, so that $\{G_n(x)\}$ is a subadditive sequence of functions in the sense of (\ref{subadd}), but now for the map $T^{-1}$, which preserves the same measure $\mu$. By Theorem 13.26 there is a $\chi'$ such that $\lim_{n\to\infty}\frac 1n G_n(x)=\chi'$ almost everywhere. Using the notation of the proof of 13.25 we observe that $G_n(x)=F_n(T^{-n}x)$, hence $\int F_n\, d\mu=\int G_n\, d\mu$ for every $n\geq 1$. Therefore, $$ \chi_+=\inf\left\{\frac 1n \int_X F_n\, d\mu\right \} =\inf\left\{\frac 1n \int_X G_n\, d\mu\right \}=\chi' $$ so the claim (a) is proved. To prove (b), we just apply 13.25 and 13.30(a) to the map $T^{-1}$. Lastly, according to the chain rule, cf.\ (\ref{chain}), $$ \frac 1n \ln \| D_{x}T^{n} \| + \frac 1n \ln \| D_{T^nx}T^{-n} \| \geq \frac 1n \ln \| D_{x}T^{0} \| = 0 $$ Taking the limit as $n\to +\infty$ yields (c). $\Box$\\ \noindent{\sc 13.31 Corollary}. For almost every $x\in M$ there is a sequence $n_k\to\infty$ as $k\to\infty$ such that $$ \lim_{k\to\infty} \frac{1}{2n_k} \ln \| D_{T^{-n_k}x} T^{2n_k} \| = \chi_+ $$ A similar statement also holds for $\chi_-$.\\ {\em Proof}. By Theorem 13.25, for every $k\geq 1$ and almost every $y\in M$, there is an $m_k(y)$ such that for all $m\geq m_k(y)$ we have $$ \left | \frac 1m \ln \|D_yT^m\| - \chi_+ \right | < \frac 1k $$ Clearly, there is an $M_k\geq 1$ such that $\mu\{y:\, m_k(y)0$. Fix any such $M_k$ and let $A_k=\{y:\, m_k(y)M_k$ such that $T^{-n_k}x\in A_k$. This implies $$ \left | \frac{1}{2n_k} \ln \|D_{T^{-n_k}x}T^{2n_k}\| - \chi_+ \right | < \frac 1k $$ This proves Corollary 13.31. $\Box$.\\ We now outline the proof of Oseledec's theorem 13.24 referring to Pollicott, pp.\ 31--36, for some more details.\\ \noindent{\sc 13.32 Lemma (from Linear Algebra)}. If $B$ is a $d\times d$ matrix, then there exists a symmetric positive semidefinite matrix $A=A^T$ such that \begin{itemize} \item[(a)] $A^2=B^TB$; \item[(b)] $\|A{\bf u}\|=\|B{\bf u}\|$ and $\la A{\bf u},A{\bf v}\ra = \la B{\bf u},B{\bf v}\ra$ for all vectors ${\bf u},{\bf v}\in \IR^d$. \item[(c)] There are orthogonal unit vectors $e_1,\ldots,e_d$ such that $Be_1,\ldots,Be_d$ are also orthogonal vectors. \end{itemize} If $B$ is not singular, then $A$ is positive definite.\\ {\em Proof}. Since $B^TB$ is a symmetric positive semidefinite matrix, then it has real eigenvalues $\beta_i\geq 0$ and corresponding orthogonal unit eigenvectors $e_1,\ldots,e_d$. Therefore, $B^TB=QDQ^T$, where $D=\,{\rm diag} (\beta_1, \ldots, \beta_d)$ and $Q$ is an orthogonal matrix (whose columns are the vectors $e_1,\ldots,e_d$). Let $D^{1/2}=\,{\rm diag} (\beta_1^{1/2}, \ldots, \beta_d^{1/2})$, then $$ B^TB=QD^{1/2}D^{1/2}Q^T=QD^{1/2}Q^TQD^{1/2}Q^T $$ and we can set $A=QD^{1/2}Q^T$. The claim (a) and the fact $A^T=A$ are obvious. To prove (b), just note that $\la B{\bf u},B{\bf v}\ra= \la B^TB{\bf u},{\bf v}\ra= \la A^2{\bf u},{\bf v}\ra= \la A{\bf u},A{\bf v}\ra$. Now, observe that $\{e_1,\ldots,e_d\}$ are eigenvectors of $A$ with corresponding eigenvalues $\lambda_1=\beta_1^{1/2}, \ldots, \lambda_d=\beta_d^{1/2}$. Hence, by (b) we have $\la Be_i,B e_j\ra = \la Ae_i,Ae_j\ra = \lambda_i\lambda_j\la e_i,e_j \ra$. If $i\neq j$, then $Be_i$ and $Be_j$ must be orthogonal\footnote{In linear algebra, the vectors $e_1,\ldots,e_d$ are called the {\bf singular vectors} and $\lambda_1,\ldots,\lambda_d$ the {\bf singular values} of the matrix $B$. This is the content of Singular Value Decomposition (SVD) theorem. It is easy to see that if $B$ is invertible, then the singular values of $B^{-1}$ are $\lambda_1^{-1},\ldots,\lambda_d^{-1}$ and its singular vectors are the normalized vectors $Be_1,\ldots,Be_d$.}, which proves (c). Setting $i=j$, we obtain another useful identity: $\|Be_i\|=\lambda_i$. $\Box$ \\ Now, fix a point $x\in M$. For each $n\in \ZZ$ denote $B_n(x)=D_xT^n$ and let $A_n(x)$ be the corresponding symmetric positive definite matrix defined in 13.32. Let $\lambda_1^{(n)}(x)\geq\lambda_2^{(n)}(x)$ be the eigenvalues of $A_n(x)$ and $E_1^{(n)}(x)$, $E_2^{(n)}(x)$ the corresponding eigenspaces spanned by unit vectors $e_1^{(n)}(x)$ and $e_2^{(n)}(x)$, respectively. Note that $\lambda_i^{(n)}(x)>0$ and $E_1^{(n)}(x)$ is orthogonal to $E_2^{(n)}(x)$, i.e.\ $e_1^{(n)}\perp e_2^{(n)}$. We can choose $e_j^{(0)}\in E_j^{(0)}$ arbitrarily and then adjust the orientation of the other vectors $e_j^{(n)}$, $n\neq 0$, so that $\la e_j^{(n)},e_j^{(n-1)}\ra\geq 0$ for all $n\in\ZZ$ and for each $j=1,2$. Observe that $\|B_n(x)\|=\|A_n(x)\| =\lambda_1^{(n)}(x)$. By 13.25, for almost every $x\in M$ there exists a limit $$ \chi_1:=\lim_{n\to \infty}\frac 1n \ln\|B_n(x)\| =\lim_{n\to \infty}\frac 1n \ln \lambda_1^{(n)}(x) $$ Also, by Corollary 13.30(b), there exists a limit $$ \chi_2:=-\lim_{n\to \infty}\frac 1n \ln\|B_{-n}(x)\| =-\lim_{n\to \infty}\frac 1n \ln \lambda_1^{(-n)}(x) $$ Note that $\chi_1=\chi_+$ and $\chi_2=-\chi_-$, hence by 13.30(c) we have $\chi_1\geq \chi_2$. These are the characteristic Lyapunov exponents in the statement of Oseledec's theorem 13.24. If $\chi_1>\chi_2$, then we need also to construct the characteristic subspaces ${\cal E}_1$ and ${\cal E}_2$, which we do next (assuming $\chi_1>\chi_2$).\\ \noindent{\bf Claim 1}. The following limits exist: \be e_j^{\pm} = \lim_{n\to \infty} e_j^{(\pm n)} \ \ \ \ \ \ \ {\rm for}\ \ j=1,2 \label{e1e2} \ee This follows from certain elementary geometric estimates, we refer to Pollicott, pp.\ 33-34. It is shown there that the convergence in (\ref{e1e2}) is exponentially fast, i.e. \be \|e_j^{(\pm n)}-e_j^{\pm}\|\leq Ce^{-\delta n} \label{ee} \ee for some constants $C,\delta>0$ and all $n\geq 0$, $j=1,2$. Now we define $$ {\cal E}_1= \,{\rm span} (e_2^{-}) \ \ \ \ \ \ {\rm and}\ \ \ \ \ \ {\cal E}_2= \,{\rm span} (e_2^{+}) $$ We note that while the Lyapunov exponents $\chi_j$, $j=1,2$, are defined by the dominant eigenvalues $\lambda_1^{(\pm n)}$, the characteristic subspaces ${\cal E}_j$ are spanned by the limit eigenvectors $\lim e_2^{(\pm n)}$ corresponding to the other (smaller) eigenvalues $\lambda_2^{(\pm n)}$, which is an interesting fact. \\ Fig.~17 illustrates our construction. There we assumed that $\chi_1>0$ and $\chi_2<0$, so that we would have exponential growth and contraction of vectors. \\ \begin{figure}[h] \centering \epsfig{figure=ds-19.eps}\caption{Two orthonormal bases at a point $x$, one for $D_xT^n$ and the other for $D_xT^{-n}$. The contracting vectors $e_2^{(\pm n)}$ are shown bold -- they give rise to ${\cal E}_1$ and ${\cal E}_2$.} \end{figure} \noindent {\bf Claim 2}. The vector $e_1^+$ grows under $D_xT^n$ exponentially with the rate $\chi_1$: $$ \lim_{n\to\infty} \frac 1n \ln \|D_xT^n(e_1^+)\|=\chi_1 $$ Similarly, $\lim_{n\to\infty} \frac 1n \ln \|D_xT^{-n}(e_1^-)\|= -\chi_2$. This follows from certain elementary geometric estimates, we refer to Pollicott, p.\ 35. \\ \noindent {\bf Claim 3}. The families of subspaces ${\cal E}_1(x)$ and ${\cal E}_2(x)$ are $D_xT$-invariant. Equivalently, $D_xT (e_2^{\pm}(x)) = \beta^{\pm}(x)\, e_2^{\pm}(Tx)$ for some scalar functions $\beta^{\pm}(x)$. Indeed, consider an arbitrary nonzero vector ${\bf u}$ and let ${\bf u}=c_1^+e_1^++c_2^+e_2^+$ be its decomposition in the basis $\{e_1^+,e_2^+\}$, and ${\bf u}=c_1^{(n)}e_1^{(n)}+ c_2^{(n)}e_2^{(n)}$ be its decomposition in the basis $\{e_1^{(n)},e_2^{(n)}\}$ for all $n\geq 0$. Of course, $c_j^{(n)}\to c_j^+$ as $n\to\infty$, for $j=1,2$, according to Claim~1. If $c_1^+\neq 0$, then $|c_1^{(n)}| \geq |c_1^+|/2>0$ for all large enough $n$, hence $D_xT^n({\bf u})$ will grow exponentially at the rate $\chi_1$ as $n\to\infty$ by Claim~2, precisely \be c_1^+\neq 0 \ \ \Longrightarrow\ \ \lim_{n\to\infty} \frac 1n \ln \|D_xT^n({\bf u})\|=\chi_1 \label{c1n0} \ee Now, when $c_1^+=0$, then $|c_1^{(n)}|<\,{\rm const}\, e^{-\delta n}$ by (\ref{ee}). By a little more detailed estimation, one obtains \be c_1^+ = 0 \ \ \Longrightarrow\ \ \limsup_{n\to\infty} \frac 1n \ln \|D_xT^n({\bf u})\| \leq \max\{\chi_1 - \delta,\, \chi_2\} \label{c1=0} \ee Since the limits in (\ref{c1n0}) and (\ref{c1=0}) are different, Claim~3 is proved. Note: the families of subspaces spanned by the limit dominant eigenvectors, $e_1^+(x)$ and $e_1^-(x)$, are generally not (!) invariant under $D_xT$. \\ \noindent{\bf Claim 4}. The spaces ${\cal E}_1$ and ${\cal E}_2$ are distinct, i.e. the vectors $e_2^+$ and $e_2^-$ are not parallel. Indeed, suppose ${\cal E}_1={\cal E}_2$. Let ${\bf v}\in {\cal T}_{T^{-n}x}M$ be an arbitrary unit vector at the point $T^{-n}x$. Let ${\bf v} = c_1e_1^{(n)} (T^{-n}x) + c_2e_2^{(n)} (T^{-n}x)$ be its decomposition in the basis $\{e_1^{(n)}(T^{-n}x),e_2^{(n)}(T^{-n}x)\}$. Then $$ {\bf u}:=D_{T^{-n}x}T^n({\bf v}) =\frac{c_1}{\lambda_1^{(-n)}(x)}e_1^{(-n)}(x) +\frac{c_2}{\lambda_2^{(-n)}(x)}e_2^{(-n)}(x) $$ Recall that $\frac 1n \ln \lambda_1^{(-n)}(x)\to -\chi_2$ and $\frac 1n \ln \lambda_2^{(-n)}(x)\to -\chi_1$ as $n\to\infty$. Our assumption ${\cal E}_1={\cal E}_2$ and (\ref{ee}) imply ${\bf u}=c_1'e_1^{(n)}+c_2'e_2^{(n)}$ with some $c_1',c_2'$ satisfying $$ \limsup_{n\to\infty}\ \sup_{\bf v} \frac 1n \ln|c_1'| \leq \max\{\chi_2,\chi_1-\delta\} $$ and $$ \limsup_{n\to\infty}\ \sup_{\bf v} \frac 1n \ln|c_2'| \leq \chi_1 $$ Since $e_1^{(n)}$ and $e_2^{(n)}$ are eigenvectors for $A_n(x)$, we see that $$ D_xT^n{\bf u}=D_{T^{-n}x}T^{2n}({\bf v}) =c_1'\lambda_1^{(n)}e_1^{(-n)}(T^nx) +c_2'\lambda_2^{(n)}e_2^{(-n)}(T^nx) $$ Therefore, $$ \limsup_{n\to\infty} \frac{1}{2n} \ln \|D_{T^{-n}x}T^{2n}\| \leq \max \left \{ \frac{\chi_1+\chi_2}{2},\chi_1-\frac{\delta}{2}\right \} < \chi_1 $$ which contradicts Corollary~13.31. Claim~4 is proved.\\ We are now ready to show that for every nonzero vector ${\bf u}\in{\cal E}_1$ \be \lim_{n\to\pm\infty} \frac 1n \ln \| D_xT^n({\bf u})\| = \chi_1 \label{Ochi+} \ee as required by the definition of Lyapunov exponents 13.18. For $n\to +\infty$, this follows from (\ref{c1n0}) (note: by Claim~4 the vector $\bf u$ cannot be parallel to $e_2^+$, hence it has a nonzero projection onto $e_1^+$). Now, recall that by Claim 3 we have $\|D_xT ({\bf u})\| = |\beta^-(x)|\, \|{\bf u}\|$ and $\|D_xT^{-1} ({\bf u})\| = |1/\beta^-(T^{-1}x)|\, \|{\bf u}\|$. Let $G(x)=\ln |\beta^-(x)|$, then $$ \lim_{n\to\infty} \frac 1n \ln \| D_xT^n({\bf u})\| = \lim_{n\to\infty} \frac 1n \sum_{i=0}^{n-1}G(T^ix)=G_+(x) $$ in the notation of Section~9, and we just have seen that $G_+(x)=\chi_1$ almost everywhere. On the other hand, $$ \lim_{n\to -\infty} \frac 1n \ln \| D_xT^n({\bf u})\| = \lim_{n\to -\infty} \frac{1}{|n|} \sum_{i=1}^{|n|}G(T^{-i}x)=G_-(x) $$ and we have $G_-(x)=G_+(x)$ almost everywhere by 9.4. This completes the proof of (\ref{Ochi+}). The corresponding limit for vectors ${\bf u}\in{\cal E}_2$ is computed similarly. Hence we proved Oceledec Theorem in the main case $\chi_1>\chi_2$. If $\chi_1=\chi_2$, then the spaces ${\cal E}_j$, $j=1,2$, need not be constructed, and the argument is much simpler, we refer to Pollicott, p.\ 36, and leave the details as an exercise. $\Box$ \\ Lyapunov exponents are the key tool in the study of smooth dynamics. Interestingly, they exist even for maps that are only differentiable almost everywhere, rather than everywhere. The following extension of Oceledec's theorem takes care of such maps:\\ \noindent{\sc 13.33 Theorem (Oseledec Theorem, Extended)}. Let $U\subset M$ be an open subset of a manifold $M$. Let $T:U\to M$ be a $C^1$ diffeomorphism of $U$ onto $T(U)$ preserving a probability measure $\mu$ on $M$. Assume that \be \int_M \ln^+\|D_xT\|\, d\mu <\infty \ \ \ \ {\rm and}\ \ \ \ \int_M \ln^+\|D_xT^{-1}\|\, d\mu <\infty \label{log+} \ee where $\ln^+a=\max\{0,\ln a\}$. Then there exists a $T$-invariant subset $N\subset M$, $\mu(N)=1$, such that for every point $x \in N$ all Lyapunov exponents exist. Note: the assumption (\ref{log+}) is necessary to ensure the integrability of functions $F_n(x)$ in Subadditive Ergodic Theorem 13.26.\\ Oseledec Theorem contains a clause that we have omitted so far. We state it now without proof. It deals with angles between the characteristic subspaces ${\cal E}_j$. Even though these subspaces are not necessarily orthogonal to each other, the angle between them cannot become too small, in the following sense.\\ \noindent{\sc 13.34 Addendum}. Let $x\in M$ be a point where all Lyapunov exponents $\chi_1>\cdots >\chi_m$ exist. For any subset $S\subset\{1,\ldots,m\}$ let $\gamma_S(x)$ denote the angle between the spaces $\oplus_{j\in S} {\cal E}_i$ and $\oplus_{j\notin S}{\cal E}_j$ in ${\cal T}_xM$. Then for almost every $x\in M$ $$ \lim_{n\to\pm\infty} \frac 1n \ln \gamma_{S}(T^nx) = 0 $$ i.e.\ the angles between the characteristic subspaces can only approach zero very slowly (more slowly than any exponential function).\\ \noindent{\sc 13.35 Theorem (Lyapunov Exponents versus Volume)}. Let $J_n(x)=|\,{\rm det}\, D_xT^n|$ be the Jacobian of the map $T^n$ at $x$ (this is the factor by which $T^n$ changes volume in an infinitesimal neighborhood of $x\in M$). Then for almost every $x$ $$ \lim_{n\to\pm\infty} \frac 1n \ln J_n(x) = \sum_{j=1}^m \chi_j\, {\rm dim}\, {\cal E}_j $$ i.e.\ the asymptotic rate of change of volume equals the sum of all Lyapunov exponents (counting multiplicity).\\ {\em Proof}. For simplicity, assume that dim$\, M=2$ and $\chi_1>\chi_2$. Pick any nonzero vectors $e_1\in{\cal E}_1$ and $e_2\in{\cal E}_2$. The area of the parallelogram spanned by $e_1,e_2$ equals $\|e_1\|\, \|e_2\|\, \sin\gamma(x)$, where $\gamma(x)$ is the angle between ${\cal E}_1$ and ${\cal E}_2$ in ${\cal T}_xM$. Therefore, $$ J_n(x)=\frac{\|D_xT^ne_1\|\,\|D_xT^ne_2\|\, \sin\gamma(T^nx)} {\|e_1\|\,\|e_2\|\, \sin\gamma(x)} $$ Taking the logarithm, dividing by $n$ and letting $n\to\pm\infty$ proves the theorem. Note that the term containing $\sin\gamma(T^nx)$ is eliminated by 13.34. In the general case, when dim$\, M>2$, some elementary but boring geometric estimates must be involved, so we do not elaborate. $\Box$ \\ \noindent{\sc 13.36 Corollary}. Assume that $\ln J_1(x)\in L^1_{\mu}(M)$. Then $$ \int_M \ln J_1(x)\, d\mu = \sum_{j=1}^m \chi_j\, {\rm dim}\, {\cal E}_j $$ i.e.\ the average one-step rate of change of volume equals the sum of all Lyapunov exponents (counting multiplicity).\\ {\em Proof}. By the chain rule, $$ J_n(x)=J_1(x)\cdots J_1(T^ix) $$ Taking the logarithm, dividing by $n$ and letting $n\to\pm\infty$ proves the theorem in view of 13.35 and Ergodic Theorem 9.2. \\ \noindent{\sc 13.37 Corollary}. Let the invariant measure $\mu$ be absolutely continuous with density $f(x)$ with respect to the Lebesgue measure (volume). Assume that $\ln f(x)\in L^1_{\mu}(M)$. Then $$ \sum_{j=1}^m \chi_j\, {\rm dim}\, {\cal E}_j = 0 $$ i.e.\ the sum of all Lyapunov exponents vanishes.\\ {\em Proof}. Note that $J_1(x)=f(x)/f(Tx)$, hence $$ \int_M \ln J_1(x)\, d\mu = \int_M \ln f(x)\, d\mu - \int_M \ln f(Tx)\, d\mu = 0 $$ by the invariance of $\mu$. $\Box$ \\ \newpage \section{Dynamical Systems with Continuous Time} So far we have studied measurable maps $T:X\to X$ and their iterations $T^n$, $n\in\ZZ$, interpreting $n$ as time. Our time variable $n$ was restricted to integers only. Naturally, in real life and in physical sciences, time is a continuous variable that can take all real values. \\ If we allow the time variable $t$ take all real values, then for any initial point $x\in X$, its image $S^tx$ must be defined for all $t\in\IR$. Hence, we need to define a one-parameter family of transformations $S^t:X\to X$, where $t\in\IR$. Note that $S^{t+s}x=S^t(S^sx)$ for every $x\in X$, so that $S^{t+s}= S^t\circ S^s$. Hence the transformations $S^t$ make a {\em group} with respect to composition. In particular, $S^0=\,$id, the identity map.\\ There are two ways to look at such a system $S^t:X\to X$. One way is to fix some $t$'s and consider the corresponing maps $S^t$ on $X$, hence the entire system is viewed as an uncountable collection of maps of $X$ into itself, which are related to each other by the group rule $S^{t+s}=S^t\circ S^s$. The other way is to fix some $x$'s and consider the corresponding trajectories (orbits) $S^t(x)$, $-\infty0$, is dense in ${\rm Tor}^d$; \item[(c)] the numbers $a_1,\ldots,a_d$ are rationally independent, i.e. \be m_1a_1+m_2a_2+\cdots+m_da_d\neq 0 \label{linrel1} \ee for any integers $m_1,\ldots,m_d$ unless $m_1=\cdots=m_d=0$. \end{itemize} The proof of this theorem very much repeats that of 12.5. The differences are minor and can be easily worked out. \\ \noindent{\sc 14.16 Remark}. Let $d=2$ in the above theorem. Then either $S^t_{\bf a}$ is ergodic, or $a_1/a_2$ is a rational number. In the latter case there is a $T>0$ such that $S^T_{\bf a}(x)=x$ for every point $x\in{\rm Tor}^2$. Hence, we arrive at an alternative: on a 2-D torus all the orbits are either periodic or dense. \\ \noindent{\sc 14.17 Remark}. The flow $S^t_{\bf a}$ is never mixing. In fact, all maps $S^t_{\bf a}$, $t\in\IR$, are isometries, i.e.\ they preserve distances and angles on ${\rm Tor}^d$, so this flow is a very regular (as opposed to chaotic) dynamical system, despite its ergodicity for some ${\bf a}\in\IR^d$.\\ There is a simple criterion for the invariance of the Lebesgue measure under a smooth flow:\\ \noindent{\sc 14.18 Lemma}. Let $X\subset\IR^d$ be an open domain and $S^t$ a flow defined by a differential equation, see 14.2. Then $S^t$ preserves the Lebesgue measure $m$ on $X$ iff the divergence of the vector field ${\bf v}$ vanishes: $$ {\rm div}\,{\bf v}= \frac{\partial v_1}{\partial x_1}+\cdots + \frac{\partial v_d}{\partial x_d}=0 $$ at every point $x\in X$. (Compare this to 5.6(a).)\\ \begin{figure}[h] \centering \epsfig{figure=ds-18.eps}\caption{Change of volume by the flow $S^t$.} \end{figure} {\em Proof}. It is a well known fact in vector analysis that ${\rm div}\,{\bf v}$ is the rate of change of volume by the vector field $\bf v$. More precisely, let $x(s)$, $0\leq s\leq t$, be a segment of an orbit of the flow. Take an infinitesimal ball $B_0$ of volume $V_0$ around the point $x(0)$ and denote by $V(t)$ the volume of the image $S^t(B_0)$, see Fig.~19. Then $$ \frac{V_t}{V_0}=\exp\left [\int_0^t{\rm div}\, {\bf v}(x(s))\, ds \right ] $$ This proves the lemma. $\Box$ \\ The following fact extends 14.18 to more general flows and measures.\\ \noindent{\sc 14.19 Theorem (Liouville)}. Let $X\subset\IR^d$ be an open domain and $S^t$ a flow defined by a differential equation, see 14.2. Then $S^t$ preserves a measure $\mu$ with a continuous density $f(x)$ on $X$ iff the divergence of the vector field $f{\bf v}$ vanishes: $$ {\rm div}(f{\bf v})= \frac{\partial (fv_1)}{\partial x_1}+\cdots + \frac{\partial (fv_d)}{\partial x_d}=0 $$ at every point $x\in X$. \\ {\em Proof}. In terms of the previous lemma, the invariance of the measure with density $f(x)$ means that $$ f(x(t))\, V_t = f(x(0))\, V_0 $$ hence $$ f(x(t))\, \exp\left [\int_0^t{\rm div}\, {\bf v}(x(s))\, ds \right ] = f(x(0)) $$ Differentiating with respect to $t$ and cancelling the exponential factor yields $$ \la {\rm grad}\, f, {\bf v} \ra + f\, {\rm div}\,{\bf v} = 0 $$ which is equivalent to ${\rm div}(f{\bf v})=0$. $\Box$ \\ \noindent{\sc 14.20 Example}. In classical mechanics, the motion of objects is governed by Newtonian equations constructed in the following way. Consider a system of $n$ objects (particles) with masses $m_1,\ldots,m_n$. Denote their coordinates by $q_1,\ldots,q_n$ and momenta by $p_1,\ldots,p_n$ (for simplicity, let all the particles move on a line, so that $q_i$'a and $p_i$'s are one-dimensional variables). The Newtonian equations are \be \dot{q}_i=p_i/m_i \ \ \ \ \ {\rm and} \ \ \ \ \ \dot{p}_i=F_i \label{pqDE} \ee for $1\leq i\leq n$, where $F_i$ is the force acting on the $i$-th particle. Classical forces (gravitational, etc.) only depend on the particle positions, not velocities, i.e. $F_i=F_i(q_1,\ldots,q_n)$. For example, the gravitational force is obtained by $$ F_i(q_1,\ldots,q_n)= -\frac{\partial}{\partial q_i}U(q_1,\ldots,q_n) $$ where $U(q_1,\ldots,q_n)$ is the potential energy of the system. The differential equations (\ref{pqDE}) define a flow on the $2n$-dimensional space with coordinates $(q_1,\ldots,q_n,p_1,\ldots,p_n)$, which is called the {\bf phase space} of the mechanical system.\\ \noindent{\sc 14.21 Remark}. The resulting flow on the phase space preserves the Lebesgue measure by Lemma 14.8 (indeed, all the partial derivatives involved in 14.8 vanish). This is a fundamental fact in classical mechanics. It is also called Liouville Theorem.\\ \noindent{\sc 14.22 Hamiltonian Systems}. More generally, the laws of classical mechanics can be expressed by a system of differential equations \be \dot{q}_i=\frac{\partial}{\partial p_i} {\cal H} \ \ \ \ \ \ \ \ \dot{p}_i= - \frac{\partial}{\partial q_i} {\cal H} \label{H} \ee where the function ${\cal H}(q_1,\ldots,q_n,p_1,\ldots,p_n)$ is the so called {\bf Hamiltonian}. In the above example with gravitational forces $$ {\cal H}(q_1,\ldots,q_n,p_1,\ldots,p_n) =\sum_{i=1}^n \frac{p_i^2}{2m_i} + U(q_1,\ldots,q_n) $$ This is a classical expression for the total energy of the system: it is the sum of the kinetic energy and potential energy. Lastly, note that a Hamiltonian flow always preserves the Hamiltonian itself, i.e. ${\cal H}$ is an invariant function under the flow. This follows from (\ref{H}) by $$ \frac{d}{dt}\, {\cal H}(q_1(t),\ldots,q_n(t), p_1(t),\ldots,p_n(t))= \sum_{i=1}^n\dot{q}_i\times \frac{\partial}{\partial q_i} {\cal H} +\sum_{i=1}^n\dot{p}_i\times \frac{\partial}{\partial p_i} {\cal H} =0 $$ In physics this fact is called the {\bf conservation of energy}. \end{document} \end \noindent{\sc 6.7 Theorem (Kac)}. Let $A\subset X$ and $\mu(A)>0$. Then ({\bf ergodicity?}) $$ \int_A \tau_A\, d\mu_A = 1 $$ Next, by iterating the subadditivity condition (\ref{subadd}) $n$ times \be \frac 1n F_n(x) \leq \frac 1n \sum_{i=0}^{n-1} F_1(T^ix) \label{FnF1} \ee By Ergodic Theorem 9.2, the right hand side converges to $\int F_1\, d\mu$, hence $\bar{F}\leq \int F_1\, d\mu$. In a similar manner, iterating (\ref{subadd}) $n$ steps at a time (see details in Pollicott, pp. 37--38), we obtain $$ \bar{F} \leq \frac 1n \int F_n\, d\mu \ \ \ \ \ \ \forall n\geq 1 $$ almost everywhere. Hence $\bar{F}\leq \chi$ a.e.