From 58d1bee5f41ef89a4a4fd471aed77ccd2bcfd971 Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Fri, 3 Apr 2020 01:16:11 -0400
Subject: [PATCH 1/4] Add IRK note (WIP)

---
 firk/Makefile      |   6 ++
 firk/README.md     |   3 +
 firk/main.tex      | 141 +++++++++++++++++++++++++++++++++++++++++++++
 firk/reference.bib |  10 ++++
 4 files changed, 160 insertions(+)
 create mode 100644 firk/Makefile
 create mode 100644 firk/README.md
 create mode 100644 firk/main.tex
 create mode 100644 firk/reference.bib

diff --git a/firk/Makefile b/firk/Makefile
new file mode 100644
index 0000000..fab1e0e
--- /dev/null
+++ b/firk/Makefile
@@ -0,0 +1,6 @@
+LL  := latexmk
+DEP := $(wildcard *.tex)
+MAIN=main
+
+main: ${DEP}
+	${LL} -f -pdf ${MAIN} -auxdir=output -outdir=output
diff --git a/firk/README.md b/firk/README.md
new file mode 100644
index 0000000..b9ad996
--- /dev/null
+++ b/firk/README.md
@@ -0,0 +1,3 @@
+# Efficient Implicit Runge-Kutta Implementation
+
+The PDF is at output/main.pdf.
diff --git a/firk/main.tex b/firk/main.tex
new file mode 100644
index 0000000..98a7c2c
--- /dev/null
+++ b/firk/main.tex
@@ -0,0 +1,141 @@
+% main.tex
+\documentclass[a4paper,9pt]{article}
+\usepackage{amsmath,amssymb,amsthm,amsbsy,amsfonts}
+\usepackage{todonotes}
+\usepackage{systeme}
+\usepackage{physics}
+\usepackage{cleveref}
+\newcommand{\correspondsto}{\;\widehat{=}\;}
+\usepackage{bm}
+\usepackage{enumitem} % label enumerate
+\newtheorem{theorem}{Theorem}
+\theoremstyle{definition}
+\newtheorem{definition}{Definition}[section]
+\theoremstyle{remark}
+\newtheorem*{remark}{Remark}
+% change Q.D.E symbol
+\renewcommand\qedsymbol{$\hfill \mbox{\raggedright \rule{0.1in}{0.2in}}$}
+
+\begin{document}
+
+\author{Yingbo Ma\\
+        \tt{mayingbo5@gmail.com}}
+\title{Efficient Implicit Runge-Kutta Implementation}
+\date{April, 2020}
+
+\maketitle
+
+\section{Notation}
+\todo{Is this necessary?}
+
+\section{Runge-Kutta Methods}
+Runge-Kutta methods can numerically solve differential-algebraic equations (DAEs)
+that are written in the form of
+\begin{equation}
+  M \dv{u}{t} = f(u, t),\quad u(a)=u_a \in \mathbb{R}^m, \quad t\in [a, b].
+\end{equation}
+
+\begin{definition} \label{def:rk}
+  An \emph{$s$-stage Runge Kutta} has the coefficients $a_{ij}, b_i,$ and $c_j$
+  for $i=1,2,\dots,s$ and $j=1,2,\dots,s$. One can also denote the coefficients
+  simply by $\bm{A}, \bm{b},$ and $\bm{c}$. A step of the method is
+  \begin{equation} \label{eq:rk_sum}
+    u_{n+1} = u_n + \sum_{i=1}^s b_i k_i,
+  \end{equation}
+  where
+  \begin{align} \label{eq:rk_lin}
+    M z_i &= \sum_{j=1}^s a_{ij}k_j\qq{or} (I_s \otimes M) \bm{z} =
+    (\bm{A}\otimes I_m)\bm{k}, \quad g_i = u_n + z_i \\
+    k_i &= hf(g_i, t+c_ih).
+  \end{align}
+\end{definition}
+
+We observe that when $a_{ij} = 0$ for $i\ge j$, \cref{eq:rk_lin} can be computed
+without solving a system of algebraic equations, and we call methods with this
+property \emph{explicit} otherwise \emph{implicit}.
+
+We should note solving $z_i$ is much preferred over $g_i$, as they have a
+smaller magnitude. A method is stiffly accurate, i.e. the stability function
+$R(\infty) = 0$, when the matrix $\bm{A}$ is fully ranked and $a_{si} = b_i$.
+Hence, $\bm{b}^T\bm{A}^{-1}$ gives the last row of the identity matrix $I_s$. We
+can use this condition to further simplify \cref{eq:rk_sum} (by slight abuse of
+notation):
+\begin{equation} \label{eq:rk_sim}
+  u_{n+1} = u_n + \bm{b}^T\bm{k} = u_n +
+  \bm{b}^T\underbrace{\bm{A}^{-1}\bm{z}}_\text{\cref{eq:rk_lin}} = u_n +
+  \mqty(0 & \cdots & 0 & 1)\bm{z} = u_n + z_s.
+\end{equation}
+
+\section{Solving Nonlinear Systems from Implicit Runge-Kutta Methods}
+We have to solve \cref{eq:rk_lin} for $\bm{z}$ when a Runge-Kutta method is
+implicit. More explicitly, we need to solve $G(\bm{z}) = \bm{0}$, where
+\begin{equation} \label{eq:nonlinear_rk}
+  G(\bm{z}) = (I_s \otimes M) \bm{z} - h(\bm{A}\otimes I_m) \tilde{\bm{f}}(\bm{z}) \qq{and}
+  \tilde{f}(\bm{z})_i = f(u_n+z_i, t+c_i h)
+\end{equation}
+The propose of introducing a computationally expensive nonlinear
+system solving step is to combat extreme stiffness. Hence, the Jacobian matrix
+arising from \cref{eq:rk_lin} is ill-conditioned. We must use Newton
+iteration to ensure stability, since fixed-point iteration only converges for
+contracting maps, which greatly limits the step size.
+
+Astute readers may notice that the Jacobian
+\begin{equation}
+  \tilde{\bm{J}}_{ij} = \pdv{\tilde{\bm{f}}_i}{\bm{z}_j} = \pdv{f(u_n + z_i, t+c_ih)}{u}
+\end{equation}
+requires us to compute the Jacobian of $f$ at $s$ many different points, which
+is very expensive. We can approximate it by
+\begin{equation}
+  \tilde{\bm{J}}_{ij} \approx J = \pdv{f(u_n, t)}{u}.
+\end{equation}
+Our simplified Newton iteration from \cref{eq:nonlinear_rk} is then
+\begin{align} \label{eq:newton_1}
+  (I_s \otimes M - h\bm{A}\otimes J) \Delta \bm{z}^k &= -G(\bm{z}^k) = -(I_s
+  \otimes M) \bm{z}^k + h(\bm{A}\otimes I_m) \tilde{\bm{f}}(\bm{z}^k) \\
+  \bm{z}^{k+1} &= \bm{z}^{k} + \Delta \bm{z}^k \label{eq:newton_2},
+\end{align}
+where $\bm{z}^k$ is the approximation to $\bm{z}$ at the $k$-th iteration.
+
+In Hairer's Radau IIA implementation, he left multiplies \cref{eq:newton_1} by
+$(hA)^{-1} \otimes I_m$ to exploit the structure of the iteration matrix
+\cite{hairer1999stiff}, so we have
+\begin{align}
+  ((hA)^{-1} \otimes I_m)(I_s \otimes M) &= (hA)^{-1} \otimes M \\
+  ((hA)^{-1} \otimes I_m)(h\bm{A}\otimes J) &= I_s\otimes J \\
+  ((hA)^{-1} \otimes I_m)G(\bm{z}^k) &= ((hA)^{-1} \otimes M) \bm{z}^k -
+  \tilde{\bm{f}}(\bm{z}^k),
+\end{align}
+and finally,
+\begin{equation} \label{eq:newton1}
+  ((hA)^{-1} \otimes M - I_s\otimes J) \Delta \bm{z}^k = -((hA)^{-1} \otimes M)
+  \bm{z}^k + \tilde{\bm{f}}(\bm{z}^k).
+\end{equation}
+Hairer also diagonalizes $A^{-1}$, i.e.
+\begin{equation}
+  V^{-1}A^{-1}V = \Lambda,
+\end{equation}
+to decouple the $sm \times sm$ system. To transforming \cref{eq:newton1} to
+the eigenbasis of $A^{-1}$, notice
+\begin{equation}
+  A^{-1}x = b \implies V^{-1}A^{-1}x = V^{-1}b \implies \Lambda V^{-1}x =
+  V^{-1}b.
+\end{equation}
+Similarly, we have
+\begin{align}
+  &(h^{-1} \Lambda \otimes M - I_s\otimes J) (V^{-1}\otimes I_m)\Delta\bm{z}^k\\
+  =& (V^{-1}\otimes I_m)(-((hA)^{-1} \otimes M)
+  \bm{z}^k + \tilde{\bm{f}}(\bm{z}^k)).
+\end{align}
+We can introduce the transformed variable $\bm{w} = (V^{-1}\otimes I_m) \bm{z}$
+to further reduce computation, so \cref{eq:newton1} and \cref{eq:newton2} is now
+\begin{align} \label{eq:newton2}
+  (h^{-1} \Lambda \otimes M - I_s\otimes J) \Delta\bm{w}^k
+  &= -(h^{-1} \Lambda \otimes M) \bm{w}^k +
+  (V^{-1}\otimes I_m)\tilde{\bm{f}}((V\otimes I_m)\bm{w}^k) \\
+  \bm{w}^{k+1} &= \bm{w}^{k} + \Delta \bm{w}^k.
+\end{align}
+
+\bibliography{reference.bib}
+\bibliographystyle{siam}
+
+\end{document}
diff --git a/firk/reference.bib b/firk/reference.bib
new file mode 100644
index 0000000..7ec22f5
--- /dev/null
+++ b/firk/reference.bib
@@ -0,0 +1,10 @@
+@article{hairer1999stiff,
+  title={Stiff differential equations solved by Radau methods},
+  author={Hairer, Ernst and Wanner, Gerhard},
+  journal={Journal of Computational and Applied Mathematics},
+  volume={111},
+  number={1-2},
+  pages={93--111},
+  year={1999},
+  publisher={Elsevier}
+}

From 3c7a24cad7e6be0f57e7b5fd1e81650dab65c6f9 Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Fri, 3 Apr 2020 23:13:13 -0400
Subject: [PATCH 2/4] Add "Stopping Criteria"

---
 firk/main.tex      | 122 ++++++++++++++++++++++++++++++++++++++-------
 firk/reference.bib |  10 ++++
 2 files changed, 114 insertions(+), 18 deletions(-)

diff --git a/firk/main.tex b/firk/main.tex
index 98a7c2c..81ba875 100644
--- a/firk/main.tex
+++ b/firk/main.tex
@@ -25,12 +25,12 @@
 
 \maketitle
 
-\section{Notation}
-\todo{Is this necessary?}
+% \section{Notation}
+% \todo[inline]{Is this necessary?}
 
 \section{Runge-Kutta Methods}
-Runge-Kutta methods can numerically solve differential-algebraic equations (DAEs)
-that are written in the form of
+Runge-Kutta methods can numerically solve differential-algebraic equations
+(DAEs) that are written in the form of
 \begin{equation}
   M \dv{u}{t} = f(u, t),\quad u(a)=u_a \in \mathbb{R}^m, \quad t\in [a, b].
 \end{equation}
@@ -50,16 +50,16 @@ \section{Runge-Kutta Methods}
   \end{align}
 \end{definition}
 
-We observe that when $a_{ij} = 0$ for $i\ge j$, \cref{eq:rk_lin} can be computed
-without solving a system of algebraic equations, and we call methods with this
-property \emph{explicit} otherwise \emph{implicit}.
+We observe that when $a_{ij} = 0$ for each $i\ge j$, \cref{eq:rk_lin} can be
+computed without solving a system of algebraic equations, and we call methods
+with this property \emph{explicit} otherwise \emph{implicit}.
 
 We should note solving $z_i$ is much preferred over $g_i$, as they have a
 smaller magnitude. A method is stiffly accurate, i.e. the stability function
 $R(\infty) = 0$, when the matrix $\bm{A}$ is fully ranked and $a_{si} = b_i$.
-Hence, $\bm{b}^T\bm{A}^{-1}$ gives the last row of the identity matrix $I_s$. We
-can use this condition to further simplify \cref{eq:rk_sum} (by slight abuse of
-notation):
+Hence, for such methods $\bm{b}^T\bm{A}^{-1}$ gives the last row of the identity
+matrix $I_s$. We can use this condition to further simplify \cref{eq:rk_sum} (by
+slight abuse of notation):
 \begin{equation} \label{eq:rk_sim}
   u_{n+1} = u_n + \bm{b}^T\bm{k} = u_n +
   \bm{b}^T\underbrace{\bm{A}^{-1}\bm{z}}_\text{\cref{eq:rk_lin}} = u_n +
@@ -73,9 +73,9 @@ \section{Solving Nonlinear Systems from Implicit Runge-Kutta Methods}
   G(\bm{z}) = (I_s \otimes M) \bm{z} - h(\bm{A}\otimes I_m) \tilde{\bm{f}}(\bm{z}) \qq{and}
   \tilde{f}(\bm{z})_i = f(u_n+z_i, t+c_i h)
 \end{equation}
-The propose of introducing a computationally expensive nonlinear
-system solving step is to combat extreme stiffness. Hence, the Jacobian matrix
-arising from \cref{eq:rk_lin} is ill-conditioned. We must use Newton
+The propose of introducing a computationally expensive nonlinear system solving
+step is to combat extreme stiffness. The Jacobian matrix arising from
+\cref{eq:rk_lin} is ill-conditioned due to stiffness. Thus, we must use Newton
 iteration to ensure stability, since fixed-point iteration only converges for
 contracting maps, which greatly limits the step size.
 
@@ -83,8 +83,8 @@ \section{Solving Nonlinear Systems from Implicit Runge-Kutta Methods}
 \begin{equation}
   \tilde{\bm{J}}_{ij} = \pdv{\tilde{\bm{f}}_i}{\bm{z}_j} = \pdv{f(u_n + z_i, t+c_ih)}{u}
 \end{equation}
-requires us to compute the Jacobian of $f$ at $s$ many different points, which
-is very expensive. We can approximate it by
+requires us to compute the Jacobian of $f$ at $s$ many points, which can very
+expensive. We can approximate it by
 \begin{equation}
   \tilde{\bm{J}}_{ij} \approx J = \pdv{f(u_n, t)}{u}.
 \end{equation}
@@ -96,9 +96,10 @@ \section{Solving Nonlinear Systems from Implicit Runge-Kutta Methods}
 \end{align}
 where $\bm{z}^k$ is the approximation to $\bm{z}$ at the $k$-th iteration.
 
-In Hairer's Radau IIA implementation, he left multiplies \cref{eq:newton_1} by
-$(hA)^{-1} \otimes I_m$ to exploit the structure of the iteration matrix
-\cite{hairer1999stiff}, so we have
+\subsection{Change of Basis}
+\todo[inline]{discuss SDIRK}In Hairer's Radau IIA implementation, he left
+multiplies \cref{eq:newton_1} by $(hA)^{-1} \otimes I_m$ to exploit the
+structure of the iteration matrix \cite{hairer1999stiff}, so we have
 \begin{align}
   ((hA)^{-1} \otimes I_m)(I_s \otimes M) &= (hA)^{-1} \otimes M \\
   ((hA)^{-1} \otimes I_m)(h\bm{A}\otimes J) &= I_s\otimes J \\
@@ -134,6 +135,91 @@ \section{Solving Nonlinear Systems from Implicit Runge-Kutta Methods}
   (V^{-1}\otimes I_m)\tilde{\bm{f}}((V\otimes I_m)\bm{w}^k) \\
   \bm{w}^{k+1} &= \bm{w}^{k} + \Delta \bm{w}^k.
 \end{align}
+People usually call the matrix $W=(h^{-1} \Lambda \otimes M - I_s\otimes J)$ the
+iteration matrix or the $W$-matrix.
+
+\subsection{Stopping Criteria}
+Note that throughout this subsection, $\norm{\cdot}$ denotes the norm that is
+used by the time-stepping error estimate. We are using this choice because we
+need to make sure that convergent results from a nonlinear solver does not
+introduce step rejections.
+
+There are two approaches to estimate the error of a nonlinear solver, by the
+displacement $\Delta \bm{z}^k$ or by the residual $G(\bm{z}^k)$. The residual
+behaves like the error scaled by the Lipschitz constant of $G$. Stiff equations
+have a large Lipschitz constant. However, this constant is not known a prior.
+This makes the residual test unreliable. Hence, we are going to focus on the
+analysis of the displacement.
+
+Simplified Newton iteration converges linearly, so we can model the convergence
+process as
+\begin{equation}
+  \norm{\Delta \bm{z}^{k+1}} \le \theta \norm{\Delta \bm{z}^{k}}.
+\end{equation}
+The convergence rate at $k$-th iteration $\theta_k$ can be estimated by
+\begin{equation}
+  \theta_k = \frac{\norm{\Delta\bm{z}^{k}}}{\norm{\Delta\bm{z}^{k-1}}},\quad k\ge 1.
+\end{equation}
+Notice we have the relation
+\begin{equation}
+  \bm{z}^{k+1} - \bm{z} = \sum_{i=0}^\infty \Delta\bm{z}^{k+i+1}.
+\end{equation}
+If $\theta<1$, by the triangle inequality, we then have
+\begin{equation}
+  \norm{\bm{z}^{k+1} - \bm{z}^{k}} \le
+  \norm{\Delta\bm{z}^{k+1}}\sum_{i=0}^\infty \theta^i \le \theta
+  \norm{\Delta\bm{z}^{k}}\sum_{i=0}^\infty \theta^i = \frac{\theta}{1-\theta}
+  \norm{\Delta\bm{z}^{k}}.
+\end{equation}
+To ensure nonlinear solver error does not cause that step rejection, we need a
+safety factor $\kappa = 1/10$. Our first convergence criterion is
+\begin{equation}
+  \eta_k \norm{\Delta\bm{z}^k} \le \kappa, \qq{if}
+  k \ge 1 \text{ and } \theta \le 1, ~ \eta_k=\frac{\theta_k}{1-\theta_k}.
+\end{equation}
+One major drawback with this convergence criterion is that we can only check it
+after two iterations. To cover the case of convergence in the first iteration,
+we need to define $\eta_0$. It is reasonable to believe that the convergence
+rate remains relatively constant with the same $W$-matrix, so if $W$ is reused
+\todo[inline]{Add the reuse logic section} from the previous nonlinear solve,
+then we define
+\begin{equation}
+  \eta_0 = \eta_{\text{old}},
+\end{equation}
+where $\eta_{\text{old}}$ is the finial $\eta_k$ from the previous nonlinear
+solve, otherwise we define
+\begin{equation}
+  \eta_0 = \max(\eta_{\text{old}}, ~\text{eps}(\text{relative
+  tolerance}))^{0.8}.
+\end{equation}
+In the first iteration, we also check
+\begin{equation}
+  \Delta\bm{z}^{1} < 10^{-5}
+\end{equation}
+for convergence. \todo[inline]{OrdinaryDiffEq.jl also checks
+\texttt{iszero(ndz)}. It seems redundant in hindsight.}
+
+Moreover, we need to define the divergence criteria. It is obvious that we want
+to limit $\theta$ to be small. Therefore, the first criterion is
+\begin{equation}
+  \theta_k > 2.
+\end{equation}
+Also, the algorithm diverges if the max number of iterations is reached without
+convergence. A subtler criterion for divergence is: no convergence is predicted
+by extrapolating to $\norm{\Delta\bm{z}^k_{\max}}$, e.g.
+\begin{equation}
+  \frac{\theta_k^{k_{\max}-k}}{1-\theta_k} \norm{\Delta\bm{z}^k} > \kappa.
+\end{equation}
+\todo[inline]{OrdinaryDiffEq.jl doesn't actually check this condition anymore.}
+
+\subsection{$W$-matrix reuse}
+
+\section{Step size control}
+\subsection{Standard (Integral) control}
+\subsection{Predictive (modified PI) control}
+\subsection{Smooth Error Estimation}
+
+\nocite{hairer2010solving}
 
 \bibliography{reference.bib}
 \bibliographystyle{siam}
diff --git a/firk/reference.bib b/firk/reference.bib
index 7ec22f5..2bf0fdf 100644
--- a/firk/reference.bib
+++ b/firk/reference.bib
@@ -8,3 +8,13 @@ @article{hairer1999stiff
   year={1999},
   publisher={Elsevier}
 }
+
+@book{hairer2010solving,
+  title={Solving Ordinary Differential Equations II: Stiff and Differential-Algebraic Problems},
+  author={Hairer, E. and Wanner, G.},
+  isbn={9783642052217},
+  series={Springer Series in Computational Mathematics},
+  url={https://books.google.com/books?id=g-jvCAAAQBAJ},
+  year={2010},
+  publisher={Springer Berlin Heidelberg}
+}

From 47212a1a9b6f8aec5089f7d25226113de29f1e45 Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Fri, 3 Apr 2020 23:31:45 -0400
Subject: [PATCH 3/4] Fix typos

---
 firk/main.tex | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/firk/main.tex b/firk/main.tex
index 81ba875..e068d9c 100644
--- a/firk/main.tex
+++ b/firk/main.tex
@@ -50,7 +50,7 @@ \section{Runge-Kutta Methods}
   \end{align}
 \end{definition}
 
-We observe that when $a_{ij} = 0$ for each $i\ge j$, \cref{eq:rk_lin} can be
+We observe that when $a_{ij} = 0$ for each $j\ge i$, \cref{eq:rk_lin} can be
 computed without solving a system of algebraic equations, and we call methods
 with this property \emph{explicit} otherwise \emph{implicit}.
 
@@ -115,8 +115,8 @@ \subsection{Change of Basis}
 \begin{equation}
   V^{-1}A^{-1}V = \Lambda,
 \end{equation}
-to decouple the $sm \times sm$ system. To transforming \cref{eq:newton1} to
-the eigenbasis of $A^{-1}$, notice
+to decouple the $sm \times sm$ system. To transform \cref{eq:newton1} to the
+eigenbasis of $A^{-1}$, notice
 \begin{equation}
   A^{-1}x = b \implies V^{-1}A^{-1}x = V^{-1}b \implies \Lambda V^{-1}x =
   V^{-1}b.
@@ -128,7 +128,7 @@ \subsection{Change of Basis}
   \bm{z}^k + \tilde{\bm{f}}(\bm{z}^k)).
 \end{align}
 We can introduce the transformed variable $\bm{w} = (V^{-1}\otimes I_m) \bm{z}$
-to further reduce computation, so \cref{eq:newton1} and \cref{eq:newton2} is now
+to further reduce computation, so \cref{eq:newton_1} and \cref{eq:newton_2} is now
 \begin{align} \label{eq:newton2}
   (h^{-1} \Lambda \otimes M - I_s\otimes J) \Delta\bm{w}^k
   &= -(h^{-1} \Lambda \otimes M) \bm{w}^k +
@@ -140,16 +140,15 @@ \subsection{Change of Basis}
 
 \subsection{Stopping Criteria}
 Note that throughout this subsection, $\norm{\cdot}$ denotes the norm that is
-used by the time-stepping error estimate. We are using this choice because we
-need to make sure that convergent results from a nonlinear solver does not
-introduce step rejections.
+used by the time-stepping error estimate. By doing so, we can be confident that
+convergent results from a nonlinear solver do not introduce step rejections.
 
 There are two approaches to estimate the error of a nonlinear solver, by the
 displacement $\Delta \bm{z}^k$ or by the residual $G(\bm{z}^k)$. The residual
 behaves like the error scaled by the Lipschitz constant of $G$. Stiff equations
-have a large Lipschitz constant. However, this constant is not known a prior.
-This makes the residual test unreliable. Hence, we are going to focus on the
-analysis of the displacement.
+have a large Lipschitz constant, furthermore, this constant is not known a
+priori. This makes the residual test unreliable. Hence, we are going to focus on
+the analysis of the displacement.
 
 Simplified Newton iteration converges linearly, so we can model the convergence
 process as
@@ -171,16 +170,16 @@ \subsection{Stopping Criteria}
   \norm{\Delta\bm{z}^{k}}\sum_{i=0}^\infty \theta^i = \frac{\theta}{1-\theta}
   \norm{\Delta\bm{z}^{k}}.
 \end{equation}
-To ensure nonlinear solver error does not cause that step rejection, we need a
+To ensure the nonlinear solver error does not cause step rejections, we need a
 safety factor $\kappa = 1/10$. Our first convergence criterion is
 \begin{equation}
   \eta_k \norm{\Delta\bm{z}^k} \le \kappa, \qq{if}
   k \ge 1 \text{ and } \theta \le 1, ~ \eta_k=\frac{\theta_k}{1-\theta_k}.
 \end{equation}
-One major drawback with this convergence criterion is that we can only check it
-after two iterations. To cover the case of convergence in the first iteration,
-we need to define $\eta_0$. It is reasonable to believe that the convergence
-rate remains relatively constant with the same $W$-matrix, so if $W$ is reused
+One major drawback with this criterion is that we can only check it after one
+iteration. To cover the case of convergence in the first iteration, we need to
+define $\eta_0$. It is reasonable to believe that the convergence rate remains
+relatively constant with the same $W$-matrix, so if $W$ is reused
 \todo[inline]{Add the reuse logic section} from the previous nonlinear solve,
 then we define
 \begin{equation}
@@ -206,18 +205,18 @@ \subsection{Stopping Criteria}
 \end{equation}
 Also, the algorithm diverges if the max number of iterations is reached without
 convergence. A subtler criterion for divergence is: no convergence is predicted
-by extrapolating to $\norm{\Delta\bm{z}^k_{\max}}$, e.g.
+by extrapolating to $\norm{\Delta\bm{z}^k_{\max}}$, i.e.
 \begin{equation}
   \frac{\theta_k^{k_{\max}-k}}{1-\theta_k} \norm{\Delta\bm{z}^k} > \kappa.
 \end{equation}
 \todo[inline]{OrdinaryDiffEq.jl doesn't actually check this condition anymore.}
 
-\subsection{$W$-matrix reuse}
+\subsection{$W$-matrix Reuse}
 
-\section{Step size control}
-\subsection{Standard (Integral) control}
-\subsection{Predictive (modified PI) control}
+\section{Step Size Control}
 \subsection{Smooth Error Estimation}
+\subsection{Standard (Integral) Control}
+\subsection{Predictive (Modified PI) Control}
 
 \nocite{hairer2010solving}
 

From 7431ae90c1c6bb64245990de7bd6735bbdf1e161 Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Sat, 4 Apr 2020 00:11:00 -0400
Subject: [PATCH 4/4] More typo fix

---
 firk/main.tex | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/firk/main.tex b/firk/main.tex
index e068d9c..c50d4fc 100644
--- a/firk/main.tex
+++ b/firk/main.tex
@@ -165,7 +165,7 @@ \subsection{Stopping Criteria}
 \end{equation}
 If $\theta<1$, by the triangle inequality, we then have
 \begin{equation}
-  \norm{\bm{z}^{k+1} - \bm{z}^{k}} \le
+  \norm{\bm{z}^{k+1} - \bm{z}} \le
   \norm{\Delta\bm{z}^{k+1}}\sum_{i=0}^\infty \theta^i \le \theta
   \norm{\Delta\bm{z}^{k}}\sum_{i=0}^\infty \theta^i = \frac{\theta}{1-\theta}
   \norm{\Delta\bm{z}^{k}}.
@@ -199,13 +199,14 @@ \subsection{Stopping Criteria}
 \texttt{iszero(ndz)}. It seems redundant in hindsight.}
 
 Moreover, we need to define the divergence criteria. It is obvious that we want
-to limit $\theta$ to be small. Therefore, the first criterion is
+to limit $\theta$ to be small. Therefore, the first divergence criterion is
 \begin{equation}
   \theta_k > 2.
 \end{equation}
-Also, the algorithm diverges if the max number of iterations is reached without
-convergence. A subtler criterion for divergence is: no convergence is predicted
-by extrapolating to $\norm{\Delta\bm{z}^k_{\max}}$, i.e.
+Also, the algorithm diverges if the max number of iterations $k_{\max}$ is
+reached without convergence. A subtler criterion for divergence is: no
+convergence is predicted by extrapolating to $\norm{\Delta\bm{z}^k_{\max}}$,
+i.e.
 \begin{equation}
   \frac{\theta_k^{k_{\max}-k}}{1-\theta_k} \norm{\Delta\bm{z}^k} > \kappa.
 \end{equation}