information-theory-cheat-sheet.tex

\documentclass[10pt,a4paper,landscape]{article}
\usepackage{multicol}
\usepackage{calc}
\usepackage{ifthen}
\usepackage[landscape]{geometry}
\usepackage{amsmath,amsthm,amsfonts,amssymb,mathtools}
\usepackage{color,graphicx}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{underscore}
\usepackage{todonotes}
\usepackage{algpseudocode}
\usepackage{algorithm}

% Cheatsheet style
\input{style.tex}

% Shorthands
\renewcommand{\bf}[1]{\ensuremath{\mathbf{#1}}}
\newcommand{\E}{\mathrm{E}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\balpha}{\boldsymbol\alpha}
\newcommand{\bbeta}{\boldsymbol\beta}
\newcommand{\bdelta}{\boldsymbol\delta}
\newcommand{\btheta}{\boldsymbol\theta}
\newcommand{\bPhi}{\boldsymbol\Phi}
\newcommand{\code}{\mathcal{C}}
\newcommand{\alphabet}{\mathcal{U}}

\pdfinfo{
  /Title (Machine Learning Cheat Sheet)
  /Creator (TeX)
  /Producer (pdfTeX 1.40.0)
  /Author (Dennis Meier)
  /Subject (Machine Learning cheatsheet)
  /Keywords (machinelearning, ml, bayes, regression, classification)
}

% -----------------------------------------------------------------------

\begin{document}
\title{Information Theory Cheat Sheet}

\raggedright
\footnotesize
\sffamily
\begin{multicols*}{4}

% multicol parameters
% These lengths are set only within the two main columns
%\setlength{\columnseprule}{0.25pt}
\setlength{\premulticols}{1pt}
\setlength{\postmulticols}{1pt}
\setlength{\multicolsep}{1pt}
\setlength{\columnsep}{2pt}

\begin{center}
\Large{\underline{Information Theory Cheat Sheet}}
\end{center}

% ----------


\section{Entropy and co.}
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{entropy-quantities-venn-diagram.png}
\end{colfig}

\subsection{Entropy}

$H(U) = \sum_u - P(u) \log_2(P(u)) = E[- \ln(P(U))]$

\subsubsection{Properties:}
\begin{itemize}
	\item $H(U) \leq \log | \alphabet |$ with equality if all elements in $\alphabet$ are equally probable.
	\item $H(U) \geq 0$ always.
	\item $H(U) \geq H(f(U))$
\end{itemize}

\subsection{Joint Entropy}
$H(X,Y) = H(X) + H(Y | X) = H(Y) + H(X | Y)$

\subsubsection{Properties:}
\begin{itemize}
	\item Joint increases entropy: $H(X, Y) \geq H(X)$ with equality iff $X$ and $Y$ are independent.
	\item $H(X,Y) \leq H(X) + H(Y)$
	\item Chain rule: $H(X, Y) = H(X) + H(Y | X)$
	\item General chain rule: $H(X_1, X_2, ..., X_n) = \sum_{i=1}^n H(X_i | X_1, ..., X_{i-1})$
\end{itemize}


\subsection{Conditional Entropy}

$H(U | V = v) = \sum_u - P(u | v) \log P(u | v)$

$H(U | V) = \sum_v P(v) H(U | V = v)$


\subsubsection{Properties:}
\begin{itemize}
	\item Conditioning reduces entropy: $H(U | V) \leq H(U)$
	\item $H(U | V) \leq H(U | f(V))$ since $U, V, f(V)$ form a markov chain.
	\item Chain rule: $H( U | V) = H(U,V) - H(V)$
	\item If $X_i$ is a stationary stochastic process: $H(X_n | X^{n-1}) \leq H(X_i | X^{i-1})$ with $i \in 1, ..., n$
\end{itemize}

\subsubsection{Fano's Inequality}
Suppose $U,V$ are random variables with the same alphabet. Let $p=Pr(U \neq V)$. Then $ H(U|V) \leq h_2(p) + p\log (|\mathcal{U}|-1)$ where $h_2(p) = p\log \frac{1}{p} + (1-p)\log\frac{1}{1-p}$

\subsection{Mutual Information}
\begin{align*}
I(U; V) &= H(U) + H(V) - H(U,V)\\
		&= H(U) - H(U | V)\\
		&= H(V) - H(V | U)\\
		&= H(U,V) - H(U | V) - H (V | U)\\
		&= D(p_{UV} || p_Up_V)
\end{align*}

\subsubsection{Properties:}
\begin{itemize}
	\item Nonnegative: $I(U;V) \geq 0$ with equality if $U, V$ are independent.
	\item Chain rule: $I(X; Y, Z) = I(X;Z) + I(X; Y | Z)$
	\item General chain rule for mutual Information: $I(X_1, ..., X_n; Y) = \sum_{i=1}^n I(X_i; Y | X_1, ..., X_{i-1})$
	\item \textbf{Data processing} inequality: $I(X,Y) \geq I(X, Z)$ if X, Y, Z form a markov-chain.
	\item Symmetric: $I(X;Y) = I(Y;X)$.
	\item Is concave.
\end{itemize}

\subsection{Conditional Mutual Information}
\begin{align*}
I(X;Y|Z) 	&= I(X;Y,Z) - I(X;Z)\\
			&= H(X,Z) + H(Y,Z) - H(X,Y,Z) - H(Z)\\
			&= H(X|Z) - H(X|Y,Z)\\
			&= \sum_z p(z) I(X;Y|Z=z)\\
			&= \sum_{y \in Y} \sum_{x \in X} 
			p(x,y) \log{ \left(\frac{p(x,y)}{p(x)\,p(y)}
				\right) }
\end{align*}
\subsection{Cross-Entropy, Divergence}
$$ D(p || q) = \sum_x p(x) \log \frac{p(x)}{q(x)}$$

\subsubsection{Properties}
\begin{itemize}
	\item Information inequality / Gibb's inequality: $D(p||q) \geq 0$ with equality iff $p(x) = q(x)$ for all $x$.
	\item Additive for independent distributions: If $P_1, P_2$ are independent distributions, with the joint distribution $P(x,y) = P_1(x)P_2(y)$, and $Q, Q_1, Q_2$ likewise, then:
	
	$$D_{\mathrm{KL}}(P \| Q) = D_{\mathrm{KL}}(P_1 \| Q_1) + D_{\mathrm{KL}}(P_2 \| Q_2)$$
\end{itemize}

\section{Entropy Rate \& Typical Sets}
Entropy rate of a stochastic Process $\{X_i\}$ is

$$H(X_i) = \lim_{n \rightarrow \infty} \frac{1}{n} H(X_1, X_2, ..., X_n)$$

If a process is stationary, then the entropy rate exists and is equal to:

$$H(X_n | X_{n-1}, X_{n-2}, ..., X_1)$$

\subsection{Asymptotic Equpartition Property}
Information Theory's analog to the law of large numbers: If $X_1, X_2, ...$ are i.i.d. $\sim p(x)$, then
$$ - \frac{1}{n} \log p(X_1, X_2, ..., X_n) \rightarrow H(X) \text{ in probability}$$

Or alternatively,

$$ p(X_1, X_2, ..., X_n) \rightarrow 2^{-n H}$$

\subsection{Typcial Set}
The typical set $T_\epsilon$ is the set of sequences $(x_1, x_2, ..., x_n) \in \alphabet^n$ with empirical entropies $\epsilon$-close to the true entropy of $X$:

\begin{align*}
	&p(x_1, x_2, ..., x_n) = 2^{-n(H(X) \pm \epsilon)}\\
	\Leftrightarrow & - \frac{1}{n} \log p(x^n) = H(X) \pm \epsilon\\
	\Leftrightarrow &\left | - \frac{1}{n} \log p(x^n) - H(X) \right | < \epsilon
\end{align*}

\subsubsection{Properties of the typical set:}
\begin{itemize}
	\item Has probability nearly 1: $P(T_\epsilon) > 1 - \epsilon$
	\item All elements in $T_\epsilon$ are nearly equiprobable (with probability as given above)
	\item The number of elements in $T_\epsilon$ is nearly $2^{nH}$
\end{itemize}

\section{Coding schemes}
\subsection{Tunstall Coding}

\begin{algorithmic}[1]
\State {$D$ := tree of $|\mathcal{U}|$ leaves, one for each letter in $\mathcal{U}$.}
\While{$|D| < C$}
	\State {Convert most probable leaf to $|\mathcal{U}|$ sub-leaves.}
\EndWhile
\end{algorithmic}

\subsection{Huffman Coding}

\begin{algorithmic}[1]
\State {Leaf node for each symbol in $\mathcal{U}$, add it to the priority queue.}
\While {there is more than one node in the queue}:
	\State {Remove the two nodes of highest priority (lowest probability) from the queue}
	\State {Create a new internal node with these two nodes as children and with probability equal to the sum of the two nodes' probabilities.}
	\State {Add the new node to the queue.}
\EndWhile
\State {The remaining node is the root node and the tree is complete.}
\end{algorithmic}

\subsection{Lempel-Ziv coding}
\begin{algorithmic}[1]
	\State {Set $\mathcal D = \alphabet$}
	\Loop
		\State Associate to each $w \in \mathcal D$ a $\lceil \log | \mathcal D | \rceil$-bit binary representation, based on dictionary order.
		\State Parse the next work $w$ from the source sequence using $\mathcal D$
		\State Set $\mathcal D \gets \mathcal D \setminus \{w\} \cup \{ w u: u \in \alphabet \}$
	\EndLoop
\end{algorithmic}

\section{Source Coding}
A source code $\code$ for a random variable $X$ is a mapping from the range of X to $D^*$, the set of finite-length strings of symbols from a D-ary alphabet.

Expected length of a uniquely decodable source code C(x) for a random variable X with probability mass function p(x) is given by:
$$L(C) = \sum_{x \in \alphabet} p(x) l(x) \geq H(X)$$
with equality only if $D^{-l_i} = p_i$.

Expected codeword length: $E[len(\code(U))] < H(U) + 1$

Prefix-free codes can always be represented as leaves of a full tree.

\begin{colfig}
	\centering
	\includegraphics[width=\linewidth]{code-classes.png}
\end{colfig}

\subsection{Kraft's inequality}
$$\sum_{u \in \alphabet} 2^{-len(\code(u))} \leq 1$$

$\code$ prefix free $\implies$ Kraft

$\code$ uniquely decodable $\implies$ Kraft

$l(u)$ fullfills Kraft $\implies$ $\exists \ \code$, prefix free with $len(\code(u)) = l(u)$

\subsection{Dictionaries}
A dictionary $\mathcal{D}$ is valid if every source sequence has a prefix in $\mathcal{D}$. It is prefix-free if no dictionary word is a prefix of another.

If $W$ is the first word obtained when parsing an iid source $U_1,U_2,...$ with a valid prefix-free dictionary $\mathcal{D}$, then $$H(W) = E[length(W)]H(U)$$
$$\frac{\text{nb of bits}}{\text{letter}} \rightarrow \frac{\lceil\log |\mathcal{D}|\rceil}{E[length(W)]}$$

\subsection{Finite state machine}
A set of states $S$, a starting state, some state transfer function $g: S \times \alphabet \rightarrow S$ and some output function $f: S \times \alphabet \rightarrow \{0,1\}^*$

Letigimate FSM needs to be information lossless, i.e. $\forall s \in S$ $\forall u_1, ..., u_m \neq v_1, ..., v_m: g(s, \vec u) = g(s, \vec v) \implies f(s, \vec u) \neq f(s, \vec v)$

For a $L$-state finite state machine the compressibility is $\rho_M \geq 1/L$

A lempel-ziv procedure does at least as well as the best finite state machine there exists for a given sequence: $\rho_{LZ}(\vec x) \leq \rho_{FSM}(\vec x)$.

\section{Channel Coding}
\subsection{Discrete Memoryless Channels}
memoryless: $P(y|x, x_1^{i-1}, y_1^{i-1}) = p(y|x)$

without feedback if $P(x_i|x_1^{i-1},y_1^{i-1}) = P(x_i|x_1^{i-1})$

memoryless without feedback: $$P(y_1^n | x_1^n) = \prod_{i=1}^n p(y_i|x_i)$$
$$I(X_1^n;Y_1^n) \leq \sum_{i=1}^n I(X_i;Y_i) \leq nC$$

\subsection{Channel Capacity}
Information channel capacity of a discrete memoryless channel:
$$C = \max_{p(x)} I(X; Y)$$

Feedback does not increase the capacity for a discrete memoryless channel. Tinkering with the channel by using functions on it does not increase the capacity, either. However, channels with memory can have higher capacity than without memory.

\subsubsection{Properties}
\begin{enumerate}
	\item Positive: $C \geq 0$ since $I(X; Y) \geq 0$
	\item $C \leq \log | \mathcal{X} |$ since $C = \max I(X; Y) \leq \max H(X) =  \log | \mathcal{X} |$ and similar for Y.
\end{enumerate}

The capacity subject to a cost constraint $\beta$ and a cost function $b(x)$ changes the definition to $C = \max_{p(x): E[b(X)] < \beta} I(X; Y)$

\subsection{Transmission}
\begin{colfig}
	\centering
	\includegraphics[width=\linewidth]{comm-setup.png}
\end{colfig}

Source produces a letter each $\tau_S$ sec and channel accepts input symbols every $\tau_C$ sec: $\tau_S L = \tau_C n$

Suppose $U_1U_2..$ is a stationary source with Entropy Rate $\mathcal{H}$, and we have the setup above, then $\frac{1}{L} H(U_1..U_L|V_1..V_L) \geq \mathcal{H} - \frac{n}{L}C = \tau_S\left(\frac{\mathcal{H}}{\tau_S}-\frac{C}{\tau_C}\right)$

No matter how encoder and decoder are designed:
$$ h_2(\bar{P_e}) + \bar{P_e}\log(|\mathcal{U}|-1) \geq \tau_S \left(\frac{\mathcal{H}}{\tau_S}-\frac{C}{\tau_C}\right)$$ with $\bar{P_e} = \frac{1}{L} \sum_{i=1}^L P(U_i \neq V_i)$

Bad news: if $\frac{\mathcal{H}}{\tau_S} > \frac{C}{\tau_C}$ then $\bar{P_e}$ not arbitrary small

Good news: if $\frac{\mathcal{H}}{\tau_S} < \frac{C}{\tau_C}$ can achieve $\bar{P_e} \rightarrow 0$

\subsection{Channel Coding Theorem}
$M$ is the number of different messages we can send over the channel.
$R = \frac{\log M}{n}$ is the rate of the channel code, i.e. how many message bits are encoded per channel bit sent.

For a discrete memoryless channel, all rates below capacity $C$ are achievable. Specifically, for every rate $R < C$ there exists a sequence of $(2^{nR}, n)$ codes with maximum probability of error $\rightarrow 0$.

Conversely, any sequence of $(2^{nR}, n)$ codes with a maximum error probability going to zero must have $R \leq C$.

\subsection{Source-channel theorem}
A stochastic process with entropy rate H cannot be sent reliably over a discrete memoryless channel if $H > C$. Conversely, if the process satisfies the asymptotic equipartition property (APP), the source can be transmitted reliably if H < C.

\subsection{Singleton bound}
If number of codewords $M > 2^k$ then $$d_{min} = \min_{1\leq i \leq M, 1 \leq j \leq M, i \neq j} d_H(x^{(i)}, x^{(j)}) \leq n-k$$

\section{Example Channels}
\subsection{Binary Symmetric channel}
$C = 1 - H(p)$ because:
\begin{align*}
	I(X; Y) &= H(Y) - H(Y | X)\\
			&= H(Y) - \sum p(x) H(Y| X = x)\\
			&= H(Y) - \sum p(x) H(p)\\
			&= H(Y) - H(p)\\
			&\leq 1 - H(p).
\end{align*}
\begin{colfig}
	\centering
	\includegraphics[width=0.6\linewidth]{binary-symmetric-channel.png}
\end{colfig}

\subsection{Binary erasure channel}
$C = 1 - \alpha$
\begin{colfig}
	\centering
	\includegraphics[width=0.6\linewidth]{binary-erasure-channel.png}
\end{colfig}

\subsection{Arbitrary symmetric channel}
A channel is said to be \emph{weakly symmetric} if every row of the transition matrix $p(\cdot | x)$ is a permutation of every other row and all the column sums $\sum_x (y|x)$ are equal. For weakly symmetric channels:
$C = \log | \mathcal{Y} | - H(\text{row of transition matrix})$
and this is achieved by a uniform distribution on the input alphabet.

\section{Differential Entropy}
\begin{align*}
h(X) 	& = -\int_\mathbb{X} f(x)\log f(x)\,dx \\
		& = E[-\log f(X)] 
\end{align*}

\begin{align*}
D(f||g) & = \int_\mathbb{X} f(x) \log \frac{f(x)}{g(x)} \, dx \\
	& = E_f[\log \frac{f(X)}{g(X)}]
\end{align*}

\begin{align*}
I(X;Y) 	 & = h(X) - h(X|Y)\\
		& = h(Y) - h(Y|X)\\
		& = \int_\mathbb{X} \int_\mathbb{Y} f(x,y) \log \frac{f(y|x)}{f(y)}\,dx\,dy \\
		& = D(f_{XY} || f_Xf_Y)
\end{align*}


\subsubsection{Examples}
\begin{itemize}
	\item For the uniform distribution $\mathcal{U}[0, a]$:
	$h(X) = - \int_0^a \frac{1}{a} \log \frac{1}{a} dx = \log a$
	\item For the normal:
	$h(X) = \frac{1}{2} \log 2 \pi e \sigma$
\end{itemize}


\subsubsection{Properties:}
\begin{itemize}
	\item General chain rule: $h(X_1, \ldots, X_n) = \sum_{i=1}^{n} h(X_i|X_1, \ldots, X_{i-1}) \leq \sum h(X_i)$
	\item Translation-Invariant: $h(X + c) = h(X)$
	\item Scalable: $h(aX) = h(X) + log|a|$
	\item Conditioning reduces Entropy: $h(X|Y) \leq h(X)$ with equality iff $X$ and $Y$ are independent.
	\item Can be negative, unlike discrete entropy.
	\item Divergence and mutual information are non-negative.
	\item Entropy of $X$ (on $\mathbb{R}$ with zero mean and variance $\sigma^2$) is upper bounded by: $h(X) \leq \frac{1}{2} \log ( 2\pi e \sigma^2)$, with equality if $X$ is a Gaussian.
	\item Entropy of $X$ (on $[0, \infty]$ with mean $\lambda$) is upper bounded by $h(X) \leq \ln(\lambda) + 1$ with equality if X is exponentially distributed $p(X) = \frac{1}{\lambda} e^{-x/\lambda}$
	\item Entropy of $X$ (on $[0, a]$) is bounded by $h(X) \leq \log a$, with equality if $X$ is Uniform.
\end{itemize}

\subsubsection{Vector extensions}
\begin{itemize}
	\item Entropy of $\boldsymbol X$ (on $\mathbb{R}^d$ with zero mean and $Cov(\boldsymbol X) = K$) is upper bounded by: $h(\boldsymbol X) \leq \frac{1}{2} \log \det (2\pi eK)$, with equality if $\boldsymbol X$ is jointly Gaussian.
	\item $h(A\boldsymbol X) = h(\boldsymbol X) + \log|\det A|$
	\item $h(\boldsymbol X + \boldsymbol c) = h(\boldsymbol X)$
\end{itemize}

\subsection{Typical Set}
Entropy typical set: $T^n(\epsilon) = \{\bar{x}: |-\frac{1}{n}\sum_{i=1}^n \log f(x_i) - h(X)| \leq \epsilon\}$

Strong typicality: $|\frac{1}{n} \sum_{i=0}^n \mathbb{I}\{x_i = x\} - p(x)| \leq \epsilon p(x) \forall x$

\subsubsection{Properties}
\begin{itemize}
 \item $P\left(\bar{x} \in T^n(\epsilon)\right) \rightarrow 1$ as $n \rightarrow \infty$.
 \item $Vol(T^n(\epsilon)) = 2^{n(h(X)+\delta(\epsilon))}$, where $Vol(A) = \int_\mathbb{A} 1\, d\bar{x}$.
\end{itemize}

\vfill
\columnbreak 

\section{Gaussian Channel}
$Y_i = X_i + Z_i$ with $Z_i \sim \mathcal{N}(0, \sigma^2)$ and cost constraint $\frac{1}{n}\sum_{i=1}^{n} x_i^2 \leq P$ gives Capacity:
$$C = \frac{1}{2} \log(1 + P/\sigma^2)$$
Which is attained with $X \sim \mathcal{N}(0, P)$

\subsection{Sphere packing}
Any received vector is normally distributed around the sent codeword and therefore within a sphere of radius $\sqrt{n (\sigma^2 + \epsilon)}$  around that codeword with high probability. $\rightarrow$ small sphere.

In general, received vectors can not exceed a magnitude of $\sqrt{n(P + \sigma^2)}$. $\rightarrow$ big sphere. By calculating how many small spheres can fit into a big sphere maximally, we can get an upper bound on how many codewords can be distinguished on the receiver side without overlap.
$$ M \approx \sqrt{\frac{n(P + \sigma^2)}{n \sigma^2}}^n = \sqrt{2 \log (1 + P/\sigma^2)}^n$$
This gives $R = \frac{\log M}{n} \leq C$


\subsection{Water-filling ($k$ parallel Gaussian Channels)}
Assume we have a set of $k$ gaussian channels (indexed by $j$) in parallel: $Y_j = X_j + Z_j$ with $Z_j \sim \mathcal{N}(0, \sigma^2_j)$. The noise is assumed independent from channel to channel. There is a common total power constraint: $E[\sum_j^k X_j^2] \leq P$. Then:

\begin{align*}
	I(\text{every } X_j; \text{every } Y_j) 
	&\leq \sum_j h(Y_j) + h(Z_j)\\ 
	&\leq \sum_j \frac{1}{2} \log(1 + P_j/\sigma^2_j)
\end{align*}

\begin{colfig}
	\centering
	\includegraphics[width=\linewidth]{water-filling.png}
\end{colfig}

With equality (i.e. capacity is achieved) if the $X_j$ are independent Gaussians with variance $P_j$, where the $P_j$ need to be found by water filling. This means finding a $\nu$ that gives the $P_j$s and fulfills the power requirements:

$$P_j = (\nu - N_i)^+ \text{ and } \sum_j (\nu - N_i)^+ = P$$

Where $(x)^+$ is the positive part of x, i.e. 0 if $x < 0$ and $x$ if $x > 0$.

\section{Codes}
The notation $(n,k,d)_q$ describes a block code over an alphabet $\Sigma$ of size $q$, with a block length $n$, message length $k$, and distance $d$.
If the block code is a linear block code, then the square brackets in the notation $[n,k,d]_q$ are used to represent that fact.
For binary codes with $q=2$, the index is sometimes dropped.
For maximum distance separable codes, the distance is always $d=n-k+1$.

\subsection{Linear Codes}
A code $\mathcal{C} \subseteq \mathbb{F}^n$ is linear if $\forall x,x' \in \mathcal{C}, a,a' \in \mathbb{F}$, $a \cdot x + a' \cdot x' \in \mathcal{C}$, $\cdot$ and $+$ in field $\mathbb{F}$

\textbf{Generator}: a $k \times n$ binary matrix $G$ s.t. $\mathcal{C} = \{[x_1..x_n] = [u_1..u_k]G: (u_1..u_k) \in \{0,1\}^k\}$, i.e. any element of $\mathcal{C}$ is a linear combination of $G$'s rows.
$G = [I_k | - A^T]$

\textbf{Parity-check}: a $n \times k$ binary matrix $H$ s.t. $\mathcal{C} = \{\bar{x}: \bar{x}H = \bar{0}\}$. $H = [A | I_{n-k}]$

$$H G^T = G H^T = 0$$

We send a message $\vec x$ using a codeword $\vec c = \vec x G$ over the channel. The other side will receive a $\vec r = \vec c + (\text{some bits that were flipped } \vec e)$.

Distance: $$d_{min}(\mathcal{C}) = w_{min}(\mathcal{C}) = \min_{\bar{x}\in\mathcal{C}, \bar{x}\neq 0} w_H (\bar{x})$$

\subsection{Binary Linear Codes}
A code $\mathcal{C}$ is a binary linear code if $x,y \in \mathcal{C}$ then $x \oplus y \in \mathcal{C}$ and + being the componentwise modulo 2 addition. The size of the codebook is always a power of two: $|\mathcal{C}| = 2^k$.

\subsubsection{Sphere packing bound}
Given $d$, we can not have too many codewords:
$$M \leq \frac{2^n}{\sum_{i=0}^{r = \lfloor \frac{d-1}{2} \rfloor} {n \choose i}} = \frac{|\text{total possible codewords}|}{|\text{codewords within distance $r$}|}$$

\subsubsection{Gilbert-Varshamov for binary codes}
The maximum possible size $M_{max}$ of a binary code of length $n$ and minimum hamming weight $d$ is
$$M_{max} \geq \frac{2^n}{\sum_{i=0}^{d-1} {n \choose i}} = \frac{|\text{total possible codewords}|}{|\text{codewords closer than $d-1$}|}$$

\subsection{Reed-Solomon Codes}
Linear code $\mathcal{C} \subset \mathbb{F}^n$ with $|\mathcal{C}| = |\mathbb{F}|^k$ given $n \leq |\mathbb{F}|$ and $k \leq n$:
\begin{itemize}
 \item pick $n$ distinct alphas in $\mathbb{F}$, call them $\alpha_1 , .., \alpha_n$:
 $\mathcal{C} = \{(x_1,..,x_n) = \left( u(\alpha_1),..,u(\alpha_n)\right) : (u_0 ... u_{k-1}) \in \mathbb{F}^k\}$ with $u(D) = u_0 + u_1D + .. + u_{k-1}D^{k-1}$
\end{itemize}

Distance: $d_{min} \geq n-(k-1)$ (in fact with equality)

\subsection{Hamming Codes}
Are $[2^m, 2^m - m - 1, 4]$ codes. Where the superfluous bits are used as parity bits.

\section{Polar Codes}

Channels are polarized in the sense that their mutual information is either close to 0 (completely noisy channels) or close to 1 (perfectly noiseless channels).

\subsection{Phase 1: Channel Combining}

\begin{colfig}
	\centering
	\includegraphics[width=\linewidth]{polar-basic-scheme.png}
\end{colfig}

Two seperate channels are combined to create a new vector channel of size 2, which is $W_2: \{0, 1\}^2 \rightarrow \mathcal{Y}^2$. Since the linear transform between $(U_1, U_2)$ and $(X_1, X_2)$ is a one-to-one mapping the following equality holds:
$$I(U_1, U_2; Y_1, Y_2) = I(X_1, X_2; Y_1, Y_2) = 2I(W)$$

The channel combining for general $N = 2^n$ is done recursively in the following way: $W_N: X_N \rightarrow Y_N$ is defined as:
\begin{multline*}
	W_N(Y^N |U^N) = \\
	W_{N/2}(Y^{N/2}|U_{odd}^N \oplus U_{even}^N) \cdot W_{N/2}(Y_{N/2+1}^{N/2}|U^N_{even})
\end{multline*}

where $U_{odd}^{N−1} = (u_1, u_3, ..., u_{N−1})$ and $U_{even}^{N−1} = (u_2, u_4, ..., u_N )$ are the even and odd channels from the lower layer. For 8 channels:

\begin{colfig}
	\centering
	\includegraphics[width=\linewidth]{polar-eight-inputs.png}
\end{colfig}

\subsection{Phase 2: Channel Splitting}

In the basic case, N = 2, using chain rule of mutual information suggests that the channel $W_2$ can be split into two channels $W^+$ and $W^-$:
$$I(U1, U2; Y1, Y2) = I(U1; Y1, Y2) + I(U1; Y1, Y2, U1)$$

$W^- :U_1 \rightarrow (Y_1, Y_2)$

$W^+ :U_2 \rightarrow (Y_1, Y_2, U_1)$

Then we have $2I(W) = I(W^-) + I(W^+)$ and $I(W^-) \leq I(W) \leq I(W^+)$, i.e. the channels are pushed towards the extremes 0 and 1 with respect to $I(W)$ while their sum is still $2I(W)$.

$(W,W,W,W) \rightarrow (W^{-^-},W^{-^+},W^{+^-},W^{+^+})$

$\underbrace{(W,..,W)}_{2^l} \rightarrow \underbrace{(W^{\overbrace{-..-}^l},..,W^{\overbrace{+..+}^l})}_{2^l}$

\textbf{Moderate channel}: there is a function $\mathcal{K}(\delta)$ with $\mathcal{K}(\delta) > 0$, for $0 < \delta \leq \frac{1}{2}$, $\mathcal{K}(0) = 0$, s.t. for any channel $p$,
$$ I(p) \in (\delta,1-\delta) \Rightarrow I(p^+) - I(p^-) = 2[I(p) - I(p^-)] \geq \mathcal{K}(\delta)$$

Fraction of good channels is $\approx I(p)$, bad channels is $\approx 1-I(p)$, moderate channels is $\approx 0$.

\textbf{Code construction}: can achieve any rate $R < I(p)$. Encoding complexity is $\frac{n}{2} \log_2 n$ XOR's for blocklength $n$ code. Decoding complexity is $\mathcal{O}(n\log n)$.


\section{Utilities}
\subsection{Convexity}

$f$ is convex if: $f((1-t)x+ t y)\leq (1-t) f(x)+ t f(y)$

Let $\{f_i\}$ be a set of convex functions and $c_i \geq 0 \ \forall i$:
Then $\sum_{i=1}^n c_i f_i(x_i)$ is convex.

$f(x) = \sup_{i \in I} f_i(x)$ is convex.

$g(x) = h \circ f$ is convex if $f(x)$ is bounded and convex and $h(x)$ is increasing and convex.

\subsection{Kuhn-Tucker}
Given $f(p_1,..,p_n)$, a concave function, twice differenciable defined on $\mathcal{P} = \{\boldsymbol p \in \mathbb{R}^n : p_i \geq 0, \sum p_i = 1\}$. A point $\boldsymbol p* \in \mathcal{P}$ maximizes $f(\boldsymbol p)$ iff $\exists \lambda$ s.t.
\begin{align*}
 \frac{\partial f(\boldsymbol p*)}{\partial p_i} 	& \leq \lambda \, \forall i \\
							& = \lambda \, \forall i \, s.t. p_i^* > 0
\end{align*}

\subsection{Hamming distance \& weight}
Given $\bar{x}, \bar{y}, \bar{z} \in \{0,1\}^n$, the hamming distance is:
$$d_H(\bar{x},\bar{y}) = \#\{i:y_i \neq x_i\}$$

Given $\bar x$, the hamming weight is the number of 1s:
$$w_H(\bar{x}) = \#\{i: x_i \neq 0\}$$

\subsubsection{Properties:}
\begin{itemize}
	 \item Positive: $d_H(\bar{x},\bar{y}) \geq 0$ with equality iff $\bar{x} = \bar{y}$
	 \item Symmetric: $d_H(\bar{x},\bar{y}) = d_H(\bar{y},\bar{x})$
	 \item Triangle inequality: $d_H(\bar{x},\bar{z}) \leq d_H(\bar{x},\bar{y}) + d_H(\bar{y},\bar{z})$
	 \item Distance is the weight of the sum: $d_H(\bar{x},\bar{y}) = w_H(\bar x + \bar y)$
	 \item Weight is the distance to 0:
	 $w_H(\bar x) = d_H(\bar x, \vec 0)$
	 \item Triangle inequality for the weight:
	 $w_H(\bar x + \bar y) \leq w_H(\bar x) + w_H(\bar y)$
\end{itemize}

\subsection{Miscellaneous}
$\log(x) \leq 1 + x$

$\lceil x \rceil < 1 + a$

$E[Z] = \sum_{n=0}^{\infty} Pr[Z > n]$

Jensen's inequality for convex functions $f$: $ E[f(X)] \geq f(E[X])$ ($\leq$ for concave)

$a^b = 2^{ b \log a}$

Law of large numbers: $\lim_{n \rightarrow \infty} \frac{1}{n} \sum_{i=1}^n X_i = E[X]$

If a $K$-ary tree has $\alpha$ intermediate nodes, the tree has $1+(K-1)\alpha$ leaves.


$\max(f(t) + g(t)) \leq \max(f(t)) + \max(g(t))$


$h_2 \left(\frac{1}{L}\sum_{i=1}^L p_i \right) \geq \frac{1}{L} \sum_{i=1}^L h_2(p_i)$

Normal distribution: $f(x \; | \; \mu, \sigma) = \frac{1}{\sigma\sqrt{2\pi} } \; e^{ -\frac{(x-\mu)^2}{2\sigma^2} }$

Integration by parts: $\int u(x) v'(x) \, dx = u(x) v(x) - \int v(x) \, u'(x)  dx$ 

Total probabilities: $p(x) = \sum_y p(x,y)$

Bayes: $P(X|Y) = \frac{P(X,Y)}{P(Y)} = \frac{P(Y|X)P(X)}{P(Y)}$

Variance: $Var[X] = E[(X-E[X])^2] = E[X^2] - E[X]^2$

Geometric progressions:
$$\sum_{k=0}^n r^k = \frac{1-r^{n+1}}{1-r},\, \forall r \in \mathbb{R}-\{1\}$$
$$\sum_{k=1}^\infty k(k-p)^{k-1} = \frac{1}{p^2}$$

Markov chain: $A-B-C-D$ then $p(a,b,c,d) = p(a)p(b|a)p(c|b)p(d|c)$ and the pairs \{A,C\},\{A,D\} and \{B,D\} are independent.

Union Bound: ${\mathbb P}\biggl(\bigcup_{i} A_i\biggr) \le \sum_i {\mathbb P}(A_i)$

\end{multicols*}
\end{document}