-
Notifications
You must be signed in to change notification settings - Fork 0
/
information-theory-cheat-sheet.tex
606 lines (442 loc) · 25.1 KB
/
information-theory-cheat-sheet.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
\documentclass[10pt,a4paper,landscape]{article}
\usepackage{multicol}
\usepackage{calc}
\usepackage{ifthen}
\usepackage[landscape]{geometry}
\usepackage{amsmath,amsthm,amsfonts,amssymb,mathtools}
\usepackage{color,graphicx}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{underscore}
\usepackage{todonotes}
\usepackage{algpseudocode}
\usepackage{algorithm}
% Cheatsheet style
\input{style.tex}
% Shorthands
\renewcommand{\bf}[1]{\ensuremath{\mathbf{#1}}}
\newcommand{\E}{\mathrm{E}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\balpha}{\boldsymbol\alpha}
\newcommand{\bbeta}{\boldsymbol\beta}
\newcommand{\bdelta}{\boldsymbol\delta}
\newcommand{\btheta}{\boldsymbol\theta}
\newcommand{\bPhi}{\boldsymbol\Phi}
\newcommand{\code}{\mathcal{C}}
\newcommand{\alphabet}{\mathcal{U}}
\pdfinfo{
/Title (Machine Learning Cheat Sheet)
/Creator (TeX)
/Producer (pdfTeX 1.40.0)
/Author (Dennis Meier)
/Subject (Machine Learning cheatsheet)
/Keywords (machinelearning, ml, bayes, regression, classification)
}
% -----------------------------------------------------------------------
\begin{document}
\title{Information Theory Cheat Sheet}
\raggedright
\footnotesize
\sffamily
\begin{multicols*}{4}
% multicol parameters
% These lengths are set only within the two main columns
%\setlength{\columnseprule}{0.25pt}
\setlength{\premulticols}{1pt}
\setlength{\postmulticols}{1pt}
\setlength{\multicolsep}{1pt}
\setlength{\columnsep}{2pt}
\begin{center}
\Large{\underline{Information Theory Cheat Sheet}}
\end{center}
% ----------
\section{Entropy and co.}
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{entropy-quantities-venn-diagram.png}
\end{colfig}
\subsection{Entropy}
$H(U) = \sum_u - P(u) \log_2(P(u)) = E[- \ln(P(U))]$
\subsubsection{Properties:}
\begin{itemize}
\item $H(U) \leq \log | \alphabet |$ with equality if all elements in $\alphabet$ are equally probable.
\item $H(U) \geq 0$ always.
\item $H(U) \geq H(f(U))$
\end{itemize}
\subsection{Joint Entropy}
$H(X,Y) = H(X) + H(Y | X) = H(Y) + H(X | Y)$
\subsubsection{Properties:}
\begin{itemize}
\item Joint increases entropy: $H(X, Y) \geq H(X)$ with equality iff $X$ and $Y$ are independent.
\item $H(X,Y) \leq H(X) + H(Y)$
\item Chain rule: $H(X, Y) = H(X) + H(Y | X)$
\item General chain rule: $H(X_1, X_2, ..., X_n) = \sum_{i=1}^n H(X_i | X_1, ..., X_{i-1})$
\end{itemize}
\subsection{Conditional Entropy}
$H(U | V = v) = \sum_u - P(u | v) \log P(u | v)$
$H(U | V) = \sum_v P(v) H(U | V = v)$
\subsubsection{Properties:}
\begin{itemize}
\item Conditioning reduces entropy: $H(U | V) \leq H(U)$
\item $H(U | V) \leq H(U | f(V))$ since $U, V, f(V)$ form a markov chain.
\item Chain rule: $H( U | V) = H(U,V) - H(V)$
\item If $X_i$ is a stationary stochastic process: $H(X_n | X^{n-1}) \leq H(X_i | X^{i-1})$ with $i \in 1, ..., n$
\end{itemize}
\subsubsection{Fano's Inequality}
Suppose $U,V$ are random variables with the same alphabet. Let $p=Pr(U \neq V)$. Then $ H(U|V) \leq h_2(p) + p\log (|\mathcal{U}|-1)$ where $h_2(p) = p\log \frac{1}{p} + (1-p)\log\frac{1}{1-p}$
\subsection{Mutual Information}
\begin{align*}
I(U; V) &= H(U) + H(V) - H(U,V)\\
&= H(U) - H(U | V)\\
&= H(V) - H(V | U)\\
&= H(U,V) - H(U | V) - H (V | U)\\
&= D(p_{UV} || p_Up_V)
\end{align*}
\subsubsection{Properties:}
\begin{itemize}
\item Nonnegative: $I(U;V) \geq 0$ with equality if $U, V$ are independent.
\item Chain rule: $I(X; Y, Z) = I(X;Z) + I(X; Y | Z)$
\item General chain rule for mutual Information: $I(X_1, ..., X_n; Y) = \sum_{i=1}^n I(X_i; Y | X_1, ..., X_{i-1})$
\item \textbf{Data processing} inequality: $I(X,Y) \geq I(X, Z)$ if X, Y, Z form a markov-chain.
\item Symmetric: $I(X;Y) = I(Y;X)$.
\item Is concave.
\end{itemize}
\subsection{Conditional Mutual Information}
\begin{align*}
I(X;Y|Z) &= I(X;Y,Z) - I(X;Z)\\
&= H(X,Z) + H(Y,Z) - H(X,Y,Z) - H(Z)\\
&= H(X|Z) - H(X|Y,Z)\\
&= \sum_z p(z) I(X;Y|Z=z)\\
&= \sum_{y \in Y} \sum_{x \in X}
p(x,y) \log{ \left(\frac{p(x,y)}{p(x)\,p(y)}
\right) }
\end{align*}
\subsection{Cross-Entropy, Divergence}
$$ D(p || q) = \sum_x p(x) \log \frac{p(x)}{q(x)}$$
\subsubsection{Properties}
\begin{itemize}
\item Information inequality / Gibb's inequality: $D(p||q) \geq 0$ with equality iff $p(x) = q(x)$ for all $x$.
\item Additive for independent distributions: If $P_1, P_2$ are independent distributions, with the joint distribution $P(x,y) = P_1(x)P_2(y)$, and $Q, Q_1, Q_2$ likewise, then:
$$D_{\mathrm{KL}}(P \| Q) = D_{\mathrm{KL}}(P_1 \| Q_1) + D_{\mathrm{KL}}(P_2 \| Q_2)$$
\end{itemize}
\section{Entropy Rate \& Typical Sets}
Entropy rate of a stochastic Process $\{X_i\}$ is
$$H(X_i) = \lim_{n \rightarrow \infty} \frac{1}{n} H(X_1, X_2, ..., X_n)$$
If a process is stationary, then the entropy rate exists and is equal to:
$$H(X_n | X_{n-1}, X_{n-2}, ..., X_1)$$
\subsection{Asymptotic Equpartition Property}
Information Theory's analog to the law of large numbers: If $X_1, X_2, ...$ are i.i.d. $\sim p(x)$, then
$$ - \frac{1}{n} \log p(X_1, X_2, ..., X_n) \rightarrow H(X) \text{ in probability}$$
Or alternatively,
$$ p(X_1, X_2, ..., X_n) \rightarrow 2^{-n H}$$
\subsection{Typcial Set}
The typical set $T_\epsilon$ is the set of sequences $(x_1, x_2, ..., x_n) \in \alphabet^n$ with empirical entropies $\epsilon$-close to the true entropy of $X$:
\begin{align*}
&p(x_1, x_2, ..., x_n) = 2^{-n(H(X) \pm \epsilon)}\\
\Leftrightarrow & - \frac{1}{n} \log p(x^n) = H(X) \pm \epsilon\\
\Leftrightarrow &\left | - \frac{1}{n} \log p(x^n) - H(X) \right | < \epsilon
\end{align*}
\subsubsection{Properties of the typical set:}
\begin{itemize}
\item Has probability nearly 1: $P(T_\epsilon) > 1 - \epsilon$
\item All elements in $T_\epsilon$ are nearly equiprobable (with probability as given above)
\item The number of elements in $T_\epsilon$ is nearly $2^{nH}$
\end{itemize}
\section{Coding schemes}
\subsection{Tunstall Coding}
\begin{algorithmic}[1]
\State {$D$ := tree of $|\mathcal{U}|$ leaves, one for each letter in $\mathcal{U}$.}
\While{$|D| < C$}
\State {Convert most probable leaf to $|\mathcal{U}|$ sub-leaves.}
\EndWhile
\end{algorithmic}
\subsection{Huffman Coding}
\begin{algorithmic}[1]
\State {Leaf node for each symbol in $\mathcal{U}$, add it to the priority queue.}
\While {there is more than one node in the queue}:
\State {Remove the two nodes of highest priority (lowest probability) from the queue}
\State {Create a new internal node with these two nodes as children and with probability equal to the sum of the two nodes' probabilities.}
\State {Add the new node to the queue.}
\EndWhile
\State {The remaining node is the root node and the tree is complete.}
\end{algorithmic}
\subsection{Lempel-Ziv coding}
\begin{algorithmic}[1]
\State {Set $\mathcal D = \alphabet$}
\Loop
\State Associate to each $w \in \mathcal D$ a $\lceil \log | \mathcal D | \rceil$-bit binary representation, based on dictionary order.
\State Parse the next work $w$ from the source sequence using $\mathcal D$
\State Set $\mathcal D \gets \mathcal D \setminus \{w\} \cup \{ w u: u \in \alphabet \}$
\EndLoop
\end{algorithmic}
\section{Source Coding}
A source code $\code$ for a random variable $X$ is a mapping from the range of X to $D^*$, the set of finite-length strings of symbols from a D-ary alphabet.
Expected length of a uniquely decodable source code C(x) for a random variable X with probability mass function p(x) is given by:
$$L(C) = \sum_{x \in \alphabet} p(x) l(x) \geq H(X)$$
with equality only if $D^{-l_i} = p_i$.
Expected codeword length: $E[len(\code(U))] < H(U) + 1$
Prefix-free codes can always be represented as leaves of a full tree.
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{code-classes.png}
\end{colfig}
\subsection{Kraft's inequality}
$$\sum_{u \in \alphabet} 2^{-len(\code(u))} \leq 1$$
$\code$ prefix free $\implies$ Kraft
$\code$ uniquely decodable $\implies$ Kraft
$l(u)$ fullfills Kraft $\implies$ $\exists \ \code$, prefix free with $len(\code(u)) = l(u)$
\subsection{Dictionaries}
A dictionary $\mathcal{D}$ is valid if every source sequence has a prefix in $\mathcal{D}$. It is prefix-free if no dictionary word is a prefix of another.
If $W$ is the first word obtained when parsing an iid source $U_1,U_2,...$ with a valid prefix-free dictionary $\mathcal{D}$, then $$H(W) = E[length(W)]H(U)$$
$$\frac{\text{nb of bits}}{\text{letter}} \rightarrow \frac{\lceil\log |\mathcal{D}|\rceil}{E[length(W)]}$$
\subsection{Finite state machine}
A set of states $S$, a starting state, some state transfer function $g: S \times \alphabet \rightarrow S$ and some output function $f: S \times \alphabet \rightarrow \{0,1\}^*$
Letigimate FSM needs to be information lossless, i.e. $\forall s \in S$ $\forall u_1, ..., u_m \neq v_1, ..., v_m: g(s, \vec u) = g(s, \vec v) \implies f(s, \vec u) \neq f(s, \vec v)$
For a $L$-state finite state machine the compressibility is $\rho_M \geq 1/L$
A lempel-ziv procedure does at least as well as the best finite state machine there exists for a given sequence: $\rho_{LZ}(\vec x) \leq \rho_{FSM}(\vec x)$.
\section{Channel Coding}
\subsection{Discrete Memoryless Channels}
memoryless: $P(y|x, x_1^{i-1}, y_1^{i-1}) = p(y|x)$
without feedback if $P(x_i|x_1^{i-1},y_1^{i-1}) = P(x_i|x_1^{i-1})$
memoryless without feedback: $$P(y_1^n | x_1^n) = \prod_{i=1}^n p(y_i|x_i)$$
$$I(X_1^n;Y_1^n) \leq \sum_{i=1}^n I(X_i;Y_i) \leq nC$$
\subsection{Channel Capacity}
Information channel capacity of a discrete memoryless channel:
$$C = \max_{p(x)} I(X; Y)$$
Feedback does not increase the capacity for a discrete memoryless channel. Tinkering with the channel by using functions on it does not increase the capacity, either. However, channels with memory can have higher capacity than without memory.
\subsubsection{Properties}
\begin{enumerate}
\item Positive: $C \geq 0$ since $I(X; Y) \geq 0$
\item $C \leq \log | \mathcal{X} |$ since $C = \max I(X; Y) \leq \max H(X) = \log | \mathcal{X} |$ and similar for Y.
\end{enumerate}
The capacity subject to a cost constraint $\beta$ and a cost function $b(x)$ changes the definition to $C = \max_{p(x): E[b(X)] < \beta} I(X; Y)$
\subsection{Transmission}
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{comm-setup.png}
\end{colfig}
Source produces a letter each $\tau_S$ sec and channel accepts input symbols every $\tau_C$ sec: $\tau_S L = \tau_C n$
Suppose $U_1U_2..$ is a stationary source with Entropy Rate $\mathcal{H}$, and we have the setup above, then $\frac{1}{L} H(U_1..U_L|V_1..V_L) \geq \mathcal{H} - \frac{n}{L}C = \tau_S\left(\frac{\mathcal{H}}{\tau_S}-\frac{C}{\tau_C}\right)$
No matter how encoder and decoder are designed:
$$ h_2(\bar{P_e}) + \bar{P_e}\log(|\mathcal{U}|-1) \geq \tau_S \left(\frac{\mathcal{H}}{\tau_S}-\frac{C}{\tau_C}\right)$$ with $\bar{P_e} = \frac{1}{L} \sum_{i=1}^L P(U_i \neq V_i)$
Bad news: if $\frac{\mathcal{H}}{\tau_S} > \frac{C}{\tau_C}$ then $\bar{P_e}$ not arbitrary small
Good news: if $\frac{\mathcal{H}}{\tau_S} < \frac{C}{\tau_C}$ can achieve $\bar{P_e} \rightarrow 0$
\subsection{Channel Coding Theorem}
$M$ is the number of different messages we can send over the channel.
$R = \frac{\log M}{n}$ is the rate of the channel code, i.e. how many message bits are encoded per channel bit sent.
For a discrete memoryless channel, all rates below capacity $C$ are achievable. Specifically, for every rate $R < C$ there exists a sequence of $(2^{nR}, n)$ codes with maximum probability of error $\rightarrow 0$.
Conversely, any sequence of $(2^{nR}, n)$ codes with a maximum error probability going to zero must have $R \leq C$.
\subsection{Source-channel theorem}
A stochastic process with entropy rate H cannot be sent reliably over a discrete memoryless channel if $H > C$. Conversely, if the process satisfies the asymptotic equipartition property (APP), the source can be transmitted reliably if H < C.
\subsection{Singleton bound}
If number of codewords $M > 2^k$ then $$d_{min} = \min_{1\leq i \leq M, 1 \leq j \leq M, i \neq j} d_H(x^{(i)}, x^{(j)}) \leq n-k$$
\section{Example Channels}
\subsection{Binary Symmetric channel}
$C = 1 - H(p)$ because:
\begin{align*}
I(X; Y) &= H(Y) - H(Y | X)\\
&= H(Y) - \sum p(x) H(Y| X = x)\\
&= H(Y) - \sum p(x) H(p)\\
&= H(Y) - H(p)\\
&\leq 1 - H(p).
\end{align*}
\begin{colfig}
\centering
\includegraphics[width=0.6\linewidth]{binary-symmetric-channel.png}
\end{colfig}
\subsection{Binary erasure channel}
$C = 1 - \alpha$
\begin{colfig}
\centering
\includegraphics[width=0.6\linewidth]{binary-erasure-channel.png}
\end{colfig}
\subsection{Arbitrary symmetric channel}
A channel is said to be \emph{weakly symmetric} if every row of the transition matrix $p(\cdot | x)$ is a permutation of every other row and all the column sums $\sum_x (y|x)$ are equal. For weakly symmetric channels:
$C = \log | \mathcal{Y} | - H(\text{row of transition matrix})$
and this is achieved by a uniform distribution on the input alphabet.
\section{Differential Entropy}
\begin{align*}
h(X) & = -\int_\mathbb{X} f(x)\log f(x)\,dx \\
& = E[-\log f(X)]
\end{align*}
\begin{align*}
D(f||g) & = \int_\mathbb{X} f(x) \log \frac{f(x)}{g(x)} \, dx \\
& = E_f[\log \frac{f(X)}{g(X)}]
\end{align*}
\begin{align*}
I(X;Y) & = h(X) - h(X|Y)\\
& = h(Y) - h(Y|X)\\
& = \int_\mathbb{X} \int_\mathbb{Y} f(x,y) \log \frac{f(y|x)}{f(y)}\,dx\,dy \\
& = D(f_{XY} || f_Xf_Y)
\end{align*}
\subsubsection{Examples}
\begin{itemize}
\item For the uniform distribution $\mathcal{U}[0, a]$:
$h(X) = - \int_0^a \frac{1}{a} \log \frac{1}{a} dx = \log a$
\item For the normal:
$h(X) = \frac{1}{2} \log 2 \pi e \sigma$
\end{itemize}
\subsubsection{Properties:}
\begin{itemize}
\item General chain rule: $h(X_1, \ldots, X_n) = \sum_{i=1}^{n} h(X_i|X_1, \ldots, X_{i-1}) \leq \sum h(X_i)$
\item Translation-Invariant: $h(X + c) = h(X)$
\item Scalable: $h(aX) = h(X) + log|a|$
\item Conditioning reduces Entropy: $h(X|Y) \leq h(X)$ with equality iff $X$ and $Y$ are independent.
\item Can be negative, unlike discrete entropy.
\item Divergence and mutual information are non-negative.
\item Entropy of $X$ (on $\mathbb{R}$ with zero mean and variance $\sigma^2$) is upper bounded by: $h(X) \leq \frac{1}{2} \log ( 2\pi e \sigma^2)$, with equality if $X$ is a Gaussian.
\item Entropy of $X$ (on $[0, \infty]$ with mean $\lambda$) is upper bounded by $h(X) \leq \ln(\lambda) + 1$ with equality if X is exponentially distributed $p(X) = \frac{1}{\lambda} e^{-x/\lambda}$
\item Entropy of $X$ (on $[0, a]$) is bounded by $h(X) \leq \log a$, with equality if $X$ is Uniform.
\end{itemize}
\subsubsection{Vector extensions}
\begin{itemize}
\item Entropy of $\boldsymbol X$ (on $\mathbb{R}^d$ with zero mean and $Cov(\boldsymbol X) = K$) is upper bounded by: $h(\boldsymbol X) \leq \frac{1}{2} \log \det (2\pi eK)$, with equality if $\boldsymbol X$ is jointly Gaussian.
\item $h(A\boldsymbol X) = h(\boldsymbol X) + \log|\det A|$
\item $h(\boldsymbol X + \boldsymbol c) = h(\boldsymbol X)$
\end{itemize}
\subsection{Typical Set}
Entropy typical set: $T^n(\epsilon) = \{\bar{x}: |-\frac{1}{n}\sum_{i=1}^n \log f(x_i) - h(X)| \leq \epsilon\}$
Strong typicality: $|\frac{1}{n} \sum_{i=0}^n \mathbb{I}\{x_i = x\} - p(x)| \leq \epsilon p(x) \forall x$
\subsubsection{Properties}
\begin{itemize}
\item $P\left(\bar{x} \in T^n(\epsilon)\right) \rightarrow 1$ as $n \rightarrow \infty$.
\item $Vol(T^n(\epsilon)) = 2^{n(h(X)+\delta(\epsilon))}$, where $Vol(A) = \int_\mathbb{A} 1\, d\bar{x}$.
\end{itemize}
\vfill
\columnbreak
\section{Gaussian Channel}
$Y_i = X_i + Z_i$ with $Z_i \sim \mathcal{N}(0, \sigma^2)$ and cost constraint $\frac{1}{n}\sum_{i=1}^{n} x_i^2 \leq P$ gives Capacity:
$$C = \frac{1}{2} \log(1 + P/\sigma^2)$$
Which is attained with $X \sim \mathcal{N}(0, P)$
\subsection{Sphere packing}
Any received vector is normally distributed around the sent codeword and therefore within a sphere of radius $\sqrt{n (\sigma^2 + \epsilon)}$ around that codeword with high probability. $\rightarrow$ small sphere.
In general, received vectors can not exceed a magnitude of $\sqrt{n(P + \sigma^2)}$. $\rightarrow$ big sphere. By calculating how many small spheres can fit into a big sphere maximally, we can get an upper bound on how many codewords can be distinguished on the receiver side without overlap.
$$ M \approx \sqrt{\frac{n(P + \sigma^2)}{n \sigma^2}}^n = \sqrt{2 \log (1 + P/\sigma^2)}^n$$
This gives $R = \frac{\log M}{n} \leq C$
\subsection{Water-filling ($k$ parallel Gaussian Channels)}
Assume we have a set of $k$ gaussian channels (indexed by $j$) in parallel: $Y_j = X_j + Z_j$ with $Z_j \sim \mathcal{N}(0, \sigma^2_j)$. The noise is assumed independent from channel to channel. There is a common total power constraint: $E[\sum_j^k X_j^2] \leq P$. Then:
\begin{align*}
I(\text{every } X_j; \text{every } Y_j)
&\leq \sum_j h(Y_j) + h(Z_j)\\
&\leq \sum_j \frac{1}{2} \log(1 + P_j/\sigma^2_j)
\end{align*}
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{water-filling.png}
\end{colfig}
With equality (i.e. capacity is achieved) if the $X_j$ are independent Gaussians with variance $P_j$, where the $P_j$ need to be found by water filling. This means finding a $\nu$ that gives the $P_j$s and fulfills the power requirements:
$$P_j = (\nu - N_i)^+ \text{ and } \sum_j (\nu - N_i)^+ = P$$
Where $(x)^+$ is the positive part of x, i.e. 0 if $x < 0$ and $x$ if $x > 0$.
\section{Codes}
The notation $(n,k,d)_q$ describes a block code over an alphabet $\Sigma$ of size $q$, with a block length $n$, message length $k$, and distance $d$.
If the block code is a linear block code, then the square brackets in the notation $[n,k,d]_q$ are used to represent that fact.
For binary codes with $q=2$, the index is sometimes dropped.
For maximum distance separable codes, the distance is always $d=n-k+1$.
\subsection{Linear Codes}
A code $\mathcal{C} \subseteq \mathbb{F}^n$ is linear if $\forall x,x' \in \mathcal{C}, a,a' \in \mathbb{F}$, $a \cdot x + a' \cdot x' \in \mathcal{C}$, $\cdot$ and $+$ in field $\mathbb{F}$
\textbf{Generator}: a $k \times n$ binary matrix $G$ s.t. $\mathcal{C} = \{[x_1..x_n] = [u_1..u_k]G: (u_1..u_k) \in \{0,1\}^k\}$, i.e. any element of $\mathcal{C}$ is a linear combination of $G$'s rows.
$G = [I_k | - A^T]$
\textbf{Parity-check}: a $n \times k$ binary matrix $H$ s.t. $\mathcal{C} = \{\bar{x}: \bar{x}H = \bar{0}\}$. $H = [A | I_{n-k}]$
$$H G^T = G H^T = 0$$
We send a message $\vec x$ using a codeword $\vec c = \vec x G$ over the channel. The other side will receive a $\vec r = \vec c + (\text{some bits that were flipped } \vec e)$.
Distance: $$d_{min}(\mathcal{C}) = w_{min}(\mathcal{C}) = \min_{\bar{x}\in\mathcal{C}, \bar{x}\neq 0} w_H (\bar{x})$$
\subsection{Binary Linear Codes}
A code $\mathcal{C}$ is a binary linear code if $x,y \in \mathcal{C}$ then $x \oplus y \in \mathcal{C}$ and + being the componentwise modulo 2 addition. The size of the codebook is always a power of two: $|\mathcal{C}| = 2^k$.
\subsubsection{Sphere packing bound}
Given $d$, we can not have too many codewords:
$$M \leq \frac{2^n}{\sum_{i=0}^{r = \lfloor \frac{d-1}{2} \rfloor} {n \choose i}} = \frac{|\text{total possible codewords}|}{|\text{codewords within distance $r$}|}$$
\subsubsection{Gilbert-Varshamov for binary codes}
The maximum possible size $M_{max}$ of a binary code of length $n$ and minimum hamming weight $d$ is
$$M_{max} \geq \frac{2^n}{\sum_{i=0}^{d-1} {n \choose i}} = \frac{|\text{total possible codewords}|}{|\text{codewords closer than $d-1$}|}$$
\subsection{Reed-Solomon Codes}
Linear code $\mathcal{C} \subset \mathbb{F}^n$ with $|\mathcal{C}| = |\mathbb{F}|^k$ given $n \leq |\mathbb{F}|$ and $k \leq n$:
\begin{itemize}
\item pick $n$ distinct alphas in $\mathbb{F}$, call them $\alpha_1 , .., \alpha_n$:
$\mathcal{C} = \{(x_1,..,x_n) = \left( u(\alpha_1),..,u(\alpha_n)\right) : (u_0 ... u_{k-1}) \in \mathbb{F}^k\}$ with $u(D) = u_0 + u_1D + .. + u_{k-1}D^{k-1}$
\end{itemize}
Distance: $d_{min} \geq n-(k-1)$ (in fact with equality)
\subsection{Hamming Codes}
Are $[2^m, 2^m - m - 1, 4]$ codes. Where the superfluous bits are used as parity bits.
\section{Polar Codes}
Channels are polarized in the sense that their mutual information is either close to 0 (completely noisy channels) or close to 1 (perfectly noiseless channels).
\subsection{Phase 1: Channel Combining}
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{polar-basic-scheme.png}
\end{colfig}
Two seperate channels are combined to create a new vector channel of size 2, which is $W_2: \{0, 1\}^2 \rightarrow \mathcal{Y}^2$. Since the linear transform between $(U_1, U_2)$ and $(X_1, X_2)$ is a one-to-one mapping the following equality holds:
$$I(U_1, U_2; Y_1, Y_2) = I(X_1, X_2; Y_1, Y_2) = 2I(W)$$
The channel combining for general $N = 2^n$ is done recursively in the following way: $W_N: X_N \rightarrow Y_N$ is defined as:
\begin{multline*}
W_N(Y^N |U^N) = \\
W_{N/2}(Y^{N/2}|U_{odd}^N \oplus U_{even}^N) \cdot W_{N/2}(Y_{N/2+1}^{N/2}|U^N_{even})
\end{multline*}
where $U_{odd}^{N−1} = (u_1, u_3, ..., u_{N−1})$ and $U_{even}^{N−1} = (u_2, u_4, ..., u_N )$ are the even and odd channels from the lower layer. For 8 channels:
\begin{colfig}
\centering
\includegraphics[width=\linewidth]{polar-eight-inputs.png}
\end{colfig}
\subsection{Phase 2: Channel Splitting}
In the basic case, N = 2, using chain rule of mutual information suggests that the channel $W_2$ can be split into two channels $W^+$ and $W^-$:
$$I(U1, U2; Y1, Y2) = I(U1; Y1, Y2) + I(U1; Y1, Y2, U1)$$
$W^- :U_1 \rightarrow (Y_1, Y_2)$
$W^+ :U_2 \rightarrow (Y_1, Y_2, U_1)$
Then we have $2I(W) = I(W^-) + I(W^+)$ and $I(W^-) \leq I(W) \leq I(W^+)$, i.e. the channels are pushed towards the extremes 0 and 1 with respect to $I(W)$ while their sum is still $2I(W)$.
$(W,W,W,W) \rightarrow (W^{-^-},W^{-^+},W^{+^-},W^{+^+})$
$\underbrace{(W,..,W)}_{2^l} \rightarrow \underbrace{(W^{\overbrace{-..-}^l},..,W^{\overbrace{+..+}^l})}_{2^l}$
\textbf{Moderate channel}: there is a function $\mathcal{K}(\delta)$ with $\mathcal{K}(\delta) > 0$, for $0 < \delta \leq \frac{1}{2}$, $\mathcal{K}(0) = 0$, s.t. for any channel $p$,
$$ I(p) \in (\delta,1-\delta) \Rightarrow I(p^+) - I(p^-) = 2[I(p) - I(p^-)] \geq \mathcal{K}(\delta)$$
Fraction of good channels is $\approx I(p)$, bad channels is $\approx 1-I(p)$, moderate channels is $\approx 0$.
\textbf{Code construction}: can achieve any rate $R < I(p)$. Encoding complexity is $\frac{n}{2} \log_2 n$ XOR's for blocklength $n$ code. Decoding complexity is $\mathcal{O}(n\log n)$.
\section{Utilities}
\subsection{Convexity}
$f$ is convex if: $f((1-t)x+ t y)\leq (1-t) f(x)+ t f(y)$
Let $\{f_i\}$ be a set of convex functions and $c_i \geq 0 \ \forall i$:
Then $\sum_{i=1}^n c_i f_i(x_i)$ is convex.
$f(x) = \sup_{i \in I} f_i(x)$ is convex.
$g(x) = h \circ f$ is convex if $f(x)$ is bounded and convex and $h(x)$ is increasing and convex.
\subsection{Kuhn-Tucker}
Given $f(p_1,..,p_n)$, a concave function, twice differenciable defined on $\mathcal{P} = \{\boldsymbol p \in \mathbb{R}^n : p_i \geq 0, \sum p_i = 1\}$. A point $\boldsymbol p* \in \mathcal{P}$ maximizes $f(\boldsymbol p)$ iff $\exists \lambda$ s.t.
\begin{align*}
\frac{\partial f(\boldsymbol p*)}{\partial p_i} & \leq \lambda \, \forall i \\
& = \lambda \, \forall i \, s.t. p_i^* > 0
\end{align*}
\subsection{Hamming distance \& weight}
Given $\bar{x}, \bar{y}, \bar{z} \in \{0,1\}^n$, the hamming distance is:
$$d_H(\bar{x},\bar{y}) = \#\{i:y_i \neq x_i\}$$
Given $\bar x$, the hamming weight is the number of 1s:
$$w_H(\bar{x}) = \#\{i: x_i \neq 0\}$$
\subsubsection{Properties:}
\begin{itemize}
\item Positive: $d_H(\bar{x},\bar{y}) \geq 0$ with equality iff $\bar{x} = \bar{y}$
\item Symmetric: $d_H(\bar{x},\bar{y}) = d_H(\bar{y},\bar{x})$
\item Triangle inequality: $d_H(\bar{x},\bar{z}) \leq d_H(\bar{x},\bar{y}) + d_H(\bar{y},\bar{z})$
\item Distance is the weight of the sum: $d_H(\bar{x},\bar{y}) = w_H(\bar x + \bar y)$
\item Weight is the distance to 0:
$w_H(\bar x) = d_H(\bar x, \vec 0)$
\item Triangle inequality for the weight:
$w_H(\bar x + \bar y) \leq w_H(\bar x) + w_H(\bar y)$
\end{itemize}
\subsection{Miscellaneous}
$\log(x) \leq 1 + x$
$\lceil x \rceil < 1 + a$
$E[Z] = \sum_{n=0}^{\infty} Pr[Z > n]$
Jensen's inequality for convex functions $f$: $ E[f(X)] \geq f(E[X])$ ($\leq$ for concave)
$a^b = 2^{ b \log a}$
Law of large numbers: $\lim_{n \rightarrow \infty} \frac{1}{n} \sum_{i=1}^n X_i = E[X]$
If a $K$-ary tree has $\alpha$ intermediate nodes, the tree has $1+(K-1)\alpha$ leaves.
$\max(f(t) + g(t)) \leq \max(f(t)) + \max(g(t))$
$h_2 \left(\frac{1}{L}\sum_{i=1}^L p_i \right) \geq \frac{1}{L} \sum_{i=1}^L h_2(p_i)$
Normal distribution: $f(x \; | \; \mu, \sigma) = \frac{1}{\sigma\sqrt{2\pi} } \; e^{ -\frac{(x-\mu)^2}{2\sigma^2} }$
Integration by parts: $\int u(x) v'(x) \, dx = u(x) v(x) - \int v(x) \, u'(x) dx$
Total probabilities: $p(x) = \sum_y p(x,y)$
Bayes: $P(X|Y) = \frac{P(X,Y)}{P(Y)} = \frac{P(Y|X)P(X)}{P(Y)}$
Variance: $Var[X] = E[(X-E[X])^2] = E[X^2] - E[X]^2$
Geometric progressions:
$$\sum_{k=0}^n r^k = \frac{1-r^{n+1}}{1-r},\, \forall r \in \mathbb{R}-\{1\}$$
$$\sum_{k=1}^\infty k(k-p)^{k-1} = \frac{1}{p^2}$$
Markov chain: $A-B-C-D$ then $p(a,b,c,d) = p(a)p(b|a)p(c|b)p(d|c)$ and the pairs \{A,C\},\{A,D\} and \{B,D\} are independent.
Union Bound: ${\mathbb P}\biggl(\bigcup_{i} A_i\biggr) \le \sum_i {\mathbb P}(A_i)$
\end{multicols*}
\end{document}