diff --git a/tutorial/15.quickstart.ipynb b/tutorial/15.quickstart.ipynb
index 011283a95..eaa1fdfc0 100644
--- a/tutorial/15.quickstart.ipynb
+++ b/tutorial/15.quickstart.ipynb
@@ -4,12 +4,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Quick start"
+ "# Quick start\n",
+ "(c) Deniz Yuret, 2019\n",
+ "\n",
+ "This notebook is for the impatient reader who wants to get a flavor of Julia/Knet possibly to compare it with other deep learning frameworks. In 15 lines of code and 30 seconds of GPU time we define, train, and evaluate the LeNet convolutional neural network model from scratch without any predefined layers."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -18,31 +21,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Define convolutional layer:\n",
"struct Conv; w; b; f; end\n",
"(c::Conv)(x) = c.f.(pool(conv4(c.w, x) .+ c.b))\n",
- "Conv(w1,w2,cx,cy,f=relu) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f)"
+ "Conv(w1,w2,cx,cy,f=relu) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Define dense layer:\n",
"struct Dense; w; b; f; end\n",
"(d::Dense)(x) = d.f.(d.w * mat(x) .+ d.b)\n",
- "Dense(i::Int,o::Int,f=relu) = Dense(param(o,i), param0(o), f)"
+ "Dense(i::Int,o::Int,f=relu) = Dense(param(o,i), param0(o), f);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -54,33 +57,53 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading MNIST...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/mnist.jl:33\n"
+ ]
+ }
+ ],
"source": [
"# Load MNIST data\n",
"include(Knet.dir(\"data\",\"mnist.jl\"))\n",
- "dtrn, dtst = mnistdata()"
+ "dtrn, dtst = mnistdata();"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.40e-02 100.00%┣████████████████████████████████████████████████████████████┫ 6000/6000 [00:25/00:25, 238.78i/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.9921"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Train and test LeNet (about 30 secs on a gpu to reach 99% accuracy)\n",
"LeNet = Chain(Conv(5,5,1,20), Conv(5,5,20,50), Dense(800,500), Dense(500,10,identity))\n",
"progress!(adam(LeNet, repeat(dtrn,10)))\n",
"accuracy(LeNet, dtst)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/tutorial/20.mnist.ipynb b/tutorial/20.mnist.ipynb
index 22a519459..0e4ea7a60 100644
--- a/tutorial/20.mnist.ipynb
+++ b/tutorial/20.mnist.ipynb
@@ -9,27 +9,16 @@
},
"source": [
"# Load and minibatch MNIST data\n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "source": [
- "* Objective: Load the [MNIST](http://yann.lecun.com/exdb/mnist) dataset, convert into Julia arrays, split into minibatches using Knet's [minibatch](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.minibatch) function and [Data](https://github.com/denizyuret/Knet.jl/blob/master/src/data.jl) iterator type.\n",
+ "(c) Deniz Yuret, 2019\n",
+ "* Objective: Load the [MNIST](http://yann.lecun.com/exdb/mnist) dataset, convert into Julia arrays, split into minibatches using Knet's [minibatch](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.minibatch) function and [Data](https://github.com/denizyuret/Knet.jl/blob/master/src/data.jl) iterator type.\n",
"* Prerequisites: [Julia arrays](https://docs.julialang.org/en/v1/manual/arrays)\n",
- "* New functions: [dir](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.dir), [minibatch, Data](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.minibatch), [mnist, mnistview](https://github.com/denizyuret/Knet.jl/blob/master/data/mnist.jl)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the next few notebooks, we build classification models for the MNIST handwritten digit recognition dataset. MNIST has 60000 training and 10000 test examples. Each input x consists of 784 pixels representing a 28x28 image. The corresponding output indicates the identity of the digit 0..9."
+ "* New functions: [dir](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.dir), [minibatch, Data](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.minibatch), [mnist, mnistview](https://github.com/denizyuret/Knet.jl/blob/master/data/mnist.jl)\n",
+ "\n",
+ "In the next few notebooks, we build classification models for the MNIST handwritten digit recognition dataset. MNIST has 60000 training and 10000 test examples. Each input x consists of 784 pixels representing a 28x28 image. The corresponding output indicates the identity of the digit 0..9.\n",
+ "\n",
+ "![](http://yann.lecun.com/exdb/lenet/gifs/asamples.gif \"MNIST\")\n",
+ "\n",
+ "[image source](http://yann.lecun.com/exdb/lenet)"
]
},
{
diff --git a/tutorial/25.iterators.ipynb b/tutorial/25.iterators.ipynb
index 8339f3f5a..d8b20563c 100644
--- a/tutorial/25.iterators.ipynb
+++ b/tutorial/25.iterators.ipynb
@@ -22,12 +22,19 @@
"[drop](https://docs.julialang.org/en/v1/base/iterators/#Base.Iterators.drop), \n",
"[cycle](https://docs.julialang.org/en/v1/base/iterators/#Base.Iterators.cycle), \n",
"[Stateful](https://docs.julialang.org/en/v1/base/iterators/#Base.Iterators.Stateful), \n",
- "[iterate](https://docs.julialang.org/en/v1/base/collections/#lib-collections-iteration-1)"
+ "[iterate](https://docs.julialang.org/en/v1/base/collections/#lib-collections-iteration-1)\n",
+ "\n",
+ "The `minibatch` function returns a `Knet.Data` object implemented as a Julia iterator that generates (x,y) minibatches. Iterators are lazy objects that only generate their next element when asked. This has the advantage of not wasting time and memory trying to create and store all the elements at once. We can even have infinite iterators! The training algorithms in Knet are also implemented as iterators so that:\n",
+ "1. We can monitor and report the training loss\n",
+ "2. We can take snapshots of the model during training\n",
+ "3. We can pause/terminate training when necessary\n",
+ "\n",
+ "Here are some things Julia can do with iterators:"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -35,7 +42,8 @@
},
"outputs": [],
"source": [
- "# Load packages, import symbols\n",
+ "# Set display width, load packages, import symbols\n",
+ "ENV[\"COLUMNS\"]=72\n",
"using Pkg; haskey(Pkg.installed(),\"Knet\") || Pkg.add(\"Knet\")\n",
"using Base.Iterators: take, drop, cycle, Stateful\n",
"using Knet"
@@ -43,9 +51,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading MNIST...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/mnist.jl:33\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Knet.Data{Tuple{Array{Float32,4},Array{UInt8,1}}}(Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], UInt8[0x07 0x02 … 0x05 0x06], 100, 10000, false, 9901, 1:10000, false, (28, 28, 1, 10000), (10000,), Array{Float32,4}, Array{UInt8,1})"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Load data\n",
"include(Knet.dir(\"data\",\"mnist.jl\"))\n",
@@ -53,23 +80,22 @@
"dtst = minibatch(xtst,ytst,100)"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The `minibatch` function returns a `Knet.Data` object implemented as a Julia iterator that generates (x,y) minibatches. Iterators are lazy objects that only generate their next element when asked. This has the advantage of not wasting time and memory trying to create and store all the elements at once. We can even have infinite iterators! The training algorithms in Knet are also implemented as iterators so that:\n",
- "1. We can monitor and report the training loss\n",
- "2. We can take snapshots of the model during training\n",
- "3. We can pause/terminate training when necessary\n",
- "\n",
- "Here are some things Julia can do with iterators:"
- ]
- },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(\"28×28×1×100 Array{Float32,4}\", \"100-element Array{UInt8,1}\")"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can peek at the first element using first()\n",
"summary.(first(dtst))"
@@ -77,13 +103,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 100\n"
+ ]
+ }
+ ],
"source": [
"# Iterators can be used in for loops\n",
"# Let's count the elements in dtst:\n",
@@ -94,13 +128,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"100-element Array{Tuple{Array{Float32,4},Array{UInt8,1}},1}\""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Iterators can be converted to arrays using `collect` \n",
"# (don't do this unless necessary, it just wastes memory. Use a for loop instead)\n",
@@ -109,13 +154,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 500\n"
+ ]
+ }
+ ],
"source": [
"# We can generate an iterator for multiple epochs using `repeat`\n",
"# (an epoch is a single pass over the dataset)\n",
@@ -126,13 +179,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 20\n"
+ ]
+ }
+ ],
"source": [
"# We can generate partial epochs using `take` which takes the first n elements\n",
"n = 0\n",
@@ -142,13 +203,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 80\n"
+ ]
+ }
+ ],
"source": [
"# We can also generate partial epochs using `drop` which drops the first n elements\n",
"n = 0\n",
@@ -158,13 +227,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 32502\n"
+ ]
+ }
+ ],
"source": [
"# We can repeat forever using `cycle` (this is useful to train until convergence)\n",
"# You do not want to collect a cycle or run a for loop without break! \n",
@@ -178,13 +255,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[7, 2, 1, 10, 4]\n",
+ "[7, 2, 1, 10, 4]\n",
+ "[7, 2, 1, 10, 4]\n",
+ "[6, 10, 5, 4, 9]\n"
+ ]
+ }
+ ],
"source": [
"# We can make an iterator `Stateful` so it remembers where it left off.\n",
"# (by default iterators start from the beginning)\n",
@@ -198,13 +286,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[7, 2, 1, 10, 4]\n",
+ "[7, 2, 1, 10, 4]\n",
+ "[6, 9, 2, 2, 6]\n",
+ "[7, 4, 9, 9, 9]\n"
+ ]
+ }
+ ],
"source": [
"# We can shuffle instances at every epoch using the keyword argument `shuffle=true`\n",
"# (by default elements are generated in the same order)\n",
@@ -218,13 +317,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×100 LinearAlgebra.Adjoint{Float32,Array{Float32,1}}:\n",
+ " 7990.35 7842.33 8162.68 7692.77 … 8494.0 7361.33 8643.01"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can construct new iterators using [Generator expressions](https://docs.julialang.org/en/v1/manual/arrays/#Generator-Expressions-1)\n",
"# The following example constructs an iterator over the x norms in a dataset:\n",
@@ -234,13 +345,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n = 100\n"
+ ]
+ }
+ ],
"source": [
"# Every iterator implements the `iterate` function which returns\n",
"# the next element and state (or nothing if no elements left).\n",
@@ -256,13 +375,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×100 LinearAlgebra.Adjoint{Any,Array{Any,1}}:\n",
+ " 7990.35 7842.33 8162.68 7692.77 … 8494.0 7361.33 8643.01"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# You can define your own iterator by declaring a new type and overriding the `iterate` method.\n",
"# Here is another way to define an iterator over the x norms in a dataset:\n",
@@ -279,13 +410,6 @@
"\n",
"collect(Xnorm(dtst))'"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/tutorial/30.lin.ipynb b/tutorial/30.lin.ipynb
index c31cf82eb..16291cb03 100644
--- a/tutorial/30.lin.ipynb
+++ b/tutorial/30.lin.ipynb
@@ -9,36 +9,33 @@
},
"source": [
"# Linear models, loss functions, gradients, SGD\n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "source": [
+ "(c) Deniz Yuret, 2019\n",
"* Objectives: Define, train and visualize a simple model; understand gradients and SGD; learn to use the GPU.\n",
"* Prerequisites: [Callable objects](https://docs.julialang.org/en/v1/manual/methods/#Function-like-objects-1), [Generator expressions](https://docs.julialang.org/en/v1/manual/arrays/#Generator-Expressions-1), [MNIST](20.mnist.ipynb), [Iterators](25.iterators.ipynb)\n",
"* New functions: \n",
"[mnistdata](https://github.com/denizyuret/Knet.jl/blob/master/data/mnist.jl),\n",
- "[accuracy](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.accuracy), \n",
- "[zeroone](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.zeroone), \n",
- "[nll](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.nll), \n",
- "[Param, @diff, value, params, grad](http://denizyuret.github.io/Knet.jl/latest/reference.html#AutoGrad),\n",
- "[sgd](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.sgd),\n",
- "[progress, progress!](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.progress), \n",
- "[gpu](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.gpu), \n",
- "[KnetArray](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.KnetArray), \n",
- "[load](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.load), \n",
- "[save](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.save)"
+ "[accuracy](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.accuracy), \n",
+ "[zeroone](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.zeroone), \n",
+ "[nll](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.nll), \n",
+ "[Param, @diff, value, params, grad](http://denizyuret.github.io/Knet.jl/latest/reference/#AutoGrad),\n",
+ "[sgd](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.sgd),\n",
+ "[progress, progress!](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.progress), \n",
+ "[gpu](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.gpu), \n",
+ "[KnetArray](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.KnetArray), \n",
+ "[load](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.load), \n",
+ "[save](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.save)\n",
+ "\n",
+ "\n",
+ " ([image source](https://www.oreilly.com/library/view/tensorflow-for-deep/9781491980446/ch04.html))\n",
+ "\n",
+ "In Knet, a machine learning model is defined using plain Julia code. A typical model consists of a **prediction** and a **loss** function. The prediction function takes some input, returns the prediction of the model for that input. The loss function measures how bad the prediction is with respect to some desired output. We train a model by adjusting its parameters to reduce the loss.\n",
+ "\n",
+ "In this section we will implement a simple linear model to classify MNIST digits. The prediction function will return 10 scores for each of the possible labels 0..9 as a linear combination of the pixel values. The loss function will convert these scores to normalized probabilities and return the average -log probability of the correct answers. Minimizing this loss should maximize the scores assigned to correct answers by the model. We will make use of the loss gradient with respect to each parameter, which tells us the direction of the greatest loss increase. We will improve the model by moving the parameters in the opposite direction (using a GPU if available). We will visualize the model weights and performance over time. The final accuracy of about 92% is close to the limit of what we can achieve with this type of model. To improve further we must look beyond linear models."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -51,7 +48,7 @@
"using Pkg; for p in (\"Knet\",\"AutoGrad\",\"Plots\",\"Images\",\"ImageMagick\"); haskey(Pkg.installed(),p) || Pkg.add(p); end\n",
"using Statistics: mean\n",
"using Base.Iterators: flatten\n",
- "using Random: seed!\n",
+ "import Random # seed!\n",
"using Knet: Knet, AutoGrad, dir, Data, Param, @diff, value, params, grad, progress, progress!, gpu, KnetArray, load, save\n",
"# The following are defined for instruction even though they are provided in Knet\n",
"# using Knet: accuracy, zeroone, nll, sgd"
@@ -59,13 +56,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading MNIST...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/mnist.jl:33\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "600-element Data{Tuple{Array{Float32,2},Array{UInt8,1}}}\n",
+ "100-element Data{Tuple{Array{Float32,2},Array{UInt8,1}}}\n"
+ ]
+ }
+ ],
"source": [
"# Load data (mnistdata basically replicates mnist.ipynb)\n",
"include(Knet.dir(\"data\",\"mnist.jl\"))\n",
@@ -81,18 +95,29 @@
}
},
"source": [
- "## Define linear model"
+ "## Model definition"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Linear([-0.00577016 0.00740097 … 0.000858401 0.0100368; 0.00508172 0.00706544 … 0.000725378 -0.00708007; … ; -0.0163844 -0.0174381 … -0.0148394 -0.00358024; 0.00810655 0.00301301 … 0.011367 -0.00975392], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# In Julia we define a new datatype using `struct`:\n",
"struct Linear; w; b; end\n",
@@ -115,12 +140,12 @@
}
},
"source": [
- "## Prediction and accuracy"
+ "## Prediction"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -134,13 +159,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(\"784×100 Array{Float32,2}\", \"100-element Array{UInt8,1}\")"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"x,y = first(dtst) # The first minibatch from the test set\n",
"summary.((x,y))"
@@ -148,40 +184,84 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×100 LinearAlgebra.Adjoint{Int64,Array{Int64,1}}:\n",
+ " 7 2 1 10 4 1 4 9 5 9 … 1 3 6 9 3 1 4 1 7 6 9"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"Int.(y)' # correct answers are given as an array of integers (remember we use 10 for 0)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10×100 Array{Float64,2}:\n",
+ " -0.0573706 0.0836898 -0.0993739 … 0.0341014 -0.125484 \n",
+ " -0.0340025 0.0118058 0.0451012 -0.0658874 0.0276873 \n",
+ " 0.0293171 0.070559 -0.00201588 0.085163 0.0388435 \n",
+ " 0.068166 -0.0514477 -0.0772223 -0.0186688 0.0323772 \n",
+ " -0.00441849 -0.025335 -0.0217464 0.0426281 -0.000801449\n",
+ " 0.102021 0.0436141 0.0867583 … 0.0987738 0.0433309 \n",
+ " -0.0436877 0.0792506 -0.00330623 -0.0109909 -0.113095 \n",
+ " 0.167162 0.00859507 -0.0264033 0.0839035 0.17578 \n",
+ " 0.0891127 -0.0194897 0.0988181 -0.00689439 0.151461 \n",
+ " -0.0394966 -0.0226349 -0.129688 0.0667691 0.117331 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"ypred = model(x) # Predictions on the first minibatch: a 10x100 score matrix"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.15"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can calculate the accuracy of our model for the first minibatch\n",
"accuracy(model,x,y) = mean(y' .== map(i->i[1], findmax(Array(model(x)),dims=1)[2]))\n",
@@ -190,13 +270,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.09829999999999998"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can calculate the accuracy of our model for the whole test set\n",
"accuracy(model,data) = mean(accuracy(model,x,y) for (x,y) in data)\n",
@@ -205,14 +296,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9017000000000001"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# ZeroOne loss (or error) is defined as 1 - accuracy\n",
"zeroone(x...) = 1 - accuracy(x...)\n",
@@ -227,18 +329,29 @@
}
},
"source": [
- "## Negative log likelihood"
+ "## Loss function"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "nll (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# For classification we use negative log likelihood loss (aka cross entropy, softmax loss, NLL)\n",
"# This is the average -log probability assigned to correct answers by the model\n",
@@ -252,13 +365,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.2995940410919387"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# model(x) gives predictions, let model(x,y) give the loss\n",
"(m::Linear)(x, y) = nll(m(x), y)\n",
@@ -267,13 +391,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.2995940410919387"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can also use the Knet nll implementation for efficiency\n",
"(m::Linear)(x, y) = Knet.nll(m(x), y)\n",
@@ -282,7 +417,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -296,18 +431,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.300518331889146"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Here is per-instance average negative log likelihood for the whole test set\n",
"model(dtst)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Bonus question:** What is special about the loss value 2.3?"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -321,27 +474,161 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "Usage:\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "x = Param([1,2,3]) # user declares parameters with `Param`\n",
+ "x => P([1,2,3]) # `Param` is just a struct wrapping a value\n",
+ "value(x) => [1,2,3] # `value` returns the thing wrapped\n",
+ "sum(x .* x) => 14 # Params act like regular values\n",
+ "y = @diff sum(x .* x) # Except when we differentiate using `@diff`\n",
+ "y => T(14) # you get another struct\n",
+ "value(y) => 14 # which carries the same result\n",
+ "params(y) => [x] # and the Params that it depends on \n",
+ "grad(y,x) => [2,4,6] # and the gradients for all Params\n",
+ "\\end{verbatim}\n",
+ "\\texttt{Param(x)} returns a struct that acts like \\texttt{x} but marks it as a parameter you want to compute gradients with respect to.\n",
+ "\n",
+ "\\texttt{@diff expr} evaluates an expression and returns a struct that contains the result (which should be a scalar) and gradient information.\n",
+ "\n",
+ "\\texttt{grad(y, x)} returns the gradient of \\texttt{y} (output by @diff) with respect to any parameter \\texttt{x::Param}, or \\texttt{nothing} if the gradient is 0.\n",
+ "\n",
+ "\\texttt{value(x)} returns the value associated with \\texttt{x} if \\texttt{x} is a \\texttt{Param} or the output of \\texttt{@diff}, otherwise returns \\texttt{x}.\n",
+ "\n",
+ "\\texttt{params(x)} returns an iterator of Params found by a recursive search of object \\texttt{x}.\n",
+ "\n",
+ "Alternative usage:\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "x = [1 2 3]\n",
+ "f(x) = sum(x .* x)\n",
+ "f(x) => 14\n",
+ "grad(f)(x) => [2 4 6]\n",
+ "gradloss(f)(x) => ([2 4 6], 14)\n",
+ "\\end{verbatim}\n",
+ "Given a scalar valued function \\texttt{f}, \\texttt{grad(f,argnum=1)} returns another function \\texttt{g} which takes the same inputs as \\texttt{f} and returns the gradient of the output with respect to the argnum'th argument. \\texttt{gradloss} is similar except the resulting function also returns f's output.\n",
+ "\n"
+ ],
+ "text/markdown": [
+ "Usage:\n",
+ "\n",
+ "```\n",
+ "x = Param([1,2,3]) # user declares parameters with `Param`\n",
+ "x => P([1,2,3]) # `Param` is just a struct wrapping a value\n",
+ "value(x) => [1,2,3] # `value` returns the thing wrapped\n",
+ "sum(x .* x) => 14 # Params act like regular values\n",
+ "y = @diff sum(x .* x) # Except when we differentiate using `@diff`\n",
+ "y => T(14) # you get another struct\n",
+ "value(y) => 14 # which carries the same result\n",
+ "params(y) => [x] # and the Params that it depends on \n",
+ "grad(y,x) => [2,4,6] # and the gradients for all Params\n",
+ "```\n",
+ "\n",
+ "`Param(x)` returns a struct that acts like `x` but marks it as a parameter you want to compute gradients with respect to.\n",
+ "\n",
+ "`@diff expr` evaluates an expression and returns a struct that contains the result (which should be a scalar) and gradient information.\n",
+ "\n",
+ "`grad(y, x)` returns the gradient of `y` (output by @diff) with respect to any parameter `x::Param`, or `nothing` if the gradient is 0.\n",
+ "\n",
+ "`value(x)` returns the value associated with `x` if `x` is a `Param` or the output of `@diff`, otherwise returns `x`.\n",
+ "\n",
+ "`params(x)` returns an iterator of Params found by a recursive search of object `x`.\n",
+ "\n",
+ "Alternative usage:\n",
+ "\n",
+ "```\n",
+ "x = [1 2 3]\n",
+ "f(x) = sum(x .* x)\n",
+ "f(x) => 14\n",
+ "grad(f)(x) => [2 4 6]\n",
+ "gradloss(f)(x) => ([2 4 6], 14)\n",
+ "```\n",
+ "\n",
+ "Given a scalar valued function `f`, `grad(f,argnum=1)` returns another function `g` which takes the same inputs as `f` and returns the gradient of the output with respect to the argnum'th argument. `gradloss` is similar except the resulting function also returns f's output.\n"
+ ],
+ "text/plain": [
+ " Usage:\n",
+ "\n",
+ "\u001b[36m x = Param([1,2,3]) # user declares parameters with `Param`\u001b[39m\n",
+ "\u001b[36m x => P([1,2,3]) # `Param` is just a struct wrapping a value\u001b[39m\n",
+ "\u001b[36m value(x) => [1,2,3] # `value` returns the thing wrapped\u001b[39m\n",
+ "\u001b[36m sum(x .* x) => 14 # Params act like regular values\u001b[39m\n",
+ "\u001b[36m y = @diff sum(x .* x) # Except when we differentiate using `@diff`\u001b[39m\n",
+ "\u001b[36m y => T(14) # you get another struct\u001b[39m\n",
+ "\u001b[36m value(y) => 14 # which carries the same result\u001b[39m\n",
+ "\u001b[36m params(y) => [x] # and the Params that it depends on \u001b[39m\n",
+ "\u001b[36m grad(y,x) => [2,4,6] # and the gradients for all Params\u001b[39m\n",
+ "\n",
+ " \u001b[36mParam(x)\u001b[39m returns a struct that acts like \u001b[36mx\u001b[39m but marks it as a\n",
+ " parameter you want to compute gradients with respect to.\n",
+ "\n",
+ " \u001b[36m@diff expr\u001b[39m evaluates an expression and returns a struct that\n",
+ " contains the result (which should be a scalar) and gradient\n",
+ " information.\n",
+ "\n",
+ " \u001b[36mgrad(y, x)\u001b[39m returns the gradient of \u001b[36my\u001b[39m (output by @diff) with respect\n",
+ " to any parameter \u001b[36mx::Param\u001b[39m, or \u001b[36mnothing\u001b[39m if the gradient is 0.\n",
+ "\n",
+ " \u001b[36mvalue(x)\u001b[39m returns the value associated with \u001b[36mx\u001b[39m if \u001b[36mx\u001b[39m is a \u001b[36mParam\u001b[39m or the\n",
+ " output of \u001b[36m@diff\u001b[39m, otherwise returns \u001b[36mx\u001b[39m.\n",
+ "\n",
+ " \u001b[36mparams(x)\u001b[39m returns an iterator of Params found by a recursive search\n",
+ " of object \u001b[36mx\u001b[39m.\n",
+ "\n",
+ " Alternative usage:\n",
+ "\n",
+ "\u001b[36m x = [1 2 3]\u001b[39m\n",
+ "\u001b[36m f(x) = sum(x .* x)\u001b[39m\n",
+ "\u001b[36m f(x) => 14\u001b[39m\n",
+ "\u001b[36m grad(f)(x) => [2 4 6]\u001b[39m\n",
+ "\u001b[36m gradloss(f)(x) => ([2 4 6], 14)\u001b[39m\n",
+ "\n",
+ " Given a scalar valued function \u001b[36mf\u001b[39m, \u001b[36mgrad(f,argnum=1)\u001b[39m returns another\n",
+ " function \u001b[36mg\u001b[39m which takes the same inputs as \u001b[36mf\u001b[39m and returns the gradient\n",
+ " of the output with respect to the argnum'th argument. \u001b[36mgradloss\u001b[39m is\n",
+ " similar except the resulting function also returns f's output."
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"@doc AutoGrad"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Linear"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Redefine the constructor to use Param's so we can compute gradients\n",
"Linear(i::Int,o::Int,scale=0.01) = \n",
@@ -350,7 +637,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -364,13 +651,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Linear(P(Array{Float64,2}(10,784)), P(Array{Float64,1}(10)))"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Use a larger scale to get a large initial loss\n",
"model = Linear(784,10,1.0)"
@@ -378,13 +676,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "19.10423456298375"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We can still do predictions and calculate loss:\n",
"model(x,y)"
@@ -392,13 +701,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "T(19.10423456298375)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# And we can do the same loss calculation also computing gradients:\n",
"J = @diff model(x,y)"
@@ -406,13 +726,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "19.10423456298375"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# To get the actual loss value from J:\n",
"value(J)"
@@ -420,13 +751,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2-element Array{Param,1}:\n",
+ " P(Array{Float64,1}(10)) \n",
+ " P(Array{Float64,2}(10,784))"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# params(J) returns an iterator of Params J depends on (i.e. model.b, model.w):\n",
"params(J) |> collect"
@@ -434,13 +778,34 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10×784 Array{Float64,2}:\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# To get the gradient of a parameter from J:\n",
"∇w = grad(J,model.w)"
@@ -448,13 +813,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "∇b = grad(J, model.b) = [-0.139954, -0.064541, -0.109522, -0.1275, -0.059184, -0.0980703, -0.102617, 0.0133898, -0.104578, 0.792576]\n"
+ ]
+ }
+ ],
"source": [
"# Note that each gradient has the same size and shape as the corresponding parameter:\n",
"@show ∇b = grad(J,model.b);"
@@ -477,13 +850,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "value(model.b) = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "19.10423456298375"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Loss for the first minibatch with the original parameters\n",
"@show value(model.b)\n",
@@ -492,13 +883,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.1"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# To numerically check the gradient let's increase the last entry of b by +0.1.\n",
"model.b[10] = 0.1"
@@ -506,13 +908,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "value(model.b) = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "19.183620170313954"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We see that the loss moves by ≈ +0.79*0.1 as expected.\n",
"@show value(model.b)\n",
@@ -521,13 +941,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Reset the change.\n",
"model.b[10] = 0"
@@ -546,7 +977,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -569,40 +1000,73 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 31,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], [-0.139954, -0.064541, -0.109522, -0.1275, -0.059184, -0.0980703, -0.102617, 0.0133898, -0.104578, 0.792576])"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"∇w2,∇b2 = nllgrad(model,x,y)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 32,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "true"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"∇w2 ≈ ∇w"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 33,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "true"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"∇b2 ≈ ∇b"
]
@@ -620,13 +1084,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 34,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sgdupdate! (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Here is a single SGD update:\n",
"function sgdupdate!(func, args; lr=0.1)\n",
@@ -641,13 +1116,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 35,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sgd (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We define SGD for a dataset as an iterator so that:\n",
"# 1. We can monitor and report the training loss\n",
@@ -659,17 +1145,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 36,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "model(dtst) = 2.313187215962106\n",
+ "3.03e-01 100.00%┣███████████████████┫ 6000/6000 [00:10/00:10, 598.92i/s]\n",
+ "model(dtst) = 0.2806535683801265\n"
+ ]
+ }
+ ],
"source": [
"# Let's train a model for 10 epochs to compare training speed on cpu vs gpu.\n",
"# progress!(itr) displays a progress bar when wrapped around an iterator like this:\n",
- "# 2.94e-01 100.00%┣████████████████████┫ 6000/6000 [00:10/00:10, 592.96/s]\n",
+ "# 2.94e-01 100.00%┣████████████████████┫ 6000/6000 [00:10/00:10, 592.96/s] 2.31->0.28\n",
"model = Linear(784,10)\n",
"@show model(dtst)\n",
"progress!(sgd(model, repeat(dtrn,10)))\n",
@@ -689,16 +1185,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 37,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "model(dtst) = 2.3035536f0\n",
+ "3.03e-01 100.00%┣██████████████████┫ 6000/6000 [00:04/00:04, 1654.96i/s]\n",
+ "model(dtst) = 0.28049186f0\n"
+ ]
+ }
+ ],
"source": [
"# The training would go a lot faster on a GPU:\n",
- "# 2.94e-01 100.00%┣███████████████████┫ 6000/6000 [00:02/00:02, 2653.45/s]\n",
+ "# 2.94e-01 100.00%┣███████████████████┫ 6000/6000 [00:02/00:02, 2653.45/s] 2.31->0.28\n",
"# To work on a GPU, all we have to do is convert Arrays to KnetArrays:\n",
"if gpu() >= 0 # gpu() returns a device id >= 0 if there is a GPU, -1 otherwise\n",
" atype = KnetArray{Float32} # KnetArrays are stored and operated in the GPU\n",
@@ -728,13 +1234,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "trainresults (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"function trainresults(file, model)\n",
" if (print(\"Train from scratch? (~77s) \"); readline()[1]=='y')\n",
@@ -758,9 +1275,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 39,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? (~77s) stdin> n\n"
+ ]
+ }
+ ],
"source": [
"# 2.43e-01 100.00%┣████████████████▉┫ 60000/60000 [00:44/00:44, 1349.13/s]\n",
"lin = trainresults(\"lin113.jld2\",Linear(784,10));"
@@ -779,7 +1304,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 40,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -792,13 +1317,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 41,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAAGQCAIAAAD9V4nPAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deXxV1b338d/aZx6SnMyQAJF5dCpaFRVEWyxqQL2P1Vst1nqvr6rU2mvVvhzooKjgba/D01e92kZbWi+tPoJSWm0d0Cve9lqrosigogwhQAiZzsmZ93r+2CHGkCAIOSdkfd5/8Dpn7X3O/p3NTr5Ze+19ltJaCwAAprLyXQAAAPlEEAIAjEYQAgCMRhACAIxGEAIAjEYQAgCMRhACAIyWoyBsbm6ura0tKSmZM2dOc3Nzr+u8++67oVAoN/UAAODIURAuWrSopqamoaFhxIgRixcv3neF1tbWb3zjGx0dHbmpBwAAh8rNN8uMHz/+6aefnjBhwvr16+fOnbthw4buS7XWF1544aWXXnrRRRfxTTcAgFzKURCGw+HGxsZAIBCPxysrK9va2rovveeee3bv3v3v//7vSvVeT1NT09VXX51IJHq0+3y+xx9/PJvNKqUsq/fe7dZtOwKhcFlx+HB9Fjj2v9vRTzKZjMvlUkrluxCzpNNpj8eT7yqMc1C73eVyfe5fR+7P97KDpbV2fnS11tlstvuil1566dlnn/3LX/6yn5c/+eSTbW1tV155ZY92l8uVzWbb2tpcLldf44sbn3gsNfHUWV+edmifAD21tLT4/X6/35/vQsyyZ8+ewsJCfinn2O7du8vKyvizL8d27do1ZMiQA1z5UP53chSEVVVVW7duHTt2bH19fXV1dfdFL7zwwssvv+z1ep2nSqn//u//Pu2003q8Q01NzUUXXdTrm6dSKZfL1ddvZKWU2+3m9/Vh5/P5CMLcc3Y7QZhjzm4nCHPM2e052FCO/l9ra2vr6uq01nV1dXPnznUaV61aJSJ33nmn3ktEtNb7puCh0Eppm3FHAEDvchSECxYsWLNmzfDhw9euXXvbbbc5jTNnzszBppUoLQQhAKB3OTo1GolEVq5c2aNx3+ti+uXKHSXClagAgD6YcMqb6+sAAH0yIQg5MQoA6JMBQaiUcLEMAKAPgz8ItXO5DAAAvRn8QSiKc6MAgD6ZEIScGgUA9MmAIKRDCADomwlByLlRAECfDAhCpbihHgDQl8EfhB4lsUy+iwAADFSDPwjDXtWYsPNdBQBggBr8QVjgUY3xfBcBABiojAjC3QnGCAEAvRv8QRjySiyt4wwTAgB6M/iDUIkq9cnHUTqFAIBe5Gg+wnxSqsKvN7XLxEi+KwEAkR/84AfPPPNMvqsYuMaPH7906dJcbtGAIBQp98uHbZqJCQEMBJs3b77gggtqa2vzXchAtHHjxrvuuivHGzUgCJUq98lL7ZwaBTBQjBgx4vjjj893FQORZeVhwG7wjxGKSHlANrXluwgAwIBkQBAqVebVH7bRIwQA9GLwB6ESVRaQj6J83ygAoBeDPwhFxGtJkUcaOohCAEBPBgShEtEyqlB9yDAhAGAfBgShKBE9ukBt4sJRAMA+DAhCpUTrUYWyietlAAD7MCYIC9SH7fmuBAAw8Az+IFS+gB2PjS5U9AgB4NApNdi+pWvwB6G7rCqzu35UgfqQMUIA6MOXvvSlfJeQN4M/CD0V1Zld9UOCEktLezrf1QDAgPTCCy/ku4S8GfxB6C4flmncpkRGFqiP6BQCwD7OP/98ETnuuONERCm1ZMmSIUOGOI+XLl167LHHlpaW3nfffd1fsmfPnnnz5g0dOrSqquryyy/fs2eP015XVzd06NCysrIHHnigr5aBZvB/6bYVLhJRdrR1VGF4U7s+pmSwnd0GcKTbFZerXs3mZlsVAXn4NFePxuXLlyul3nrrLefp3/72t64O4pYtW956662XXnrpvPPOu/7667tecv3113u93k2bNonIddddd8MNNzz66KMicsMNN7zyyis+n+/aa6+97rrrem0ZaAZ/EIqIu7w607htdMEE7qkHMACFPfKNcTn6Gz3o/uwN/eAHPygvL3ceX3311UqpM888Mx6Pd1/nj3/843vvvRcIBETkzjvvPOaYY5z26dOn33LLLfPmzfvTn/7UV8tAY0YQVlSnG7ePLJi4vpVTowAGnKBbzq8ZQANVXSkoIgUFBX2t1nX5qFIqm+3s0S5fvvzPf/7zY4899vDDD//lL3/ptWWgGUC7vv+4y6ozjfXcQQEA+5FOH8T1hLNnz7711lsTiUQ8Hr/11lvPOeccp33kyJEjR45csGDBG2+80VfLQGNGEFZUZ3bVjyoQ7qkHgF6dc845o0ePPvD177vvvng8ftRRR40aNSqVSnVdSnPTTTedcsopZ5xxxr333ttXy0BjxKlRT/mw9sZtIwvU1qjOanFxuQwAfNrKlSudB7rbnHW9PnYelJaWLlmyZN/3ueaaa6655pr9tww0xvQId2/3Wbrcr7bFODsKAPiEEUGovH4rEM62NI4qFC4cBQB0Z0QQSucdFPVMxgQA6MGYIKwYlt5VP4oLRwEAn2ZMEJZVZRq5cBQA0JMxQVgxLNO4jVsJAeBzG3wTMDlMCUJPuXMrIWOEANCL/UzDNOhnaDIlCF1lQ7MtjaWejK2lOZnvagBggNnPNEyDfoYmU4JQudyuSHl2d8NIOoUA8Gndp2HqMWtS90WOwTQBk8OIb5ZxuCuq0431owqrNrXrqWWD80w3gCOR3dHe9uxvcrMtK1hQ+JXLejR2n4apx6xJPWZoksE1AZPDpCAsH5ZprD+m5Iv/u0tfNDLf1QDAXsrldpdV5Whb/uD+V/jMWZMG0wRMDqOCsDq9fdO5R6tLXrTvPSnf1QDAXsoXCE+fm+8qOh3IrEmDZgImhyljhCLiqajONNZ/oUwlsrKRiQkB4NOcaZh6nTWp+wxNg2kCJodBQegur87s2qZEzhuhntlCEALAJ7qmYdp31qQeMzQNpgmYHAadGnVFyu14VCfjtSP8i97Ofu9og/4IAID965qGad9Zk3rM0DSYJmBymBQGSrnLqjKN28+qUmv26N2JfNcDABgATApCEXf5sHTjNp9LzhhqPbvNznc5AID8MywIK6ozu+pFpLZGrWCYEABgXBCWV2d214vIecOtP2+zU/QJAcB4ZgWhp7zK6RFWBGRCRL3cQKcQAExnVhC6y4dlGrc5j2tHWCu20CUEANOZFYRWuEhE2dFWEZlTo57ZTI8QAExnVhCKM0zYuE1EphQrS8k7e8hCADCaeUFYUZ3e2XV2lK+YAQDTGfTNMg7f2OMS77wWOvlsEakdYd32RvbW44z7awBAfj333HNNTU35rmIgamhoyP1GjQvC4LGntT71kB1rs0KFM4aq91v1jrgMCeS7LADGuOeee+69997t27fnu5ABat68eTneonFBqHwB38Sp8bdfDU07x2PJl6utpz6yr5lEpxBAjgwZMuQnP/lJvqvAJ0wMgODUMzveeMl5/J0p1uI13FkPAOYyMQj9E09I79ic2bNTRE6pUOOL5Nfvk4QAYCgTg1C53IFjT4u/+bLz9EdTXXe8SacQAAxlYhCKSPCEMztef955fHKFmhiRxzaShABgIkOD0Ddysk4l09s/cp7eeYJr4Vt2MpvfogAAeWBoEIpSgS+c0fHGi86zE8rU5GJ5lE4hAJjH1CAUCZ5wVsffXxTd+c0yd0x13f02nUIAMI65QegZMsIKFyU/fNd5OrVMHV0sv9xApxAAzGJuEErnDYUvdj398VTXwrfseCaPFQEAcs3wIJwZX7NaZ9LO0y+UqRPK1X1r6RQCgEGMDkJXUamnamTi3b92tfxsmnX/u9n/2cWUFABgCqODUEQKzvpq6x/qdDrlPB0WUr843f31VdnWVH7rAgDkiOlB6J8w1TtifNtzv+1qOW+EOnuY+tf/5vpRADCC6UEoIpF/uqbjf/+crv+wq+WnJ7neb9NcQQoAJiAIxQoVFs6e1/y7B7ruKfS55Pdnum79e3ZdC4OFADDI5S4Im5uba2trS0pK5syZ09zc3GPps88+O2nSpEgkMmnSpD//+c85q8oROvkryuePvrayq2VskVp4guurL2S5mwIABrfcBeGiRYtqamoaGhpGjBixePHi7ots27700ksffPDBPXv2/PjHP77iiityVlUnpYov+nbbn36TbW3qartyvHVMiZr3cjbDKVIAGLxyF4TLli2bP3++z+ebP3/+U0891X1RJpNZsmTJmWeeGYvFfD5fJBLJWVVd3BXDwqee07Lsoe6Nj85wJbPyzy+RhQAwaLlztqX6+vqamhoRcfqF3Rd5vd5zzjknGo0WFhYqpV599dV9X7569eqvf/3rPRp9Pt9PfvKT9vZ2l8uVyRzqSUx94uzkz29qWv2se8opXY2/PFG+/prvq39J/+KkpEsd4hYGlba2tlQqlUwm812IWdra2kTE4/HkuxCztLW1eb1ey+Kiipxqa2sLBoMHuHIwGPzcPxe5C0KttVLKeZDN9nJzQjgcjkaj999//3e+853XX3+9x9Ly8vJp06b1aHS73R6Px+PxWJZ1GH41eDyuf/5e+2N3uEMFnnHHd7aJ/PZ0+/+87LruDf9DJ2ctsnAvz175LsQs7Pa86Po9k+9CzHJQh/qh/O/kLgirqqq2bt06duzY+vr66urq7os+/vjjn/3sZ/fee28oFLryyivvuuuufV8+bty4q6++utd3zmQyLpfrwP9w2J8xk33/+sPdj/yw9PLv+8Ye57QFRVbOlvOey1z3hvsXp7vIQkcymQwEAn6/P9+FmKWjo+NQ/vLF5xMIBILBIEGYY85uz8GGcvf/WltbW1dXp7Wuq6ubO3eu07hq1SoRqaqq+uUvf/nyyy9rrX/3u98df/zxOatqX96aCWVXLmj61T3JD9/pagy6ZcXZ7g/a9KWrsh1cRwoAg0jugnDBggVr1qwZPnz42rVrb7vtNqdx5syZIuL1epctW/Zv//ZvpaWlS5cufeSRR3JWVa+8IyeVzru56dGFqS0buxpDbvnLbHfILac8k9nUzv2FADBI5O7UaCQSWblyZY9Gvfce9hkzZrzxxhs5K+Yz+cYdX/LP32165Ael//pD74jxnY0u+cXprofX26c+k/nVGe5Z1ZwkBYAjHqe8++SffFLxJdfvfvgH7aue6vrSGRG5aoL1xFnub76SXfS2TccQAI50BOH++CefVHnDA4k1qxt/fku2bU9X+2lD1F/nuJ762K59LrMtRhoCwBGMIPwMruKK8vmLfaMm7/r3+Yl1f+9qHxZSq2vdpw+xvrAsc/+7Nn1DADhCEYQHwHIVfuWykstual76Hy1P/l872uI0uy25+VjrtTnuZZvtGSszG1oJQwA48hCEB8o37rjKmx8St3fH3Ve1/fm/dCrhtI8pVC+c4/6no6zTVmTueNOOcXMFABxRCMKDYAULIudfVfFvD2R2bN6x8MrYX58VOysiLiXXT7Fen+te36LHP5F5eL3Nd5MCwJGCIDxo7tIhJfO+X/ovP+z4x6qGO65of/53dqxNRI4qUL+d6XruK65nNttT/l/miY8YNwSAIwBB+Dl5h48tv+aesit/kNmzc8fCbzY9tjD18ToRmVys/nC2+/5TXAvftE9cnnniIztLHgLAAJa7G+oHJc+w0cVfva7o3G/E/vps06/vtoIFweNnBI6fcfawylnD3H/YYi962775f+3vTLaummAF2NkAMPDwu/kwsEKFBWd9teDMi5Ifvht/8+VdP/2Ou3RI8Pjps48+pbZ26Ks79OI19qI16asmWP863qoO8X00ADCAEISHj1K+MUf7xhwd+adrEu+/FX/zlfYXn1S+wJSJJ/x+wtSPjj/6/77vPeapzOlDrG9NtGZVK2axAICBgCDsB5blH/8F//gviNbp7R8l1r/R/tL/K9xy9w+rR/9o5JTVeuLdfx1/TTZ8+Vjr0jFqTCF5CAD5RBD2J6U81aM81aMKzrpIJ+PJj9elNq2d9v6yE7ZsSBdWvrdz/H0vj26tGHPylFEXjfNXBPJdLQAYiSDMEeULdHYTRcTOprZ9WLZl40lb3t/z8XPuf2xd4x3WVDKqZHjN5ImjKmtGuCLl+a4XAExBEOaD5fKOGOcdMS4sUiqi06nSbR+/vXZT/cebX3vyjQnJLSFJuSqGhYcO81QMc5dXu8ur3eVVystc8ABw+BGE+ac83sKR404fOU5EUra8tF0/u6Fl44dbRzfXT2/fPnHjqpL2ertpu+UPuUor3aVD3SVDXKWVrki5q7jcXVypvL58fwIAOIIRhAOL15Kzh6mzhxXLWcXv7Dn6T9v0z7bab+zWJx+rzi1umeHbMSazy97TkPpoXbbllUxLY3bPTuX1u4pKXZFyV2GJK1LmKip1FZZYBcWuwhKrIKJc/BcDwP7wW3LgOrpEHV2ibjrGakvLyw32C9uLf/lRZFts/PQh1oyj1WmV6vhS5bbEjrZmW5uyrbuzrU3Z1qbU5g3Z9ma7vTnb2mRHW5U/aIUjrnCRFY64CiJWuMgKF1nBAldBxAoVWaFCK1igPN58f1YAyBuC8AhQ6JHaEVbtCBGRXXF5cbv9yg5dt8HeEtUnVajThhScVll44thRBZ5eXmtHW7PRFjvamm1vtttb7FhbZsfmbLTVjrXZ0Va7o93uaBcRK1hgBQusUIEVKLCCYRUIW4GQFQhbgZDyh6xAyPKHVCBoBQosf0AsV253AAD0I4LwCFMRkEtGW5eMFhHZk5TVO+1Xd+gf/sN+s0mPLFAnlatTKtUXy9XEiHIpERGnC7j/99SppJOIdke73RG14+12PKbj0UxjvZ2I6XjMTsTseIdOdNjxdjvRoVxu5QuK158MhFyBkPIFLJ9f+UOWP6g8PuX1WYGw8vqVx2MFwsrjFbfXCoSV26O8fuXzc7YWwIDCr6QjWInvk55ixpa39+i/7tKrtut719jbYvrYEjW1TJ1Qro4vVROKlLvv71dXXp/L63NFyg5wuzqVtJMdLTsbfEp77IxOxe1kQic67ESHTiXsjvZM0w6dSuh0yo7HdDolmZTdEdWZlE4n7URctG0FQsrtUR6/8vqU26MCIWW5lD+oLEv5Q8pyKV9AuVzKFxDLZTn/+oOixAqERUT5Q6JU52tdbuUNiIjlD4rFl8gDOGgE4SDhtmRqmZpapq6dJCLSnpZ/7NZv7NZ/2qoXvmlvjemJEXV8qTq2VB1boqYUq+JDuNTUCU4rI+5AwOc/+Js6tLbjMScXdTKhsxkdj2k7qxMdnf9mMzqV0Nms3RGVbCaTSkg2ayc7RIsdj4qIjsdEtJ1KSCajsxmdiouIHY+J1srtcS6jVf6gUi5xuSxfQETE7VEen4hYXp+4PSJi+YLicomIE7TO51Juj4gol7vrZhUrEBalnHew9l6gq7wB57UiYgVCnSs4K4vatx3AQEYQDk4FHpkxVM0Y2vmLOJaRd/bot5r0W0368Q/stc26yKumlMgxJWpysZoUURMiKpSzY0EpKxjup/fW6ZROp0TETsRE25LN2sm4iEhmb3sqKZm0iNjJDslmRcROxp0JlnUqYXckRcRJYucNnTFUEZFM2k4lO7eSijuvFRE7HpW9M23Z8b0r7w3mzk+8N547n3r9yvXJiK4KhFS3yFRev3Q7e6zc7u63kKaSyVZ/wBUI9fjgTl/5U5Sy9lmt9zWd1f1B1cfor9NZ73WRiFK9beVTWzzgznr3vyQObH3+2sBhQBAaIeSWkyvUyRWdvzK0yMft+t1m/W6zPLtV/8c79oZWXRlQEyMyMaLGF6nxETUxosqPwDv4lcfrXATbf1n7OehMWu8NURHRqYTOpj95Go9p/cmslTqZEDvT7bWfpLKIZNvbfT6vlUn12ITTV/70VrXdsU+jSKZpR+9FJjq0ne17kd3rIhGt47E+Fu2tLdEhfb7802vGoyIHMYFn9782PrcDyWnbtnfk9sS7cnudcxiHyDn1cujvk3t2okOddr5ceFUOtkUQmkiJjCxQIwuUM74oIraWj9r1ey16fYv8rVH/6n17fYvWImOL1NhCNa5IjS2UMUVqTKGKcKvFwVNuj3PStdMhhHRs925/UZHH09slwvhcDiSnGxsbS0tLrRxm4eEKsMMVqLln+YMNO3fmZlsEIURELCWjC9Xowk+iUUSakrKhRW9s1e+36ac+lg/a7A/atNeS0YVqTKGq9nrHRtS4Ej2qUKqDzCqFI5XlD372Sv6YFQznMgiRSwQh+lTqk2mValrlpyJuV1w+bNcftul3d8rLO9WvP8p+1C67E3pEWB0VlqMKVE1Y1YTlqLA6qkCGBjvv4gCAAYsgxMGpCEhFQJ1SoZpLU4GAy+/3ikgyKx+164+jsjmqN7frP22Vj6P25qjsTughATUiLDVhNTwkw0JqeFhGhFV1UJUdgQOQAAYlghCHgc8lEyJqQkR6XPKXtmVbTG+Nyeao3hKVd5v1n7bpLVHZFtPxrAwLqaqgDA+p6pBUBVV1UIYG1fCwVAaUl1NQAHKFIEQ/8ljOVTmy7zXx8YxsjentHbI1putj8kGbfrlBGjrsbTHZEdclPqkIqGEhqQyo6qBUBtTQoAwJqMqAVIVyeKcHAAPwGwX5EXDLuCI1rkj2zUgtsisuO+N6W0x2xfW2mLzfpl/ZITvi9s64bI9pERkSVEMCUu5XVSEp90u5Xw0JSEVAlfulMnBIXxcAwDQEIQYcJVIZkMqAOqZEer29OpaRhg69My6NCV0fk8aEXtusn6+XxoTdmJAdHTqelXK/KvNLZUAqAqrML2U+VRGQioCU+lSZv/NfrnQFIAQhjkQht4wpVGMKpa9vIUlmZXdCNyZkR1x2J/TuhOxO6P9tlN0J2Z2wm5KyO6GbElLql1KfKvV3paOU+Z2nUuJTpX4p8UmJT/mZbAMY1AhCDEI+l1SHVHXnN3/1Hpa2lqakNCX07oQ0JXVTUpoS0pjQ77dJU0KakvaepOxJ6j1JcSkp8akSX2culvikuNuDYq8q7vYAwBGHIIShLNU5uCgi+/9+y1hG9iR1U0KaU7InofckxcnITe3SnJTmpL0nKc0paUnqlpQU+yTiVcU+iXgl4lURr0R83R9IkVdFvFK09wGAvCMIgc8QckvIrYbvt3/p0CItSWlO6eaktOyNxpaUtKT0xtbOB60paUlJa0paU7o11RmNhU40eqTQq4q8UuSViFcVeqTQK4UeKfCqQo8U+6TAo9I2A5vAYUYQAoeNEin2SbFPSUFXw2doTkprSrelnWiUtrRuTUlbSppTektU2tLSlpL2tN2WlpaktKV1W6pQRAo86SKvKvJKgUcKPBL2qKK9kRl2S9gjxT4Ju1XYIyG3RHxS4FEhtwT5cQd6w08GkE+dwfmJz8jO3bt3+8NFCfG0pXVLUtrT0p6WaLozStvTeltMohlpSUp72o5mJJbuzNdYRhIZifgk7FYhj4TcUuyTkFsF3VLgkSKvBN0SdKuIV0JuCbil0KMKPBJwS9gjhR4VcAu3b2Kw4tAGjjA+l4Q9Uubv6nfKAc7hZ+vOsIxlJJaRlqREM7ojI9G0tKYkltG7E/rDNollpCMj7Wm7LSXxrBOlOp6VjowUeSXgkqBbRXwScEnALRGv8rsk6JYir/hcnd1Qn8tZU/ldEvGJs0KBR/lcUsi0GRh4CELAFJY66A5oDy0piWckntXNSUlkJZ6RlpSOZyWekdaUJLPSnNJbYpLMSltKOjJ20pbmpCSz0pGRtrROZqU9LUG3k5TKa4nT6fTv87TQozxWZ7gG3RJyK68lRV5xW1LkFa8lIbdy1gQOHUEI4EBFvBLxishBd0a768hIMistKZ2yJZr+5Gnalva9T1tTOm3LB22dcRvN2GlbWpKS0dKWkpQtsYx21izwiNuSiFe5lBR5xWNJ2CN+lwRcykncsEc8lhR6lEtJxNc5jmuJFHmVy5JCj7iUFHrFraTAo9yWFNBnNQ9BCCCngm4Junt0TOVzBKqjLS1ZW5pTOmtLW1pStsTSkshKPKtjaUnZ0p6WjC1taZ3V8lG72CItSclqaUvbmb1L29OS0dKe1mlbomlx4rB7NNqZwpJg1m3ZhR5lKSnyihKJ+EREir1KRIq8YqnOVHb6r36XBNydwaxEIt1Ww0BDEAI4gjmDjvvEqnzuZBURJw7TtkQz2onJXbtb3eFiW1RbujNxnQFXEWlOaRH5qF20dMZqLGOn7M6+rPNWtkhrSotIS1K0dHZSnW6r0ysVkYhXlJKwW3mszhPCTuKKSIFHuVVno4g439vgxK0TtF0vd8Zlu16IA0QQAsCneKzOsKnYm6Y7s5nycrEsdSj52sVJWafb6gStiDQnRUTa0zqjO0O0K2ud7qwz1OrcqyrSebrYCVoRaUmJ1hLP6kT2kxeG3OJ1dZ4HFpGQR7zWJzEZdCufJbI3WbuC1jmN3HWW2HmTrk6t19V5/bDTLiJFXmWJHNFnlQlCAMip8CeBcTg7svuKZSSVdc4DaxFxzhV3xWRHRidtkb0Z7AStiLSmtC2fRKwT21qkJWV3X815cxFpSWkt0pXoztCsSGfvVvaeEHbGYkU6r3USka7Lnbpe4nRzRaTQKy4lR4l76NDDuD/6RBACwOAU2nv3Z5m/13ztl+FK5yIm2du7lb0nhLNa2lIiIklbOjJaROIZSWRFumVqNKNjGRGRzVHJagkV5GiGboIQAHDYBN1dY5l7g7Zg37UOKIMbGlKHraz9ylHeAgAwMBGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKP1EoTr1q078cQTlyxZIiI33nhjOByeOXPm1q1bc14bAAD9rpcg/Pa3v11ZWTl79uwtW7Y89NBDTz75pNfr/e53v5v74gAA6G/ufZv+9re//eIXvygrK1u8ePHs2bO/8pWvtLa2XnvttbkvDgCA/tZLj9DtdiulRGT16tUzZswQkUAgkEqlcl0aAAD9r5cgPOmkk1asWLFu3boXX3zxoosuSqfTS9TqzyoAABDiSURBVJcuPfbYY3NfHAAA/a2XIFy8ePHzzz8/adKkiy++uKKi4tprr33xxRfvvffe3BcHAEB/62WM8Jhjjtm6devOnTurqqpEZNGiRT//+c9dLlfOawMAoN/1EoQi4na7q6urncfFxcU5rAcAgJziPkIAgNG4jxAAYDTuIwQAGI37CAEARuM+QgCA0XJ3H2Fzc3NtbW1JScmcOXOam5t7LH366aenTJkSiUSmT5++cePGQ9wWAAAHqJcgdO4j3LZt2yOPPCIiixYtqq+vP/nkkw9xS4sWLaqpqWloaBgxYsTixYu7L9qyZctll132yCOPNDQ0zJkz54orrjjEbQEAcIB6n4/QuY/QGSksLi4+LHfTL1u2bP78+T6fb/78+U899VT3RZs2bbrkkktOOeWUQCBw+eWXb9iw4dA3BwDAgej9hvoVK1YsXrx43bp1tm1Pnjz5+9///rnnnnuIW6qvr6+pqRERp1/YfdEZZ5xxxhlniEg2m12wYMHFF1+878v//ve/73sLh9frve2226LRqMvlsm37ECvEQYlGo5lMJp1O57sQs0SjUcuyPB5PvgsxSzQa9fv9lsVM5jkVjUbb29sPcOVAIOB2955on6mXlz3xxBNf+9rXbrzxxrvvvlsptXLlyrlz5/7+97+/8MILP982HFprp4uptc5ms/uu8Pzzz990002zZs268847913q8/lKSkp6Vu92q24OpTwcLHZ7XrDb84Ldnhc52+e9BOFdd9110003LVy40Hl66qmn2ra9cOHCQwzCqqqqrVu3jh07tr6+vuv72xxa61tuuWX16tVLly4dN25cry8/+uijb7/99l4X2bbtcrlCodChlIeDlU6nA4GA3+/PdyFmSSQS4XCYHmGOxWKxcDhMjzDHQqFQOBzOwYZ6+X/duHHjaaed1r1lxowZhz5uV1tbW1dXp7Wuq6ubO3eu07hq1SoRee2115YtW/bMM89UVVVFo9FoNHqI2wIA4AD1EoQ1NTVr167t3vLuu+86w3uHYsGCBWvWrBk+fPjatWtvu+02p3HmzJkismrVqg0bNhQXFxfsdYjbAgDgAPVyavSqq666/fbbKysrnQtkVq5c+eMf//iOO+44xC1FIpGVK1f2aNRai8itt9566623HuL7AwDwOfQShNddd10mk/nud787b948ESktLV2wYMF1112X89oAAOh3vQShZVnf+973brjhhsbGRhEpLy/nWikAwGDV510XSqmKiopclgIAQO4d0NXAy5cvp1MIABiUuC0GAGA0ghAAYDSCEABgNIIQAGC0zqtG169fv5+Vtm3blpNiAADItc4gnDhxYn7rAAAgLzqD0PmqMwAATMMYIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBouQ7C5ubm2trakpKSOXPmNDc377tCNpudMGFCjqsCABgr10G4aNGimpqahoaGESNGLF68uMfS+++/f9q0aRs2bMhxVQAAY7lzvL1ly5Y9/fTTPp9v/vz5c+fOvfvuu7svPeaYY0aPHl1bW7vvC999992FCxf2aHS73ddee20sFnO5XFrrfqwb+4jFYtlsNpPJ5LsQszhHu8fjyXchZonFYoFAwLIYS8qpWCwWjUYPcGW/3+92f85Ey3UQ1tfX19TUiIjTL+yxdObMmX29MB6P79q1q0ej1+vV3Rz2arEf7Pa8YLfnBbs9L3K2z3MdhFprpZTzIJvNHvgLTzzxxPvvv7+v93S5XKFQ6PCUiAOTyWQCgYDf7893IWZJJpMFBQX0CHOso6OjoKCAHmGORaPRgoKCHGwo1/+vVVVVW7duFZH6+vrq6uocbx0AgB5yHYS1tbV1dXVa67q6urlz5zqNq1atynEZAAA4ch2ECxYsWLNmzfDhw9euXXvbbbc5jfsZGgQAoF/leowwEomsXLmyR2OP4VBGpAEAOcPYLwDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGgEIQDAaAQhAMBoBCEAwGiDIQjb2tpisVi+qzBOc3NzIpHIdxXGaWpqSqfT+a7COLt27bJtO99VGGfnzp252dBgCMK77rrrsccey3cVxrn++uufffbZfFdhnEsuueTtt9/OdxXGOeuss3bt2pXvKsySTqdPPPHE3GxrMAQhAACfW+6CsLm5uba2tqSkZM6cOc3NzQe1FACAfpK7IFy0aFFNTU1DQ8OIESMWL158UEsBAOgn7pxtadmyZU8//bTP55s/f/7cuXPvvvvuA18qIqtXr7788st7NLpcrpNOOum9997bsWPHf/7nf/bvB8Cnbd68+fnnn29tbc13IWZpampavnz5mjVr8l2IWeLx+G9+85uioqJ8F2KQbDZr2/aB/2KfNWvWyJEjP9+2lNb6873yYIXD4cbGxkAgEI/HKysr29raDnxpNBqdNm1aPB7v8Z6WZZ1yyinJZFIp5fV6+/0zoJtEIuFyuTweT74LMUtHR4fP53O5XPkuxCyxWCwYDCql8l2IWaLRaDgcPsCVv/3tb0+dOvXzbSh3PUKttXMYaa2z2exBLQ2Hw/wJDADoD7kbI6yqqtq6dauI1NfXV1dXH9RSAAD6Se6CsLa2tq6uTmtdV1c3d+5cp3HVqlX7WQoAQH/L3RhhS0vLpZde+vbbb3/hC19YsmSJM+ysVGcBvS4FAKC/5S4IAQAYgI74b5bhTvycefrpp6dMmRKJRKZPn75x40an8dRTT1V7fetb38pvhYNSr3uYw76/qX0IR3u/yWazEyZM6N7S6xHef4f9ER+E3ImfG1u2bLnsssseeeSRhoaGOXPmXHHFFSKitV6/fv22bdva29vb29vvu+++fJc52PS1hzns+1t7N7fffvvNN9/M0d5P7r///mnTpm3YsKF7Y69HeD8e9voIN27cuHXr1mmt161bN27cuHyXM2i99NJL//Iv/+I83rVrV2lpqda6oaEhHA5PnTo1HA7PnTt3586dea1xEOprD3PY58yaNWvOOuusdDrN0d5PXnzxxRUrVvQIo16P8P477I/4IAyFQh0dHVrrjo6OgoKCfJcz+GUymW9961vXXHON1vrNN9+cOXPmm2++2dTUNG/evEsuuSTf1Q02fe1hDvvcSCaTX/ziF9euXas52vtZjyDs9Qjvv8P+iL9YJhQKNTU1+f3+jo6O8vJyJibsV88///xNN900a9asO++80+3+1LcxNDQ0TJ48ec+ePfmqbdDrvoc57HPj7rvv3r59+4MPPtijnaP9sOu6g8DR6xHef4d97r5Zpp84d+KPHTuWO/H7ldb6lltuWb169dKlS8eNG+c0/uMf/0gkEtOmTRMRr9fr8/nyWuMg1Nce5rDPgWw2+9BDD73wwgvOU472XOr1CO+/w/6Iv1iGO/Fz47XXXlu2bNkzzzxTVVUVjUaj0aiIxGKxCy64YN26dalU6o477jj//PPzXeZg09ce5rDPgRdffHH48OFjxoxxnnK051KvR3g/HvaH8TRrXjQ3N59zzjnV1dW1tbUtLS35LmfQuvPOO/c9cmzb/tnPfjZ69OiysrJ58+a1trbmu8zBpq89zGGfA1/72td+9KMfdT3laO9XPcKo1yO8/w77I36MEACAQ3HEnxoFAOBQEIQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhAAAoxGEAACjEYQAAKMRhEDuqD703+bWr1/fT28ODBpH/HyEwJHlwQcfHDZsWL6rAPAJghDIqS996UsTJkzIdxUAPsGpUQCA0QhCYKBYv369UuqDDz44++yzi4qKjj322Mcff7xrqW3b99133+TJk8Ph8NSpU5988smuRVrrBx54YNKkSYWFhaeffvqrr77ataihoeHcc8+NRCJHHXXUb3/72672d955Z/bs2SUlJUVFRbNmzWIoESYjCIGc2rRp0/p9dF/h3HPPPeOMM5YsWTJjxozLLrvsD3/4g9P+05/+9Pbbb//GN77xxBNPzJo165JLLula9OCDDy5YsODqq69+9NFHy8vLzzjjjDVr1jiLvvnNb1544YUrVqyYPn36FVdcEY1GRSSbzZ599tmlpaUPPfTQww8/7PV6582bl8N9AAwwh3G2ewD7t/8fw3Xr1onIo48+2rX+9ddfP336dK21bdulpaW/+tWvuhZ9//vfP/30051FQ4YMefzxx532bDY7e/bsJUuWOJu7//77nXYnAtetW6e13rp1q4i89957zqLGxsZf//rX/fzRgYGLHiGQU04U9dB9hdra2q7HF1xwwXvvvScijY2NTU1N5513Xteic845x1nU1NS0Y8eOWbNmOe2WZf3xj3+87LLLnKennnqq8yAUCnW9tqqq6oorrvjiF784d+7cxYsXx+Pxr3/96/3yaYEjAUEIDFyWZWUymb4WZbNZEXFWcLlcva7WPf+6v7auru6DDz748pe//Prrr0+aNOmmm246fFUDRxiCEBhYVqxY0fV4+fLlU6ZMEZHy8vLS0tI//vGPXYtWrlzpLKqsrCwuLn7hhRecdtu2TzjhhLvuums/m2hubr7qqqtKSkrmz5//xBNPPPHEEz//+c/75cMARwLuIwRy6vnnn9/3Es3zzz+/6/GNN964c+fOSZMmvfDCCw888MDTTz8tIkqpm2+++ZprrnEWvfLKK/fee+/y5cudRTfeeONVV121Y8eOMWPG/Nd//dc777zz2GOP7aeGwsLCZ555JhaLXXzxxYlEoq6uburUqYf/owJHitwPSwLG2v+PoXOxzOuvv37qqaeGw+Gjjz666xIYrXU2m/3pT386ceLEYDB43HHHPfnkk90X3XPPPWPGjAkGgyeeeOJzzz3XtbnuQ5Ldn/7P//zPtGnTQqFQcXHxBRdcsGXLln7/8MBApXTfP5wAcmn9+vUTJ07kRxLIMcYIAQBG+/94dZbOM91OcwAAAABJRU5ErkJggg=="
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Demonstrates underfitting: training loss not close to 0\n",
"# Also slight overfitting: test loss higher than train\n",
@@ -807,13 +1341,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 42,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# this is the error plot, we get to about 7.5% test error, i.e. 92.5% accuracy\n",
"plot([lin[4,:], lin[5,:]],ylim=(.0,.12),labels=[:trnerr :tsterr],xlabel=\"Epochs\",ylabel=\"Error\")"
@@ -832,13 +1375,59 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 43,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "28×280 Array{Gray{Float64},2} with eltype Gray{Float64}:\n",
+ " Gray{Float64}(0.490824) … Gray{Float64}(0.500489)\n",
+ " Gray{Float64}(0.509466) Gray{Float64}(0.500019)\n",
+ " Gray{Float64}(0.499039) Gray{Float64}(0.490492)\n",
+ " Gray{Float64}(0.508941) Gray{Float64}(0.476173)\n",
+ " Gray{Float64}(0.494655) Gray{Float64}(0.488138)\n",
+ " Gray{Float64}(0.508286) … Gray{Float64}(0.485166)\n",
+ " Gray{Float64}(0.50035) Gray{Float64}(0.464198)\n",
+ " Gray{Float64}(0.485799) Gray{Float64}(0.508036)\n",
+ " Gray{Float64}(0.498516) Gray{Float64}(0.475252)\n",
+ " Gray{Float64}(0.490375) Gray{Float64}(0.491196)\n",
+ " Gray{Float64}(0.493878) … Gray{Float64}(0.498795)\n",
+ " Gray{Float64}(0.510057) Gray{Float64}(0.515459)\n",
+ " Gray{Float64}(0.507569) Gray{Float64}(0.510868)\n",
+ " ⋮ ⋱ \n",
+ " Gray{Float64}(0.493676) Gray{Float64}(0.49869) \n",
+ " Gray{Float64}(0.510784) Gray{Float64}(0.54785) \n",
+ " Gray{Float64}(0.513567) Gray{Float64}(0.501581)\n",
+ " Gray{Float64}(0.505647) Gray{Float64}(0.490174)\n",
+ " Gray{Float64}(0.506392) … Gray{Float64}(0.509558)\n",
+ " Gray{Float64}(0.484479) Gray{Float64}(0.491085)\n",
+ " Gray{Float64}(0.509207) Gray{Float64}(0.498947)\n",
+ " Gray{Float64}(0.500104) Gray{Float64}(0.494736)\n",
+ " Gray{Float64}(0.514525) Gray{Float64}(0.521575)\n",
+ " Gray{Float64}(0.506117) … Gray{Float64}(0.514865)\n",
+ " Gray{Float64}(0.509232) Gray{Float64}(0.490231)\n",
+ " Gray{Float64}(0.495311) Gray{Float64}(0.487005)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "\"Epoch 99\""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"# Let us visualize the evolution of the weight matrix as images below\n",
"# Each row is turned into a 28x28 image with positive weights light and negative weights dark gray\n",
diff --git a/tutorial/40.mlp.ipynb b/tutorial/40.mlp.ipynb
index 5e5c09117..6df528f8c 100644
--- a/tutorial/40.mlp.ipynb
+++ b/tutorial/40.mlp.ipynb
@@ -9,31 +9,28 @@
},
"source": [
"# Multilayer Perceptron (MLP) \n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "source": [
+ "(c) Deniz Yuret, 2019\n",
"* Objectives: Stack multiple layers, demonstrate the power of nonlinear activation functions, use regularization and dropout to address overfitting.\n",
"* Prerequisites: [Linear models](30.lin.ipynb), [MNIST](20.mnist.ipynb)\n",
"* New functions: \n",
- "[relu](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.relu), \n",
- "[dropout](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.dropout), \n",
- "[param, param0](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.param), \n",
- "[xavier](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.xavier), \n",
- "[training](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.training), \n",
- "[gc](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.gc)"
+ "[relu](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.relu), \n",
+ "[dropout](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.dropout), \n",
+ "[param, param0](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.param), \n",
+ "[xavier](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.xavier), \n",
+ "[training](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.training), \n",
+ "[gc](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.gc)\n",
+ "\n",
+ "![image](https://github.com/denizyuret/Knet.jl/blob/master/docs/src/images/neural_net2.jpeg?raw=true)\n",
+ "([image source](http://cs231n.github.io/neural-networks-1))\n",
+ "\n",
+ "A multilayer perceptron, i.e. a fully connected feed-forward neural network, is basically a bunch of linear layers stuck together with non-linearities in between. In this notebook we will first chain together a couple of linear layers and see that it has no effect on performance (multiple linear layers still compute a linear function). However with a one line change (adding an elementwise nonlinear function between layers) the accuracy will go from 92% to 98%! It turns out the class of functions that can be represented by linear models is severely restricted and does not contain good MNIST classifiers. MLPs have no such restrictions, being **universal function approximators** they can represent any classifier given large enough layers and learn the correct classifier with sufficient data. \n",
+ "\n",
+ "Simple models with low capacity, e.g. the linear model, have an **underfitting** problem: the error does not drop to 0 even on the training set. Models with high capacity, e.g. MLP, have an **overfitting** problem: even when the training set error goes to 0, the test set error remains high. We will implement two methods to fight overfitting: **regularization** and **dropout**."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -51,13 +48,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading MNIST...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/mnist.jl:33\n"
+ ]
+ }
+ ],
"source": [
"# Load data (see mnist.ipynb)\n",
"include(Knet.dir(\"data\",\"mnist.jl\")) # Load data\n",
@@ -66,13 +72,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "trainresults (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# For running experiments\n",
"function trainresults(file,model; o...)\n",
@@ -94,13 +111,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "atype (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Some utilities to make model definitions easier:\n",
"param(d...; init=xavier, atype=atype())=Param(atype(init(d...)))\n",
@@ -122,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -153,7 +181,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -169,14 +197,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(\"64×784 Param{KnetArray{Float32,2}}\", \"64-element Param{KnetArray{Float32,1}}\")\n",
+ "(\"10×64 Param{KnetArray{Float32,2}}\", \"10-element Param{KnetArray{Float32,1}}\")\n"
+ ]
+ }
+ ],
"source": [
"# Here is an example two layer model\n",
"model=Chain(Layer0(784,64), Layer0(64,10))\n",
@@ -185,30 +222,49 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "2.04e-01 100.00%┣████████████████┫ 60000/60000 [00:51/00:51, 1179.52i/s]\n",
+ "Float32[0.240742; 0.282535; 0.0691333; 0.0794]\n"
+ ]
+ }
+ ],
"source": [
"# Train the two layer model\n",
- "# 52s [0.240726; 0.281965; 0.0691833; 0.0794]\n",
+ "# 2.04e-01 100.00%┣████████████████┫ 60000/60000 [00:51/00:51, 1179.52i/s]\n",
+ "# [0.240742; 0.282535; 0.0691333; 0.0794]\n",
"mlp1 = trainresults(\"mlp113a.jld2\", model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(\"10×784 Param{KnetArray{Float32,2}}\", \"10-element Param{KnetArray{Float32,1}}\")\n"
+ ]
+ }
+ ],
"source": [
"# Here is a single layer (linear) model\n",
"model=Chain(Layer0(784,10))\n",
@@ -217,22 +273,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "2.30e-01 100.00%┣████████████████┫ 60000/60000 [00:40/00:40, 1502.26i/s]\n",
+ "Float32[0.242363; 0.267073; 0.0671333; 0.0746]\n"
+ ]
+ }
+ ],
"source": [
"# Train the single layer (linear) model\n",
- "# 43s [0.242353; 0.267041; 0.0669667; 0.0749]\n",
+ "# 2.30e-01 100.00%┣████████████████┫ 60000/60000 [00:40/00:40, 1502.26i/s]\n",
+ "# [0.242363; 0.267073; 0.0671333; 0.0746]\n",
"lin1 = trainresults(\"mlp113b.jld2\", model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -245,13 +312,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# multilinear converges to a similar solution, not identical because problem is non-convex\n",
"plot([lin1[1,:], lin1[2,:], mlp1[1,:], mlp1[2,:]], ylim=(0.0,0.4),\n",
@@ -260,13 +336,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# error results also close to the linear model\n",
"plot([lin1[3,:], lin1[4,:], mlp1[3,:], mlp1[4,:]], ylim=(0.0,0.1),\n",
@@ -304,7 +389,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -321,18 +406,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "2.83e-02 100.00%┣████████████████┫ 60000/60000 [00:52/00:52, 1150.34i/s]\n",
+ "Float32[0.00636037; 0.084129; 0.000516667; 0.0243]\n"
+ ]
+ }
+ ],
"source": [
"# We add a nonlinear activation function to all but the last layer\n",
+ "# 2.83e-02 100.00%┣████████████████┫ 60000/60000 [00:52/00:52, 1150.34i/s]\n",
+ "# [0.00636037; 0.084129; 0.000516667; 0.0243]\n",
"model = Chain(Layer1(784,64), Layer1(64,10,identity))\n",
- "# 54s [0.00612065; 0.0864965; 0.00055; 0.0244]\n",
"mlp2 = trainresults(\"mlp113c.jld2\", model);"
]
},
@@ -349,13 +445,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# MLP solves the underfitting problem! A more serious overfitting problem remains.\n",
"plot([lin1[1,:], lin1[2,:], mlp2[1,:], mlp2[2,:]], ylim=(0.0,0.4),\n",
@@ -364,13 +469,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Test error improves from 7.5% to 2.5%!\n",
"plot([lin1[3,:], lin1[4,:], mlp2[3,:], mlp2[4,:]], ylim=(0.0,0.1),\n",
@@ -390,7 +504,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -411,7 +525,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -432,45 +546,74 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "1.94e-01 100.00%┣████████████████┫ 60000/60000 [00:57/00:57, 1058.05i/s]\n",
+ "Float32[0.026522; 0.0802438; 0.00688333; 0.0231]\n"
+ ]
+ }
+ ],
"source": [
+ "# 1.94e-01 100.00%┣████████████████┫ 60000/60000 [00:57/00:57, 1058.05i/s]\n",
+ "# [0.026522; 0.0802438; 0.00688333; 0.0231]\n",
"model = Chain2(Layer1(784,64), Layer1(64,10,identity); λ1=4f-5)\n",
- "# 61s [0.0259648; 0.0722113; 0.00625; 0.0212]\n",
"mlp3 = trainresults(\"mlp113d.jld2\", model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# overfitting less, test loss improves from 0.0865 to 0.0722\n",
+ "# overfitting less, test loss improves from 0.0841 to 0.0802\n",
"plot([mlp2[1,:], mlp2[2,:], mlp3[1,:], mlp3[2,:]], ylim=(0.0,0.15),\n",
" labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel=\"Epochs\",ylabel=\"Loss\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# test error also improves: 0.0244 -> 0.0212\n",
+ "# test error also improves: 0.0243 -> 0.0231\n",
"plot([mlp2[3,:], mlp2[4,:], mlp3[3,:], mlp3[4,:]], ylim=(0.0,0.04),\n",
" labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel=\"Epochs\",ylabel=\"Error\")"
]
@@ -488,20 +631,53 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 33,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "\\begin{verbatim}\n",
+ "dropout(x, p; drop, seed)\n",
+ "\\end{verbatim}\n",
+ "Given an array \\texttt{x} and probability \\texttt{0<=p<=1} return an array \\texttt{y} in which each element is 0 with probability \\texttt{p} or \\texttt{x[i]/(1-p)} with probability \\texttt{1-p}. Just return \\texttt{x} if \\texttt{p==0}, or \\texttt{drop=false}. By default \\texttt{drop=true} in a \\texttt{@diff} context, \\texttt{drop=false} otherwise. Specify a non-zero \\texttt{seed::Number} to set the random number seed for reproducible results. See \\href{http://www.jmlr.org/papers/v15/srivastava14a.html}{(Srivastava et al. 2014)} for a reference.\n",
+ "\n"
+ ],
+ "text/markdown": [
+ "```\n",
+ "dropout(x, p; drop, seed)\n",
+ "```\n",
+ "\n",
+ "Given an array `x` and probability `0<=p<=1` return an array `y` in which each element is 0 with probability `p` or `x[i]/(1-p)` with probability `1-p`. Just return `x` if `p==0`, or `drop=false`. By default `drop=true` in a `@diff` context, `drop=false` otherwise. Specify a non-zero `seed::Number` to set the random number seed for reproducible results. See [(Srivastava et al. 2014)](http://www.jmlr.org/papers/v15/srivastava14a.html) for a reference.\n"
+ ],
+ "text/plain": [
+ "\u001b[36m dropout(x, p; drop, seed)\u001b[39m\n",
+ "\n",
+ " Given an array \u001b[36mx\u001b[39m and probability \u001b[36m0<=p<=1\u001b[39m return an array \u001b[36my\u001b[39m in which\n",
+ " each element is 0 with probability \u001b[36mp\u001b[39m or \u001b[36mx[i]/(1-p)\u001b[39m with probability\n",
+ " \u001b[36m1-p\u001b[39m. Just return \u001b[36mx\u001b[39m if \u001b[36mp==0\u001b[39m, or \u001b[36mdrop=false\u001b[39m. By default \u001b[36mdrop=true\u001b[39m in a\n",
+ " \u001b[36m@diff\u001b[39m context, \u001b[36mdrop=false\u001b[39m otherwise. Specify a non-zero \u001b[36mseed::Number\u001b[39m\n",
+ " to set the random number seed for reproducible results. See\n",
+ " (Srivastava et al. 2014)\n",
+ " (http://www.jmlr.org/papers/v15/srivastava14a.html) for a reference."
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"@doc dropout"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -517,71 +693,122 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "1.06e-01 100.00%┣████████████████┫ 60000/60000 [00:53/00:53, 1142.58i/s]\n",
+ "Float32[0.0144843; 0.067272; 0.00376667; 0.0189]\n"
+ ]
+ }
+ ],
"source": [
+ "# 1.06e-01 100.00%┣████████████████┫ 60000/60000 [00:53/00:53, 1142.58i/s]\n",
+ "# [0.0144843; 0.067272; 0.00376667; 0.0189]\n",
"model = Chain(Layer2(784,64,pdrop=0.2), Layer2(64,10,identity))\n",
- "# 55s [0.0134416; 0.0672397; 0.00371667; 0.0193]\n",
"mlp4 = trainresults(\"mlp113e.jld2\", model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# overfitting less, loss results improve 0.0865 -> 0.0672\n",
+ "# overfitting less, loss results improve 0.0841 -> 0.0673\n",
"plot([mlp2[1,:], mlp2[2,:], mlp4[1,:], mlp4[2,:]], ylim=(0.0,0.15),\n",
" labels=[:trnMLP :tstMLP :trnDrop :tstDrop],xlabel=\"Epochs\",ylabel=\"Loss\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# this time error also improves 0.0244 -> 0.0193\n",
+ "# this time error also improves 0.0243 -> 0.0189\n",
"plot([mlp2[3,:], mlp2[4,:], mlp4[3,:], mlp4[4,:]], ylim=(0.0,0.04),\n",
" labels=[:trnMLP :tstMLP :trnDrop :tstDrop],xlabel=\"Epochs\",ylabel=\"Error\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(mlperr = 0.0243f0, L1err = 0.0231f0, dropouterr = 0.0189f0)"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"(mlperr=minimum(mlp2[4,:]),L1err=minimum(mlp3[4,:]),dropouterr=minimum(mlp4[4,:]))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(mlploss = 0.084129006f0, L1loss = 0.08024382f0, dropoutloss = 0.06727201f0)"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"(mlploss=minimum(mlp2[2,:]),L1loss=minimum(mlp3[2,:]),dropoutloss=minimum(mlp4[2,:]))"
]
@@ -599,29 +826,49 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "5.80e-02 100.00%┣████████████████┫ 60000/60000 [00:54/00:54, 1101.25i/s]\n",
+ "Float32[0.0035497; 0.0479644; 0.0003; 0.0149]\n"
+ ]
+ }
+ ],
"source": [
"# The current trend is to use models with higher capacity tempered with dropout\n",
+ "# 5.80e-02 100.00%┣████████████████┫ 60000/60000 [00:54/00:54, 1101.25i/s]\n",
+ "# [0.0035497; 0.0479644; 0.0003; 0.0149]\n",
"model = Chain(Layer2(784,256,pdrop=0.2), Layer2(256,10,identity))\n",
- "# 56s [0.00393102; 0.0491462; 0.0004; 0.0154]\n",
"mlp = trainresults(\"mlp113f.jld2\", model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 31,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Both train and test loss is better with the larger model\n",
"plot([mlp4[1,:], mlp4[2,:], mlp[1,:], mlp[2,:]],ylim=(0,0.15),\n",
@@ -630,13 +877,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 32,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# We are down to 1.5% error.\n",
"plot([mlp4[3,:], mlp4[4,:], mlp[3,:], mlp[4,:]],ylim=(0,0.04),\n",
diff --git a/tutorial/50.cnn.ipynb b/tutorial/50.cnn.ipynb
index bc373421f..65892ed06 100644
--- a/tutorial/50.cnn.ipynb
+++ b/tutorial/50.cnn.ipynb
@@ -9,28 +9,25 @@
},
"source": [
"# Convolutional Neural Networks\n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "source": [
+ "(c) Deniz Yuret, 2019\n",
"* Objectives: See the effect of sparse and shared weights implemented by convolutional networks.\n",
"* Prerequisites: [MLP models](40.mlp.ipynb), [MNIST](20.mnist.ipynb)\n",
"* New functions:\n",
- "[conv4](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.conv4),\n",
- "[pool](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.pool),\n",
- "[mat](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.mat)"
+ "[conv4](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.conv4),\n",
+ "[pool](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.pool),\n",
+ "[mat](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.mat)\n",
+ "\n",
+ "![image](https://github.com/denizyuret/Knet.jl/blob/master/docs/src/images/le_net.png?raw=true)\n",
+ "([image source](http://www.dataiku.com/blog/2015/08/18/Deep_Learning.html))\n",
+ "\n",
+ "To improve the performance further, we can use a convolutional neural networks (CNN). See the [course notes](http://cs231n.github.io/convolutional-networks/) by Andrej Karpathy for a good introduction to CNNs. We will implement the [LeNet](http://yann.lecun.com/exdb/lenet) model which consists of two convolutional layers followed by two fully connected layers. We will describe and use the [conv4](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.conv4) and [pool](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.pool) functions provided by Knet for the implementation of convolutional nets.\n",
+ "\n",
+ "Even though MLPs and CNNs are both universal function approximators and both achieve 0 error on the training set, we will see that a CNN converges a lot faster and generalizes a lot better with less overfitting achieving a 99.5% test accuracy on MNIST. The sparse connectivity and shared weights of a CNN give it an inductive bias appropriate for image features allowing it to learn better with less data."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -59,13 +56,118 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "\\begin{verbatim}\n",
+ "conv4(w, x; kwargs...)\n",
+ "\\end{verbatim}\n",
+ "Execute convolutions or cross-correlations using filters specified with \\texttt{w} over tensor \\texttt{x}.\n",
+ "\n",
+ "Currently KnetArray\\{Float32/64,4/5\\} and Array\\{Float32/64,4\\} are supported as \\texttt{w} and \\texttt{x}. If \\texttt{w} has dimensions \\texttt{(W1,W2,...,I,O)} and \\texttt{x} has dimensions \\texttt{(X1,X2,...,I,N)}, the result \\texttt{y} will have dimensions \\texttt{(Y1,Y2,...,O,N)} where\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "Yi=1+floor((Xi+2*padding[i]-Wi)/stride[i])\n",
+ "\\end{verbatim}\n",
+ "Here \\texttt{I} is the number of input channels, \\texttt{O} is the number of output channels, \\texttt{N} is the number of instances, and \\texttt{Wi,Xi,Yi} are spatial dimensions. \\texttt{padding} and \\texttt{stride} are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.\n",
+ "\n",
+ "\\section{Keywords}\n",
+ "\\begin{itemize}\n",
+ "\\item \\texttt{padding=0}: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{stride=1}: the number of elements to slide to reach the next filtering window.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{upscale=1}: upscale factor for each dimension.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{mode=0}: 0 for convolution and 1 for cross-correlation.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{alpha=1}: can be used to scale the result.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{handle}: handle to a previously created cuDNN context. Defaults to a Knet allocated handle.\n",
+ "\n",
+ "\\end{itemize}\n"
+ ],
+ "text/markdown": [
+ "```\n",
+ "conv4(w, x; kwargs...)\n",
+ "```\n",
+ "\n",
+ "Execute convolutions or cross-correlations using filters specified with `w` over tensor `x`.\n",
+ "\n",
+ "Currently KnetArray{Float32/64,4/5} and Array{Float32/64,4} are supported as `w` and `x`. If `w` has dimensions `(W1,W2,...,I,O)` and `x` has dimensions `(X1,X2,...,I,N)`, the result `y` will have dimensions `(Y1,Y2,...,O,N)` where\n",
+ "\n",
+ "```\n",
+ "Yi=1+floor((Xi+2*padding[i]-Wi)/stride[i])\n",
+ "```\n",
+ "\n",
+ "Here `I` is the number of input channels, `O` is the number of output channels, `N` is the number of instances, and `Wi,Xi,Yi` are spatial dimensions. `padding` and `stride` are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.\n",
+ "\n",
+ "# Keywords\n",
+ "\n",
+ " * `padding=0`: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.\n",
+ " * `stride=1`: the number of elements to slide to reach the next filtering window.\n",
+ " * `upscale=1`: upscale factor for each dimension.\n",
+ " * `mode=0`: 0 for convolution and 1 for cross-correlation.\n",
+ " * `alpha=1`: can be used to scale the result.\n",
+ " * `handle`: handle to a previously created cuDNN context. Defaults to a Knet allocated handle.\n"
+ ],
+ "text/plain": [
+ "\u001b[36m conv4(w, x; kwargs...)\u001b[39m\n",
+ "\n",
+ " Execute convolutions or cross-correlations using filters specified\n",
+ " with \u001b[36mw\u001b[39m over tensor \u001b[36mx\u001b[39m.\n",
+ "\n",
+ " Currently KnetArray{Float32/64,4/5} and Array{Float32/64,4} are\n",
+ " supported as \u001b[36mw\u001b[39m and \u001b[36mx\u001b[39m. If \u001b[36mw\u001b[39m has dimensions \u001b[36m(W1,W2,...,I,O)\u001b[39m and \u001b[36mx\u001b[39m has\n",
+ " dimensions \u001b[36m(X1,X2,...,I,N)\u001b[39m, the result \u001b[36my\u001b[39m will have dimensions\n",
+ " \u001b[36m(Y1,Y2,...,O,N)\u001b[39m where\n",
+ "\n",
+ "\u001b[36m Yi=1+floor((Xi+2*padding[i]-Wi)/stride[i])\u001b[39m\n",
+ "\n",
+ " Here \u001b[36mI\u001b[39m is the number of input channels, \u001b[36mO\u001b[39m is the number of output\n",
+ " channels, \u001b[36mN\u001b[39m is the number of instances, and \u001b[36mWi,Xi,Yi\u001b[39m are spatial\n",
+ " dimensions. \u001b[36mpadding\u001b[39m and \u001b[36mstride\u001b[39m are keyword arguments that can be\n",
+ " specified as a single number (in which case they apply to all\n",
+ " dimensions), or an array/tuple with entries for each spatial\n",
+ " dimension.\n",
+ "\n",
+ "\u001b[1m Keywords\u001b[22m\n",
+ "\u001b[1m ≡≡≡≡≡≡≡≡≡≡\u001b[22m\n",
+ "\n",
+ " • \u001b[36mpadding=0\u001b[39m: the number of extra zeros implicitly\n",
+ " concatenated at the start and at the end of each\n",
+ " dimension.\n",
+ "\n",
+ " • \u001b[36mstride=1\u001b[39m: the number of elements to slide to reach the\n",
+ " next filtering window.\n",
+ "\n",
+ " • \u001b[36mupscale=1\u001b[39m: upscale factor for each dimension.\n",
+ "\n",
+ " • \u001b[36mmode=0\u001b[39m: 0 for convolution and 1 for cross-correlation.\n",
+ "\n",
+ " • \u001b[36malpha=1\u001b[39m: can be used to scale the result.\n",
+ "\n",
+ " • \u001b[36mhandle\u001b[39m: handle to a previously created cuDNN context.\n",
+ " Defaults to a Knet allocated handle."
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Convolution operator in Knet\n",
"@doc conv4"
@@ -73,13 +175,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w = [1.0; 2.0; 3.0]\n",
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]\n",
+ "y = conv4(w, x) = [10.0; 16.0; 22.0; 28.0; 34.0]\n"
+ ]
+ }
+ ],
"source": [
"# Convolution in 1-D\n",
"w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w\n",
@@ -89,13 +201,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w = [1.0; 2.0; 3.0]\n",
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]\n",
+ "y2 = conv4(w, x, padding=(1, 0)) = [4.0; 10.0; 16.0; 22.0; 28.0; 34.0; 32.0]\n"
+ ]
+ }
+ ],
"source": [
"# Padding\n",
"w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w\n",
@@ -106,13 +228,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w = [1.0; 2.0; 3.0]\n",
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]\n",
+ "y3 = conv4(w, x; padding=(1, 0), stride=3) = [4.0; 22.0; 32.0]\n"
+ ]
+ }
+ ],
"source": [
"# Stride\n",
"w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w\n",
@@ -122,13 +254,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w = [1.0; 2.0; 3.0]\n",
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]\n",
+ "y4 = conv4(w, x, mode=0) = [10.0; 16.0; 22.0; 28.0; 34.0]\n",
+ "y5 = conv4(w, x, mode=1) = [14.0; 20.0; 26.0; 32.0; 38.0]\n"
+ ]
+ }
+ ],
"source": [
"# Mode\n",
"w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w\n",
@@ -139,13 +282,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×3×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0 4.0 7.0\n",
+ " 2.0 5.0 8.0\n",
+ " 3.0 6.0 9.0"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Convolution in more dimensions\n",
"x = reshape([1.0:9.0...], (3,3,1,1))"
@@ -153,39 +311,87 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×2×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0 3.0\n",
+ " 2.0 4.0"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"w = reshape([1.0:4.0...], (2,2,1,1))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×2×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 23.0 53.0\n",
+ " 33.0 63.0"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"y = conv4(w, x)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×3×2×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0 4.0 7.0\n",
+ " 2.0 5.0 8.0\n",
+ " 3.0 6.0 9.0\n",
+ "\n",
+ "[:, :, 2, 1] =\n",
+ " 10.0 13.0 16.0\n",
+ " 11.0 14.0 17.0\n",
+ " 12.0 15.0 18.0"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Convolution with multiple channels, filters, and instances\n",
"# size X = [X1,X2,...,Xd,Cx,N] where d is the number of dimensions, Cx is channels, N is instances\n",
@@ -194,7 +400,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -208,13 +414,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×2×3×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 328.0 436.0\n",
+ " 364.0 472.0\n",
+ "\n",
+ "[:, :, 2, 1] =\n",
+ " 808.0 1108.0\n",
+ " 908.0 1208.0\n",
+ "\n",
+ "[:, :, 3, 1] =\n",
+ " 1288.0 1780.0\n",
+ " 1452.0 1944.0"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# size Y = [Y1,Y2,...,Yd,Cy,N] where Yi = 1 + floor((Xi+2Pi-Wi)/Si), Cy is channels, N is instances\n",
"y = conv4(w,x)"
@@ -244,13 +472,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "\\begin{verbatim}\n",
+ "pool(x; kwargs...)\n",
+ "\\end{verbatim}\n",
+ "Compute pooling of input values (i.e., the maximum or average of several adjacent values) to produce an output with smaller height and/or width.\n",
+ "\n",
+ "Currently 4 or 5 dimensional KnetArrays with \\texttt{Float32} or \\texttt{Float64} entries are supported. If \\texttt{x} has dimensions \\texttt{(X1,X2,...,I,N)}, the result \\texttt{y} will have dimensions \\texttt{(Y1,Y2,...,I,N)} where\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "Yi=1+floor((Xi+2*padding[i]-window[i])/stride[i])\n",
+ "\\end{verbatim}\n",
+ "Here \\texttt{I} is the number of input channels, \\texttt{N} is the number of instances, and \\texttt{Xi,Yi} are spatial dimensions. \\texttt{window}, \\texttt{padding} and \\texttt{stride} are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.\n",
+ "\n",
+ "\\section{Keywords:}\n",
+ "\\begin{itemize}\n",
+ "\\item \\texttt{window=2}: the pooling window size for each dimension.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{padding=0}: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{stride=window}: the number of elements to slide to reach the next pooling window.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{mode=0}: 0 for max, 1 for average including padded values, 2 for average excluding padded values.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{maxpoolingNanOpt=0}: Nan numbers are not propagated if 0, they are propagated if 1.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{alpha=1}: can be used to scale the result.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{handle}: Handle to a previously created cuDNN context. Defaults to a Knet allocated handle.\n",
+ "\n",
+ "\\end{itemize}\n"
+ ],
+ "text/markdown": [
+ "```\n",
+ "pool(x; kwargs...)\n",
+ "```\n",
+ "\n",
+ "Compute pooling of input values (i.e., the maximum or average of several adjacent values) to produce an output with smaller height and/or width.\n",
+ "\n",
+ "Currently 4 or 5 dimensional KnetArrays with `Float32` or `Float64` entries are supported. If `x` has dimensions `(X1,X2,...,I,N)`, the result `y` will have dimensions `(Y1,Y2,...,I,N)` where\n",
+ "\n",
+ "```\n",
+ "Yi=1+floor((Xi+2*padding[i]-window[i])/stride[i])\n",
+ "```\n",
+ "\n",
+ "Here `I` is the number of input channels, `N` is the number of instances, and `Xi,Yi` are spatial dimensions. `window`, `padding` and `stride` are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.\n",
+ "\n",
+ "# Keywords:\n",
+ "\n",
+ " * `window=2`: the pooling window size for each dimension.\n",
+ " * `padding=0`: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.\n",
+ " * `stride=window`: the number of elements to slide to reach the next pooling window.\n",
+ " * `mode=0`: 0 for max, 1 for average including padded values, 2 for average excluding padded values.\n",
+ " * `maxpoolingNanOpt=0`: Nan numbers are not propagated if 0, they are propagated if 1.\n",
+ " * `alpha=1`: can be used to scale the result.\n",
+ " * `handle`: Handle to a previously created cuDNN context. Defaults to a Knet allocated handle.\n"
+ ],
+ "text/plain": [
+ "\u001b[36m pool(x; kwargs...)\u001b[39m\n",
+ "\n",
+ " Compute pooling of input values (i.e., the maximum or average of\n",
+ " several adjacent values) to produce an output with smaller height\n",
+ " and/or width.\n",
+ "\n",
+ " Currently 4 or 5 dimensional KnetArrays with \u001b[36mFloat32\u001b[39m or \u001b[36mFloat64\u001b[39m\n",
+ " entries are supported. If \u001b[36mx\u001b[39m has dimensions \u001b[36m(X1,X2,...,I,N)\u001b[39m, the\n",
+ " result \u001b[36my\u001b[39m will have dimensions \u001b[36m(Y1,Y2,...,I,N)\u001b[39m where\n",
+ "\n",
+ "\u001b[36m Yi=1+floor((Xi+2*padding[i]-window[i])/stride[i])\u001b[39m\n",
+ "\n",
+ " Here \u001b[36mI\u001b[39m is the number of input channels, \u001b[36mN\u001b[39m is the number of\n",
+ " instances, and \u001b[36mXi,Yi\u001b[39m are spatial dimensions. \u001b[36mwindow\u001b[39m, \u001b[36mpadding\u001b[39m and\n",
+ " \u001b[36mstride\u001b[39m are keyword arguments that can be specified as a single\n",
+ " number (in which case they apply to all dimensions), or an\n",
+ " array/tuple with entries for each spatial dimension.\n",
+ "\n",
+ "\u001b[1m Keywords:\u001b[22m\n",
+ "\u001b[1m ≡≡≡≡≡≡≡≡≡≡≡\u001b[22m\n",
+ "\n",
+ " • \u001b[36mwindow=2\u001b[39m: the pooling window size for each dimension.\n",
+ "\n",
+ " • \u001b[36mpadding=0\u001b[39m: the number of extra zeros implicitly\n",
+ " concatenated at the start and at the end of each\n",
+ " dimension.\n",
+ "\n",
+ " • \u001b[36mstride=window\u001b[39m: the number of elements to slide to reach\n",
+ " the next pooling window.\n",
+ "\n",
+ " • \u001b[36mmode=0\u001b[39m: 0 for max, 1 for average including padded values,\n",
+ " 2 for average excluding padded values.\n",
+ "\n",
+ " • \u001b[36mmaxpoolingNanOpt=0\u001b[39m: Nan numbers are not propagated if 0,\n",
+ " they are propagated if 1.\n",
+ "\n",
+ " • \u001b[36malpha=1\u001b[39m: can be used to scale the result.\n",
+ "\n",
+ " • \u001b[36mhandle\u001b[39m: Handle to a previously created cuDNN context.\n",
+ " Defaults to a Knet allocated handle."
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Pooling operator in Knet\n",
"@doc pool"
@@ -258,13 +598,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0]\n",
+ "pool(x) = [2.0; 4.0; 6.0]\n"
+ ]
+ }
+ ],
"source": [
"# 1-D pooling example\n",
"x = reshape([1.0:6.0...], (6,1,1,1)); @show x\n",
@@ -273,13 +622,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0]\n",
+ "pool(x; window=3) = [3.0; 6.0]\n"
+ ]
+ }
+ ],
"source": [
"# Window size\n",
"x = reshape([1.0:6.0...], (6,1,1,1)); @show x\n",
@@ -288,13 +646,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0]\n",
+ "pool(x; padding=(1, 0)) = [1.0; 3.0; 5.0; 6.0]\n"
+ ]
+ }
+ ],
"source": [
"# Padding\n",
"x = reshape([1.0:6.0...], (6,1,1,1)); @show x\n",
@@ -303,13 +670,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0; 8.0; 9.0; 10.0]\n",
+ "pool(x; stride=4) = [2.0; 6.0; 10.0]\n"
+ ]
+ }
+ ],
"source": [
"# Stride\n",
"x = reshape([1.0:10.0...], (10,1,1,1)); @show x\n",
@@ -318,13 +694,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x = K[1.0; 2.0; 3.0; 4.0; 5.0; 6.0]\n",
+ "pool(x; padding=(1, 0), mode=0) = K[1.0; 3.0; 5.0; 6.0]\n",
+ "pool(x; padding=(1, 0), mode=1) = K[0.5; 2.5; 4.5; 3.0]\n",
+ "pool(x; padding=(1, 0), mode=2) = K[1.0; 2.5; 4.5; 6.0]\n"
+ ]
+ }
+ ],
"source": [
"# Mode (using KnetArray here; not all modes are implemented on the CPU)\n",
"x = KnetArray(reshape([1.0:6.0...], (6,1,1,1))); @show x\n",
@@ -335,13 +722,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4×4×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0 5.0 9.0 13.0\n",
+ " 2.0 6.0 10.0 14.0\n",
+ " 3.0 7.0 11.0 15.0\n",
+ " 4.0 8.0 12.0 16.0"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# More dimensions\n",
"x = reshape([1.0:16.0...], (4,4,1,1))"
@@ -349,26 +752,62 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×2×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 6.0 14.0\n",
+ " 8.0 16.0"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pool(x)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4×4×2×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0 5.0 9.0 13.0\n",
+ " 2.0 6.0 10.0 14.0\n",
+ " 3.0 7.0 11.0 15.0\n",
+ " 4.0 8.0 12.0 16.0\n",
+ "\n",
+ "[:, :, 2, 1] =\n",
+ " 17.0 21.0 25.0 29.0\n",
+ " 18.0 22.0 26.0 30.0\n",
+ " 19.0 23.0 27.0 31.0\n",
+ " 20.0 24.0 28.0 32.0"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Multiple channels and instances\n",
"x = reshape([1.0:32.0...], (4,4,2,1))"
@@ -376,13 +815,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×2×2×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 6.0 14.0\n",
+ " 8.0 16.0\n",
+ "\n",
+ "[:, :, 2, 1] =\n",
+ " 22.0 30.0\n",
+ " 24.0 32.0"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# each channel and each instance is pooled separately\n",
"pool(x) # size Y = (Y1,...,Yd,Cx,N) where Yi are spatial dims, Cx and N are identical to input X"
@@ -401,13 +858,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading MNIST...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/mnist.jl:33\n"
+ ]
+ }
+ ],
"source": [
"# Load data (see mnist.ipynb)\n",
"include(Knet.dir(\"data\",\"mnist.jl\")) # Load data\n",
@@ -416,13 +882,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "28×28×1×100 KnetArray{Float32,4}\n",
+ "100-element Array{UInt8,1}\n"
+ ]
+ }
+ ],
"source": [
"(x,y) = first(dtst)\n",
"println.(summary.((x,y)));"
@@ -430,13 +905,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "trainresults (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# For running experiments\n",
"function trainresults(file,model; o...)\n",
@@ -469,13 +955,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Conv"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Define a convolutional layer:\n",
"struct Conv; w; b; f; p; end\n",
@@ -485,13 +982,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Dense"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Redefine dense layer (See mlp.ipynb):\n",
"struct Dense; w; b; f; p; end\n",
@@ -501,7 +1009,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -521,14 +1029,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4-element Array{String,1}:\n",
+ " \"5×5×1×20 AutoGrad.Param{KnetArray{Float32,4}}\" \n",
+ " \"5×5×20×50 AutoGrad.Param{KnetArray{Float32,4}}\"\n",
+ " \"500×800 AutoGrad.Param{KnetArray{Float32,2}}\" \n",
+ " \"10×500 AutoGrad.Param{KnetArray{Float32,2}}\" "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"lenet = Chain(Conv(5,5,1,20), \n",
" Conv(5,5,20,50), \n",
@@ -539,13 +1062,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.2857893f0"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"lenet(x,y)"
]
@@ -563,21 +1097,32 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 31,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "1.08e-02 100.00%┣████████████████▉┫ 60000/60000 [03:50/03:50, 260.67i/s]\n",
+ "Float32[0.000135032; 0.0196918; 0.0; 0.0053]\n"
+ ]
+ }
+ ],
"source": [
- "# 225s [0.000159805; 0.0163911; 0.0; 0.0046]\n",
+ "# 1.08e-02 100.00%┣████████████████▉┫ 60000/60000 [03:50/03:50, 260.67i/s]\n",
+ "# [0.000135032; 0.0196918; 0.0; 0.0053]\n",
"cnn = trainresults(\"cnn113.jld2\", lenet);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 32,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -590,7 +1135,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 33,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -603,13 +1148,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 34,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Comparison to MLP shows faster convergence, better generalization\n",
"plot([mlp[1,:], mlp[2,:], cnn[1,:], cnn[2,:]],ylim=(0.0,0.1),\n",
@@ -618,14 +1172,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 35,
"metadata": {
"scrolled": false,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"plot([mlp[3,:], mlp[4,:], cnn[3,:], cnn[4,:]],ylim=(0.0,0.03),\n",
" labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel=\"Epochs\",ylabel=\"Error\") "
@@ -644,13 +1207,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 36,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w = reshape([1.0, 2.0, 3.0], (3, 1, 1, 1)) = [1.0; 2.0; 3.0]\n",
+ "x = reshape([1.0:7.0...], (7, 1, 1, 1)) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]\n",
+ "y = conv4(w, x) = [10.0; 16.0; 22.0; 28.0; 34.0]\n"
+ ]
+ }
+ ],
"source": [
"# Convolution and matrix multiplication can be implemented in terms of each other.\n",
"# Convolutional networks have no additional representational power, only statistical efficiency.\n",
@@ -662,13 +1235,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 37,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5×7 Array{Float64,2}:\n",
+ " 3.0 2.0 1.0 0.0 0.0 0.0 0.0\n",
+ " 0.0 3.0 2.0 1.0 0.0 0.0 0.0\n",
+ " 0.0 0.0 3.0 2.0 1.0 0.0 0.0\n",
+ " 0.0 0.0 0.0 3.0 2.0 1.0 0.0\n",
+ " 0.0 0.0 0.0 0.0 3.0 2.0 1.0"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Convolution as matrix multiplication (1)\n",
"# Turn w into a (Y,X) sparse matrix\n",
@@ -677,26 +1266,48 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "y2 = w2 * mat(x) = [10.0; 16.0; 22.0; 28.0; 34.0]\n"
+ ]
+ }
+ ],
"source": [
"@show y2 = w2 * mat(x);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 39,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×5 Array{Float64,2}:\n",
+ " 1.0 2.0 3.0 4.0 5.0\n",
+ " 2.0 3.0 4.0 5.0 6.0\n",
+ " 3.0 4.0 5.0 6.0 7.0"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Convolution as matrix multiplication (2)\n",
"# Turn x into a (W,Y) dense matrix (aka the im2col operation)\n",
@@ -706,13 +1317,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 40,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "w3 = [3.0 2.0 1.0] = [3.0 2.0 1.0]\n",
+ "y3 = w3 * x3 = [10.0 16.0 22.0 28.0 34.0]\n"
+ ]
+ }
+ ],
"source": [
"@show w3 = [3.0 2.0 1.0]\n",
"@show y3 = w3 * x3;"
@@ -720,13 +1340,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 41,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×3 Array{Float64,2}:\n",
+ " 1.0 3.0 5.0\n",
+ " 2.0 4.0 6.0"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Matrix multiplication as convolution\n",
"# This could be used to make a fully connected network accept variable sized inputs.\n",
@@ -735,39 +1368,86 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 42,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×1 Array{Float64,2}:\n",
+ " 1.0\n",
+ " 2.0\n",
+ " 3.0"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"x = reshape([1.0:3.0...], (3,1))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 43,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×1 Array{Float64,2}:\n",
+ " 22.0\n",
+ " 28.0"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"y = w * x"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 44,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×1×1×2 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0\n",
+ " 3.0\n",
+ " 5.0\n",
+ "\n",
+ "[:, :, 1, 2] =\n",
+ " 2.0\n",
+ " 4.0\n",
+ " 6.0"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Consider w with size (Y,X)\n",
"# Treat each of the Y rows of w as a convolution filter\n",
@@ -776,13 +1456,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 45,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3×1×1×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 1.0\n",
+ " 2.0\n",
+ " 3.0"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Reshape x for convolution\n",
"x2 = reshape(x, (3,1,1,1))"
@@ -790,13 +1485,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 46,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×1×2×1 Array{Float64,4}:\n",
+ "[:, :, 1, 1] =\n",
+ " 22.0\n",
+ "\n",
+ "[:, :, 2, 1] =\n",
+ " 28.0"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Use conv4 for matrix multiplication\n",
"y2 = conv4(w2, x2; mode=1)"
diff --git a/tutorial/60.rnn.ipynb b/tutorial/60.rnn.ipynb
index 3bd86cf82..5cd67d1c9 100644
--- a/tutorial/60.rnn.ipynb
+++ b/tutorial/60.rnn.ipynb
@@ -9,27 +9,24 @@
},
"source": [
"# Introduction to Recurrent Neural Networks\n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "source": [
+ "(c) Deniz Yuret, 2019\n",
"* Objectives: learn about RNNs, the RNN layer, compare with MLP on a tagging task.\n",
"* Prerequisites: [MLP models](40.mlp.ipynb)\n",
"* New functions: \n",
- "[RNN](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.RNN),\n",
- "[adam](http://denizyuret.github.io/Knet.jl/latest/reference.html#Knet.adam)"
+ "[RNN](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.RNN),\n",
+ "[adam](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.adam)\n",
+ "\n",
+ "![image](https://github.com/denizyuret/Knet.jl/blob/master/docs/src/images/RNN-unrolled.png?raw=true)([image\n",
+ "source](http://colah.github.io/posts/2015-08-Understanding-LSTMs))\n",
+ "\n",
+ "In this notebook we will see how to implement a recurrent neural network (RNN) in Knet. In RNNs, connections between units form a directed cycle, which allows them to keep a persistent state over time. This gives them the ability to process sequences of arbitrary length one element at a time, while keeping track of what happened at previous elements. One can view the current state of the RNN as a representation for the sequence processed so far.\n",
+ "\n",
+ "We will build a part-of-speech tagger using a large annotated corpus of English. We will represent words with numeric vectors appropriate as inputs to a neural network. These word vectors will be initialized randomly and learned during training just like other model parameters. We will compare three network architectures: (1) an MLP which tags each word independently of its neighbors, (2) a simple RNN that can represent the neighboring words to the left, (3) a bidirectional RNN that can represent both left and right contexts. As can be expected 1 < 2 < 3 in performance. More surprisingly, the three models are very similar to each other: we will see their model diagrams are identical except for the horizontal connections that carry information across the sequence. "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -59,13 +56,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The Brown Corpus has 57340 sentences, 1161192 tokens, with a word vocabulary of 56057 and a tag vocabulary of 472.\n"
+ ]
+ }
+ ],
"source": [
"include(Knet.dir(\"data/nltk.jl\"))\n",
"(data,words,tags) = brown()\n",
@@ -85,13 +90,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "57340-element Array{Tuple{Array{UInt16,1},Array{UInt16,1}},1}\n",
+ "56057-element Array{String,1}\n",
+ "472-element Array{String,1}\n"
+ ]
+ }
+ ],
"source": [
"println.(summary.((data,words,tags)));"
]
@@ -109,13 +124,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2×25 Array{Int64,2}:\n",
+ " 15 5860 1296 5597 17468 60 … 14 9 85 10004 221 189 3\n",
+ " 3 40 21 39 21 13 29 14 46 7 13 1 5"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "2×25 Array{String,2}:\n",
+ " \"The\" \"Fulton\" \"County\" \"Grand\" \"Jury\" … \"took\" \"place\" \".\"\n",
+ " \"at\" \"np-tl\" \"nn-tl\" \"jj-tl\" \"nn-tl\" \"vbd\" \"nn\" \".\""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"(w,t) = first(data)\n",
"display(permutedims(Int[w t]))\n",
@@ -135,7 +173,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -165,7 +203,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -193,7 +231,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -219,14 +257,52 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×7 Array{String,2}:\n",
+ " \"Rapping\" \"the\" \"stick\" \"against\" \"the\" \"desk\" \".\""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1×7 Array{Int64,2}:\n",
+ " 47900 1 3014 163 1 1719 3"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "8×7 Knet.KnetArray{Float32,2}:\n",
+ " -5.76045e-5 0.00222079 … 0.000327385 0.00192072 \n",
+ " -0.00522177 -0.000249661 -0.000252162 0.000350792\n",
+ " 0.00511935 -0.000636698 0.00414317 -0.000444564\n",
+ " -0.00366786 0.000273602 0.00106341 -0.0026508 \n",
+ " 0.00499445 0.00397028 0.00135022 0.00303491 \n",
+ " -0.00151867 0.00453946 … 0.000968743 -0.000920549\n",
+ " 0.00123059 0.0011105 -5.77072e-5 0.00345715 \n",
+ " -0.00209608 0.00351814 0.00290676 7.46992e-5 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"embedlayer = Embed(length(words),8)\n",
"(w,t) = data[52855]\n",
@@ -248,14 +324,283 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "\\begin{verbatim}\n",
+ "rnn = RNN(inputSize, hiddenSize; opts...)\n",
+ "rnn(x; batchSizes) => y\n",
+ "rnn.h, rnn.c # hidden and cell states\n",
+ "\\end{verbatim}\n",
+ "\\texttt{RNN} returns a callable RNN object \\texttt{rnn}. Given a minibatch of sequences \\texttt{x}, \\texttt{rnn(x)} returns \\texttt{y}, the hidden states of the final layer for each time step. \\texttt{rnn.h} and \\texttt{rnn.c} fields can be used to set the initial hidden states and read the final hidden states of all layers. Note that the final time step of \\texttt{y} always contains the final hidden state of the last layer, equivalent to \\texttt{rnn.h} for a single layer network.\n",
+ "\n",
+ "\\textbf{Dimensions:} The input \\texttt{x} can be 1, 2, or 3 dimensional and \\texttt{y} will have the same number of dimensions as \\texttt{x}. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D \\texttt{x} represents a single instance for a single time step, a 2-D \\texttt{x} represents a single minibatch for a single time step, and a 3-D \\texttt{x} represents a sequence of identically sized minibatches for multiple time steps. The output \\texttt{y} gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields \\texttt{rnn.h} and \\texttt{rnn.c} represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.\n",
+ "\n",
+ "\\textbf{batchSizes:} If \\texttt{batchSizes=nothing} (default), all sequences in a minibatch are assumed to be the same length. If \\texttt{batchSizes} is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case \\texttt{x} will typically be 2-D with the second dimension representing variable size batches for time steps. If \\texttt{batchSizes} is used, \\texttt{sum(batchSizes)} should equal \\texttt{length(x) ÷ size(x,1)}. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.\n",
+ "\n",
+ "\\textbf{Hidden states:} The hidden and cell states are kept in \\texttt{rnn.h} and \\texttt{rnn.c} fields (the cell state is only used by LSTM). They can be initialized during construction using the \\texttt{h} and \\texttt{c} keyword arguments, or modified later by direct assignment. Valid values are \\texttt{nothing} (default), \\texttt{0}, or an array of the right type and size possibly wrapped in a \\texttt{Param}. If the value is \\texttt{nothing} the initial state is assumed to be zero and the final state is discarded keeping the value \\texttt{nothing}. If the value is \\texttt{0} the initial state is assumed to be zero and \\texttt{0} is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.\n",
+ "\n",
+ "In a differentiation context the returned final hidden states will be wrapped in \\texttt{Result} types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. \\texttt{rnn.h = value(rnn.h)} to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the \\href{https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb}{CharLM Tutorial} for an example.\n",
+ "\n",
+ "\\textbf{Keyword arguments for RNN:}\n",
+ "\n",
+ "\\begin{itemize}\n",
+ "\\item \\texttt{h=nothing}: Initial hidden state.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{c=nothing}: Initial cell state.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{rnnType=:lstm} Type of RNN: One of :relu, :tanh, :lstm, :gru.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{numLayers=1}: Number of RNN layers.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{bidirectional=false}: Create a bidirectional RNN if \\texttt{true}.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{dropout=0}: Dropout probability. Applied to input and between layers.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{skipInput=false}: Do not multiply the input with a matrix if \\texttt{true}.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{dataType=Float32}: Data type to use for weights.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{algo=0}: Algorithm to use, see CUDNN docs for details.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{seed=0}: Random number seed for dropout. Uses \\texttt{time()} if 0.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{winit=xavier}: Weight initialization method for matrices.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{binit=zeros}: Weight initialization method for bias vectors.\n",
+ "\n",
+ "\n",
+ "\\item \\texttt{usegpu=(gpu()>=0)}: GPU used by default if one exists.\n",
+ "\n",
+ "\\end{itemize}\n",
+ "\\textbf{Formulas:} RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:\n",
+ "\n",
+ "\\texttt{:relu} and \\texttt{:tanh}: Single gate RNN with activation function f:\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)\n",
+ "\\end{verbatim}\n",
+ "\\texttt{:gru}: Gated recurrent unit:\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\n",
+ "r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate\n",
+ "n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate\n",
+ "h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]\n",
+ "\\end{verbatim}\n",
+ "\\texttt{:lstm}: Long short term memory unit with no peephole connections:\n",
+ "\n",
+ "\\begin{verbatim}\n",
+ "i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\n",
+ "f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate\n",
+ "o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate\n",
+ "n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate\n",
+ "c[t] = f[t] .* c[t-1] .+ i[t] .* n[t] # cell output\n",
+ "h[t] = o[t] .* tanh(c[t])\n",
+ "\\end{verbatim}\n"
+ ],
+ "text/markdown": [
+ "```\n",
+ "rnn = RNN(inputSize, hiddenSize; opts...)\n",
+ "rnn(x; batchSizes) => y\n",
+ "rnn.h, rnn.c # hidden and cell states\n",
+ "```\n",
+ "\n",
+ "`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers. Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.\n",
+ "\n",
+ "**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.\n",
+ "\n",
+ "**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.\n",
+ "\n",
+ "**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.\n",
+ "\n",
+ "In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.\n",
+ "\n",
+ "**Keyword arguments for RNN:**\n",
+ "\n",
+ " * `h=nothing`: Initial hidden state.\n",
+ " * `c=nothing`: Initial cell state.\n",
+ " * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.\n",
+ " * `numLayers=1`: Number of RNN layers.\n",
+ " * `bidirectional=false`: Create a bidirectional RNN if `true`.\n",
+ " * `dropout=0`: Dropout probability. Applied to input and between layers.\n",
+ " * `skipInput=false`: Do not multiply the input with a matrix if `true`.\n",
+ " * `dataType=Float32`: Data type to use for weights.\n",
+ " * `algo=0`: Algorithm to use, see CUDNN docs for details.\n",
+ " * `seed=0`: Random number seed for dropout. Uses `time()` if 0.\n",
+ " * `winit=xavier`: Weight initialization method for matrices.\n",
+ " * `binit=zeros`: Weight initialization method for bias vectors.\n",
+ " * `usegpu=(gpu()>=0)`: GPU used by default if one exists.\n",
+ "\n",
+ "**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:\n",
+ "\n",
+ "`:relu` and `:tanh`: Single gate RNN with activation function f:\n",
+ "\n",
+ "```\n",
+ "h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)\n",
+ "```\n",
+ "\n",
+ "`:gru`: Gated recurrent unit:\n",
+ "\n",
+ "```\n",
+ "i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\n",
+ "r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate\n",
+ "n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate\n",
+ "h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]\n",
+ "```\n",
+ "\n",
+ "`:lstm`: Long short term memory unit with no peephole connections:\n",
+ "\n",
+ "```\n",
+ "i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\n",
+ "f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate\n",
+ "o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate\n",
+ "n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate\n",
+ "c[t] = f[t] .* c[t-1] .+ i[t] .* n[t] # cell output\n",
+ "h[t] = o[t] .* tanh(c[t])\n",
+ "```\n"
+ ],
+ "text/plain": [
+ "\u001b[36m rnn = RNN(inputSize, hiddenSize; opts...)\u001b[39m\n",
+ "\u001b[36m rnn(x; batchSizes) => y\u001b[39m\n",
+ "\u001b[36m rnn.h, rnn.c # hidden and cell states\u001b[39m\n",
+ "\n",
+ " \u001b[36mRNN\u001b[39m returns a callable RNN object \u001b[36mrnn\u001b[39m. Given a minibatch of\n",
+ " sequences \u001b[36mx\u001b[39m, \u001b[36mrnn(x)\u001b[39m returns \u001b[36my\u001b[39m, the hidden states of the final layer\n",
+ " for each time step. \u001b[36mrnn.h\u001b[39m and \u001b[36mrnn.c\u001b[39m fields can be used to set the\n",
+ " initial hidden states and read the final hidden states of all\n",
+ " layers. Note that the final time step of \u001b[36my\u001b[39m always contains the final\n",
+ " hidden state of the last layer, equivalent to \u001b[36mrnn.h\u001b[39m for a single\n",
+ " layer network.\n",
+ "\n",
+ " \u001b[1mDimensions:\u001b[22m The input \u001b[36mx\u001b[39m can be 1, 2, or 3 dimensional and \u001b[36my\u001b[39m will\n",
+ " have the same number of dimensions as \u001b[36mx\u001b[39m. size(x)=(X,[B,T]) and\n",
+ " size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is\n",
+ " seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default\n",
+ " a 1-D \u001b[36mx\u001b[39m represents a single instance for a single time step, a 2-D \u001b[36mx\u001b[39m\n",
+ " represents a single minibatch for a single time step, and a 3-D \u001b[36mx\u001b[39m\n",
+ " represents a sequence of identically sized minibatches for multiple\n",
+ " time steps. The output \u001b[36my\u001b[39m gives the hidden state (of the final layer\n",
+ " for multi-layer RNNs) for each time step. The fields \u001b[36mrnn.h\u001b[39m and \u001b[36mrnn.c\u001b[39m\n",
+ " represent the hidden states of all layers in a single time step and\n",
+ " have size (H,B,L/2L) where L is numLayers and 2L is for\n",
+ " bidirectional RNNs.\n",
+ "\n",
+ " \u001b[1mbatchSizes:\u001b[22m If \u001b[36mbatchSizes=nothing\u001b[39m (default), all sequences in a\n",
+ " minibatch are assumed to be the same length. If \u001b[36mbatchSizes\u001b[39m is an\n",
+ " array of (non-increasing) integers, it gives us the batch size for\n",
+ " each time step (allowing different sequences in the minibatch to\n",
+ " have different lengths). In this case \u001b[36mx\u001b[39m will typically be 2-D with\n",
+ " the second dimension representing variable size batches for time\n",
+ " steps. If \u001b[36mbatchSizes\u001b[39m is used, \u001b[36msum(batchSizes)\u001b[39m should equal \u001b[36mlength(x)\n",
+ " ÷ size(x,1)\u001b[39m. When the batch size is different in every time step,\n",
+ " hidden states will have size (H,B,L/2L) where B is always the size\n",
+ " of the first (largest) minibatch.\n",
+ "\n",
+ " \u001b[1mHidden states:\u001b[22m The hidden and cell states are kept in \u001b[36mrnn.h\u001b[39m and\n",
+ " \u001b[36mrnn.c\u001b[39m fields (the cell state is only used by LSTM). They can be\n",
+ " initialized during construction using the \u001b[36mh\u001b[39m and \u001b[36mc\u001b[39m keyword arguments,\n",
+ " or modified later by direct assignment. Valid values are \u001b[36mnothing\u001b[39m\n",
+ " (default), \u001b[36m0\u001b[39m, or an array of the right type and size possibly\n",
+ " wrapped in a \u001b[36mParam\u001b[39m. If the value is \u001b[36mnothing\u001b[39m the initial state is\n",
+ " assumed to be zero and the final state is discarded keeping the\n",
+ " value \u001b[36mnothing\u001b[39m. If the value is \u001b[36m0\u001b[39m the initial state is assumed to be\n",
+ " zero and \u001b[36m0\u001b[39m is replaced by the final state on return. If the value is\n",
+ " a valid state, it is used as the initial state and is replaced by\n",
+ " the final state on return.\n",
+ "\n",
+ " In a differentiation context the returned final hidden states will\n",
+ " be wrapped in \u001b[36mResult\u001b[39m types. This is necessary if the same RNN object\n",
+ " is to be called multiple times in a single iteration. Between\n",
+ " iterations (i.e. after diff/update) the hidden states need to be\n",
+ " unboxed with e.g. \u001b[36mrnn.h = value(rnn.h)\u001b[39m to prevent spurious\n",
+ " dependencies. This happens automatically during the backward pass\n",
+ " for GPU RNNs but needs to be done manually for CPU RNNs. See the\n",
+ " CharLM Tutorial\n",
+ " (https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb)\n",
+ " for an example.\n",
+ "\n",
+ " \u001b[1mKeyword arguments for RNN:\u001b[22m\n",
+ "\n",
+ " • \u001b[36mh=nothing\u001b[39m: Initial hidden state.\n",
+ "\n",
+ " • \u001b[36mc=nothing\u001b[39m: Initial cell state.\n",
+ "\n",
+ " • \u001b[36mrnnType=:lstm\u001b[39m Type of RNN: One of :relu, :tanh, :lstm,\n",
+ " :gru.\n",
+ "\n",
+ " • \u001b[36mnumLayers=1\u001b[39m: Number of RNN layers.\n",
+ "\n",
+ " • \u001b[36mbidirectional=false\u001b[39m: Create a bidirectional RNN if \u001b[36mtrue\u001b[39m.\n",
+ "\n",
+ " • \u001b[36mdropout=0\u001b[39m: Dropout probability. Applied to input and\n",
+ " between layers.\n",
+ "\n",
+ " • \u001b[36mskipInput=false\u001b[39m: Do not multiply the input with a matrix\n",
+ " if \u001b[36mtrue\u001b[39m.\n",
+ "\n",
+ " • \u001b[36mdataType=Float32\u001b[39m: Data type to use for weights.\n",
+ "\n",
+ " • \u001b[36malgo=0\u001b[39m: Algorithm to use, see CUDNN docs for details.\n",
+ "\n",
+ " • \u001b[36mseed=0\u001b[39m: Random number seed for dropout. Uses \u001b[36mtime()\u001b[39m if 0.\n",
+ "\n",
+ " • \u001b[36mwinit=xavier\u001b[39m: Weight initialization method for matrices.\n",
+ "\n",
+ " • \u001b[36mbinit=zeros\u001b[39m: Weight initialization method for bias\n",
+ " vectors.\n",
+ "\n",
+ " • \u001b[36musegpu=(gpu()>=0)\u001b[39m: GPU used by default if one exists.\n",
+ "\n",
+ " \u001b[1mFormulas:\u001b[22m RNNs compute the output h[t] for a given iteration from\n",
+ " the recurrent input h[t-1] and the previous layer input x[t] given\n",
+ " matrices W, R and biases bW, bR from the following equations:\n",
+ "\n",
+ " \u001b[36m:relu\u001b[39m and \u001b[36m:tanh\u001b[39m: Single gate RNN with activation function f:\n",
+ "\n",
+ "\u001b[36m h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)\u001b[39m\n",
+ "\n",
+ " \u001b[36m:gru\u001b[39m: Gated recurrent unit:\n",
+ "\n",
+ "\u001b[36m i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\u001b[39m\n",
+ "\u001b[36m r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate\u001b[39m\n",
+ "\u001b[36m n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate\u001b[39m\n",
+ "\u001b[36m h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]\u001b[39m\n",
+ "\n",
+ " \u001b[36m:lstm\u001b[39m: Long short term memory unit with no peephole connections:\n",
+ "\n",
+ "\u001b[36m i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate\u001b[39m\n",
+ "\u001b[36m f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate\u001b[39m\n",
+ "\u001b[36m o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate\u001b[39m\n",
+ "\u001b[36m n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate\u001b[39m\n",
+ "\u001b[36m c[t] = f[t] .* c[t-1] .+ i[t] .* n[t] # cell output\u001b[39m\n",
+ "\u001b[36m h[t] = o[t] .* tanh(c[t])\u001b[39m"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"@doc RNN"
]
@@ -312,7 +657,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -342,7 +687,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -356,7 +701,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -391,13 +736,50 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "64×32 Array{String,2}:\n",
+ " \"The\" \"Fulton\" … \"term-end\" \"presentments\" \n",
+ " \"director\" \"of\" \"gifts\" \"from\" \n",
+ " \"local\" \"club\" \"who\" \"will\" \n",
+ " \".\" \"The\" \"of\" \"the\" \n",
+ " \"plans\" \"titled\" \"this\" \"fall\" \n",
+ " \"admitted\" \"for\" … \"''\" \".\" \n",
+ " \"American\" \"system\" \"patriotic\" \"apprehensions\"\n",
+ " \"of\" \"the\" \"Then\" \"cometh\" \n",
+ " \"the\" \"misdeeds\" \"is\" \"entitled\" \n",
+ " \"entertaining\" \"and\" \"from\" \"these\" \n",
+ " \"is\" \"merely\" … \"to\" \"the\" \n",
+ " \"is\" \"not\" \"that\" \"any\" \n",
+ " \"has\" \"said\" \"important\" \"that\" \n",
+ " ⋮ ⋱ ⋮ \n",
+ " \".\" \"But\" \"would\" \"I\" \n",
+ " \"not\" \"worth\" \"Constable's\" \"explanation\" \n",
+ " \",\" \"I\" \"carries\" \",\" \n",
+ " \"not\" \"hurt\" … \"happy\" \"to\" \n",
+ " \"you\" \"do\" \"away\" \".\" \n",
+ " \"west\" \",\" \"Water\" \"splashed\" \n",
+ " \"caught\" \"sight\" \"rapidly\" \".\" \n",
+ " \"city\" \".\" \".\" \"It\" \n",
+ " \"her\" \"life\" … \"him\" \"lead\" \n",
+ " \"dog\" \".\" \"that's\" \"the\" \n",
+ " \"what's-his-name\" \"got\" \"spent\" \"the\" \n",
+ " \"solid\" \",\" \"dislike\" \"them\" "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"(x,y) = first(d)\n",
"words[x]"
@@ -417,13 +799,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"128×64×32 Knet.KnetArray{Float32,3}\""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"embedlayer = Embed(length(words),128)\n",
"summary(embedlayer(x))"
@@ -442,13 +835,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(556, 10)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# shuffle and split minibatches into train and test portions\n",
"shuffle!(d)\n",
@@ -459,22 +863,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "trainresults (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# For running experiments we will use the Adam algorithm which typically converges faster than SGD.\n",
- "function trainresults(file,model)\n",
+ "function trainresults(file,model,savemodel)\n",
" if (print(\"Train from scratch? \"); readline()[1]=='y')\n",
" takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)\n",
" results = ((nll(model,dtst), zeroone(model,dtst))\n",
" for x in takeevery(100, progress(adam(model,repeat(dtrn,5)))))\n",
" results = reshape(collect(Float32,flatten(results)),(2,:))\n",
- " Knet.save(file,\"model\",model,\"results\",results)\n",
+ " Knet.save(file,\"model\",(savemodel ? model : nothing),\"results\",results)\n",
" Knet.gc() # To save gpu memory\n",
" else\n",
" isfile(file) || download(\"http://people.csail.mit.edu/deniz/models/tutorial/$file\",file)\n",
@@ -487,7 +902,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -503,52 +918,82 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "2.06e-01 100.00%┣████████████████████┫ 2780/2780 [00:32/00:32, 88.10i/s]\n",
+ "Float32[0.322172; 0.0997559]\n"
+ ]
+ }
+ ],
"source": [
"# 2.35e-01 100.00%┣┫ 2780/2780 [00:13/00:13, 216.36i/s] [0.295007; 0.0972656]\n",
"t0 = Tagger0(VOCABSIZE,EMBEDSIZE,HIDDENSIZE,OUTPUTSIZE)\n",
- "(t0,r0) = trainresults(\"tagger113a.jld2\",t0);"
+ "(t0,r0) = trainresults(\"tagger113a.jld2\",t0,false);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "9.81e-02 100.00%┣████████████████████┫ 2780/2780 [00:32/00:32, 86.17i/s]\n",
+ "Float32[0.246886; 0.0693848]\n"
+ ]
+ }
+ ],
"source": [
"# 1.49e-01 100.00%┣┫ 2780/2780 [00:19/00:19, 142.58i/s] [0.21358; 0.0616211]\n",
"t1 = Tagger1(VOCABSIZE,EMBEDSIZE,HIDDENSIZE,OUTPUTSIZE)\n",
- "(t1,r1) = trainresults(\"tagger113b.jld2\",t1);"
+ "(t1,r1) = trainresults(\"tagger113b.jld2\",t1,false);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "4.25e-02 100.00%┣████████████████████┫ 2780/2780 [00:34/00:34, 81.05i/s]\n",
+ "Float32[0.191677; 0.0494141]\n"
+ ]
+ }
+ ],
"source": [
"# 9.37e-02 100.00%┣┫ 2780/2780 [00:25/00:25, 109.77i/s] [0.156669; 0.044043]\n",
"t2 = Tagger2(VOCABSIZE,EMBEDSIZE,HIDDENSIZE,OUTPUTSIZE)\n",
- "(t2,r2) = trainresults(\"tagger113c.jld2\",t2);"
+ "(t2,r2) = trainresults(\"tagger113c.jld2\",t2,true);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -561,13 +1006,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"plot([r0[2,:], r1[2,:], r2[2,:]]; xlabel=\"x100 updates\", ylabel=\"error\",\n",
" ylim=(0,0.15), yticks=0:0.01:0.15, labels=[\"MLP\",\"RNN\",\"biRNN\"])"
@@ -575,13 +1029,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"plot([r0[1,:], r1[1,:], r2[1,:]]; xlabel=\"x100 updates\", ylabel=\"loss\",\n",
" ylim=(0,.5), yticks=0:0.1:.5, labels=[\"MLP\",\"RNN\",\"biRNN\"])"
@@ -601,13 +1064,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tag (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"wdict=Dict{String,UInt16}(); for (i,w) in enumerate(words); wdict[w]=i; end\n",
"unk = UInt16(length(words))\n",
@@ -621,13 +1095,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "stdin> colorless green ideas sleep furiously\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "2×5 Array{AbstractString,2}:\n",
+ " \"colorless\" \"green\" \"ideas\" \"sleep\" \"furiously\"\n",
+ " \"jj\" \"jj\" \"nns\" \"vb\" \"rb\" "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"tag(t2,readline())"
]
diff --git a/tutorial/70.imdb.ipynb b/tutorial/70.imdb.ipynb
index bf115d4b0..616f9e542 100644
--- a/tutorial/70.imdb.ipynb
+++ b/tutorial/70.imdb.ipynb
@@ -9,20 +9,14 @@
},
"source": [
"# Sequence classification model for IMDB Sentiment Analysis\n",
- "(c) Deniz Yuret, 2019"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
+ "(c) Deniz Yuret, 2019\n",
"* Objectives: Learn the structure of the IMDB dataset and train a simple RNN model.\n",
"* Prerequisites: [RNN models](60.rnn.ipynb)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -35,13 +29,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0e-8"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Set constants for the model and training\n",
"EPOCHS=3 # Number of training epochs\n",
@@ -51,7 +56,7 @@
"MAXLEN=150 # maximum size of the word sequence, pad shorter sequences, truncate longer ones\n",
"VOCABSIZE=30000 # maximum vocabulary size, keep the most frequent 30K, map the rest to UNK token\n",
"NUMCLASS=2 # number of output classes\n",
- "DROPOUT=0.0 # Dropout rate\n",
+ "DROPOUT=0.5 # Dropout rate\n",
"LR=0.001 # Learning rate\n",
"BETA_1=0.9 # Adam optimization parameter\n",
"BETA_2=0.999 # Adam optimization parameter\n",
@@ -67,65 +72,222 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "imdb"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"include(Knet.dir(\"data\",\"imdb.jl\")) # defines imdb loader"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/latex": [
+ "\\begin{verbatim}\n",
+ "imdb()\n",
+ "\\end{verbatim}\n",
+ "Load the IMDB Movie reviews sentiment classification dataset from https://keras.io/datasets and return (xtrn,ytrn,xtst,ytst,dict) tuple.\n",
+ "\n",
+ "\\section{Keyword Arguments:}\n",
+ "\\begin{itemize}\n",
+ "\\item url=https://s3.amazonaws.com/text-datasets: where to download the data (imdb.npz) from.\n",
+ "\n",
+ "\n",
+ "\\item dir=Pkg.dir(\"Knet/data\"): where to cache the data.\n",
+ "\n",
+ "\n",
+ "\\item maxval=nothing: max number of token values to include. Words are ranked by how often they occur (in the training set) and only the most frequent words are kept. nothing means keep all, equivalent to maxval = vocabSize + pad + stoken.\n",
+ "\n",
+ "\n",
+ "\\item maxlen=nothing: truncate sequences after this length. nothing means do not truncate.\n",
+ "\n",
+ "\n",
+ "\\item seed=0: random seed for sample shuffling. Use system seed if 0.\n",
+ "\n",
+ "\n",
+ "\\item pad=true: whether to pad short sequences (padding is done at the beginning of sequences). pad\\_token = maxval.\n",
+ "\n",
+ "\n",
+ "\\item stoken=true: whether to add a start token to the beginning of each sequence. start\\_token = maxval - pad.\n",
+ "\n",
+ "\n",
+ "\\item oov=true: whether to replace words >= oov\\emph{token with oov}token (the alternative is to skip them). oov\\_token = maxval - pad - stoken.\n",
+ "\n",
+ "\\end{itemize}\n"
+ ],
+ "text/markdown": [
+ "```\n",
+ "imdb()\n",
+ "```\n",
+ "\n",
+ "Load the IMDB Movie reviews sentiment classification dataset from https://keras.io/datasets and return (xtrn,ytrn,xtst,ytst,dict) tuple.\n",
+ "\n",
+ "# Keyword Arguments:\n",
+ "\n",
+ " * url=https://s3.amazonaws.com/text-datasets: where to download the data (imdb.npz) from.\n",
+ " * dir=Pkg.dir(\"Knet/data\"): where to cache the data.\n",
+ " * maxval=nothing: max number of token values to include. Words are ranked by how often they occur (in the training set) and only the most frequent words are kept. nothing means keep all, equivalent to maxval = vocabSize + pad + stoken.\n",
+ " * maxlen=nothing: truncate sequences after this length. nothing means do not truncate.\n",
+ " * seed=0: random seed for sample shuffling. Use system seed if 0.\n",
+ " * pad=true: whether to pad short sequences (padding is done at the beginning of sequences). pad_token = maxval.\n",
+ " * stoken=true: whether to add a start token to the beginning of each sequence. start_token = maxval - pad.\n",
+ " * oov=true: whether to replace words >= oov*token with oov*token (the alternative is to skip them). oov_token = maxval - pad - stoken.\n"
+ ],
+ "text/plain": [
+ "\u001b[36m imdb()\u001b[39m\n",
+ "\n",
+ " Load the IMDB Movie reviews sentiment classification dataset from\n",
+ " https://keras.io/datasets and return (xtrn,ytrn,xtst,ytst,dict)\n",
+ " tuple.\n",
+ "\n",
+ "\u001b[1m Keyword Arguments:\u001b[22m\n",
+ "\u001b[1m ≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡\u001b[22m\n",
+ "\n",
+ " • url=https://s3.amazonaws.com/text-datasets: where to\n",
+ " download the data (imdb.npz) from.\n",
+ "\n",
+ " • dir=Pkg.dir(\"Knet/data\"): where to cache the data.\n",
+ "\n",
+ " • maxval=nothing: max number of token values to include.\n",
+ " Words are ranked by how often they occur (in the training\n",
+ " set) and only the most frequent words are kept. nothing\n",
+ " means keep all, equivalent to maxval = vocabSize + pad +\n",
+ " stoken.\n",
+ "\n",
+ " • maxlen=nothing: truncate sequences after this length.\n",
+ " nothing means do not truncate.\n",
+ "\n",
+ " • seed=0: random seed for sample shuffling. Use system seed\n",
+ " if 0.\n",
+ "\n",
+ " • pad=true: whether to pad short sequences (padding is done\n",
+ " at the beginning of sequences). pad_token = maxval.\n",
+ "\n",
+ " • stoken=true: whether to add a start token to the beginning\n",
+ " of each sequence. start_token = maxval - pad.\n",
+ "\n",
+ " • oov=true: whether to replace words >= oov\u001b[4mtoken with\n",
+ " oov\u001b[24mtoken (the alternative is to skip them). oov_token =\n",
+ " maxval - pad - stoken."
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"@doc imdb"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Loading IMDB...\n",
+ "└ @ Main /home/deniz/.julia/dev/Knet/data/imdb.jl:57\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " 6.811008 seconds (29.27 M allocations: 1.493 GiB, 7.70% gc time)\n"
+ ]
+ }
+ ],
"source": [
"@time (xtrn,ytrn,xtst,ytst,imdbdict)=imdb(maxlen=MAXLEN,maxval=VOCABSIZE);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "25000-element Array{Array{Int32,1},1}\n",
+ "25000-element Array{Int8,1}\n",
+ "25000-element Array{Array{Int32,1},1}\n",
+ "25000-element Array{Int8,1}\n",
+ "Dict{String,Int32} with 88584 entries\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(nothing, nothing, nothing, nothing, nothing)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"println.(summary.((xtrn,ytrn,xtst,ytst,imdbdict)))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×150 LinearAlgebra.Adjoint{Int32,Array{Int32,1}}:\n",
+ " 30000 30000 30000 30000 30000 … 1908 92 11 6 1 17 15 22"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Words are encoded with integers\n",
"rand(xtrn)'"
@@ -133,13 +295,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×25000 LinearAlgebra.Adjoint{Int64,Array{Int64,1}}:\n",
+ " 150 150 150 150 150 150 150 … 150 150 150 150 150 150"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Each word sequence is padded or truncated to length 150\n",
"length.(xtrn)'"
@@ -147,13 +321,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "reviewstring (generic function with 2 methods)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Define a function that can print the actual words:\n",
"imdbvocab = Array{String}(undef,length(imdbdict))\n",
@@ -167,11 +352,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"scrolled": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Positive review:\n",
+ "who definitely needed a hug these evil people capture the yokai and throw them into a red pit along with unwanted objects like and other mechanical things and these meld into one horribly violent robotic monsters whose only job is to kill takashi a young boy is the one to become their saviour alongside a red man dragon a turtle man and a river princess as well as a cute little creature that if it had been america they could have turned it into a cuddly toy and sold it at all good toy stores the lines are good especially the don't try this at home kids and other gems that bring a smile to your lips suspend belief and watch this with a child or on your own and enjoy though i must admit that the end was a wee bit sad and not necessarily so cheers \n"
+ ]
+ }
+ ],
"source": [
"# Hit Ctrl-Enter to see random reviews:\n",
"r = rand(1:length(xtrn))\n",
@@ -180,13 +374,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1×25000 LinearAlgebra.Adjoint{Int8,Array{Int8,1}}:\n",
+ " 1 2 2 1 1 1 1 1 2 2 1 … 1 2 2 1 1 2 1 1 1 2 2"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Here are the labels: 1=negative, 2=positive\n",
"ytrn'"
@@ -205,7 +411,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -214,9 +420,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SequenceClassifier"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"SequenceClassifier(input::Int, embed::Int, hidden::Int, output::Int; pdrop=0) =\n",
" SequenceClassifier(param(embed,input), RNN(embed,hidden,rnnType=:gru), param(output,hidden), pdrop)"
@@ -224,7 +441,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -248,9 +465,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(390, 390)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"dtrn = minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)\n",
"dtst = minibatch(xtst,ytst,BATCHSIZE)\n",
@@ -259,14 +487,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "trainresults (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# For running experiments\n",
"function trainresults(file,model; o...)\n",
" if (print(\"Train from scratch? \"); readline()[1]=='y')\n",
- " progress!(adam(model,dtrn;lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS))\n",
+ " progress!(adam(model,repeat(dtrn,EPOCHS);lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS))\n",
" Knet.save(file,\"model\",model)\n",
" Knet.gc() # To save gpu memory\n",
" else\n",
@@ -279,9 +518,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(0.69312066f0, 0.69312423f0, 0.5135817307692307, 0.5096153846153846)"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"model = SequenceClassifier(VOCABSIZE,EMBEDSIZE,NUMHIDDEN,NUMCLASS,pdrop=DROPOUT)\n",
"nll(model,dtrn), nll(model,dtst), accuracy(model,dtrn), accuracy(model,dtst)"
@@ -289,20 +539,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> y\n",
+ "1.53e-01 100.00%┣████████████████████┫ 1170/1170 [00:18/00:18, 64.14i/s]\n"
+ ]
+ }
+ ],
"source": [
+ "# 2.51e-01 100.00%┣████████████████████┫ 1170/1170 [00:16/00:16, 75.46i/s]\n",
"model = trainresults(\"imdbmodel113.jld2\",model);"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(0.05890469f0, 0.38913542f0, 0.9833733974358975, 0.8548477564102565)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 33s (0.059155148f0, 0.3877507f0, 0.9846153846153847, 0.8583733974358975)\n",
+ "# (0.059155148f0, 0.3877507f0, 0.9846153846153847, 0.8583733974358975)\n",
"nll(model,dtrn), nll(model,dtst), accuracy(model,dtrn), accuracy(model,dtst)"
]
},
@@ -315,9 +586,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "str2ids (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"predictstring(x)=\"\\nPrediction: \" * (\"Negative\",\"Positive\")[argmax(Array(vec(model([x]))))]\n",
"UNK = VOCABSIZE-2\n",
@@ -326,9 +608,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Negative review:\n",
+ " this is an emperor's new clothes situation someone needs to say that's not a funny and original etc etc film that is an inferior film don't waste your money on it the film is trashy and the people in it are embarrassingly inferior trailer trash they are all too realistically only themselves they have no lines they don't act the american dream is not to create shoddy no quality films or anything else shoddy and of no quality it is to achieve something of quality and thereby success only people who are desperate to praise any film not made in hollywood it can't have been made in hollywood can it would try to any kind of quality to this film it's worse than ed woods another film about a film maker without standards these films shouldn't have been made and you shouldn't go see american movie\n",
+ "\n",
+ "Prediction: Negative\n"
+ ]
+ }
+ ],
"source": [
"# Here we can see predictions for random reviews from the test set; hit Ctrl-Enter to sample:\n",
"r = rand(1:length(xtst))\n",
@@ -338,9 +631,19 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "stdin> this was not a great movie\n",
+ "\n",
+ "Prediction: Negative\n"
+ ]
+ }
+ ],
"source": [
"# Here the user can enter their own reviews and classify them:\n",
"println(predictstring(str2ids(readline(stdin))))"
diff --git a/tutorial/80.charlm.ipynb b/tutorial/80.charlm.ipynb
index 1733f5e7e..96e4e5203 100644
--- a/tutorial/80.charlm.ipynb
+++ b/tutorial/80.charlm.ipynb
@@ -24,7 +24,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -58,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -71,7 +71,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -87,9 +87,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CharLM (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# The h=0,c=0 options to RNN enable a persistent state between iterations\n",
"CharLM(vocab::Int,embed::Int,hidden::Int; o...) = \n",
@@ -105,7 +116,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -130,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -166,7 +177,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
@@ -185,13 +196,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(\"4934845-element Array{UInt8,1}\", \"526731-element Array{UInt8,1}\", \"84-element Array{Char,1}\")"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Load 'The Complete Works of William Shakespeare'\n",
"include(Knet.dir(\"data\",\"gutenberg.jl\"))\n",
@@ -201,13 +223,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r\n",
+ " Cheated of feature by dissembling nature,\r\n",
+ " Deform'd, unfinish'd, sent before my time\r\n",
+ " Into this breathing world scarce half made up,\r\n",
+ " And that so lamely and unfashionable\r\n",
+ " \n"
+ ]
+ }
+ ],
"source": [
"# Print a sample\n",
"println(string(shakechars[trn[1020:1210]]...))"
@@ -215,13 +250,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(192, 20)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Minibatch data\n",
"function mb(a)\n",
@@ -235,18 +281,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(\"256×100 Array{UInt8,2}\", \"256×100 Array{UInt8,2}\")"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"summary.(first(dtrn)) # each x and y have dimensions (BATCHSIZE,SEQLENGTH)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> n\n"
+ ]
+ }
+ ],
"source": [
"# 3.30e+00 ┣ / / / / / ┫ 122 [04:46, 2.35s/i]\n",
"Knet.gc()\n",
@@ -256,22 +321,66 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3.2993853f0"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"exp(shakemodel(dtst)) # Perplexity = 3.30"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Floudg, \n",
+ " Kent. My lord, of more, youth away, his gracious\n",
+ " forgot rules over a gentlewisold, how thou reads,\n",
+ " The business, Romeo, Hastings, the field\n",
+ " GENTLOW, MACBETH, with the TRIER\n",
+ "\n",
+ "\n",
+ "Flourish coults\n",
+ " DUKE, SILVIA, and, Monten\n",
+ "\n",
+ "Enter CLOWN and CLARENCE of OARSMA\n",
+ "\n",
+ " FLUELLEN. He is good, let your sooving themselves shin excelsions\n",
+ " banished you are not acquainting the now. Yet she comes\n",
+ " that doth make defeat of mine are that he makes an oath,\n",
+ " there is a connnivation dospish from these hands, upon the\n",
+ " coverworth as certain physice.\n",
+ " PISTOL. T' never soft as come dead.\n",
+ " Well, are the god sitting odds foo my business?'\n",
+ " The bloody king, this fast show rank and runk.\n",
+ " PERDIATA. Well, let's awly ladies.'\n",
+ " WIDOW. Take your hands, you are poison nor the valiant man\n",
+ " proph on matches sent out 'Hang.\n",
+ " CELIA. They are the new-my woman's character; I am both about the\n",
+ " friend of the duncatar; I, a drum in the searchine! \n",
+ " HO\n"
+ ]
+ }
+ ],
"source": [
"generate(shakemodel,shakechars,1000)"
]
@@ -285,7 +394,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -300,9 +409,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9168446"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Read julia base library source code\n",
"base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, \"julia\")\n",
@@ -319,9 +439,46 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3642×2 Array{Any,2}:\n",
+ " ' ' 1981523\n",
+ " 'e' 550283\n",
+ " 't' 479801\n",
+ " 'n' 344556\n",
+ " 'r' 339428\n",
+ " 'i' 330713\n",
+ " 's' 327190\n",
+ " 'a' 317875\n",
+ " 'o' 277083\n",
+ " '\\n' 266644\n",
+ " 'l' 204455\n",
+ " ',' 201044\n",
+ " ')' 194869\n",
+ " ⋮ \n",
+ " 'ה' 1\n",
+ " '🍢' 1\n",
+ " '𝗾' 1\n",
+ " '𝔔' 1\n",
+ " 'É' 1\n",
+ " '𝓟' 1\n",
+ " '𝚿' 1\n",
+ " '𝕨' 1\n",
+ " 'ɛ' 1\n",
+ " 'Χ' 1\n",
+ " '🕙' 1\n",
+ " 'ℚ' 1"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Find unique chars, sort by frequency, assign integer ids.\n",
"charcnt = Dict{Char,Int}()\n",
@@ -334,9 +491,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(9168446, 8644158, 524288)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Keep only VOCABSIZE most frequent chars, split into train and test\n",
"data = map(c->charid[c], collect(text))\n",
@@ -349,9 +517,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{}, Union{}) === Const(true) # any result is ok\n",
+ " @test subtype_tfunc(Union{}, Type{typeof(Union{})}) === Const(true) # any result is ok\n",
+ " @test subtype_tfunc(Union{}, Const(typeof(Union{}))) === Const(true) # any result is ok\n",
+ " @test subtype_tfunc(typeof(Union{}), Const(typeof(Union{}))) === Const(true) # Union{} <: typeof(Union{})\n",
+ " @test subtype_tfunc(typeof(Union{}), Const(Int)) === Const(true) # Union{} <: Int\n",
+ " @test subtype_tfunc(typeof(Union{}), Const(Union{})) === Const(true) # Union{} <: Union{}\n",
+ " @test subtype_tfunc(typeof(Union{}), Type{typeof(Union{})}) === Const(true) # Union{} <: Union{}\n",
+ " @test subtype_tfunc(typeof(Union{}), Type{typeof(Union{})}) === Const(true) # Union{} <: typeof(Union{})\n",
+ " @test subtype_tfunc(typeof(Union{}), Type{Union{}}) === Const(true) # Union{} <: Union{}\n",
+ " @test subtype_tfunc(Type{Union{}}, typeof(Union{})) === Const(true) # Union{} <: Union{}\n",
+ " @test subtype_tfunc(Type{Union{}}, Const(typeof(Union{}))) === Const(true) # Uni\n"
+ ]
+ }
+ ],
"source": [
"# Print a sample\n",
"r = rand(1:(length(trn)-1000))\n",
@@ -360,9 +546,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2110, 127)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"# Minibatch data\n",
"function mb(a)\n",
@@ -376,18 +573,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(\"64×64 Array{Int64,2}\", \"64×64 Array{Int64,2}\")"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"summary.(first(dtrn)) # each x and y have dimensions (BATCHSIZE,SEQLENGTH)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train from scratch? stdin> n\n"
+ ]
+ }
+ ],
"source": [
"# 3.25e+00 ┣ / / / / /┫ 126 [05:43, 2.72s/i]\n",
"juliamodel = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)\n",
@@ -396,18 +612,78 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3.27486f0"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"exp(juliamodel(dtst)) # Perplexity = 3.27"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " # optional\n",
+ " _ = Expr(expMreadcos, Expr(:meta, :stderr), :n, :default, ex, context[o.e.ex, ex.args[1] -typeinfo + Int])\n",
+ "\n",
+ " isprint((v\"), GotoNode(e))\n",
+ "end\n",
+ "\n",
+ "for (fname, getfield) do t\n",
+ " print(io, \":\")\n",
+ " new()\n",
+ " end\n",
+ "end\n",
+ "\n",
+ "if option\n",
+ " quote\n",
+ " bounds end\n",
+ " end\n",
+ " @sprintf(\"Other prompt\", ex.field, UV_REQ) == pop!(bb_start_off+1, i)\n",
+ " write(io, take!(builder_path))\n",
+ "end\n",
+ "\n",
+ "Base.:Table(io::IOContext) = write(io, position(s))\n",
+ "\n",
+ "function const_rerror(pre::GlobalRef)\n",
+ " ret = proty(d)\n",
+ " if !rel_key && length(blk)\n",
+ " return htstarted_keys(terminal(u, p))\n",
+ " end\n",
+ " write(io, (\"\\\\\\\\\\\" => \"\\n\\n\\n\\n\") ? \"\\n>\\n\"\n",
+ " p = empty(dir+stdout)\n",
+ " n = MD(count_ok_new_data(L) : n_power\n",
+ " while push!(blks[$ur], altbuf)\n",
+ " end\n",
+ " function prec_uninitual(p, keep='\\n')\n",
+ " print(io, \"1 2\")\n",
+ " else\n",
+ " p = blk + p0\n",
+ " out = Mair(1)\n",
+ " elseif occursin(\".cmd\", keep=ks) != 0\n",
+ " res = write(io, c)\n",
+ " end\n",
+ " while take!(word)\n",
+ " \n"
+ ]
+ }
+ ],
"source": [
"generate(juliamodel,juliachars,1000)"
]