add note

add nci60-pca-clustering-svm
math4mad · Oct 14, 2023 · 61270a8 · 61270a8
1 parent 2341f2d
commit 61270a8
Show file tree

Hide file tree

Showing 4 changed files with 195 additions and 0 deletions.
diff --git a/_freeze/category/dimension-reduction/1-nci60-pca-svm/execute-results/html.json b/_freeze/category/dimension-reduction/1-nci60-pca-svm/execute-results/html.json
diff --git a/...ze/category/dimension-reduction/1-nci60-pca-svm/figure-html/cell-8-output-1.png b/...ze/category/dimension-reduction/1-nci60-pca-svm/figure-html/cell-8-output-1.png
diff --git a/category/dimension-reduction/1-nci60-pca-svm.qmd b/category/dimension-reduction/1-nci60-pca-svm.qmd
@@ -0,0 +1,115 @@
+---
+title: "1-nci60-pca-clustering-svm"
+code-fold: true
+---
+
+:::{.callout-note title="简介"}
+
+ 参见 [ISLR-nci60]:An Introduction to Statistical Learning.pdf page 18
+
+ 流程为:`pca->clustering->svm`  半监督学习方法,首先对数据降维, 然后聚类, 最后使用 SVM 进行分类学习
+:::
+
+
+## 1. load package
+```{julia}
+
+import MLJ:transform,predict
+using DataFrames,MLJ,CSV,MLJModelInterface,GLMakie,Random
+Random.seed!(45454)
+```
+
+##  2.  import data
+```{julia}
+    df= CSV.File("./data/NCI60.csv") |> DataFrame |> dropmissing
+    Xtr = df[:,2:end]
+    Xtr_labels = Vector(df[:,1])
+    # # split other half to testing set
+    Xte=df[1:3:end,2:end]
+    Xte_labels = Vector(df[1:3:end,1])
+    first(df,10)
+```
+
+## 2. MLJ WorkFlow
+### 2.1 define models
+需要定义三个模型:
+
+1. pca model
+2. clustering model
+3. classficiation model
+
+```{julia}
+ PCA = @load PCA pkg=MultivariateStats
+ KMeans = @load KMeans pkg=Clustering
+ SVC = @load SVC pkg=LIBSVM
+
+ model=PCA(maxoutdim=2) # pca model
+ model2 = KMeans(k=3)   # clustering model
+ model3 = SVC()        # svm dodel
+```
+
+### 2.2 PCA 
+在 PCA 流程中要完成两步:
+
+1. PCA 模型训练(如果为了便于可视化, 维度为 2或者 3)
+2. 将原始数据映射到降维的空间上
+```{julia}
+mach = machine(model, Xtr) |> fit!
+Xproj =transform(mach, Xtr)
+first(Xproj,10)
+```
+
+### 2.3 生成决策边界测试数据集
+```{julia}
+function boundary_data(df,;n=200)
+    n1=n2=n
+    xlow,xhigh=extrema(df[:,:x1])
+    ylow,yhigh=extrema(df[:,:x2])
+    tx = range(xlow,xhigh; length=n1)
+    ty = range(ylow,yhigh; length=n2)
+    x_test = mapreduce(collect, hcat, Iterators.product(tx, ty));
+    xtest=MLJ.table(x_test')
+    return tx,ty, xtest
+end
+tx,ty, xtest=boundary_data(Xproj)  #xtest  生成决策边界的数据
+```
+
+### 2.5 Clustering and  SVM  training
+```{julia}
+ mach2= machine(model2, Xproj) |> fit!
+ yhat = predict(mach2, Xproj)  # 聚类结果
+ cat=yhat|>Array|>levels
+
+ mach3 = machine(model3, Xproj, yhat)|>fit!
+ ypred=predict(mach3, xtest)|>Array|>d->reshape(d,200,200) #SVM 结果
+
+```
+
+### 2.6 plot results
+```{julia}
+    function plot_model()
+        fig = Figure()
+        ax = Axis(fig[1, 1],title="NCI60 Machine Learning",subtitle="pca->clustering->svm")
+
+        colors = [:red, :orange, :blue]
+        contourf!(ax, tx,ty,ypred)
+        for (i, c) in enumerate(Array(yhat))
+            data = Xproj[i, :]
+            
+            scatter!(ax, data.x1, data.x2;marker=:circle,markersize=12,color=(colors[c],0.3),strokewidth=1,strokecolor=:black)
+            text!(ax,data.x1, data.x2;text="v$(i)")
+        end
+
+        fig
+        #save("NCI60 Machine Learning:pca->clustering->svm-with-tag.png",fig)
+    end
+
+    plot_model()
+```
+
+
+
+
+
+
+
diff --git a/category/dimension-reduction/data/NCI60.csv b/category/dimension-reduction/data/NCI60.csv