Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding code for SVM using Python and C++ #514

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions SVM_Python_Cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
cmake_minimum_required(VERSION 2.6)
project(svm_cpp)


# The library prefix
SET(LIB_PREFIX _svm_test)

set(CMAKE_CXX_FLAGS "-g -Wall")

SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${svm_test_SOURCE_DIR}/bin)

ADD_LIBRARY(svm_lib
svm.h svm.cpp
)
ADD_EXECUTABLE(svm_classification SVM_Classification.cpp)
ADD_EXECUTABLE(svm_regression SVM_Regression.cpp)

SET_TARGET_PROPERTIES(svm_lib PROPERTIES OUTPUT_NAME ${LIB_PREFIX}_svm_lib)
SET_TARGET_PROPERTIES(svm_classification PROPERTIES OUTPUT_NAME svm_classification)
SET_TARGET_PROPERTIES(svm_regression PROPERTIES OUTPUT_NAME svm_regression)

# link the library to the executable
TARGET_LINK_LIBRARIES(svm_classification svm_lib)
TARGET_LINK_LIBRARIES(svm_regression svm_lib)
28 changes: 28 additions & 0 deletions SVM_Python_Cpp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SVM using Python and C++

## Requirements

In order to run the Python scripts, you will need to install `scikit-learn` module using `pip install scikit-learn`

For running the C++ code, make sure you download and place all the files provided in the repository following the same folder structure.

## Compilation Instructions

To run the Python scripts, use:

```
python SVM_Regression.py
python SVM_Classification.py
```

To run the C++ code, use:

```
mkdir build
cd build
cmake ..
cmake --build . --config Release
cd ..
```

The executable files will be created in the folder `bin/`. You can run regression example using `svm_regression` binary and classification example using `svm_classification` binary.
158 changes: 158 additions & 0 deletions SVM_Python_Cpp/SVM_Classification.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
// Classification using SVM

#include "svm.h"
#include <ctype.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
using namespace std;

// create data
vector<vector<double>> generateData(int problemSize, int featureNum) {
vector<vector<double>> data;
// our data
for(int i = 0; i < problemSize; i++) {
// create feature vector
vector<double> featureSet;
for(int j = 0; j < featureNum-1; j++) {
int value = 0;
int value_2 = 0;
// to make a gap between both classes
while(abs(value_2 - value) < 40){
value = rand() % 1000;
value_2 = rand() % 1000;
}
featureSet.push_back(value);
featureSet.push_back(value_2);
}
data.push_back(featureSet);
}
return data;
}

// create labels
vector<int> generateLabels(int labelsSize, vector< vector<double>> data) {
// create labels vector
vector<int> labels;
for(int i = 0; i < labelsSize; i++) {
if(data[i][0] > data[i][1]) {
labels.push_back(1);
}
else {
labels.push_back(-1);
}
}
// introduce noise in the data
for(int i = 0; i < labelsSize; i++) {
// invert label only for a few points
if(rand() % 1000 > 980) {
labels[i] = -1 * labels[i]; // invert the label
}
}
return labels;
}

// utility function to scale data
vector<vector<double>> scale_data(vector<vector<double>> data) {
//vector<int> minimum, maximum;
vector<vector<double>> scaled_data;
for(int i = 0; i < data.size(); i++) {
vector<double> featureSet;
for(int j = 0; j < data[i].size(); j++) {
// scale data
//double value = 2 * (data[i][j] - minimum[j])/(maximum[j] - minimum[j]) -1;
double value = 2 * (data[i][j] - 0)/(999.0 - 0) -1;
featureSet.push_back(value);
}
scaled_data.push_back(featureSet);
}
return scaled_data;
}

int main(){
// Training and testing data
int test_size = 300;
int train_size = 700;
int featureNum = 2;

vector<vector<double>> test_data = generateData(test_size, featureNum);
vector<int> test_labels = generateLabels(test_size, test_data);
vector<vector<double>> train_data = generateData(train_size, featureNum);
vector<int> train_labels = generateLabels(train_size, train_data);

// Scale data
//train_data = scale_data(train_data);
//test_data = scale_data(test_data);

// Train model on the dataset
struct svm_parameter param; // parameters of svm
struct svm_problem prob; // contains the training data in svm_node format
// set parameters
param.svm_type = C_SVC;
param.kernel_type = RBF;
param.degree = 3;
param.gamma = 0.5;
param.coef0 = 0;
param.nu = 0.5;
param.cache_size = 100;
param.eps = 1e-3;
param.p = 0.1;
param.shrinking = 1;
param.probability = 0;
param.nr_weight = 0;
param.weight_label = NULL;
param.weight = NULL;
param.C = 10;

// Number of training examples
prob.l = train_size;

// training dataset in svm_node matrix format
svm_node** svm_x_train = (svm_node**)malloc((prob.l) * sizeof(svm_node*));

// iterate over each sample
for (int sample=0; sample < prob.l; sample++){
svm_node* x_space = (svm_node*)malloc((featureNum+1) * sizeof(svm_node));
for (int feature=0; feature < featureNum; feature++){
// feature value
x_space[feature].value= train_data[sample][feature];
// feature index
x_space[feature].index = feature+1;
}
// each sample's last feature should be -1 in libSVM
x_space[featureNum].index = -1;
svm_x_train[sample] = x_space;
}

// store training data in prob
prob.x = svm_x_train;

// store labels
prob.y = (double *)malloc(prob.l * sizeof(double));
for (int sample = 0; sample < prob.l; sample++){
prob.y[sample] = train_labels[sample];
}

// train the model
struct svm_model *model;
model = svm_train(&prob, &param);

// Evaluating the trained model on test dataset
// svm_predict returns the predicted value in C++
int prediction;

// iterate over each test sample
for (int sample=0; sample < test_data.size(); sample++){
svm_node* x_space = (svm_node*)malloc((featureNum+1) * sizeof(svm_node));
for (int feature=0; feature < featureNum; feature++){
// feature value
x_space[feature].value= test_data[sample][feature];
// feature index
x_space[feature].index = feature+1;
}
// each sample's last feature should be -1 in libSVM
x_space[featureNum].index = -1;
prediction = svm_predict(model, x_space);
std::cout << "Prediction: " << prediction << ", Groundtruth: " << test_labels[sample] << std::endl;
}
}
30 changes: 30 additions & 0 deletions SVM_Python_Cpp/SVM_Classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Classification using SVM

## Import required modules
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm

## Creating a sample dataset
X,Y = make_classification(n_samples=1000,n_features=2,n_informative=1,\
n_clusters_per_class=1,n_redundant=0)

## Training and testing split
# dividing data to train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

## Normalize data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Training the SVC model
# make a SVC classifier
clf = svm.SVC()
# fit the training data using classifier
clf.fit(X_train, y_train)

## Predicting the trained model on test data
clf_predictions = clf.predict(X_test)
138 changes: 138 additions & 0 deletions SVM_Python_Cpp/SVM_Regression.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Regression using SVM

#include "svm.h"
#include <ctype.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
using namespace std;

// generate data for regression task
vector<vector<double>> generateData(int problemSize, int featureNum) {
vector<vector<double>> data;
for(int i = 0; i < problemSize; i++) {
vector<double> featureSet;
for(int j = 0; j < featureNum-1; j++) {
int value = rand() % 1000;
int value_2 = rand() % 1000;
featureSet.push_back(value);
featureSet.push_back(value_2);
}
data.push_back(featureSet);
}
return data;
}

// generate labels for the data provided
vector<int> generateLabels(int labelsSize, vector<vector<double>> data) {
vector<int> labels;
for (int i=0; i < labelsSize; ++i) {
// create labels (average of both values)
labels.push_back((data[i][0] + data[i][1])/2);
}
return labels;
}

// utility function to scale data
vector<vector<double>> scale_data(vector<vector<double>> data) {
//vector<int> minimum, maximum;
vector<vector<double>> scaled_data;
for(int i = 0; i < data.size(); i++) {
vector<double> featureSet;
for(int j = 0; j < data[i].size(); j++) {
// scale data
//double value = 2 * (data[i][j] - minimum[j])/(maximum[j] - minimum[j]) -1;
double value = 2 * (data[i][j] - 0)/(999.0 - 0) -1;
featureSet.push_back(value);
}
scaled_data.push_back(featureSet);
}
return scaled_data;
}

int main(){
// Training and testing data
int test_size = 300;
int featureNum = 2;
int train_size = 700;

vector<vector<double>> test_data = generateData(test_size, featureNum);
vector<int> test_labels = generateLabels(test_size, test_data);
vector<vector<double>> train_data = generateData(train_size, featureNum);
vector<int> train_labels = generateLabels(train_size, train_data);

// Scale data
train_data = scale_data(train_data);
test_data = scale_data(test_data);

// Train model on the dataset
struct svm_parameter param; // parameters of svm
struct svm_problem prob; // contains the training data in svm_node format
// set parameters
param.svm_type = EPSILON_SVR;
param.kernel_type = RBF;
param.gamma = 0.5;
param.degree = 3;
param.coef0 = 0;
param.nu = 0.5;
param.C = 10;
param.eps = 1e-3;
param.p = 0.1;
param.shrinking = 1;
param.probability = 0;
param.nr_weight = 0;
param.weight_label = NULL;
param.weight = NULL;

// Number of training examples
prob.l = train_size;

// training dataset in svm_node matrix format
svm_node** svm_x_train = (svm_node**)malloc((prob.l) * sizeof(svm_node*));

// iterate over each sample
for (int sample=0; sample < prob.l; sample++){
svm_node* x_space = (svm_node*)malloc((featureNum+1) * sizeof(svm_node));
for (int feature=0; feature < featureNum; feature++){
// feature value
x_space[feature].value= train_data[sample][feature];
// feature index
x_space[feature].index = feature+1;
}
// each sample's last feature should be -1 in libSVM
x_space[featureNum].index = -1;
svm_x_train[sample] = x_space;
}

// store training data in prob
prob.x = svm_x_train;

// store labels
prob.y = (double *)malloc(prob.l * sizeof(double));
for (int sample = 0; sample < prob.l; sample++){
prob.y[sample] = train_labels[sample];
}

// train the model
struct svm_model *model;
model = svm_train(&prob, &param);

// Evaluating the trained model on test dataset
// svm_predict returns the predicted value in C++
int prediction;

// iterate over each test sample
for (int sample=0; sample < test_data.size(); sample++){
svm_node* x_space = (svm_node*)malloc((featureNum+1) * sizeof(svm_node));
for (int feature=0; feature < featureNum; feature++){
// feature value
x_space[feature].value= train_data[sample][feature];
// feature index
x_space[feature].index = feature+1;
}
// each sample's last feature should be -1 in libSVM
x_space[featureNum].index = -1;
prediction = svm_predict(model, x_space);
std::cout << "Prediction: " << prediction << ", Groundtruth: " << test_labels[sample] << std::endl;
}
}
Loading