Skip to content

Commit

Permalink
PerfProfiler can now execute Rust code
Browse files Browse the repository at this point in the history
  • Loading branch information
BhavyeMathur committed Dec 30, 2024
1 parent 472c663 commit cba3c32
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 25 deletions.
8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@ name = "chela"
version = "0.1.0"
edition = "2021"

[[bin]]
name = "fill"
path = "benches/fill.rs"

[[bin]]
name = "fill_naive"
path = "benches/fill_naive.rs"

[dependencies]

[lints.rust]
Expand Down
14 changes: 12 additions & 2 deletions benches/fill.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

class TensorFill(TimingSuite):
def __init__(self, n):
self.n = n

self.ndarray = np.zeros(n, dtype="float32")
self.tensor_cpu = torch.zeros(n, dtype=torch.float32)
self.tensor_mps = torch.zeros(n, device="mps", dtype=torch.float32)
Expand All @@ -24,8 +26,16 @@ def run(self):
# def run(self):
# self.tensor_mps.fill_(5)

@measure_rust_performance("Chela CPU (Accelerate)", target="fill")
def run(self, executable):
return self.run_rust(executable, self.n)

@measure_rust_performance("Chela CPU (Naïve)", target="fill_naive")
def run(self, executable):
return self.run_rust(executable, self.n)


if __name__ == "__main__":
sizes = 10 ** np.arange(1, 7)
results = TensorFill.profile_each(sizes, n=30)
sizes = [128, 256, 499, 512, 1023, 1024, 2048]
results = TensorFill.profile_each(sizes, n=50)
plot_results(sizes, results)
19 changes: 19 additions & 0 deletions benches/fill.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use chela::*;
use std::env;
use std::time::Instant;


fn profile(size: usize) -> u128 {
let mut tensor = Tensor::zeros(size);

let now = Instant::now();
tensor.fill(5_f32);
now.elapsed().as_nanos()
}

fn main() {
let args: Vec<String> = env::args().collect();
let size = args[1].parse::<usize>().unwrap();

println!("{}", profile(size));
}
19 changes: 19 additions & 0 deletions benches/fill_naive.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use chela::*;
use std::env;
use std::time::Instant;


fn profile(size: usize) -> u128 {
let mut tensor = Tensor::zeros(size);

let now = Instant::now();
tensor.fill_naive(5_f32);
now.elapsed().as_nanos()
}

fn main() {
let args: Vec<String> = env::args().collect();
let size = args[1].parse::<usize>().unwrap();

println!("{}", profile(size));
}
6 changes: 3 additions & 3 deletions benches/perfprofiler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .suite import TimingSuite
from .profile import measure_performance
from .plot import plot_results
from .suite import *
from .profile import *
from .plot import *
13 changes: 6 additions & 7 deletions benches/perfprofiler/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,28 @@
from .result import Result

colors = {
"NumPy": "#4dabcf",
"PyTorch CPU": "#f2765d",
"PyTorch MPS": "#812ce5",
"NumPy": "#4dabcf",
"PyTorch CPU": "#f2765d",
"PyTorch MPS": "#812ce5",
"Chela CPU (Accelerate)": "#ce422b"
}


def plot_results(x, results: dict[str, list[Result]]) -> None:
plt.figure()
plt.figure(figsize=(10, 7))
ax = plt.gca()

for label, result in results.items():
upper_bound = [res.mean + res.se() for res in result]
lower_bound = [res.mean - res.se() for res in result]

print(upper_bound, lower_bound)

color = colors.get(label)

ax.fill_between(x, upper_bound, lower_bound, color=color, alpha=0.2, ec=None)
ax.plot(x, result, label=label, color=color)

ax.yaxis.set_major_formatter(FuncFormatter(lambda val, _: f"{val:.3f} ms"))
plt.xscale("symlog")
# plt.xscale("symlog")
# plt.yscale("symlog")
plt.legend()

Expand Down
26 changes: 23 additions & 3 deletions benches/perfprofiler/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import time

from .result import Result
from .util import compile_rust, get_method_class

profile_methods = defaultdict(dict)
rust_methods = defaultdict(dict)


# noinspection PyDecorator
@classmethod
def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[str, Result]:
total_time = {label: [] for label in cls.perf_methods.keys()}
total_time = defaultdict(list)

for _ in range(n):
suite_obj = cls(*args, **kwargs)
Expand All @@ -22,6 +24,10 @@ def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[st

total_time[label].append(end - start)

for label, function in cls.rust_methods.items():
elapsed = function(suite_obj)
total_time[label].append(elapsed)

results = {}
for label, times in total_time.items():
results[label] = Result(times)
Expand All @@ -33,11 +39,25 @@ def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[st

def measure_performance(label: str) -> Callable:
def decorator(function):
clsname = function.__qualname__.split(".")[0]
clsname = get_method_class(function)
profile_methods[clsname][label] = function
return function

return decorator


__all__ = ["measure_performance"]
def measure_rust_performance(label: str, target: str) -> Callable:
def decorator(function):
executable = compile_rust(target)

def wrapper(self, *args, **kwargs):
return function(self, executable, *args, **kwargs)

clsname = get_method_class(function)
rust_methods[clsname][label] = wrapper
return wrapper

return decorator


__all__ = ["measure_performance", "measure_rust_performance"]
15 changes: 12 additions & 3 deletions benches/perfprofiler/suite.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
from typing import Iterable
import subprocess

# from tqdm import tqdm
from tqdm import tqdm

# noinspection PyProtectedMember
from .profile import profile, profile_methods
from .profile import profile, profile_methods, rust_methods
from .util import merge_dicts
from .result import Result


# from tqdm import tqdm


class TimingSuiteMeta(type):
perf_methods = {}

def __new__(cls, name, bases, attrs):
attrs["perf_methods"] = profile_methods.pop(name, {})
attrs["rust_methods"] = rust_methods.pop(name, {})
attrs["profile"] = profile
return type.__new__(cls, name, bases, attrs)


class TimingSuite(metaclass=TimingSuiteMeta):
def run_rust(self, executable, *argv) -> float:
value = subprocess.check_output(f"{executable} {''.join(map(str, argv))}", shell=True)
return int(value)

def profile(*args, **kwargs) -> dict[str, Result]:
raise NotImplementedError() # implemented by TimingSuiteMeta

@classmethod
def profile_each(cls, args_array, n: int = 100) -> dict[str, list[Result]]:
results = []

for i, args in enumerate(args_array):
for i, args in enumerate(tqdm(args_array)):
if isinstance(args, dict):
result = cls.profile(**args, n=n, verbose=False)
elif isinstance(args, Iterable):
Expand Down
18 changes: 18 additions & 0 deletions benches/perfprofiler/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
import subprocess
import json


def merge_dicts(list_of_dicts):
result = {}
for k in list_of_dicts[0].keys():
result[k] = [d[k] for d in list_of_dicts]
return result


def compile_rust(target: str) -> str:
out = subprocess.check_output(f"$HOME/.cargo/bin/cargo build --release --bin {target} "
"-v --message-format=json", shell=True)
out = out.split(b"\n")[-3]
return json.loads(out)["executable"]


def get_method_class(method) -> str:
return method.__qualname__.split(".")[0]


__all__ = ["merge_dicts", "compile_rust", "get_method_class"]
10 changes: 10 additions & 0 deletions src/tensor/data_buffer/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ pub trait DataBuffer: Index<usize> {

fn const_ptr(&self) -> *const Self::DType;

fn mut_ptr(&mut self) -> *mut Self::DType;

fn to_view(&self) -> DataView<Self::DType>;

// fn clone(&self) -> DataOwned<Self::DType>;
Expand All @@ -36,6 +38,10 @@ impl<T: RawDataType> DataBuffer for DataOwned<T> {
self.ptr.as_ptr()
}

fn mut_ptr(&mut self) -> *mut T {
unsafe { self.ptr.as_mut() }
}

fn to_view(&self) -> DataView<T> {
let ptr = self.ptr;
let len = self.len;
Expand All @@ -57,6 +63,10 @@ impl<T: RawDataType> DataBuffer for DataView<T> {
self.ptr.as_ptr()
}

fn mut_ptr(&mut self) -> *mut T {
unsafe { self.ptr.as_mut() }
}

fn to_view(&self) -> DataView<T> {
(*self).clone()
}
Expand Down
35 changes: 31 additions & 4 deletions src/tensor/data_buffer/fill.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,52 @@ use crate::dtype::RawDataType;
use std::ffi::c_int;

#[cfg(target_vendor = "apple")]
use crate::accelerate::cblas::{catlas_dset, catlas_sset};
use crate::accelerate::cblas::catlas_dset;
use crate::accelerate::cblas::catlas_sset;

pub(in crate::tensor) trait Fill<T>
where
T: RawDataType,
{
fn fill(&self, value: T);
fn fill(&mut self, value: T);

fn fill_naive(&mut self, value: T);
}

#[cfg(target_vendor = "apple")]
impl Fill<f32> for DataOwned<f32> {
fn fill(&self, value: f32) {
fn fill(&mut self, value: f32) {
unsafe { catlas_sset(self.len as c_int, value, self.const_ptr(), 1) }
}

#[inline(never)]
fn fill_naive(&mut self, value: f32) {
let mut ptr = self.mut_ptr();
let end_ptr = unsafe { ptr.add(self.len) };

while ptr != end_ptr {
unsafe {
std::ptr::write(ptr, value);
ptr = ptr.add(1);
}
}
}
}

#[cfg(target_vendor = "apple")]
impl Fill<f64> for DataOwned<f64> {
fn fill(&self, value: f64) {
fn fill(&mut self, value: f64) {
unsafe { catlas_dset(self.len as c_int, value, self.const_ptr(), 1) }
}

fn fill_naive(&mut self, value: f64) {
let mut ptr = self.mut_ptr();

for _ in 0..self.len {
unsafe {
ptr = ptr.add(1);
std::ptr::write(ptr, value);
}
}
}
}
10 changes: 7 additions & 3 deletions src/tensor/fill.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ where
B: DataBuffer<DType=T> + Fill<T>,
T: RawDataType,
{
pub fn fill(&self, value: T) {
pub fn fill(&mut self, value: T) {
self.data.fill(value)
}

pub fn fill_naive(&mut self, value: T) {
self.data.fill_naive(value)
}
}

#[cfg(target_vendor = "apple")]
Expand All @@ -20,7 +24,7 @@ mod tests {

#[test]
fn test_fill_f32() {
let a: Tensor<f32> = Tensor::zeros([3, 5, 3]);
let mut a: Tensor<f32> = Tensor::zeros([3, 5, 3]);

assert!(a.flat_iter().all(|x| x == 0.0));
a.fill(25.0);
Expand All @@ -29,7 +33,7 @@ mod tests {

#[test]
fn test_fill_f64() {
let a: Tensor<f64> = Tensor::zeros([15]);
let mut a: Tensor<f64> = Tensor::zeros([15]);

assert!(a.flat_iter().all(|x| x == 0.0));
a.fill(20.0);
Expand Down

0 comments on commit cba3c32

Please sign in to comment.