PerfProfiler can now execute Rust code

BhavyeMathur · Dec 30, 2024 · cba3c32 · cba3c32
1 parent 472c663
commit cba3c32
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 25 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,6 +3,14 @@ name = "chela"
 version = "0.1.0"
 edition = "2021"
 
+[[bin]]
+name = "fill"
+path = "benches/fill.rs"
+
+[[bin]]
+name = "fill_naive"
+path = "benches/fill_naive.rs"
+
 [dependencies]
 
 [lints.rust]

diff --git a/benches/fill.py b/benches/fill.py
@@ -6,6 +6,8 @@
 
 class TensorFill(TimingSuite):
     def __init__(self, n):
+        self.n = n
+
         self.ndarray = np.zeros(n, dtype="float32")
         self.tensor_cpu = torch.zeros(n, dtype=torch.float32)
         self.tensor_mps = torch.zeros(n, device="mps", dtype=torch.float32)
@@ -24,8 +26,16 @@ def run(self):
     # def run(self):
     #     self.tensor_mps.fill_(5)
 
+    @measure_rust_performance("Chela CPU (Accelerate)", target="fill")
+    def run(self, executable):
+        return self.run_rust(executable, self.n)
+
+    @measure_rust_performance("Chela CPU (Naïve)", target="fill_naive")
+    def run(self, executable):
+        return self.run_rust(executable, self.n)
+
 
 if __name__ == "__main__":
-    sizes = 10 ** np.arange(1, 7)
-    results = TensorFill.profile_each(sizes, n=30)
+    sizes = [128, 256, 499, 512, 1023, 1024, 2048]
+    results = TensorFill.profile_each(sizes, n=50)
     plot_results(sizes, results)
diff --git a/benches/fill.rs b/benches/fill.rs
@@ -0,0 +1,19 @@
+use chela::*;
+use std::env;
+use std::time::Instant;
+
+
+fn profile(size: usize) -> u128 {
+    let mut tensor = Tensor::zeros(size);
+
+    let now = Instant::now();
+    tensor.fill(5_f32);
+    now.elapsed().as_nanos()
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let size = args[1].parse::<usize>().unwrap();
+
+    println!("{}", profile(size));
+}
diff --git a/benches/fill_naive.rs b/benches/fill_naive.rs
@@ -0,0 +1,19 @@
+use chela::*;
+use std::env;
+use std::time::Instant;
+
+
+fn profile(size: usize) -> u128 {
+    let mut tensor = Tensor::zeros(size);
+
+    let now = Instant::now();
+    tensor.fill_naive(5_f32);
+    now.elapsed().as_nanos()
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let size = args[1].parse::<usize>().unwrap();
+
+    println!("{}", profile(size));
+}
diff --git a/benches/perfprofiler/__init__.py b/benches/perfprofiler/__init__.py
@@ -1,3 +1,3 @@
-from .suite import TimingSuite
-from .profile import measure_performance
-from .plot import plot_results
+from .suite import *
+from .profile import *
+from .plot import *
diff --git a/benches/perfprofiler/plot.py b/benches/perfprofiler/plot.py
@@ -4,29 +4,28 @@
 from .result import Result
 
 colors = {
-    "NumPy":       "#4dabcf",
-    "PyTorch CPU": "#f2765d",
-    "PyTorch MPS": "#812ce5",
+    "NumPy":                  "#4dabcf",
+    "PyTorch CPU":            "#f2765d",
+    "PyTorch MPS":            "#812ce5",
+    "Chela CPU (Accelerate)": "#ce422b"
 }
 
 
 def plot_results(x, results: dict[str, list[Result]]) -> None:
-    plt.figure()
+    plt.figure(figsize=(10, 7))
     ax = plt.gca()
 
     for label, result in results.items():
         upper_bound = [res.mean + res.se() for res in result]
         lower_bound = [res.mean - res.se() for res in result]
 
-        print(upper_bound, lower_bound)
-
         color = colors.get(label)
 
         ax.fill_between(x, upper_bound, lower_bound, color=color, alpha=0.2, ec=None)
         ax.plot(x, result, label=label, color=color)
 
     ax.yaxis.set_major_formatter(FuncFormatter(lambda val, _: f"{val:.3f} ms"))
-    plt.xscale("symlog")
+    # plt.xscale("symlog")
     # plt.yscale("symlog")
     plt.legend()
 

diff --git a/benches/perfprofiler/profile.py b/benches/perfprofiler/profile.py
@@ -3,14 +3,16 @@
 import time
 
 from .result import Result
+from .util import compile_rust, get_method_class
 
 profile_methods = defaultdict(dict)
+rust_methods = defaultdict(dict)
 
 
 # noinspection PyDecorator
 @classmethod
 def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[str, Result]:
-    total_time = {label: [] for label in cls.perf_methods.keys()}
+    total_time = defaultdict(list)
 
     for _ in range(n):
         suite_obj = cls(*args, **kwargs)
@@ -22,6 +24,10 @@ def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[st
 
             total_time[label].append(end - start)
 
+        for label, function in cls.rust_methods.items():
+            elapsed = function(suite_obj)
+            total_time[label].append(elapsed)
+
     results = {}
     for label, times in total_time.items():
         results[label] = Result(times)
@@ -33,11 +39,25 @@ def profile(cls, *args, n: int = 100, verbose: bool = True, **kwargs) -> dict[st
 
 def measure_performance(label: str) -> Callable:
     def decorator(function):
-        clsname = function.__qualname__.split(".")[0]
+        clsname = get_method_class(function)
         profile_methods[clsname][label] = function
         return function
 
     return decorator
 
 
-__all__ = ["measure_performance"]
+def measure_rust_performance(label: str, target: str) -> Callable:
+    def decorator(function):
+        executable = compile_rust(target)
+
+        def wrapper(self, *args, **kwargs):
+            return function(self, executable, *args, **kwargs)
+
+        clsname = get_method_class(function)
+        rust_methods[clsname][label] = wrapper
+        return wrapper
+
+    return decorator
+
+
+__all__ = ["measure_performance", "measure_rust_performance"]
diff --git a/benches/perfprofiler/suite.py b/benches/perfprofiler/suite.py
@@ -1,31 +1,40 @@
 from typing import Iterable
+import subprocess
 
-# from tqdm import tqdm
+from tqdm import tqdm
 
 # noinspection PyProtectedMember
-from .profile import profile, profile_methods
+from .profile import profile, profile_methods, rust_methods
 from .util import merge_dicts
 from .result import Result
 
 
+# from tqdm import tqdm
+
+
 class TimingSuiteMeta(type):
     perf_methods = {}
 
     def __new__(cls, name, bases, attrs):
         attrs["perf_methods"] = profile_methods.pop(name, {})
+        attrs["rust_methods"] = rust_methods.pop(name, {})
         attrs["profile"] = profile
         return type.__new__(cls, name, bases, attrs)
 
 
 class TimingSuite(metaclass=TimingSuiteMeta):
+    def run_rust(self, executable, *argv) -> float:
+        value = subprocess.check_output(f"{executable} {''.join(map(str, argv))}", shell=True)
+        return int(value)
+
     def profile(*args, **kwargs) -> dict[str, Result]:
         raise NotImplementedError()  # implemented by TimingSuiteMeta
 
     @classmethod
     def profile_each(cls, args_array, n: int = 100) -> dict[str, list[Result]]:
         results = []
 
-        for i, args in enumerate(args_array):
+        for i, args in enumerate(tqdm(args_array)):
             if isinstance(args, dict):
                 result = cls.profile(**args, n=n, verbose=False)
             elif isinstance(args, Iterable):

diff --git a/benches/perfprofiler/util.py b/benches/perfprofiler/util.py
@@ -1,5 +1,23 @@
+import subprocess
+import json
+
+
 def merge_dicts(list_of_dicts):
     result = {}
     for k in list_of_dicts[0].keys():
         result[k] = [d[k] for d in list_of_dicts]
     return result
+
+
+def compile_rust(target: str) -> str:
+    out = subprocess.check_output(f"$HOME/.cargo/bin/cargo build --release --bin {target} "
+                                  "-v --message-format=json", shell=True)
+    out = out.split(b"\n")[-3]
+    return json.loads(out)["executable"]
+
+
+def get_method_class(method) -> str:
+    return method.__qualname__.split(".")[0]
+
+
+__all__ = ["merge_dicts", "compile_rust", "get_method_class"]
diff --git a/src/tensor/data_buffer/buffer.rs b/src/tensor/data_buffer/buffer.rs
@@ -12,6 +12,8 @@ pub trait DataBuffer: Index<usize> {
 
     fn const_ptr(&self) -> *const Self::DType;
 
+    fn mut_ptr(&mut self) -> *mut Self::DType;
+
     fn to_view(&self) -> DataView<Self::DType>;
 
     // fn clone(&self) -> DataOwned<Self::DType>;
@@ -36,6 +38,10 @@ impl<T: RawDataType> DataBuffer for DataOwned<T> {
         self.ptr.as_ptr()
     }
 
+    fn mut_ptr(&mut self) -> *mut T {
+        unsafe { self.ptr.as_mut() }
+    }
+
     fn to_view(&self) -> DataView<T> {
         let ptr = self.ptr;
         let len = self.len;
@@ -57,6 +63,10 @@ impl<T: RawDataType> DataBuffer for DataView<T> {
         self.ptr.as_ptr()
     }
 
+    fn mut_ptr(&mut self) -> *mut T {
+        unsafe { self.ptr.as_mut() }
+    }
+
     fn to_view(&self) -> DataView<T> {
         (*self).clone()
     }

diff --git a/src/tensor/data_buffer/fill.rs b/src/tensor/data_buffer/fill.rs
@@ -3,25 +3,52 @@ use crate::dtype::RawDataType;
 use std::ffi::c_int;
 
 #[cfg(target_vendor = "apple")]
-use crate::accelerate::cblas::{catlas_dset, catlas_sset};
+use crate::accelerate::cblas::catlas_dset;
+use crate::accelerate::cblas::catlas_sset;
 
 pub(in crate::tensor) trait Fill<T>
 where
     T: RawDataType,
 {
-    fn fill(&self, value: T);
+    fn fill(&mut self, value: T);
+
+    fn fill_naive(&mut self, value: T);
 }
 
 #[cfg(target_vendor = "apple")]
 impl Fill<f32> for DataOwned<f32> {
-    fn fill(&self, value: f32) {
+    fn fill(&mut self, value: f32) {
         unsafe { catlas_sset(self.len as c_int, value, self.const_ptr(), 1) }
     }
+
+    #[inline(never)]
+    fn fill_naive(&mut self, value: f32) {
+        let mut ptr = self.mut_ptr();
+        let end_ptr = unsafe { ptr.add(self.len) };
+
+        while ptr != end_ptr {
+            unsafe {
+                std::ptr::write(ptr, value);
+                ptr = ptr.add(1);
+            }
+        }
+    }
 }
 
 #[cfg(target_vendor = "apple")]
 impl Fill<f64> for DataOwned<f64> {
-    fn fill(&self, value: f64) {
+    fn fill(&mut self, value: f64) {
         unsafe { catlas_dset(self.len as c_int, value, self.const_ptr(), 1) }
     }
+
+    fn fill_naive(&mut self, value: f64) {
+        let mut ptr = self.mut_ptr();
+
+        for _ in 0..self.len {
+            unsafe {
+                ptr = ptr.add(1);
+                std::ptr::write(ptr, value);
+            }
+        }
+    }
 }
diff --git a/src/tensor/fill.rs b/src/tensor/fill.rs
@@ -8,9 +8,13 @@ where
     B: DataBuffer<DType=T> + Fill<T>,
     T: RawDataType,
 {
-    pub fn fill(&self, value: T) {
+    pub fn fill(&mut self, value: T) {
         self.data.fill(value)
     }
+
+    pub fn fill_naive(&mut self, value: T) {
+        self.data.fill_naive(value)
+    }
 }
 
 #[cfg(target_vendor = "apple")]
@@ -20,7 +24,7 @@ mod tests {
 
     #[test]
     fn test_fill_f32() {
-        let a: Tensor<f32> = Tensor::zeros([3, 5, 3]);
+        let mut a: Tensor<f32> = Tensor::zeros([3, 5, 3]);
 
         assert!(a.flat_iter().all(|x| x == 0.0));
         a.fill(25.0);
@@ -29,7 +33,7 @@ mod tests {
 
     #[test]
     fn test_fill_f64() {
-        let a: Tensor<f64> = Tensor::zeros([15]);
+        let mut a: Tensor<f64> = Tensor::zeros([15]);
 
         assert!(a.flat_iter().all(|x| x == 0.0));
         a.fill(20.0);