diff --git a/.gitignore b/.gitignore
index a9d37c560..9c858b5d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 target
 Cargo.lock
+pkg
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 29fb129c3..641a667c5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,10 @@ homepage = "https://github.com/matterinc/bellman"
 license = "MIT/Apache-2.0"
 name = "bellman"
 repository = "https://github.com/matterinc/bellman"
-version = "0.1.2"
+version = "0.1.3"
+
+[lib]
+crate-type = ["cdylib", "lib", "staticlib"]
 
 [dependencies]
 rand = "0.4"
@@ -18,8 +21,6 @@ crossbeam = "0.3"
 pairing = { git = 'https://github.com/matterinc/pairing' }
 byteorder = "1"
 ff = { git = 'https://github.com/matterinc/ff', features = ["derive"] }
-pbr = "1.0.1"
-time = "0.1"
 
 [features]
 default = []
diff --git a/src/domain.rs b/src/domain.rs
index e7b9cc35a..43f3d38ed 100644
--- a/src/domain.rs
+++ b/src/domain.rs
@@ -509,3 +509,59 @@ fn parallel_fft_consistency() {
 
     test_consistency::<Bls12, _>(rng);
 }
+
+#[test]
+fn test_field_element_multiplication_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use pairing::bn256::Fr;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 27;
+
+    let rng = &mut rand::thread_rng();
+    let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+    let v2 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+
+    let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
+    let v2 = EvaluationDomain::from_coeffs(v2).unwrap();
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    v1.mul_assign(&pool, &v2);
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
+}
+
+#[test]
+fn test_fft_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use pairing::bn256::Fr;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 27;
+
+    let rng = &mut rand::thread_rng();
+    let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+
+    let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    v1.ifft(&pool);
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
+}
\ No newline at end of file
diff --git a/src/groth16/generator.rs b/src/groth16/generator.rs
index 98a384cb1..bb72ef0f1 100644
--- a/src/groth16/generator.rs
+++ b/src/groth16/generator.rs
@@ -1,9 +1,5 @@
-extern crate time;
-
 use super::super::verbose_flag;
 
-use self::time::PreciseTime;
-
 use rand::Rng;
 
 use std::sync::Arc;
@@ -255,7 +251,9 @@ pub fn generate_parameters<E, C>(
     {
         // Compute powers of tau
         if verbose {eprintln!("computing powers of tau...")};
-        let start = PreciseTime::now();
+
+        let start = std::time::Instant::now();
+
         {
             let powers_of_tau = powers_of_tau.as_mut();
             worker.scope(powers_of_tau.len(), |scope, chunk| {
@@ -272,14 +270,16 @@ pub fn generate_parameters<E, C>(
                 }
             });
         }
-        if verbose {eprintln!("powers of tau stage 1 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+        if verbose {eprintln!("powers of tau stage 1 done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
 
         // coeff = t(x) / delta
         let mut coeff = powers_of_tau.z(&tau);
         coeff.mul_assign(&delta_inverse);
 
         if verbose {eprintln!("computing the H query with multiple threads...")};
-        let start = PreciseTime::now();
+
+        let start = std::time::Instant::now();
+
         // Compute the H query with multiple threads
         worker.scope(h.len(), |scope, chunk| {
             for (h, p) in h.chunks_mut(chunk).zip(powers_of_tau.as_ref().chunks(chunk))
@@ -302,17 +302,18 @@ pub fn generate_parameters<E, C>(
                 });
             }
         });
-        if verbose {eprintln!("computing the H query done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+        if verbose {eprintln!("computing the H query done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
     }
 
     if verbose {eprintln!("using inverse FFT to convert powers of tau to Lagrange coefficients...")};
-    let start = PreciseTime::now();
+    
+    let start = std::time::Instant::now();
 
     // Use inverse FFT to convert powers of tau to Lagrange coefficients
     powers_of_tau.ifft(&worker);
     let powers_of_tau = powers_of_tau.into_coeffs();
 
-    if verbose {eprintln!("powers of tau stage 2 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0)};
+    if verbose {eprintln!("powers of tau stage 2 done in {} s", start.elapsed().as_millis() as f64 / 1000.0)};
 
     let mut a = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
     let mut b_g1 = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
@@ -321,7 +322,7 @@ pub fn generate_parameters<E, C>(
     let mut l = vec![E::G1::zero(); assembly.num_aux];
 
     if verbose {eprintln!("evaluating polynomials...")};
-    let start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     fn eval<E: Engine>(
         // wNAF window tables
@@ -474,7 +475,7 @@ pub fn generate_parameters<E, C>(
         &worker
     );
 
-    if verbose {eprintln!("evaluating polynomials done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+    if verbose {eprintln!("evaluating polynomials done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
 
     // Don't allow any elements be unconstrained, so that
     // the L query is always fully dense.
diff --git a/src/groth16/prover.rs b/src/groth16/prover.rs
index ef19ebf6d..98f2b628e 100644
--- a/src/groth16/prover.rs
+++ b/src/groth16/prover.rs
@@ -1,6 +1,3 @@
-extern crate time;
-use self::time::PreciseTime;
-
 use super::super::verbose_flag;
 
 use rand::Rng;
@@ -173,7 +170,7 @@ impl<E:Engine> PreparedProver<E> {
 
         let vk = params.get_vk(self.assignment.input_assignment.len())?;
 
-        let h_start = PreciseTime::now();
+        let start = std::time::Instant::now();
 
         let h = {
             let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@@ -209,10 +206,9 @@ impl<E:Engine> PreparedProver<E> {
             multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
         };
 
-        let h_end = PreciseTime::now();
-        if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
+        if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
 
-        let points_start = PreciseTime::now();
+        let start = std::time::Instant::now();
 
         // TODO: Check that difference in operations for different chunks is small
 
@@ -283,8 +279,7 @@ impl<E:Engine> PreparedProver<E> {
         g_c.add_assign(&h.wait()?);
         g_c.add_assign(&l.wait()?);
 
-        let points_end = PreciseTime::now();
-        if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
+        if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
 
         Ok(Proof {
             a: g_a.into_affine(),
@@ -437,7 +432,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
 
     let vk = params.get_vk(prover.input_assignment.len())?;
 
-    let h_start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     let h = {
         let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@@ -473,10 +468,9 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
         multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
     };
 
-    let h_end = PreciseTime::now();
-    if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
+    if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
 
-    let points_start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     // TODO: Check that difference in operations for different chunks is small
 
@@ -547,8 +541,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
     g_c.add_assign(&h.wait()?);
     g_c.add_assign(&l.wait()?);
 
-    let points_end = PreciseTime::now();
-    if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
+    if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
 
     Ok(Proof {
         a: g_a.into_affine(),
diff --git a/src/lib.rs b/src/lib.rs
index 53ccdbbf7..c43cdd1a6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,7 +14,6 @@ pub mod multicore;
 mod multiexp;
 pub mod domain;
 pub mod groth16;
-pub mod progress_bar;
 
 use pairing::{Engine};
 use ff::Field;
diff --git a/src/multiexp.rs b/src/multiexp.rs
index b32327e33..e834f4c8b 100644
--- a/src/multiexp.rs
+++ b/src/multiexp.rs
@@ -142,6 +142,36 @@ impl DensityTracker {
     }
 }
 
+/// This genious piece of code works in the following way:
+/// - choose `c` - the bit length of the region that one thread works on
+/// - make `2^c - 1` buckets and initialize them with `G = infinity` (that's equivalent of zero)
+/// - there is no bucket for "zero" cause it's not necessary
+/// - go over the pairs `(base, scalar)`
+/// - for each scalar calculate `scalar % 2^c` and add the base (without any multiplications!) to the 
+/// corresponding bucket
+/// - at the end each bucket will have an accumulated value that should be multiplied by the corresponding factor
+/// between `1` and `2^c - 1` to get the right value
+/// - here comes the first trick - you don't need to do multiplications at all, just add all the buckets together
+/// starting from the first one `(a + b + c + ...)` and than add to the first sum another sum of the form
+/// `(b + c + d + ...)`, and than the third one `(c + d + ...)`, that will result in the proper prefactor infront of every
+/// accumulator, without any multiplication operations at all
+/// - that's of course not enough, so spawn the next thread
+/// - this thread works with the same bit width `c`, but SKIPS lowers bits completely, so it actually takes values
+/// in the form `(scalar >> c) % 2^c`, so works on the next region
+/// - spawn more threads until you exhaust all the bit length
+/// - you will get roughly `[bitlength / c] + 1` inaccumulators
+/// - double the highest accumulator enough times, add to the next one, double the result, add the next accumulator, continue
+/// 
+/// Demo why it works:
+/// ```
+///     a * G + b * H = (a_2 * (2^c)^2 + a_1 * (2^c)^1 + a_0) * G + (b_2 * (2^c)^2 + b_1 * (2^c)^1 + b_0) * H
+/// ```
+/// - make buckets over `0` labeled coefficients
+/// - make buckets over `1` labeled coefficients
+/// - make buckets over `2` labeled coefficients
+/// - accumulators over each set of buckets will have an implicit factor of `(2^c)^i`, so before summing thme up
+/// "higher" accumulators must be doubled `c` times
+///
 fn multiexp_inner<Q, D, G, S>(
     pool: &Worker,
     bases: S,
@@ -195,7 +225,7 @@ fn multiexp_inner<Q, D, G, S>(
                     } else {
                         // Place multiplication into the bucket: Separate s * P as 
                         // (s/2^c) * P + (s mod 2^c) P
-                        // First multiplication is c bits less, do one can do it,
+                        // First multiplication is c bits less, so one can do it,
                         // sum results from different buckets and double it c times,
                         // then add with (s mod 2^c) P parts
                         let mut exp = exp;
@@ -317,3 +347,34 @@ fn test_with_bls12() {
 
     assert_eq!(naive, fast);
 }
+
+#[test]
+fn test_speed_with_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 22;
+
+    let rng = &mut rand::thread_rng();
+    let v = Arc::new((0..SAMPLES).map(|_| <Bn256 as ScalarEngine>::Fr::rand(rng).into_repr()).collect::<Vec<_>>());
+    let g = Arc::new((0..SAMPLES).map(|_| <Bn256 as Engine>::G1::rand(rng).into_affine()).collect::<Vec<_>>());
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    let _fast = multiexp(
+        &pool,
+        (g, 0),
+        FullDensity,
+        v
+    ).wait().unwrap();
+
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per multiplication", SAMPLES, cpus, time_per_sample);
+}
diff --git a/src/progress_bar.rs b/src/progress_bar.rs
deleted file mode 100644
index f8574c0bf..000000000
--- a/src/progress_bar.rs
+++ /dev/null
@@ -1,146 +0,0 @@
-extern crate time;
-
-use std::io::{Write};
-use std::sync::{
-    mpsc::{channel, Sender, Receiver}, 
-    Arc, 
-    atomic::{AtomicUsize, Ordering}
-};
-use self::time::precise_time_ns;
-use std::time::Duration;
-
-static UPDATE_INTERVAL: u64 = 1000_000 * 1000; // ms
-
-pub struct MultiBar {
-    n_workers:  u64,
-
-    total:      u64,
-    cur:        u64,
-
-    prev:       u64,
-    prev_time:  u64,
-
-    total_elapsed: u64,
-
-    step:       Arc<AtomicUsize>,
-    tx:         Sender<u64>,
-    rx:         Receiver<u64>,
-}
-
-pub struct ProgressBar {
-    //chunk:  u64,
-    acc:    u64,
-    step:   Arc<AtomicUsize>,
-    tx:     Option<Sender<u64>>,
-}
-
-/// Simple efficient thread-safe progress indicator
-/// It follows the interface of [https://github.com/a8m/pb](https://github.com/a8m/pb)
-impl MultiBar {
-
-    /// Create a new MultiBar for stdout
-    pub fn new() -> Self {
-        let (tx, rx) = channel();
-        Self{
-            n_workers:  0,
-            total:      0,
-            cur:        0,
-            prev:       0,
-            prev_time:  precise_time_ns(),
-            total_elapsed: 0,
-            step:       Arc::new(AtomicUsize::new(1)),
-            tx, 
-            rx,
-        }
-    }
-
-    // Create a ProgressBar for a process of `total` steps
-    pub fn create_bar(&mut self, chunk: u64) -> ProgressBar {
-        self.n_workers += 1;
-        self.total += chunk;
-        //println!("step 0 of {}", chunk);
-        ProgressBar{
-            //chunk,
-            acc:    0,
-            tx:     Some(Sender::clone(&self.tx)),
-            step:   Arc::clone(&self.step),
-        }
-    }
-
-    /// Start listening for updates from ProgressBars in different threads
-    pub fn listen(&mut self) {
-        //println!("");
-        for d in &self.rx {
-            if d == 0 {
-                self.n_workers -= 1;
-            }
-            if self.n_workers == 0 { 
-                break; 
-            }
-
-            self.cur += d;
-            let processed = self.cur - self.prev;
-            if processed > self.step.load(Ordering::Acquire) as u64 * self.n_workers {
-                let now = time::precise_time_ns();
-                let elapsed = now - self.prev_time;
-
-                if elapsed > UPDATE_INTERVAL {
-                    self.prev = self.cur;
-                    self.prev_time = now;
-                    self.total_elapsed += elapsed;
-
-                    print!("\rprocessed {:2}%: {} of {}.", self.cur * 100 / self.total, self.cur, self.total);
-
-                    let r = Duration::from_nanos((self.total - self.cur) * self.total_elapsed / self.cur).as_secs();
-                    print!(" Remaining estimated: {} h {} min {} s", r / 3600, r % 3600 / 60, r % 60);
-
-                    let new_step = (self.cur * UPDATE_INTERVAL / self.total_elapsed) / self.n_workers;
-                    self.step.store(new_step as usize, Ordering::Release);
-                    
-                    std::io::stdout().flush().unwrap();
-                }
-            }
-
-        }
-        println!("\rdone                                                                   ");
-    }
-}
-
-impl ProgressBar {
-
-    /// Increment progress by `d` steps
-    pub fn add(&mut self, d: u64) {
-        self.acc += d;
-        if self.acc > (self.step.load(Ordering::Relaxed) as u64) {
-            if let Some(tx) = &self.tx { 
-                tx.send(self.acc).unwrap(); 
-            }
-            self.acc = 0;
-        }
-    }
-
-    /// Finish the process
-    pub fn finish(&mut self) {
-        let tx = self.tx.take().unwrap();
-        tx.send(0).unwrap();
-        drop(tx);
-    }
-}
-
-#[test]
-fn test_progress_display() {
-
-    let mut mb = MultiBar::new();
-
-    for _j in 1..=0 { 
-        let mut pb = mb.create_bar(3600000); 
-        std::thread::spawn(move || {
-            for _i in 0..3600000 {
-                std::thread::sleep(Duration::from_millis(1));
-                pb.add(1);
-            }
-            pb.finish();
-        });
-    };
-    //mb.listen();
-}