- add some benches

- cleanup dependencies - will work on WASM and GM17
matter-labs · Jan 23, 2019 · e775b47 · e775b47
1 parent 6e5cfe2
commit e775b47
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 178 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 target
 Cargo.lock
+pkg
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,10 @@ homepage = "https://github.com/matterinc/bellman"
 license = "MIT/Apache-2.0"
 name = "bellman"
 repository = "https://github.com/matterinc/bellman"
-version = "0.1.2"
+version = "0.1.3"
+
+[lib]
+crate-type = ["cdylib", "lib", "staticlib"]
 
 [dependencies]
 rand = "0.4"
@@ -18,8 +21,6 @@ crossbeam = "0.3"
 pairing = { git = 'https://github.com/matterinc/pairing' }
 byteorder = "1"
 ff = { git = 'https://github.com/matterinc/ff', features = ["derive"] }
-pbr = "1.0.1"
-time = "0.1"
 
 [features]
 default = []
diff --git a/src/domain.rs b/src/domain.rs
@@ -509,3 +509,59 @@ fn parallel_fft_consistency() {
 
     test_consistency::<Bls12, _>(rng);
 }
+
+#[test]
+fn test_field_element_multiplication_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use pairing::bn256::Fr;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 27;
+
+    let rng = &mut rand::thread_rng();
+    let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+    let v2 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+
+    let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
+    let v2 = EvaluationDomain::from_coeffs(v2).unwrap();
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    v1.mul_assign(&pool, &v2);
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
+}
+
+#[test]
+fn test_fft_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use pairing::bn256::Fr;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 27;
+
+    let rng = &mut rand::thread_rng();
+    let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
+
+    let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    v1.ifft(&pool);
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
+}
diff --git a/src/groth16/generator.rs b/src/groth16/generator.rs
@@ -1,9 +1,5 @@
-extern crate time;
-
 use super::super::verbose_flag;
 
-use self::time::PreciseTime;
-
 use rand::Rng;
 
 use std::sync::Arc;
@@ -255,7 +251,9 @@ pub fn generate_parameters<E, C>(
     {
         // Compute powers of tau
         if verbose {eprintln!("computing powers of tau...")};
-        let start = PreciseTime::now();
+
+        let start = std::time::Instant::now();
+
         {
             let powers_of_tau = powers_of_tau.as_mut();
             worker.scope(powers_of_tau.len(), |scope, chunk| {
@@ -272,14 +270,16 @@ pub fn generate_parameters<E, C>(
                 }
             });
         }
-        if verbose {eprintln!("powers of tau stage 1 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+        if verbose {eprintln!("powers of tau stage 1 done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
 
         // coeff = t(x) / delta
         let mut coeff = powers_of_tau.z(&tau);
         coeff.mul_assign(&delta_inverse);
 
         if verbose {eprintln!("computing the H query with multiple threads...")};
-        let start = PreciseTime::now();
+
+        let start = std::time::Instant::now();
+
         // Compute the H query with multiple threads
         worker.scope(h.len(), |scope, chunk| {
             for (h, p) in h.chunks_mut(chunk).zip(powers_of_tau.as_ref().chunks(chunk))
@@ -302,17 +302,18 @@ pub fn generate_parameters<E, C>(
                 });
             }
         });
-        if verbose {eprintln!("computing the H query done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+        if verbose {eprintln!("computing the H query done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
     }
 
     if verbose {eprintln!("using inverse FFT to convert powers of tau to Lagrange coefficients...")};
-    let start = PreciseTime::now();
+
+    let start = std::time::Instant::now();
 
     // Use inverse FFT to convert powers of tau to Lagrange coefficients
     powers_of_tau.ifft(&worker);
     let powers_of_tau = powers_of_tau.into_coeffs();
 
-    if verbose {eprintln!("powers of tau stage 2 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0)};
+    if verbose {eprintln!("powers of tau stage 2 done in {} s", start.elapsed().as_millis() as f64 / 1000.0)};
 
     let mut a = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
     let mut b_g1 = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
@@ -321,7 +322,7 @@ pub fn generate_parameters<E, C>(
     let mut l = vec![E::G1::zero(); assembly.num_aux];
 
     if verbose {eprintln!("evaluating polynomials...")};
-    let start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     fn eval<E: Engine>(
         // wNAF window tables
@@ -474,7 +475,7 @@ pub fn generate_parameters<E, C>(
         &worker
     );
 
-    if verbose {eprintln!("evaluating polynomials done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
+    if verbose {eprintln!("evaluating polynomials done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
 
     // Don't allow any elements be unconstrained, so that
     // the L query is always fully dense.

diff --git a/src/groth16/prover.rs b/src/groth16/prover.rs
@@ -1,6 +1,3 @@
-extern crate time;
-use self::time::PreciseTime;
-
 use super::super::verbose_flag;
 
 use rand::Rng;
@@ -173,7 +170,7 @@ impl<E:Engine> PreparedProver<E> {
 
         let vk = params.get_vk(self.assignment.input_assignment.len())?;
 
-        let h_start = PreciseTime::now();
+        let start = std::time::Instant::now();
 
         let h = {
             let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@@ -209,10 +206,9 @@ impl<E:Engine> PreparedProver<E> {
             multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
         };
 
-        let h_end = PreciseTime::now();
-        if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
+        if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
 
-        let points_start = PreciseTime::now();
+        let start = std::time::Instant::now();
 
         // TODO: Check that difference in operations for different chunks is small
 
@@ -283,8 +279,7 @@ impl<E:Engine> PreparedProver<E> {
         g_c.add_assign(&h.wait()?);
         g_c.add_assign(&l.wait()?);
 
-        let points_end = PreciseTime::now();
-        if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
+        if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
 
         Ok(Proof {
             a: g_a.into_affine(),
@@ -437,7 +432,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
 
     let vk = params.get_vk(prover.input_assignment.len())?;
 
-    let h_start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     let h = {
         let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@@ -473,10 +468,9 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
         multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
     };
 
-    let h_end = PreciseTime::now();
-    if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
+    if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
 
-    let points_start = PreciseTime::now();
+    let start = std::time::Instant::now();
 
     // TODO: Check that difference in operations for different chunks is small
 
@@ -547,8 +541,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
     g_c.add_assign(&h.wait()?);
     g_c.add_assign(&l.wait()?);
 
-    let points_end = PreciseTime::now();
-    if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
+    if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
 
     Ok(Proof {
         a: g_a.into_affine(),

diff --git a/src/lib.rs b/src/lib.rs
@@ -14,7 +14,6 @@ pub mod multicore;
 mod multiexp;
 pub mod domain;
 pub mod groth16;
-pub mod progress_bar;
 
 use pairing::{Engine};
 use ff::Field;

diff --git a/src/multiexp.rs b/src/multiexp.rs
@@ -142,6 +142,36 @@ impl DensityTracker {
     }
 }
 
+/// This genious piece of code works in the following way:
+/// - choose `c` - the bit length of the region that one thread works on
+/// - make `2^c - 1` buckets and initialize them with `G = infinity` (that's equivalent of zero)
+/// - there is no bucket for "zero" cause it's not necessary
+/// - go over the pairs `(base, scalar)`
+/// - for each scalar calculate `scalar % 2^c` and add the base (without any multiplications!) to the 
+/// corresponding bucket
+/// - at the end each bucket will have an accumulated value that should be multiplied by the corresponding factor
+/// between `1` and `2^c - 1` to get the right value
+/// - here comes the first trick - you don't need to do multiplications at all, just add all the buckets together
+/// starting from the first one `(a + b + c + ...)` and than add to the first sum another sum of the form
+/// `(b + c + d + ...)`, and than the third one `(c + d + ...)`, that will result in the proper prefactor infront of every
+/// accumulator, without any multiplication operations at all
+/// - that's of course not enough, so spawn the next thread
+/// - this thread works with the same bit width `c`, but SKIPS lowers bits completely, so it actually takes values
+/// in the form `(scalar >> c) % 2^c`, so works on the next region
+/// - spawn more threads until you exhaust all the bit length
+/// - you will get roughly `[bitlength / c] + 1` inaccumulators
+/// - double the highest accumulator enough times, add to the next one, double the result, add the next accumulator, continue
+/// 
+/// Demo why it works:
+/// ```
+///     a * G + b * H = (a_2 * (2^c)^2 + a_1 * (2^c)^1 + a_0) * G + (b_2 * (2^c)^2 + b_1 * (2^c)^1 + b_0) * H
+/// ```
+/// - make buckets over `0` labeled coefficients
+/// - make buckets over `1` labeled coefficients
+/// - make buckets over `2` labeled coefficients
+/// - accumulators over each set of buckets will have an implicit factor of `(2^c)^i`, so before summing thme up
+/// "higher" accumulators must be doubled `c` times
+///
 fn multiexp_inner<Q, D, G, S>(
     pool: &Worker,
     bases: S,
@@ -195,7 +225,7 @@ fn multiexp_inner<Q, D, G, S>(
                     } else {
                         // Place multiplication into the bucket: Separate s * P as 
                         // (s/2^c) * P + (s mod 2^c) P
-                        // First multiplication is c bits less, do one can do it,
+                        // First multiplication is c bits less, so one can do it,
                         // sum results from different buckets and double it c times,
                         // then add with (s mod 2^c) P parts
                         let mut exp = exp;
@@ -317,3 +347,34 @@ fn test_with_bls12() {
 
     assert_eq!(naive, fast);
 }
+
+#[test]
+fn test_speed_with_bn256() {
+    use rand::{self, Rand};
+    use pairing::bn256::Bn256;
+    use num_cpus;
+
+    let cpus = num_cpus::get();
+    const SAMPLES: usize = 1 << 22;
+
+    let rng = &mut rand::thread_rng();
+    let v = Arc::new((0..SAMPLES).map(|_| <Bn256 as ScalarEngine>::Fr::rand(rng).into_repr()).collect::<Vec<_>>());
+    let g = Arc::new((0..SAMPLES).map(|_| <Bn256 as Engine>::G1::rand(rng).into_affine()).collect::<Vec<_>>());
+
+    let pool = Worker::new();
+
+    let start = std::time::Instant::now();
+
+    let _fast = multiexp(
+        &pool,
+        (g, 0),
+        FullDensity,
+        v
+    ).wait().unwrap();
+
+
+    let duration_ns = start.elapsed().as_nanos() as f64;
+    println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
+    let time_per_sample = duration_ns/(SAMPLES as f64);
+    println!("Tested on {} samples on {} CPUs with {} ns per multiplication", SAMPLES, cpus, time_per_sample);
+}