HazyResearch · mleszczy · Apr 28, 2019
diff --git a/butterfly/__init__.py b/butterfly/__init__.py
diff --git a/butterfly/factor_multiply/setup.py b/butterfly/factor_multiply/setup.py
diff --git a/tests/test_butterfly.py b/tests/test_butterfly.py
@@ -1,16 +1,12 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 import math
 import unittest
 
 import numpy as np
 
 import torch
 
-from butterfly import Butterfly
-from butterfly.butterfly import ButterflyBmm
-
+from torch_butterfly.butterfly import Butterfly
+from torch_butterfly.butterfly import ButterflyBmm
 
 class ButterflyTest(unittest.TestCase):
 

diff --git a/tests/test_butterfly_multiply.py b/tests/test_butterfly_multiply.py
@@ -1,19 +1,15 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 import math
 import unittest
 
 import torch
 import torch.nn.functional as F
 
-from butterfly import Butterfly
-from cnn.models.butterfly_conv import ButterflyConv2d
+from torch_butterfly import Butterfly
 
-from butterfly.butterfly_multiply import butterfly_mult_torch, butterfly_mult, butterfly_mult_inplace, butterfly_mult_factors
-from butterfly.butterfly_multiply import butterfly_mult_untied_torch, butterfly_mult_untied
-from butterfly.butterfly_multiply import butterfly_mult_conv2d_torch, butterfly_mult_conv2d
-from butterfly.butterfly_multiply import butterfly_mult_untied_svd_torch, butterfly_mult_untied_svd
+from torch_butterfly.butterfly_multiply import butterfly_mult_torch, butterfly_mult, butterfly_mult_inplace, butterfly_mult_factors
+from torch_butterfly.butterfly_multiply import butterfly_mult_untied_torch, butterfly_mult_untied
+from torch_butterfly.butterfly_multiply import butterfly_mult_conv2d_torch, butterfly_mult_conv2d
+from torch_butterfly.butterfly_multiply import butterfly_mult_untied_svd_torch, butterfly_mult_untied_svd
 
 
 class ButterflyMultTest(unittest.TestCase):

diff --git a/tests/test_permutation.py b/tests/test_permutation.py
@@ -1,14 +1,11 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 import math
 import unittest
 
 import numpy as np
 
 import torch
 
-from butterfly.permutation import Permutation, FixedPermutation, PermutationFactor
+from torch_butterfly.permutation import Permutation, FixedPermutation, PermutationFactor
 
 
 class PermutationTest(unittest.TestCase):

diff --git a/tests/test_permutation_multiply.py b/tests/test_permutation_multiply.py
@@ -1,13 +1,10 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 import math
 import unittest
 
 import torch
 
-from butterfly.permutation_multiply import permutation_mult_torch, permutation_mult
-from butterfly.permutation_multiply import permutation_mult_single_factor_torch, permutation_mult_single
+from torch_butterfly.permutation_multiply import permutation_mult_torch, permutation_mult
+from torch_butterfly.permutation_multiply import permutation_mult_single_factor_torch, permutation_mult_single
 
 
 class PermutationMultTest(unittest.TestCase):

diff --git a/torch_butterfly/__init__.py b/torch_butterfly/__init__.py
@@ -0,0 +1 @@
+from .butterfly import Butterfly
diff --git a/butterfly/benchmark.py → torch_butterfly/benchmark.py b/butterfly/benchmark.py → torch_butterfly/benchmark.py
@@ -1,11 +1,7 @@
-import os, sys
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, project_root)
-
 import torch
 
-from butterfly import Butterfly
-from butterfly.butterfly_multiply import butterfly_mult, butterfly_mult_untied, butterfly_mult_untied_svd, butterfly_mult_factors, butterfly_mult_inplace
+from torch_butterfly.butterfly import Butterfly
+from torch_butterfly.butterfly_multiply import butterfly_mult, butterfly_mult_untied, butterfly_mult_untied_svd, butterfly_mult_factors, butterfly_mult_inplace
 
 batch_size = 8192
 n = 256

diff --git a/cnn/benchmark_cnn.py → torch_butterfly/benchmark_cnn.py b/cnn/benchmark_cnn.py → torch_butterfly/benchmark_cnn.py
@@ -1,12 +1,6 @@
-import os, sys
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, project_root)
-
 import torch
 
-from cnn.models.butterfly_conv import ButterflyConv2d
-from butterfly.butterfly import ButterflyBmm
-from butterfly.butterfly_multiply import butterfly_conv2d
+from torch_butterfly.butterfly import ButterflyConv2d
 
 import time
 nsteps = 1000

diff --git a/butterfly/butterfly.py → torch_butterfly/butterfly.py b/butterfly/butterfly.py → torch_butterfly/butterfly.py
@@ -2,8 +2,10 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 
-from .butterfly_multiply import butterfly_mult, butterfly_mult_untied, butterfly_mult_untied_svd
+from torch_butterfly.butterfly_multiply import butterfly_mult, butterfly_mult_untied, butterfly_mult_untied_svd
+from torch_butterfly.butterfly_multiply import butterfly_mult_conv2d
 
 class Butterfly(nn.Module):
     """Product of log N butterfly factors, each is a block 2x2 of diagonal matrices.
@@ -218,4 +220,217 @@ def post_process(self, output, batch):
                 output = output.view(batch, self.matrix_batch, self.in_size_extended // out_size_extended, out_size_extended, 2).mean(dim=2)
         if self.out_size != out_size_extended:  # Take top rows
             output = output[:, :, :self.out_size]
-        return output if self.bias is None else output + self.bias
+        return output if self.bias is None else output + self.bias
+
+class Butterfly1x1Conv(Butterfly):
+    """Product of log N butterfly factors, each is a block 2x2 of diagonal matrices.
+    """
+
+    def forward(self, input):
+        """
+        Parameters:
+            input: (batch, c, h, w) if real or (batch, c, h, w, 2) if complex
+        Return:
+            output: (batch, nstack * c, h, w) if real or (batch, nstack * c, h, w, 2) if complex
+        """
+        # TODO: Only doing real for now
+        batch, c, h, w = input.shape
+        input_reshape = input.view(batch, c, h * w).transpose(1, 2).reshape(-1, c)
+        output = super().forward(input_reshape)
+        return output.view(batch, h * w, self.nstack * c).transpose(1, 2).view(batch, self.nstack * c, h, w)
+
+
+class ButterflyConv2d(ButterflyBmm):
+    """Product of log N butterfly factors, each is a block 2x2 of diagonal matrices.
+
+    Parameters:
+        in_channels: size of input
+        out_channels: size of output
+        kernel_size: int or (int, int)
+        stride: int or (int, int)
+        padding; int or (int, int)
+        dilation: int or (int, int)
+        bias: If set to False, the layer will not learn an additive bias.
+                Default: ``True``
+        tied_weight: whether the weights in the butterfly factors are tied.
+            If True, will have 4N parameters, else will have 2 N log N parameters (not counting bias)
+         increasing_stride: whether to multiply with increasing stride (e.g. 1, 2, ..., n/2) or
+             decreasing stride (e.g., n/2, n/4, ..., 1).
+             Note that this only changes the order of multiplication, not how twiddle is stored.
+             In other words, twiddle[@log_stride] always stores the twiddle for @stride.
+        ortho_init: whether the weight matrix should be initialized to be orthogonal/unitary.
+        param: The parameterization of the 2x2 butterfly factors, either 'regular' or 'ortho' or 'svd'.
+            'ortho' and 'svd' only support real, not complex.
+        max_gain: (only for svd parameterization) controls the maximum and minimum singular values
+            of the whole matrix (not of each factor).
+            For example, max_gain=10.0 means that the singular values are in [0.1, 10.0].
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=True,
+                 tied_weight=True, increasing_stride=True, ortho_init=False, param='regular', max_gain=10.0,
+                 fused_unfold=False):
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.stride = (stride, stride) if isinstance(stride, int) else stride
+        self.padding = (padding, padding) if isinstance(padding, int) else padding
+        self.dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
+        self.fused_unfold = fused_unfold
+        super().__init__(in_channels, out_channels, self.kernel_size[0] * self.kernel_size[1], bias, False,
+                         tied_weight, increasing_stride, ortho_init, param, max_gain)
+
+    def forward(self, input):
+        """
+        Parameters:
+            input: (batch, c, h, w) if real or (batch, c, h, w, 2) if complex
+        Return:
+            output: (batch, nstack * c, h, w) if real or (batch, nstack * c, h, w, 2) if complex
+        """
+        # TODO: Only doing real for now
+        batch, c, h, w = input.shape
+        h_out = (h + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) // self.stride[0] + 1
+        w_out = (h + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) // self.stride[1] + 1
+        if not self.fused_unfold or c > 1024 or not input.is_cuda:
+            # unfold input into patches and call batch matrix multiply
+            input_patches = F.unfold(input, self.kernel_size, self.dilation, self.padding, self.stride).view(
+                batch, c, self.kernel_size[0] * self.kernel_size[1], h_out * w_out)
+            input = input_patches.permute(0, 3, 2, 1).reshape(batch * h_out * w_out, self.kernel_size[0] * self.kernel_size[1], c)
+            output = super().forward(input)
+        else:
+            batch_out = batch * h_out * w_out
+            output = butterfly_mult_conv2d(self.twiddle, input, self.kernel_size[0],
+                self.padding[0], self.increasing_stride)
+            output = super().post_process(output, batch_out)
+        # combine matrix batches
+        output = output.mean(dim=1)
+        return output.view(batch, h_out * w_out, self.out_channels).transpose(1, 2).view(batch, self.out_channels, h_out, w_out)
+
+
+class ButterflyConv2dBBT(nn.Module):
+    """Product of log N butterfly factors, each is a block 2x2 of diagonal matrices.
+
+    Parameters:
+        in_channels: size of input
+        out_channels: size of output
+        kernel_size: int or (int, int)
+        stride: int or (int, int)
+        padding; int or (int, int)
+        dilation: int or (int, int)
+        bias: If set to False, the layer will not learn an additive bias.
+                Default: ``True``
+        nblocks: number of BBT blocks in the product
+        tied_weight: whether the weights in the butterfly factors are tied.
+            If True, will have 4N parameters, else will have 2 N log N parameters (not counting bias)
+        increasing_stride: whether to multiply with increasing stride (e.g. 1, 2, ..., n/2) or
+            decreasing stride (e.g., n/2, n/4, ..., 1).
+            Note that this only changes the order of multiplication, not how twiddle is stored.
+            In other words, twiddle[@log_stride] always stores the twiddle for @stride.
+        ortho_init: whether the weight matrix should be initialized to be orthogonal/unitary.
+        param: The parameterization of the 2x2 butterfly factors, either 'regular' or 'ortho' or 'svd'.
+            'ortho' and 'svd' only support real, not complex.
+        max_gain: (only for svd parameterization) controls the maximum and minimum singular values
+            of the whole BB^T matrix (not of each factor).
+            For example, max_gain=10.0 means that the singular values are in [0.1, 10.0].
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=True,
+                 tied_weight=True, nblocks=1, ortho_init=False, param='regular', max_gain=10.0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.stride = (stride, stride) if isinstance(stride, int) else stride
+        self.padding = (padding, padding) if isinstance(padding, int) else padding
+        self.dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
+        self.nblocks = nblocks
+        max_gain_per_block = max_gain ** (1 / (2 * nblocks))
+        layers = []
+        for i in range(nblocks):
+            layers.append(ButterflyBmm(in_channels if i == 0 else out_channels,
+                                       out_channels, self.kernel_size[0] *
+                                       self.kernel_size[1], False, False,
+                                       tied_weight, increasing_stride=False,
+                                       ortho_init=ortho_init, param=param,
+                                       max_gain=max_gain_per_block))
+            layers.append(ButterflyBmm(out_channels, out_channels,
+                                       self.kernel_size[0] *
+                                       self.kernel_size[1], False, bias if i == nblocks - 1 else False,
+                                       tied_weight, increasing_stride=True,
+                                       ortho_init=ortho_init, param=param,
+                                       max_gain=max_gain_per_block))
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, input):
+        """
+        Parameters:
+            input: (batch, c, h, w) if real or (batch, c, h, w, 2) if complex
+        Return:
+            output: (batch, nstack * c, h, w) if real or (batch, nstack * c, h, w, 2) if complex
+        """
+        # TODO: Only doing real for now
+        batch, c, h, w = input.shape
+        h_out = (h + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) // self.stride[0] + 1
+        w_out = (h + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) // self.stride[1] + 1
+        input_patches = F.unfold(input, self.kernel_size, self.dilation, self.padding, self.stride).view(batch, c, self.kernel_size[0] * self.kernel_size[1], h_out * w_out)
+        input_reshape = input_patches.permute(0, 3, 2, 1).reshape(batch * h_out * w_out, self.kernel_size[0] * self.kernel_size[1], c)
+        output = self.layers(input_reshape).mean(dim=1)
+        return output.view(batch, h_out * w_out, self.out_channels).transpose(1, 2).view(batch, self.out_channels, h_out, w_out)
+
+
+class ButterflyConv2dBBTBBT(nn.Module):
+    """Product of log N butterfly factors, each is a block 2x2 of diagonal matrices.
+
+    Parameters:
+        in_channels: size of input
+        out_channels: size of output
+        kernel_size: int or (int, int)
+        stride: int or (int, int)
+        padding; int or (int, int)
+        dilation: int or (int, int)
+        bias: If set to False, the layer will not learn an additive bias.
+                Default: ``True``
+        tied_weight: whether the weights in the butterfly factors are tied.
+            If True, will have 4N parameters, else will have 2 N log N parameters (not counting bias)
+         increasing_stride: whether to multiply with increasing stride (e.g. 1, 2, ..., n/2) or
+             decreasing stride (e.g., n/2, n/4, ..., 1).
+             Note that this only changes the order of multiplication, not how twiddle is stored.
+             In other words, twiddle[@log_stride] always stores the twiddle for @stride.
+        ortho_init: whether the weight matrix should be initialized to be orthogonal/unitary.
+        param: The parameterization of the 2x2 butterfly factors, either 'regular' or 'ortho' or 'svd'.
+            'ortho' and 'svd' only support real, not complex.
+        max_gain: (only for svd parameterization) controls the maximum and minimum singular values
+            of the whole BB^T BB^T matrix (not of each factor).
+            For example, max_gain=10.0 means that the singular values are in [0.1, 10.0].
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=True,
+                 tied_weight=True, ortho_init=False, param='regular', max_gain=10.0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.stride = (stride, stride) if isinstance(stride, int) else stride
+        self.padding = (padding, padding) if isinstance(padding, int) else padding
+        self.dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
+        self.layers = nn.Sequential(
+            ButterflyBmm(in_channels, out_channels, self.kernel_size[0] * self.kernel_size[1], False, False, tied_weight, increasing_stride=False, ortho_init=ortho_init, param=param, max_gain=max_gain ** (1 / 4)),
+            ButterflyBmm(out_channels, out_channels, self.kernel_size[0] * self.kernel_size[1], False, False, tied_weight, increasing_stride=True, ortho_init=ortho_init, param=param, max_gain=max_gain ** (1 / 4)),
+            ButterflyBmm(out_channels, out_channels, self.kernel_size[0] * self.kernel_size[1], False, False, tied_weight, increasing_stride=False, ortho_init=ortho_init, param=param, max_gain=max_gain ** (1 / 4)),
+            ButterflyBmm(out_channels, out_channels, self.kernel_size[0] * self.kernel_size[1], bias, False, tied_weight, increasing_stride=True, ortho_init=ortho_init, param=param, max_gain=max_gain ** (1 / 4))
+            )
+
+    def forward(self, input):
+        """
+        Parameters:
+            input: (batch, c, h, w) if real or (batch, c, h, w, 2) if complex
+        Return:
+            output: (batch, nstack * c, h, w) if real or (batch, nstack * c, h, w, 2) if complex
+        """
+        # TODO: Only doing real for now
+        batch, c, h, w = input.shape
+        h_out = (h + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) // self.stride[0] + 1
+        w_out = (h + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) // self.stride[1] + 1
+        input_patches = F.unfold(input, self.kernel_size, self.dilation, self.padding, self.stride).view(batch, c, self.kernel_size[0] * self.kernel_size[1], h_out * w_out)
+        input_reshape = input_patches.permute(0, 3, 2, 1).reshape(batch * h_out * w_out, self.kernel_size[0] * self.kernel_size[1], c)
+        output = self.layers(input_reshape).mean(dim=1)
+        return output.view(batch, h_out * w_out, self.out_channels).transpose(1, 2).view(batch, self.out_channels, h_out, w_out)
diff --git a/butterfly/butterfly_multiply.py → torch_butterfly/butterfly_multiply.py b/butterfly/butterfly_multiply.py → torch_butterfly/butterfly_multiply.py
diff --git a/butterfly/complex_utils.py → torch_butterfly/complex_utils.py b/butterfly/complex_utils.py → torch_butterfly/complex_utils.py
diff --git a/...erfly/factor_multiply/factor_multiply.cpp → ...erfly/factor_multiply/factor_multiply.cpp b/...erfly/factor_multiply/factor_multiply.cpp → ...erfly/factor_multiply/factor_multiply.cpp
diff --git a/...y/factor_multiply/factor_multiply_cuda.cu → ...y/factor_multiply/factor_multiply_cuda.cu b/...y/factor_multiply/factor_multiply_cuda.cu → ...y/factor_multiply/factor_multiply_cuda.cu
diff --git a/butterfly/permutation.py → torch_butterfly/permutation.py b/butterfly/permutation.py → torch_butterfly/permutation.py
diff --git a/butterfly/permutation_multiply.py → torch_butterfly/permutation_multiply.py b/butterfly/permutation_multiply.py → torch_butterfly/permutation_multiply.py
diff --git a/butterfly/utils.py → torch_butterfly/utils.py b/butterfly/utils.py → torch_butterfly/utils.py