Merge remote-tracking branch 'upstream/main' into release/v0.7.0

deephaven · Dec 9, 2021 · 2ce3ebe · 2ce3ebe
2 parents 1de945f + ae32bfd
commit 2ce3ebe
Show file tree

Hide file tree

Showing 7 changed files with 1,182 additions and 10 deletions.
diff --git a/Integrations/python/deephaven/__init__.py b/Integrations/python/deephaven/__init__.py
@@ -97,7 +97,6 @@
 
 from .csv import read as read_csv
 from .csv import write as write_csv
-from .conversion_utils import convertToJavaList as as_list
 
 # NB: this must be defined BEFORE importing .jvm_init or .start_jvm (circular import)
 def initialize():
@@ -414,15 +413,16 @@ def doLocked(f, lock_type="shared"):
         raise ValueError("Unsupported lock type: lock_type={}".format(lock_type))
 
 
-def combo_agg(agg_list):
+def as_list(values):
     """
-    Combines aggregations.
+    Creates a Java list containing the values.
 
-    :param agg_list: list of aggregations
-    :return: combined aggregations
+    :param values: values
+    :return: Java list containing the values.
     """
     _JArrayList = jpy.get_type("java.util.ArrayList")
-    j_agg_list = _JArrayList(len(agg_list))
-    for agg in agg_list:
-        j_agg_list.add(agg)
-    return j_agg_list
+    j_list = _JArrayList(len(values))
+    for value in values:
+        j_list.add(value)
+    return j_list
+
diff --git a/Integrations/python/deephaven/learn/__init__.py b/Integrations/python/deephaven/learn/__init__.py
@@ -30,13 +30,13 @@ def _defineSymbols():
         raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")
 
     global _Input_, _Output_, _Computer_, _Scatterer_
+
     if _Input_ is None:
         _Input_ = jpy.get_type("io.deephaven.integrations.learn.Input")
         _Output_ = jpy.get_type("io.deephaven.integrations.learn.Output")
         _Computer_ = jpy.get_type("io.deephaven.integrations.learn.Computer")
         _Scatterer_ = jpy.get_type("io.deephaven.integrations.learn.Scatterer")
 
-
 # every module method that invokes Java classes should be decorated with @_passThrough
 @wrapt.decorator
 def _passThrough(wrapped, instance, args, kwargs):
@@ -55,6 +55,10 @@ def _passThrough(wrapped, instance, args, kwargs):
     _defineSymbols()
     return wrapped(*args, **kwargs)
 
+try:
+    _defineSymbols()
+except Exception as e:
+    pass
 
 @_passThrough
 class Input:

diff --git a/Integrations/python/deephaven/learn/gather/__init__.py b/Integrations/python/deephaven/learn/gather/__init__.py
@@ -0,0 +1,122 @@
+# 
+# Copyright (c) 2016 - 2021 Deephaven Data Labs and Patent Pending
+# 
+"""
+Utilities for gathering Deephaven table data into Python objects
+"""
+
+import numpy as np
+import enum
+import jpy
+import wrapt
+
+# None until the first _defineSymbols() call
+_gatherer = None
+
+def _defineSymbols():
+    if not jpy.has_jvm():
+        raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")
+
+    global _gatherer
+    global Layout
+
+    if _gatherer is None:
+        _gatherer = jpy.get_type("io.deephaven.integrations.learn.gather.NumPy")
+
+class MemoryLayout(enum.Enum):
+    """
+    Memory layouts for an array.
+    """
+    ROW_MAJOR = True
+    """Row-major memory layout."""
+    COLUMN_MAJOR = False
+    """Column-major memory layout."""
+    C = True
+    """Memory layout consistent with C arrays (row-major)."""
+    FORTRAN = False
+    """Memory layout consistent with Fortran arrays (column-major)."""
+
+    def __init__(self, is_row_major):
+        self.is_row_major = is_row_major
+
+
+# Every method that depends on symbols defined via _defineSymbols() should be decorated with @_passThrough
+@wrapt.decorator
+def _passThrough(wrapped, instance, args, kwargs):
+    """
+    For decoration of module methods, to define necessary symbols at runtime
+
+    :param wrapped: the method to be decorated
+    :param instance: the object to which the wrapped function was bound when it was called
+    :param args: the argument list for `wrapped`
+    :param kwargs: the keyword argument dictionary for `wrapped`
+    :return: the decorated version of the method
+    """
+
+    _defineSymbols()
+    return wrapped(*args, **kwargs)
+
+try:
+    _defineSymbols()
+except Exception as e:
+    pass
+
+@_passThrough
+def convert_to_numpy_dtype(dtype):
+    """
+    Convert an input type to the corresponding NumPy data type
+
+    :param dtype: A Python type
+    """
+    if dtype.__module__ == np.__name__:
+        return dtype
+    elif dtype == bool:
+        dtype = np.bool_
+    elif dtype == float:
+        dtype = np.double
+    elif dtype == int:
+        dtype = np.intc
+    else:
+        raise ValueError(f"{dtype} is not a data type that can be converted to a NumPy dtype.")
+    return dtype
+
+@_passThrough
+def table_to_numpy_2d(row_set, col_set, order:MemoryLayout = MemoryLayout.ROW_MAJOR, dtype:np.dtype = np.intc):
+    """
+    Convert Deephaven table data to a 2d NumPy array of the appropriate size
+
+    :param row_set: A RowSequence describing the number of rows in the table
+    :param col_set: ColumnSources describing which columns to copy
+    :param order: :param order: The desired memory layout of the output array
+    :param dtype: The desired NumPy data type of the output NumPy array
+    :return: A NumPy ndarray
+    """
+
+    if not(isinstance(order, MemoryLayout)):
+        raise ValueError(f"Invalid major order {order}.  Please use an enum value from MemoryLayout.")
+
+    dtype = convert_to_numpy_dtype(dtype)
+
+    if dtype == np.byte:
+        buffer = _gatherer.tensorBuffer2DByte(row_set, col_set, order.is_row_major)
+    elif dtype == np.short:
+        buffer = _gatherer.tensorBuffer2DShort(row_set, col_set, order.is_row_major)
+    elif dtype == np.intc:
+        buffer = _gatherer.tensorBuffer2DInt(row_set, col_set, order.is_row_major)
+    elif dtype == np.int_:
+        buffer = _gatherer.tensorBuffer2DLong(row_set, col_set, order.is_row_major)
+    elif dtype == np.single:
+        buffer = _gatherer.tensorBuffer2DFloat(row_set, col_set, order.is_row_major)
+    elif dtype == np.double:
+        buffer = _gatherer.tensorBuffer2DDouble(row_set, col_set, order.is_row_major)
+    else:
+        raise ValueError(f"Data type {dtype} is not supported.")
+
+    tensor = np.frombuffer(buffer, dtype = dtype)
+
+    if order.is_row_major:
+        tensor.shape = (len(col_set), row_set.intSize())
+        return tensor.T
+    else:
+        tensor.shape = (row_set.intSize(), len(col_set))
+        return tensor
diff --git a/Integrations/python/test/test_learn_gather.py b/Integrations/python/test/test_learn_gather.py
@@ -0,0 +1,170 @@
+# 
+# Copyright (c) 2016-2021 Deephaven Data Labs and Patent Pending
+# 
+
+##############################################################################
+# NOTE: the jvm should have been initialized, or this test will certainly fail
+##############################################################################
+
+import pandas as pd
+import numpy as np
+import unittest
+import jpy
+import sys
+import os
+
+from deephaven import learn, tableToDataFrame, TableTools
+from deephaven.learn import gather
+
+class TestGather(unittest.TestCase):
+    """
+    Test cases for deephaven.learn submodule
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Inherited method allowing initialization of test environment
+        """
+        # Tables
+        cls.bool_table = TableTools.emptyTable(100).update(
+            "X = true",
+            "Y = false",
+            "Z = (i % 2 == 0) ? true : false"
+        )
+        cls.byte_table = TableTools.emptyTable(100).update(
+            "X = (byte)i",
+            "Y = (byte)(100 - X)",
+            "Z = (byte)(-101 + X)"
+        )
+        cls.short_table = TableTools.emptyTable(100).update(
+            "X = (short)i",
+            "Y = (short)(100 - X)",
+            "Z = (short)(-101 + X)"
+        )
+        cls.int_table = TableTools.emptyTable(100).update(
+            "X = (int)i",
+            "Y = 100 - X",
+            "Z = -101 + X"
+        )
+        cls.long_table = TableTools.emptyTable(100).update(
+            "X = (long)i",
+            "Y = 100 - X",
+            "Z = -101 + X"
+        )
+        cls.float_table = TableTools.emptyTable(100).update(
+            "X = (float)i",
+            "Y = (float)sqrt(X)",
+            "Z = (float)sqrt(Y)"
+        )
+        cls.double_table = TableTools.emptyTable(100).update(
+            "X = (double)i", 
+            "Y = sqrt(X)", 
+            "Z = sqrt(Y)"
+        )
+        # NumPy arrays
+        cls.bool_array = \
+            np.array([[True, False, True], [True, False, False]] * 50,
+            dtype = np.bool_)
+        cls.byte_array = np.vstack((
+            np.arange(0, 100, dtype = np.byte),
+            np.arange(100, 0, -1, dtype = np.byte),
+            np.arange(-101, -1, dtype = np.byte)
+        )).T
+        cls.short_array = np.vstack((
+            np.arange(0, 100, dtype = np.short),
+            np.arange(100, 0, -1, dtype = np.short),
+            np.arange(-101, -1, dtype = np.short)
+        )).T
+        cls.int_array = np.vstack((
+            np.arange(0, 100, dtype = np.intc),
+            np.arange(100, 0, -1, dtype = np.intc),
+            np.arange(-101, -1, dtype = np.intc)
+        )).T
+        cls.long_array = np.vstack((
+            np.arange(0, 100, dtype = np.int_),
+            np.arange(100, 0, -1, dtype = np.int_),
+            np.arange(-101, -1, dtype = np.int_)
+        )).T
+        cls.float_array = np.vstack((
+            np.arange(0, 100, dtype = np.single),
+            np.sqrt(np.arange(0, 100, dtype = np.single)),
+            np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.single)))
+        )).T
+        cls.double_array = np.vstack((
+            np.arange(0, 100, dtype = np.double),
+            np.sqrt(np.arange(0, 100, dtype = np.double)),
+            np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.double)))
+        )).T
+
+    # Model for learn to use when dtype = [np.bool_]
+    def boolean_model(self, features):
+        return np.count_nonzero(features, axis = 1) < 2
+
+    # Model for learn to use when dtype = [np.byte, np.short, np.intc, np.int_]
+    def integer_model(self, features):
+        return np.sum(features, axis = 1)
+
+    # Model for learn to use when dtype = [np.single, np.double]
+    def decimal_model(self, features):
+        return np.prod(features, axis = 1)
+
+    # Test byte data types
+    def test_byte(self):
+        self.base_test(source = self.byte_table, model = self.integer_model, np_dtype = np.byte)
+
+    # Test short data types
+    def test_short(self):
+        self.base_test(source = self.short_table, model = self.integer_model, np_dtype = np.short)
+
+    # Test int data types
+    def test_int(self):
+        self.base_test(source = self.int_table, model = self.integer_model, np_dtype = np.intc)
+
+    # Test long data types
+    def test_long(self):
+        self.base_test(source = self.long_table, model = self.integer_model, np_dtype = np.int_)
+
+    # Test float data types
+    def test_float(self):
+        self.base_test(source = self.float_table, model = self.decimal_model, np_dtype = np.single)
+
+    # Test double data types
+    def test_double(self):
+        self.base_test(source = self.double_table, model = self.decimal_model, np_dtype = np.double)
+
+    # The base test, which other tests will be built from
+    def base_test(self, source, model, np_dtype):
+
+        rows = source.getRowSet()
+        cols = [source.getColumnSource(col) for col in ["X", "Y", "Z"]]
+
+        gatherer_rowmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.ROW_MAJOR, np_dtype)
+        gatherer_colmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype)
+
+        array_from_table = tableToDataFrame(source).values
+
+        gathered_rowmajor = gatherer_rowmajor(rows, cols)
+        gathered_colmajor = gatherer_colmajor(rows, cols)
+
+        with self.subTest(msg = "Array shape"):
+            self.assertTrue(gathered_rowmajor.shape == array_from_table.shape)
+            print("Row major gathered shape: {}".format(gathered_rowmajor.shape))
+            self.assertTrue(gathered_colmajor.shape == array_from_table.shape)
+            print("Column major gathered shape: {}".format(gathered_colmajor.shape))
+        with self.subTest(msg = "Values in array"):
+            self.assertTrue(np.allclose(gathered_rowmajor, array_from_table))
+            print("All row-major array values are equal")
+            self.assertTrue(np.allclose(gathered_colmajor, array_from_table))
+            print("All column-major array values are equal")
+        with self.subTest(msg = "Array data type"):
+            self.assertTrue(gathered_rowmajor.dtype == np_dtype)
+            self.assertTrue(gathered_rowmajor.dtype == array_from_table.dtype)
+            self.assertTrue(gathered_colmajor.dtype == np_dtype)
+            self.assertTrue(gathered_colmajor.dtype == array_from_table.dtype)
+            self.assertTrue(gathered_rowmajor.dtype == gathered_colmajor.dtype)
+            print("Array dtype: {}".format(np_dtype))
+        with self.subTest(msg = "Contiguity"):
+            self.assertTrue(gathered_rowmajor.flags["C_CONTIGUOUS"] or gathered_rowmajor.flags["F_CONTIGUOUS"])
+            self.assertTrue(gathered_colmajor.flags["C_CONTIGUOUS"] or gathered_colmajor.flags["F_CONTIGUOUS"])
+            print("Array contiguity checked")
diff --git a/Integrations/python/test/test_sort.py b/Integrations/python/test/test_sort.py
@@ -0,0 +1,39 @@
+#
+#   Copyright (c) 2016-2021 Deephaven Data Labs and Patent Pending
+#
+
+# Test python sort syntax
+
+import unittest
+from unittest import TestCase
+from deephaven import as_list, SortColumn, ColumnName
+from deephaven.TableTools import newTable, intCol, diff
+
+class SortTestCase(TestCase):
+    def test_sort_syntax(self):
+        source = newTable(
+            intCol("A", 1, 2, 3, 1, 2, 3),
+            intCol("B", 0, 1, 0, 1, 0, 1),
+            intCol("C", 1, 2, 3, 4, 5, 6),
+        )
+
+        sort_columns = as_list([
+            SortColumn.asc(ColumnName.of("A")),
+            SortColumn.desc(ColumnName.of("B"))
+        ])
+
+        actual = source.sort(sort_columns)
+
+        target = newTable(
+            intCol("A", 1, 1, 2, 2, 3, 3),
+            intCol("B", 1, 0, 1, 0, 1, 0),
+            intCol("C", 4, 1, 2, 5, 6, 3),
+        )
+
+        diff_string = diff(actual, target, 1)
+        self.assertEqual("", diff_string)
+
+
+
+if __name__ == '__main__':
+    unittest.main()