From 460a218ade942eada7d81bb2d0955f1fbee517b4 Mon Sep 17 00:00:00 2001 From: Oscar Nydza <33619748+nipsn@users.noreply.github.com> Date: Fri, 9 Feb 2024 14:30:41 +0100 Subject: [PATCH 1/5] Added null check methods (#27) * Added initial isnull isna and notna implementations and tests * Proper alias, added tests and documentation * Reworked tests and added notnull method --- docs/user-guide/advanced/Pandas_API.ipynb | 140 +++++++++++++++++++++- src/pykx/pandas_api/pandas_meta.py | 16 +++ tests/test_pandas_api.py | 31 +++++ 3 files changed, 186 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index 239c4c8..a9443bf 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2335,6 +2335,144 @@ "tab.any()" ] }, + { + "cell_type": "markdown", + "id": "5e21bef1", + "metadata": {}, + "source": [ + "### Table.isna()\n", + "\n", + "```\n", + "Table.isna()\n", + "```\n", + "\n", + "Detects null values on a Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | A Table with the same shape as the original but containing boolean values. 1b represents a null value was on its place and 0b represents the opposite. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8ff16e1", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isna()" + ] + }, + { + "cell_type": "markdown", + "id": "47d20b00", + "metadata": {}, + "source": [ + "### Table.isnull()\n", + "\n", + "```\n", + "Table.isnull()\n", + "```\n", + "\n", + "Alias of Table.isna().\n", + "\n", + "Detects null values on a Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | A Table with the same shape as the original but containing boolean values. 1b represents a null value was on its place and 0b represents the opposite. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400c209e", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isnull()" + ] + }, + { + "cell_type": "markdown", + "id": "fb3164d5", + "metadata": {}, + "source": [ + "### Table.notna()\n", + "\n", + "```\n", + "Table.notna()\n", + "```\n", + "\n", + "Boolean inverse of Table.isna().\n", + "\n", + "Detects non-null values on a Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | A Table with the same shape as the original but containing boolean values. 1b represents a non null value was on its place and 0b represents the opposite. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4206eec3", + "metadata": {}, + "outputs": [], + "source": [ + "tab.notna()" + ] + }, + { + "cell_type": "markdown", + "id": "4e8e5c07", + "metadata": {}, + "source": [ + "### Table.notnull()\n", + "\n", + "```\n", + "Table.notna()\n", + "```\n", + "\n", + "Boolean inverse of Table.isnull(). Alias of Table.isna()\n", + "\n", + "Detects non-null values on a Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | A Table with the same shape as the original but containing boolean values. 1b represents a non null value was on its place and 0b represents the opposite. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3138d21a", + "metadata": {}, + "outputs": [], + "source": [ + "tab.notnull()" + ] + }, { "cell_type": "markdown", "id": "a3c3fccd", @@ -2360,7 +2498,7 @@ "\n", "| Type | Description |\n", "| :----------------: | :------------------------------------------------------------------- |\n", - "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `max` on that column / row. |" + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `max` on that column / row. |" ] }, { diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 39668d5..31bd263 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -311,3 +311,19 @@ def agg(self, func, axis=0, *args, **kwargs): # noqa: C901 return data else: return (q('{(flip enlist[`function]!enlist x)!y}', keyname, data)) + + @api_return + def isna(self): + return q.null(self) + + @api_return + def isnull(self): + return self.isna() + + @api_return + def notna(self): + return q('not', self.isna()) + + @api_return + def notnull(self): + return self.notna() diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index acfe55f..467a89c 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2029,3 +2029,34 @@ def test_keyed_loc_fixes(q): mkt[['k1', 'y']] with pytest.raises(KeyError): mkt['k1'] + + +def test_isnull(q): + tab = q('''([] + g:1#0Ng; h:1#0Nh; i1:1#0Ni; j:1#0Nj; + e:1#0Ne; f:1#0Nf; s:1#` ; p:1#0Np; + m:1#0Nm; d:1#0Nd; n:1#0Nn; u:1#0Nu; + v:1#0Nv; t:1#0Nt; c:1#" "; + g2:1?0Ng; h2:1?0Wh; i2:1?10i; j2:1?10j; + e2:1?10e; f2:1?10f; s2:1#`foo;p2:1?10p; + m2:1?"m"$10;d2:1?"d"$10;n2:1?10n; u2:1?10u; + v2:1?10v; t2:1?10t; c2:1?" ") + ''') + + cols = ["g", "h", "i1", "j", + "e", "f", "s", "p", + "m", "d", "n", "u", + "v", "t", "c", + "g2", "h2", "i2", "j2", + "e2", "f2", "s2", "p2", + "m2", "d2", "n2", "u2", + "v2", "t2", "c2"] + + expected = pd.DataFrame.from_dict({c: [True] if i < 15 else [False] + for i, c in enumerate(cols)}) + expected_inv = ~expected + + pd.testing.assert_frame_equal(tab.isna().pd(), expected) + pd.testing.assert_frame_equal(tab.isnull().pd(), expected) + pd.testing.assert_frame_equal(tab.notna().pd(), expected_inv) + pd.testing.assert_frame_equal(tab.notnull().pd(), expected_inv) From bb1549ae4d8a248d2de472f4d07a12ab60848eff Mon Sep 17 00:00:00 2001 From: Oscar Nydza <33619748+nipsn@users.noreply.github.com> Date: Tue, 13 Feb 2024 08:57:11 +0100 Subject: [PATCH 2/5] Add kurt and sem functions (#32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added implementation of kurt function (#23) * Added implementation of sem function (#22) * Refactored kurt and sem functions --------- Co-authored-by: Miguel Gómez Co-authored-by: Francisco Tórtola Vivo --- docs/user-guide/advanced/Pandas_API.ipynb | 187 ++++++++++++++++++++++ src/pykx/pandas_api/pandas_meta.py | 47 ++++++ tests/test_pandas_api.py | 153 ++++++++++++++++++ 3 files changed, 387 insertions(+) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index a9443bf..ddc3980 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -436,6 +436,91 @@ "tab.mean(axis=1)" ] }, + { + "cell_type": "markdown", + "id": "fe565b65-fbf2-47ba-a26e-791d09fd4f55", + "metadata": {}, + "source": [ + "### Table.kurt()\n", + "\n", + "```\n", + "Table.kurt(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Return unbiased kurtosis over requested axis. Kurtosis obtained using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1.\n", + "\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | Axis for the function to be applied on. 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :--------: | :--------------------------------------------------------------------------------------- |\n", + "| Dictionary | Map of columns and their yielded kurtosis values |" + ] + }, + { + "cell_type": "markdown", + "id": "e6069cac-d260-4f80-9688-3d1ec273cd22", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the kurt across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4219c826-a84b-4722-9847-372d3837acdb", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "437ab485-bf73-4209-b63e-aa0d1bfa5d58", + "metadata": {}, + "outputs": [], + "source": [ + "tab.kurt()" + ] + }, + { + "cell_type": "markdown", + "id": "ea3e1cf6-2304-4061-a846-1cbc0572ea9d", + "metadata": {}, + "source": [ + "Calculate the kurtosis across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63312e8b-76f0-46eb-b4d7-b2213561c86e", + "metadata": {}, + "outputs": [], + "source": [ + "tab.kurt(axis=1)" + ] + }, { "cell_type": "markdown", "id": "7bf853c5", @@ -646,6 +731,108 @@ "tab.mode(dropna=False)" ] }, + { + "cell_type": "markdown", + "id": "b248fef1", + "metadata": {}, + "source": [ + "### Table.sem()\n", + "\n", + "```\n", + "Table.sem(axis=0, skipna=True, numeric_only=False, ddof=0)\n", + "```\n", + "Return unbiased standard error of the mean over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | The sem across each row / column with the key corresponding to the row number or column name. |" + ] + }, + { + "cell_type": "markdown", + "id": "71bd1d6f", + "metadata": {}, + "source": [ + "**Examples**\n", + "\n", + "Calculate the sem across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "350c2b7c", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14],\n", + " }\n", + " )\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b89307e9", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem()" + ] + }, + { + "cell_type": "markdown", + "id": "6933f01f", + "metadata": {}, + "source": [ + "Calculate the sem across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3edd3feb", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ae7afe5a", + "metadata": {}, + "source": [ + "Calculate sem accross columns with ddof=0:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de626961", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem(ddof=0)" + ] + }, { "cell_type": "markdown", "id": "7e2813b4", diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 31bd263..eccd654 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -154,6 +154,32 @@ def mean(self, axis: int = 0, numeric_only: bool = False): tab ) + @api_return + def kurt(self, axis: int = 0, numeric_only: bool = False): + tab = self + if 'Keyed' in str(type(tab)): + tab = q.value(tab) + if numeric_only: + tab = _get_numeric_only_subtable(tab) + + axis_keys = q('{[axis;tab] $[0~axis;cols;`$string til count @] tab}', axis, tab) + + return q( + '''{[tab;axis;axis_keys] + tab:$[0~axis;(::);flip] value flip tab; + kurt:{[x] + res: x - avg x; + n: count x; + m2: sum rsq: res xexp 2; + m4: sum rsq xexp 2; + adj: 3 * xexp[n - 1;2] % (n - 2) * (n - 3); + num: n * (n + 1) * (n - 1) * m4; + den: (n - 2) * (n - 3) * m2 xexp 2; + (num % den) - adj}; + axis_keys!kurt each tab} + ''', tab, axis, axis_keys + ) + @api_return def median(self, axis: int = 0, numeric_only: bool = False): tab = self @@ -203,6 +229,27 @@ def mode(self, axis: int = 0, numeric_only: bool = False, dropna: bool = True): tab ) + @api_return + def sem(self, axis: int = 0, ddof: int = 1, numeric_only: bool = False): + tab = self + if 'Keyed' in str(type(tab)): + tab = q.value(tab) + if numeric_only: + tab = _get_numeric_only_subtable(tab) + + axis_keys = q('{[axis;tab] $[0~axis;cols;`$string til count @] tab}', axis, tab) + + if ddof == len(tab): + return q('{x!count[x]#0n}', axis_keys) + + return q( + '''{[tab;axis;ddof;axis_keys] + tab:$[0~axis;(::);flip] value flip tab; + d:{dev[x] % sqrt count[x] - y}[;ddof]; + axis_keys!d each tab} + ''', tab, axis, ddof, axis_keys + ) + @api_return def abs(self, numeric_only=False): tab = self diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 467a89c..7b06d65 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -1489,6 +1489,86 @@ def test_df_sample(kx, q): t.sample(ignore_index=True) +def test_sem(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [7, 11, 14, 14] + } + ) + precision = 1e-16 + tab = kx.toq(df) + p_m = df.sem() + q_m = tab.sem() + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1) + q_m = tab.sem(axis=1) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)))]) + + p_m = df.sem(ddof=0) + q_m = tab.sem(ddof=0) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(ddof=4) + q_m = tab.sem(ddof=4) + assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py()) + for c in q.key(q_m).py()]) + + q['tab'] = kx.toq(df) + tab = q('1!`idx xcols update idx: til count tab from tab') + p_m = df.sem() + q_m = tab.sem() + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1) + q_m = tab.sem(axis=1) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)) - 1)]) + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.sem(numeric_only=True) + q_m = tab.sem(numeric_only=True) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1, numeric_only=True) + q_m = tab.sem(axis=1, numeric_only=True) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)))]) + + with pytest.raises(kx.QError): + q_m = tab.sem() + with pytest.raises(kx.QError): + q_m = tab.sem(axis=1) + + df = pd.DataFrame({'a': [1]}) + tab = kx.toq(df) + p_m = df.sem() + q_m = tab.sem() + assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py()) + for c in q.key(q_m).py()]) + + p_m = df.sem(ddof=0) + q_m = tab.sem(ddof=0) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + def test_mean(kx, q): df = pd.DataFrame( { @@ -1543,6 +1623,79 @@ def test_mean(kx, q): q_m = tab.mean(axis=1) +def test_kurt(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [7, 11, 14, 14] + } + ) + tab = kx.toq(df) + p_m = df.kurt() + q_m = tab.kurt() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1) + q_m = tab.kurt(axis=1) + for c in range(len(q.cols(tab))): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + q['tab'] = kx.toq(df) + tab = q('1!`idx xcols update idx: til count tab from tab') + p_m = df.kurt() + q_m = tab.kurt() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1) + q_m = tab.kurt(axis=1) + for c in range(len(q.cols(tab)) - 1): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.kurt(numeric_only=True) + q_m = tab.kurt(numeric_only=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1, numeric_only=True) + q_m = tab.kurt(axis=1, numeric_only=True) + for c in range(len(q.cols(tab))): + assert np.isnan(p_m[c]) & np.isnan(q_m[q('{`$string x}', c)].py()) + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [11, 12, 13, 14], + 'e': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.kurt(numeric_only=True) + q_m = tab.kurt(numeric_only=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1, numeric_only=True) + q_m = tab.kurt(axis=1, numeric_only=True) + for c in range(len(q.cols(tab)) - 1): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + with pytest.raises(kx.QError): + q_m = tab.kurt() + with pytest.raises(kx.QError): + q_m = tab.kurt(axis=1) + + def test_median(kx, q): df = pd.DataFrame( { From 7ef4142393c6803f4893845b5480ad2a47c0a3cf Mon Sep 17 00:00:00 2001 From: Oscar Nydza <33619748+nipsn@users.noreply.github.com> Date: Tue, 13 Feb 2024 08:58:47 +0100 Subject: [PATCH 3/5] Added idxmin implementation (#34) --- docs/user-guide/advanced/Pandas_API.ipynb | 66 +++++++++++++++++++++++ src/pykx/pandas_api/pandas_meta.py | 26 ++++++--- tests/test_pandas_api.py | 23 ++++++++ 3 files changed, 108 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index ddc3980..d4811ff 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2698,6 +2698,72 @@ "tab.max()" ] }, + { + "cell_type": "markdown", + "id": "d98b298c", + "metadata": {}, + "source": [ + "### Table.idxmin()\n", + "\n", + "```\n", + "Table.idxmax(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Return index of first occurrence of minimum over requested axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the idxmin across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `idxmin` on that column / row. |" + ] + }, + { + "cell_type": "markdown", + "id": "143f5483", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the idxmin across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da7cbf8f", + "metadata": {}, + "outputs": [], + "source": [ + "tab.idxmin()" + ] + }, + { + "cell_type": "markdown", + "id": "fb531e00", + "metadata": {}, + "source": [ + "Calculate the idxmin across the rows of a table using only columns thar are of a numeric data type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9907226a", + "metadata": {}, + "outputs": [], + "source": [ + "tab.idxmin(axis=1, numeric_only=True)" + ] + }, { "cell_type": "markdown", "id": "301ab2c2", diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index eccd654..7bb7805 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -67,7 +67,7 @@ def preparse_computations(tab, axis=0, skipna=True, numeric_only=False, bool_onl skipna, axis ) - return (res, cols if axis == 0 else q.til(len(res))) + return (res, cols if axis == 0 else q.til(len(res)), cols) # The simple computation functions all return a tuple of the results and the col names the results @@ -259,17 +259,17 @@ def abs(self, numeric_only=False): @convert_result def all(self, axis=0, bool_only=False, skipna=True): - res, cols = preparse_computations(self, axis, skipna, bool_only=bool_only) + res, cols, _ = preparse_computations(self, axis, skipna, bool_only=bool_only) return (q('{"b"$x}', [all(x) for x in res]), cols) @convert_result def any(self, axis=0, bool_only=False, skipna=True): - res, cols = preparse_computations(self, axis, skipna, bool_only=bool_only) + res, cols, _ = preparse_computations(self, axis, skipna, bool_only=bool_only) return (q('{"b"$x}', [any(x) for x in res]), cols) @convert_result def max(self, axis=0, skipna=True, numeric_only=False): - res, cols = preparse_computations(self, axis, skipna, numeric_only) + res, cols, _ = preparse_computations(self, axis, skipna, numeric_only) return (q( '{[row] {$[11h=type x; {[x1; y1] $[x1 > y1; x1; y1]} over x; max x]} each row}', res @@ -277,15 +277,27 @@ def max(self, axis=0, skipna=True, numeric_only=False): @convert_result def min(self, axis=0, skipna=True, numeric_only=False): - res, cols = preparse_computations(self, axis, skipna, numeric_only) + res, cols, _ = preparse_computations(self, axis, skipna, numeric_only) return (q( '{[row] {$[11h=type x; {[x1; y1] $[x1 < y1; x1; y1]} over x; min x]} each row}', res ), cols) + @convert_result + def idxmin(self, axis=0, skipna=True, numeric_only=False): + tab = self + axis = q('{$[11h~type x; `index`columns?x; x]}', axis) + res, cols, ix = preparse_computations(tab, axis, skipna, numeric_only) + return (q( + '''{[row;tab;axis] + row:{$[11h~type x; {[x1; y1] $[x1 < y1; x1; y1]} over x; min x]} each row; + m:$[0~axis; (::); flip] value flip tab; + $[0~axis; (::); cols tab] m {$[abs type y;x]?y}' row} + ''', res, tab[ix], axis), cols) + @convert_result def prod(self, axis=0, skipna=True, numeric_only=False, min_count=0): - res, cols = preparse_computations(self, axis, skipna, numeric_only) + res, cols, _ = preparse_computations(self, axis, skipna, numeric_only) return (q( '{[row; minc] {$[y > 0; $[y>count[x]; 0N; prd x]; prd x]}[;minc] each row}', res, @@ -294,7 +306,7 @@ def prod(self, axis=0, skipna=True, numeric_only=False, min_count=0): @convert_result def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): - res, cols = preparse_computations(self, axis, skipna, numeric_only) + res, cols, _ = preparse_computations(self, axis, skipna, numeric_only) return (q( '{[row; minc]' '{$[y > 0;' diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 7b06d65..2fe13fc 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -1964,6 +1964,29 @@ def test_pandas_max(q): assert float(qmax[i]) == float(pmax[i]) +def test_pandas_idxmin(q): + tab = q('([] sym: 100?`foo`bar`baz`qux; price: 250.0f - 100?500.0f; ints: 100 - 100?200)') + df = tab.pd() + + p_m = df.idxmin() + q_m = tab.idxmin() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + q_m = tab.idxmin(axis=1, numeric_only=True, skipna=True) + p_m = df.idxmin(axis=1, numeric_only=True, skipna=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + tab = q('([]price: 250.0f - 100?500.0f; ints: 100 - 100?200)') + df = tab.pd() + + q_m = tab.idxmin(axis=1) + p_m = df.idxmin(axis=1) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + def test_pandas_all(q): tab = q( '([] sym: 100?`foo`bar`baz`qux; price: 250.0f - 100?500.0f; ints: 100 - 100?200;' From 4f59eddf23b6407e251b21a9fe0d2f92cbaba7af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20T=C3=B3rtola?= <59147331+tortolavivo23@users.noreply.github.com> Date: Tue, 13 Feb 2024 09:28:11 +0100 Subject: [PATCH 4/5] Added idxmax implementation (#25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added idxmax implementation, tests and documentation * fix error change python version in documentation * Change implementation, add tests and improve documentation * fix typo error * Alternative implementation of idxmax (#33) Co-authored-by: Jesús López-González --------- Co-authored-by: Jesús López-González Co-authored-by: Jesús López-González Co-authored-by: Jesús López-González --- docs/user-guide/advanced/Pandas_API.ipynb | 68 ++++++++++++++++++++++- src/pykx/pandas_api/pandas_meta.py | 12 ++++ tests/test_pandas_api.py | 23 ++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index d4811ff..c301be5 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2685,7 +2685,7 @@ "\n", "| Type | Description |\n", "| :----------------: | :------------------------------------------------------------------- |\n", - "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `max` on that column / row. |" + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `max` on that column / row. |" ] }, { @@ -2764,6 +2764,72 @@ "tab.idxmin(axis=1, numeric_only=True)" ] }, + { + "cell_type": "markdown", + "id": "d98b298c", + "metadata": {}, + "source": [ + "### Table.idxmax()\n", + "\n", + "```\n", + "Table.idxmax(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Return index of first occurrence of maximum over requested axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the idxmax across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `idxmax` on that column / row. |" + ] + }, + { + "cell_type": "markdown", + "id": "143f5483", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the idxmax across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da7cbf8f", + "metadata": {}, + "outputs": [], + "source": [ + "tab.idxmax()" + ] + }, + { + "cell_type": "markdown", + "id": "fb531e00", + "metadata": {}, + "source": [ + "Calculate the idxmax across the rows of a table using only columns thar are of a numeric data type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9907226a", + "metadata": {}, + "outputs": [], + "source": [ + "tab.idxmax(axis=1, numeric_only=True)" + ] + }, { "cell_type": "markdown", "id": "301ab2c2", diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 7bb7805..b270e69 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -283,6 +283,18 @@ def min(self, axis=0, skipna=True, numeric_only=False): res ), cols) + @convert_result + def idxmax(self, axis=0, skipna=True, numeric_only=False): + tab = self + axis = q('{$[11h~type x; `index`columns?x; x]}', axis) + res, cols, ix = preparse_computations(tab, axis, skipna, numeric_only) + return (q( + '''{[row;tab;axis] + row:{$[11h~type x; {[x1; y1] $[x1 > y1; x1; y1]} over x; max x]} each row; + m:$[0~axis; (::); flip] value flip tab; + $[0~axis; (::); cols tab] m {$[abs type y;x]?y}' row} + ''', res, tab[ix], axis), cols) + @convert_result def idxmin(self, axis=0, skipna=True, numeric_only=False): tab = self diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 2fe13fc..bfcc8d5 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -1964,6 +1964,29 @@ def test_pandas_max(q): assert float(qmax[i]) == float(pmax[i]) +def test_pandas_idxmax(q): + tab = q('([] sym: 100?`foo`bar`baz`qux; price: 250.0f - 100?500.0f; ints: 100 - 100?200)') + df = tab.pd() + + p_m = df.idxmax() + q_m = tab.idxmax() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + q_m = tab.idxmax(axis=1, numeric_only=True, skipna=True) + p_m = df.idxmax(axis=1, numeric_only=True, skipna=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + tab = q('([]price: 250.0f - 100?500.0f; ints: 100 - 100?200)') + df = tab.pd() + + q_m = tab.idxmax(axis=1) + p_m = df.idxmax(axis=1) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + def test_pandas_idxmin(q): tab = q('([] sym: 100?`foo`bar`baz`qux; price: 250.0f - 100?500.0f; ints: 100 - 100?200)') df = tab.pd() From 0019cabf1f27f660bfc69be0501f38fd3935e9df Mon Sep 17 00:00:00 2001 From: Oscar Nydza Date: Tue, 13 Feb 2024 13:08:11 +0100 Subject: [PATCH 5/5] Reordered nb documentation --- docs/user-guide/advanced/Pandas_API.ipynb | 96 +++++++++++------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index a751d4a..c35d64b 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -1884,19 +1884,19 @@ "id": "d98b298c", "metadata": {}, "source": [ - "### Table.idxmin()\n", + "### Table.min()\n", "\n", "```\n", - "Table.idxmax(axis=0, skipna=True, numeric_only=False)\n", + "Table.min(axis=0, skipna=True, numeric_only=False)\n", "```\n", "\n", - "Return index of first occurrence of minimum over requested axis.\n", + "Returns the minimum value across the given axis.\n", "\n", "**Parameters:**\n", "\n", "| Name | Type | Description | Default |\n", "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate the idxmin across 0 is columns, 1 is rows. | 0 |\n", + "| axis | int | The axis to calculate the minimum across 0 is columns, 1 is rows. | 0 |\n", "| skipna | bool | Ignore any null values along the axis. | True |\n", "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", "\n", @@ -1904,50 +1904,22 @@ "\n", "| Type | Description |\n", "| :----------------: | :------------------------------------------------------------------- |\n", - "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `idxmin` on that column / row. |" - ] - }, - { - "cell_type": "markdown", - "id": "143f5483", - "metadata": {}, - "source": [ - "**Examples:**\n", - "\n", - "Calculate the idxmin across the columns of a table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da7cbf8f", - "metadata": {}, - "outputs": [], - "source": [ - "tab.idxmin()" - ] - }, - { - "cell_type": "markdown", - "id": "fb531e00", - "metadata": {}, - "source": [ - "Calculate the idxmin across the rows of a table using only columns thar are of a numeric data type" + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `min` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "9907226a", + "id": "9f13e8a7", "metadata": {}, "outputs": [], "source": [ - "tab.idxmin(axis=1, numeric_only=True)" + "tab.min()" ] }, { "cell_type": "markdown", - "id": "d98b298c", + "id": "b52627d2", "metadata": {}, "source": [ "### Table.idxmax()\n", @@ -1975,7 +1947,7 @@ }, { "cell_type": "markdown", - "id": "143f5483", + "id": "838a07dd", "metadata": {}, "source": [ "**Examples:**\n", @@ -1986,7 +1958,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da7cbf8f", + "id": "6c6d3384", "metadata": {}, "outputs": [], "source": [ @@ -1995,7 +1967,7 @@ }, { "cell_type": "markdown", - "id": "fb531e00", + "id": "30738846", "metadata": {}, "source": [ "Calculate the idxmax across the rows of a table using only columns thar are of a numeric data type" @@ -2004,7 +1976,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9907226a", + "id": "edf9e128", "metadata": {}, "outputs": [], "source": [ @@ -2013,22 +1985,22 @@ }, { "cell_type": "markdown", - "id": "301ab2c2", + "id": "fdb4114c-640a-41ac-a4e7-6c236e9d93ea", "metadata": {}, "source": [ - "### Table.min()\n", + "### Table.idxmin()\n", "\n", "```\n", - "Table.min(axis=0, skipna=True, numeric_only=False)\n", + "Table.idxmax(axis=0, skipna=True, numeric_only=False)\n", "```\n", "\n", - "Returns the minimum value across the given axis.\n", + "Return index of first occurrence of minimum over requested axis.\n", "\n", "**Parameters:**\n", "\n", "| Name | Type | Description | Default |\n", "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate the minimum across 0 is columns, 1 is rows. | 0 |\n", + "| axis | int | The axis to calculate the idxmin across 0 is columns, 1 is rows. | 0 |\n", "| skipna | bool | Ignore any null values along the axis. | True |\n", "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", "\n", @@ -2036,17 +2008,45 @@ "\n", "| Type | Description |\n", "| :----------------: | :------------------------------------------------------------------- |\n", - "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `min` on that column / row. |" + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `idxmin` on that column / row. |" + ] + }, + { + "cell_type": "markdown", + "id": "57053c36-932e-4805-916c-4cd5e3d33e82", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the idxmin across the columns of a table" ] }, { "cell_type": "code", "execution_count": null, - "id": "9f13e8a7", + "id": "da7cbf8f", "metadata": {}, "outputs": [], "source": [ - "tab.min()" + "tab.idxmin()" + ] + }, + { + "cell_type": "markdown", + "id": "fb531e00", + "metadata": {}, + "source": [ + "Calculate the idxmin across the rows of a table using only columns thar are of a numeric data type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9907226a", + "metadata": {}, + "outputs": [], + "source": [ + "tab.idxmin(axis=1, numeric_only=True)" ] }, {