diff --git a/docs/_static/css/awkward.css b/docs/_static/css/awkward.css index 8205cb3730..5735e42ccb 100644 --- a/docs/_static/css/awkward.css +++ b/docs/_static/css/awkward.css @@ -47,3 +47,7 @@ html[data-theme="dark"] .sd-card { margin-top: 0.15em; margin-bottom: 0.15em; } + +div.toctree-wrapper { + display: none; +} diff --git a/docs/_static/js/awkward.js b/docs/_static/js/awkward.js index cb035cee71..cb9cf116bf 100644 --- a/docs/_static/js/awkward.js +++ b/docs/_static/js/awkward.js @@ -2,4 +2,7 @@ document.addEventListener("DOMContentLoaded", function() { document.querySelectorAll('a[href="_static/try-it.html"]').forEach(a => { a.target = "_blank"; }); + document.querySelectorAll('a[href="https://awkward-array.org/doc/main/_static/try-it.html"]').forEach(a => { + a.target = "_blank"; + }); }); diff --git a/docs/_static/try-it.html b/docs/_static/try-it.html index 8c376b4d89..7988a08d87 100644 --- a/docs/_static/try-it.html +++ b/docs/_static/try-it.html @@ -2,6 +2,7 @@ + @@ -281,6 +282,9 @@ ` import numpy as np import awkward as ak + if hasattr(ak._util, "STDOUT"): + import sys + ak._util.STDOUT.stream = sys.stdout example = ak.Array([ [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], [], diff --git a/docs/_toc.yml b/docs/_toc.yml index baa4f240fb..2f4ff6663f 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -10,14 +10,15 @@ subtrees: - file: getting-started/index subtrees: - entries: -# - file: getting-started/try-awkward-array - - file: getting-started/community-tutorials + - file: getting-started/what-is-an-awkward-array + - file: getting-started/10-minutes-to-awkward-array + - file: getting-started/uproot-awkward-columnar-hats + - file: getting-started/jagged-ragged-awkward-arrays + - file: getting-started/thinking-in-arrays - file: getting-started/papers-and-talks - file: user-guide/index subtrees: - entries: - - file: user-guide/10-minutes-to-awkward-array - - file: user-guide/how-to-convert title: "Converting arrays" subtrees: @@ -65,30 +66,30 @@ subtrees: - file: user-guide/how-to-examine-type title: "Data type" - file: user-guide/how-to-examine-single-item - title: "Single item detail [todo]" + title: "Single item detail" - file: user-guide/how-to-examine-list-fields title: "Listing fields/keys/columns" - file: user-guide/how-to-examine-simple-slicing - title: "Simple slicing [todo]" + title: "Simple slicing" - file: user-guide/how-to-examine-checking-validity - title: "Checking validity [todo]" + title: "Checking validity" - file: user-guide/how-to-math title: "Numerical math" subtrees: - entries: - file: user-guide/how-to-math-numpy - title: "NumPy functions [todo]" + title: "NumPy functions" - file: user-guide/how-to-math-broadcasting - title: "Awkward broadcasting [todo]" + title: "Awkward broadcasting" - file: user-guide/how-to-math-reducing - title: "Reducing (sum/min/any/all) [todo]" + title: "Reducing (sum/min/any/all)" - file: user-guide/how-to-math-statistics - title: "Statistics (mean/var/std) [todo]" + title: "Statistics (mean/var/std)" - file: user-guide/how-to-math-argminmax - title: "Using argmin/argmax [todo]" + title: "Min/max/sort one array by another" - file: user-guide/how-to-math-gpu - title: "On GPUs [todo]" + title: "Awkward Arrays on GPUs" - file: user-guide/how-to-strings title: "Working with strings" @@ -106,9 +107,9 @@ subtrees: subtrees: - entries: - file: user-guide/how-to-filter-num - title: "By number of items [todo]" + title: "By number of items" - file: user-guide/how-to-filter-cut-mask - title: "Cuts vs. masks [todo]" + title: "Cuts vs. masks" - file: user-guide/how-to-filter-ragged title: "Using ragged arrays" - file: user-guide/how-to-filter-masked @@ -122,34 +123,30 @@ subtrees: title: "Zip/unzip and project" - file: user-guide/how-to-restructure-add-fields title: "Adding fields to records" - - file: user-guide/how-to-restructure-rename-records - title: "Renaming records [todo]" - file: user-guide/how-to-restructure-flatten title: "Flattening for plots" - file: user-guide/how-to-restructure-pad title: "Padding/clipping for machine learning" - file: user-guide/how-to-restructure-concatenate - title: "Concatenating and interleaving [todo]" - - file: user-guide/how-to-restructure-sort - title: "Sorting [todo]" + title: "Concatenating and interleaving" - file: user-guide/how-to-combinatorics title: "Combinatorics" subtrees: - entries: - file: user-guide/how-to-combinatorics-cartesian-combinations - title: 'Cartesian and "n choose k" [todo]' + title: 'Cartesian and "n choose k"' - file: user-guide/how-to-combinatorics-best-match - title: "Best match between collections [todo]" + title: "Best match between collections" - file: user-guide/how-to-use-in-numba title: "Using arrays in Numba" subtrees: - entries: + - file: user-guide/how-to-use-in-numba-intro.md + title: "Introduction" - file: user-guide/how-to-use-in-numba-features - title: "Supported features [todo]" - - file: user-guide/how-to-use-in-numba-arraybuilder - title: "Building array output [todo]" + title: "Supported features" - file: user-guide/how-to-use-in-numba-cuda title: "Working with CUDA" @@ -163,17 +160,9 @@ subtrees: title: "JIT compiling operations with C++ in cppyy" - file: user-guide/how-to-specialize - title: "Specialized behavior" + title: "Special topics" subtrees: - entries: - - file: user-guide/how-to-specialize-subclass - title: "Subclassing Array/Record [todo]" - - file: user-guide/how-to-specialize-override-numpy - title: "Overriding NumPy functions [todo]" - - file: user-guide/how-to-specialize-in-numba - title: "In Numba [todo]" - - file: user-guide/how-to-specialize-lorentz - title: "For physics: Lorentz vectors [todo]" - file: user-guide/how-to-specialize-differentiate-jax title: "Differentiation using JAX" diff --git a/docs/user-guide/10-minutes-to-awkward-array.md b/docs/getting-started/10-minutes-to-awkward-array.md similarity index 98% rename from docs/user-guide/10-minutes-to-awkward-array.md rename to docs/getting-started/10-minutes-to-awkward-array.md index a6b69195c2..c380224b28 100644 --- a/docs/user-guide/10-minutes-to-awkward-array.md +++ b/docs/getting-started/10-minutes-to-awkward-array.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.16.1 kernelspec: display_name: Python 3 (ipykernel) language: python @@ -29,8 +29,12 @@ Our dataset is formatted as a 611 MB [Apache Parquet](https://parquet.apache.org Given that this file is so large, let's first look at the *metadata* with `ak.metadata_from_parquet` to see what we're working with: ```{code-cell} ipython3 -:tags: [hide-cell] - +--- +editable: true +slideshow: + slide_type: '' +tags: [hide-cell] +--- %config InteractiveShell.ast_node_interactivity = "last_expr_or_assign" ``` diff --git a/docs/getting-started/8-layer_cube.jpg b/docs/getting-started/8-layer_cube.jpg new file mode 100644 index 0000000000..08acd8f951 Binary files /dev/null and b/docs/getting-started/8-layer_cube.jpg differ diff --git a/docs/getting-started/awkward-motivation-venn-diagram.svg b/docs/getting-started/awkward-motivation-venn-diagram.svg new file mode 100644 index 0000000000..82909c93c7 --- /dev/null +++ b/docs/getting-started/awkward-motivation-venn-diagram.svg @@ -0,0 +1,112 @@ + + + + + + + + + + array-orientedinterface andperformance + arbitrary datastructures + + + + diff --git a/docs/getting-started/cartoon-cartesian.png b/docs/getting-started/cartoon-cartesian.png new file mode 120000 index 0000000000..d0bf1a45e9 --- /dev/null +++ b/docs/getting-started/cartoon-cartesian.png @@ -0,0 +1 @@ +../../docs-img/diagrams/cartoon-cartesian.png \ No newline at end of file diff --git a/docs/getting-started/cartoon-combinations.png b/docs/getting-started/cartoon-combinations.png new file mode 120000 index 0000000000..b5ffe4f16a --- /dev/null +++ b/docs/getting-started/cartoon-combinations.png @@ -0,0 +1 @@ +../../docs-img/diagrams/cartoon-combinations.png \ No newline at end of file diff --git a/docs/getting-started/community-tutorials.md b/docs/getting-started/community-tutorials.md deleted file mode 100644 index 75a05207e1..0000000000 --- a/docs/getting-started/community-tutorials.md +++ /dev/null @@ -1,58 +0,0 @@ -# Community tutorials - -## Jagged, ragged, Awkward Arrays -An HSF-provided tutorial aimed at High Energy Physics (HEP) researchers on using Awkward Array to obtain a dimuon-mass spectrum. - -### Format -- [{fas}`external-link-alt` Webpage](https://hsf-training.github.io/hsf-training-scikit-hep-webpage/04-awkward/index.html) with example code snippets. - -### Objectives -- Filter ragged arrays using innermost lists. -- Compute quantities on combinations of fields with {func}`ak.combinations`. -- Unzip arrays with {func}`ak.unzip`. -- Ravel (flatten) ragged arrays with {func}`ak.ravel`. - -## Loopy and unloopy programming techniques (SciPy 2022) -A tutorial presented at the SciPy conference on July 11, 2022. - -### Format -- [{fab}`github` GitHub repository](https://github.com/jpivarski-talks/2022-07-11-scipy-loopy-tutorial) - with Jupyter Notebooks that can be run on [MyBinder](https://mybinder.org/v2/gh/jpivarski-talks/2022-07-11-scipy-loopy-tutorial/v1.0?urlpath=lab/tree/narrative.ipynb). -- [{fab}`youtube` YouTube](https://www.youtube.com/watch?v=Dovyd72eD70) recording of presentation. - -### Objectives -- Load data from a remote Parquet source with {func}`ak.from_parquet`. -- Explore a complex dataset. -- Mask and slice ragged array with {func}`ak.mask`. -- Perform ragged reduction and broadcasting. -- Flatten ragged arrays with {func}`ak.flatten`. - -## Columnar data analysis (CoDaS-HEP 2022) -A tutorial aimed at HEP researchers, given at CODAS-HEP, to reconstruct Z masses and the Higgs mass from four leptons (4μ, 4e, 2μ2e) using Awkward Array and uproot. - -### Format -- [{fab}`github` GitHub repository](https://github.com/jpivarski-talks/2022-08-03-codas-hep-columnar-tutorial) with Jupyter Notebooks that can be run on [MyBinder](https://mybinder.org/). - -### Objectives -- Restructure/reformat arrays with {func}`ak.zip`. -- Compute kinematic quantities with [`vector`](https://github.com/scikit-hep/vector). -- Add new fields to an array. -- Explore combinatorics with {func}`ak.cartesian` and {func}`ak.combinations`. - -## Uproot-Awkward columnar HATS (2020) -Tutorials for Uproot Awkward Columnar HATS, a hands-on tutorial hosted by the [Fermilab LPC](https://lpc.fnal.gov/). - -### Format -- [{fab}`github` GitHub repository](https://github.com/jpivarski-talks/2020-06-08-uproot-awkward-columnar-hats) with Jupyter Notebooks that can be run on [MyBinder](https://mybinder.org/). - -### Objectives -- Index nested record arrays. -- Perform ragged reduction and broadcasting. -- Restructure/reformat arrays with {func}`ak.zip`. -- Write high-performance imperative routines that operate upon Awkward Arrays with Numba. -- Build Awkward Arrays imperatively with {class}`ak.ArrayBuilder`. -- Flatten ragged arrays with {func}`ak.flatten`. -- Unzip arrays with {func}`ak.unzip`. -- Explore combinatorics with {func}`ak.cartesian` and {func}`ak.combinations`. -- Mask and slice ragged array with {func}`ak.mask`. -- Explode Awkward Arrays into DataFrames with {func}`ak.pandas.df`. diff --git a/docs/getting-started/demo/example-reduction-sum.svg b/docs/getting-started/demo/example-reduction-sum.svg deleted file mode 120000 index da8db5cc3e..0000000000 --- a/docs/getting-started/demo/example-reduction-sum.svg +++ /dev/null @@ -1 +0,0 @@ -../../../docs-img/diagrams/example-reduction-sum.svg \ No newline at end of file diff --git a/docs/getting-started/demo/what-is-an-awkward-array.ipynb b/docs/getting-started/demo/what-is-an-awkward-array.ipynb deleted file mode 100644 index 3bde721da0..0000000000 --- a/docs/getting-started/demo/what-is-an-awkward-array.ipynb +++ /dev/null @@ -1,577 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": "
\n\n
" - }, - { - "cell_type": "markdown", - "id": "53a4d322", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "- To run the code, click on the first cell (gray box) and press Shift+Enter (or click the play button) to run each cell.\n", - "- Or, select `Run All Cells` from the `Run` menu.\n", - "- Feel free to experiment, but if you need to restore the original code, reload this browser page. Any changes you make will be lost when you reload!\n", - "- When you leave the page, you might get a \"Changes you made may not be saved\" message, which you can ignore." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f9ccda1", - "metadata": {}, - "outputs": [], - "source": [ - "# Install Awkward Array in the browser\n", - "import piplite; await piplite.install(\"awkward-cpp\"); await piplite.install(\"awkward==2.5.0\")\n", - "\n", - "# Import normal libraries\n", - "import numpy as np\n", - "import awkward as ak" - ] - }, - { - "cell_type": "markdown", - "id": "55548891", - "metadata": {}, - "source": [ - "## Versatile Arrays\n", - "Awkward Arrays are general tree-like data structures, like JSON, but contiguous in memory and operated upon with compiled, vectorized code like NumPy.\n", - "\n", - "They look like NumPy arrays:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6cec1a5", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array([1, 2, 3])" - ] - }, - { - "cell_type": "markdown", - "id": "b7a202fc", - "metadata": {}, - "source": [ - "Like NumPy, they can have multiple dimensions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63833e5f", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array([\n", - " [1, 2, 3],\n", - " [4, 5, 6]\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "eda805b4", - "metadata": {}, - "source": [ - "These dimensions can have varying lengths; arrays can be [ragged](https://en.wikipedia.org/wiki/Jagged_array):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "776cd2b8", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array([\n", - " [1, 2, 3],\n", - " [4],\n", - " [5, 6]\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "249fb561", - "metadata": {}, - "source": [ - "Each dimension can contain missing values:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1990a7c0", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array([\n", - " [1, 2, 3],\n", - " [4],\n", - " [5, 6, None]\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "d741190c", - "metadata": {}, - "source": [ - "Awkward Arrays can store _numbers_:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a25bbee", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array([\n", - " [3, 141], \n", - " [59, 26, 535], \n", - " [8]\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "1082c77a", - "metadata": {}, - "source": [ - "They can also work with _dates_:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1c1e4c6", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array(\n", - " [\n", - " [np.datetime64(\"1815-12-10\"), np.datetime64(\"1969-07-16\")],\n", - " [np.datetime64(\"1564-04-26\")],\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c8961b3e", - "metadata": {}, - "source": [ - "They can even work with _strings_:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79f9fb2b", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array(\n", - " [\n", - " [\n", - " \"Benjamin List\",\n", - " \"David MacMillan\",\n", - " ],\n", - " [\n", - " \"Emmanuelle Charpentier\",\n", - " \"Jennifer A. Doudna\",\n", - " ],\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "5fc94b45", - "metadata": {}, - "source": [ - "Awkward Arrays can have structure through _records_:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e544e3aa", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array(\n", - " [\n", - " [\n", - " {\"name\": \"Benjamin List\", \"age\": 53},\n", - " {\"name\": \"David MacMillan\", \"age\": 53},\n", - " ],\n", - " [\n", - " {\"name\": \"Emmanuelle Charpentier\", \"age\": 52},\n", - " {\"name\": \"Jennifer A. Doudna\", \"age\": 57},\n", - " ],\n", - " [\n", - " {\"name\": \"Akira Yoshino\", \"age\": 73},\n", - " {\"name\": \"M. Stanley Whittingham\", \"age\": 79},\n", - " {\"name\": \"John B. Goodenough\", \"age\": 98},\n", - " ],\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "5f4cb826", - "metadata": {}, - "source": [ - "In fact, Awkward Arrays can represent many kinds of jagged data. They can possess complex structures that mix records, and primitive types." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e966aee", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ak.Array(\n", - " [\n", - " [\n", - " {\n", - " \"name\": \"Benjamin List\",\n", - " \"age\": 53,\n", - " \"institutions\": [\n", - " \"University of Cologne\",\n", - " \"Max Planck Institute for Coal Research\",\n", - " \"Hokkaido University\",\n", - " ],\n", - " },\n", - " {\n", - " \"name\": \"David MacMillan\",\n", - " \"age\": 53,\n", - " \"institutions\": None,\n", - " },\n", - " ]\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f8e40f6f", - "metadata": {}, - "source": [ - "They can even contain unions!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b1dfd2f", - "metadata": {}, - "outputs": [], - "source": [ - "ak.Array(\n", - " [\n", - " [np.datetime64(\"1815-12-10\"), \"Cassini\"],\n", - " [np.datetime64(\"1564-04-26\")],\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "d1642099", - "metadata": {}, - "source": [ - "## NumPy-like interface\n", - "\n", - "Awkward Array _looks like_ NumPy. It behaves identically to NumPy for regular arrays" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3a8f2f5", - "metadata": {}, - "outputs": [], - "source": [ - "x = ak.Array([\n", - " [1, 2, 3],\n", - " [4, 5, 6]\n", - "]);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "544fb936", - "metadata": {}, - "outputs": [], - "source": [ - "ak.sum(x, axis=-1)" - ] - }, - { - "cell_type": "markdown", - "id": "e5e2bf85", - "metadata": {}, - "source": [ - "providing a similar high-level API, and implementing the [ufunc](https://numpy.org/doc/stable/reference/ufuncs.html) mechanism:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7782d9eb", - "metadata": {}, - "outputs": [], - "source": [ - "powers_of_two = ak.Array(\n", - " [\n", - " [1, 2, 4],\n", - " [None, 8],\n", - " [16],\n", - " ]\n", - ");" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9be7b61", - "metadata": {}, - "outputs": [], - "source": [ - "ak.sum(powers_of_two)" - ] - }, - { - "cell_type": "markdown", - "id": "46a564ca", - "metadata": {}, - "source": [ - "But generalises to the tricky kinds of data that NumPy struggles to work with. It can perform reductions through varying length lists:\n", - "\n", - "![](example-reduction-sum.svg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ab3b18a", - "metadata": {}, - "outputs": [], - "source": [ - "ak.sum(powers_of_two, axis=0)" - ] - }, - { - "cell_type": "markdown", - "id": "72006260", - "metadata": {}, - "source": [ - "## Lightweight structures\n", - "Awkward makes it east to pull apart record structures:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e962d648", - "metadata": {}, - "outputs": [], - "source": [ - "nobel_prize_winner = ak.Array(\n", - " [\n", - " [\n", - " {\"name\": \"Benjamin List\", \"age\": 53},\n", - " {\"name\": \"David MacMillan\", \"age\": 53},\n", - " ],\n", - " [\n", - " {\"name\": \"Emmanuelle Charpentier\", \"age\": 52},\n", - " {\"name\": \"Jennifer A. Doudna\", \"age\": 57},\n", - " ],\n", - " [\n", - " {\"name\": \"Akira Yoshino\", \"age\": 73},\n", - " {\"name\": \"M. Stanley Whittingham\", \"age\": 79},\n", - " {\"name\": \"John B. Goodenough\", \"age\": 98},\n", - " ],\n", - " ]\n", - ");" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b032fbff", - "metadata": {}, - "outputs": [], - "source": [ - "nobel_prize_winner.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be833804", - "metadata": {}, - "outputs": [], - "source": [ - "nobel_prize_winner.age" - ] - }, - { - "cell_type": "markdown", - "id": "8ee15959", - "metadata": {}, - "source": [ - "These records are lightweight, and simple to compose:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "802172ee", - "metadata": {}, - "outputs": [], - "source": [ - "nobel_prize_winner_with_birth_year = ak.zip({\n", - " \"name\": nobel_prize_winner.name,\n", - " \"age\": nobel_prize_winner.age,\n", - " \"birth_year\": 2021 - nobel_prize_winner.age\n", - "});" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa369ccd", - "metadata": {}, - "outputs": [], - "source": [ - "nobel_prize_winner_with_birth_year.show()" - ] - }, - { - "cell_type": "markdown", - "id": "7fe7fdfe", - "metadata": {}, - "source": [ - "## High performance\n", - "Like NumPy, Awkward Array performs computations in fast, optimised kernels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d59b6173", - "metadata": {}, - "outputs": [], - "source": [ - "large_array = ak.Array([[1, 2, 3], [], [4, 5]] * 1_000_000)" - ] - }, - { - "cell_type": "markdown", - "id": "a809dcb1", - "metadata": {}, - "source": [ - "We can compute the sum in `3.37 ms ± 107 µs` on a reference CPU:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d064ae4f", - "metadata": {}, - "outputs": [], - "source": [ - "ak.sum(large_array)" - ] - }, - { - "cell_type": "markdown", - "id": "b6998bbc", - "metadata": {}, - "source": [ - "The same sum can be computed with pure-Python over the flattened array in `369 ms ± 8.07 ms`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13fcea0e", - "metadata": {}, - "outputs": [], - "source": [ - "large_flat_array = ak.ravel(large_array)\n", - "\n", - "sum(large_flat_array)" - ] - }, - { - "cell_type": "markdown", - "id": "a7bd3774", - "metadata": {}, - "source": [ - "These performance values are not benchmarks; they are only an indication of the speed of Awkward Array.\n", - "\n", - "Some problems are hard to solve with array-oriented programming. Awkward Array supports [Numba](https://numba.pydata.org/) out of the box:\n", - "\n", - "```python\n", - "import numba as nb\n", - "ak.numba.register_and_check()\n", - "\n", - "@nb.njit\n", - "def cumulative_sum(arr):\n", - " result = 0\n", - " for x in arr:\n", - " for y in x:\n", - " result += y\n", - " return result\n", - " \n", - "result = cumulative_sum(large_array)\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "mystnb": { - "execution_mode": "off" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/getting-started/example-reducer-2d.svg b/docs/getting-started/example-reducer-2d.svg new file mode 100644 index 0000000000..b8b8523a0f --- /dev/null +++ b/docs/getting-started/example-reducer-2d.svg @@ -0,0 +1,429 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 100 + 10 + 1 + 200 + 20 + 2 + 300 + 30 + 3 + 400 + 40 + 4 + + + + 111 + 222 + 333 + 444 + 1000 + 100 + 10 + axis=0 + axis=1 + + + + + + + + + diff --git a/docs/getting-started/example-reducer-ragged.svg b/docs/getting-started/example-reducer-ragged.svg new file mode 100644 index 0000000000..358adbbd82 --- /dev/null +++ b/docs/getting-started/example-reducer-ragged.svg @@ -0,0 +1,396 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 100 + 10 + 1 + 200 + None + 2 + 30 + 3 + 4 + + + + 111 + 202 + 33 + 4 + 300 + 40 + 10 + axis=0 + axis=1 + + + + + + + + + diff --git a/docs/getting-started/example-reduction-sum.svg b/docs/getting-started/example-reduction-sum.svg new file mode 120000 index 0000000000..66cca97961 --- /dev/null +++ b/docs/getting-started/example-reduction-sum.svg @@ -0,0 +1 @@ +../../docs-img/diagrams/example-reduction-sum.svg \ No newline at end of file diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md index 8a5f30dcf6..6af720698b 100644 --- a/docs/getting-started/index.md +++ b/docs/getting-started/index.md @@ -2,157 +2,632 @@ ## Installation -::::::{grid} 1 1 2 2 -:::::{grid-item-card} Working with Conda? +:::::{card} If you use pip, rip, pixi, or uv to install packages, -Awkward Array can be installed from the conda-forge channel. ```bash -conda install -c conda-forge awkward +pip install awkward ``` - ::::: -:::::{grid-item-card} Prefer pip? +:::::{card} If you use conda or mamba, it's in the conda-forge channel. -Binary wheels for Awkward Array are available on PyPI. ```bash -pip install awkward +conda install -c conda-forge awkward ``` ::::: -:::::: -## Overview +If you're installing as a developer or testing updates that haven't been released in a package manager yet, see the [developer installation instructions](https://github.com/scikit-hep/awkward/blob/main/CONTRIBUTING.md#building-and-testing-locally) in the [Contributor guide](https://github.com/scikit-hep/awkward/blob/main/CONTRIBUTING.md). + +## Tutorials + +See the left side-bar (or bring it into view by clicking on the upper-left `≡`) for tutorials that illustrate the purpose and main concepts behind Awkward Arrays. + +## Frequently asked questions + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + ::::::::{grid} 1 :::::::{grid-item} -::::::{dropdown} What kind of data does Awkward Array handle? +::::::{dropdown} What is Awkward Array for? How does it compare to other libraries? + +Python's builtin lists, dicts, and classes can be used to analyze arbitrary data structures, but at a cost in speed and memory. Therefore, they can't be used (easily) with large datasets. + +[Pandas](https://pandas.pydata.org/) DataFrames (as well as [Polars](https://pola.rs/), [cuDF](https://docs.rapids.ai/api/cudf/stable/), and [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html)) are well-suited to tabular data, including tables with relational indexes, but not arbitrary data structures. If a DataFrame is filled with Python's builtin types, then it offers no speed or memory advantage over Python itself. + +[NumPy](https://numpy.org/) is ideal for rectangular arrays of numbers, but not arbitrary data structures. If a NumPy array is filled with Python's builtin types, then it offers no speed or memory advantage over Python itself. -Awkward Array is designed to make working with [_ragged_ arrays](https://en.wikipedia.org/wiki/Jagged_array) as trivial as manipulating regular (non-ragged) N-dimensional arrays in NumPy. It understands data with variable-length lists, -```pycon ->>> ak.Array([ -... [1, 2, 3], -... [4] -... ]) - +[Apache Arrow](https://arrow.apache.org/) ([pyarrow](https://arrow.apache.org/docs/python/)) manages arrays of arbitrary data structures (including those in [Polars](https://pola.rs/), [cuDF](https://docs.rapids.ai/api/cudf/stable/), and to some extent, [Pandas](https://pandas.pydata.org/)), with great language interoperability and interprocess communication, but without manipulation functions oriented toward data analysts. + +Awkward Array is a data analyst-friendly extension of NumPy-like idioms for arbitrary data structures. It is intended to be used interchangeably with NumPy and share data with Arrow and DataFrames. Like NumPy, it simplifies and accelerates computations that transform arrays into arrays—all computations over elements in an array are compiled. Also like NumPy, imperative-style computations can be accelerated with [Numba](https://numba.pydata.org/). + +Note that there is also a [ragged](https://github.com/scikit-hep/ragged) array library with simpler (but still non-rectangular) data types that more closely adheres to [array APIs](https://data-apis.org/array-api/latest/API_specification). + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} Where is an Awkward Array's `shape` and `dtype`? + +Since Awkward Arrays can contain arbitrary data structures, their type can't be separated into a `shape` and a `dtype`, the way a NumPy array can. + +For an array of records like + +```python +import awkward as ak + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) ``` -missing ({data}`None`) values, -```pycon ->>> ak.Array([1, None]) - + +the `x` field contains floating point numbers and the `y` field contains lists of integers. They would have different `dtypes`, as well as different numbers of dimensions. This array also can't be separated into `x` and `y` columns with different `dtypes`, as in a DataFrame, since both fields are inside of records in a variable-length list. + +Instead, Awkward Arrays have a `type`, which looks like + +```python +3 * var * {x: float64, y: var * int64} ``` -record structures, -```pycon ->>> ak.Array([{'x': 1, 'y': 2}]) - + +for the above. This combines `shape` and `dtype` information in the following way: the length of the array is `3`, the first dimension has `var` or variable length, it contains records with `x` and `y` field names in `{` `}`, the `x` field has `float64` primitive type and the `y` field is a `var` variable length list of `int64`. You can `print(array.type)` or `array.type.show()` to see the type of any `array`. (For more, see the [DataShape language](https://datashape.readthedocs.io/).) + +See the [ragged](https://github.com/scikit-hep/ragged) array library for variable-length dimensions that are nevertheless separable into a `shape` and `dtype`, like a conventional array. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I get Awkward Arrays, or read or write files of them? + +After importing Awkward Array with + +```python +import awkward as ak ``` -and even union-types! -```pycon ->>> ak.Array([1, "hi", None]) - + +the `ak.Array` constructor takes [NumPy arrays](https://numpy.org/), [CuPy arrays](https://cupy.dev/), [pyarrow arrays](https://arrow.apache.org/docs/python/), or an iterable of Python builtin lists and dicts, such as + +```python +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) ``` + +This is a shorthand for functions such as {func}`ak.from_numpy`, {func}`ak.from_cupy`, {func}`ak.from_arrow`, and {func}`ak.from_iter`, which you can call explicitly for more control. Similarly, functions like {func}`ak.to_numpy`, {func}`ak.to_cupy`, {func}`ak.to_arrow`, and {func}`ak.to_list` convert Awkward Arrays into other types of arrays, or Python lists. + +Several file formats have `ak.from_*` and `ak.to_*` functions, such as JSON, Parquet, and Feather. To read and write ROOT files, see [Uproot](https://uproot.readthedocs.io/). + +In addition, there are low-level routines, {func}`ak.from_buffers` and {func}`ak.to_buffers`, to build new file or line protocol interfaces. + :::::: ::::::: :::::::{grid-item} -::::::{dropdown} How do I read and write ragged arrays? +::::::{dropdown} How do I slice Awkward Arrays? + +Like NumPy: all [NumPy slicing rules](https://numpy.org/doc/stable/user/basics.indexing.html) are supported, with generalizations to support more data types, as well as slicing rules that have no analog in rectangular arrays; see {func}`ak.Array.__getitem__`. -Awkward Array provides a suite of high-level IO functions (`ak.to_*` and `ak.from_*`), such as {func}`ak.to_parquet` and {func}`ak.from_parquet` that make it simple to serialise Awkward Arrays to disk, or read ragged arrays from other formats. +Some common examples using + +```python +import awkward as ak + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +are + +* `example[0:-1]` to select a range in the first dimension, +* `example[:, 1:]` to keep all elements of the first dimension but drop the first element of each nested list, +* `example["x"]` or `array.x` to select the `x` field of all records, +* `example[array.x > 3]` to select all records in which the field `x` is greater than `3`, +* `example.y[:, :, [0, -1]]` to select field `y` and take the first (`0`) and last (`-1`) element of each list of field `y`, +* and so on. -In addition to specialised IO reading and writing routines, Awkward Arrays can also be serialised to/from a set of one dimensional buffers with the {func}`ak.to_buffers`/{func}`ak.from_buffers` functions. These buffers can then be written to/read from a wide range of existing array serialisation formats that understand NumPy arrays, e.g. {func}`numpy.savez`. :::::: ::::::: :::::::{grid-item} -::::::{dropdown} How do I see the type and shape of an array? +::::::{dropdown} How do I use NumPy functions with Awkward Arrays? -Ragged arrays do not have shapes that can be described by a collection of integers. Instead, Awkward Array uses an extended version of the [DataShape](https://datashape.readthedocs.io/en/latest/) layout language to describe the structure and type of an Array. The {attr}`ak.Array.type` attribute of an array reveals its DataShape: -```pycon ->>> array = ak.Array([[{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [2, 2]}], -... [], -... [{"x": 3.3, "y": [3, 3, 3]}]]) ->>> array.type -3 * var * {"x": float64, "y": var * int64} +All NumPy [universal functions](https://numpy.org/doc/stable/reference/ufuncs.html) can be applied to Awkward Arrays that do not contain record structures, as well as any other functions that have Awkward equivalents, such as {func}`ak.sum`, {func}`ak.argmax`, {func}`ak.mean`, {func}`ak.sort`, and {func}`ak.concatenate`. + +For example, with + +```python +import awkward as ak +import numpy as np + +y = ak.Array([ + [[1], [1, 2], [1, 2, 3]], + [], + [[1, 2, 3, 4], [1, 2, 3, 4, 5]] +]) ``` + +you can call + +* `np.sqrt(y)` to get an array of lists of lists of the square roots of the numbers above, +* `y * 2` or `y + y` to multiply every value by 2 (which calls `np.multiply` or `np.add`, which are NumPy ufuncs), +* `np.sum(y)` to get the sum of all values, +* `np.argmax(y, axis=-1)` to get the position of the maximum value of each inner list, +* `np.mean(y, axis=0)` to get the mean of the first elements of each list, the second elements, and so on, +* `np.sort(y)` to sort lists, +* `np.concatenate((y, y))` to concatenate them, +* and so on. + :::::: ::::::: :::::::{grid-item} -::::::{dropdown} How do I select a subset of an array? +::::::{dropdown} How do I flatten a ragged array for plotting? + +{func}`ak.flatten` eliminates one level of nested lists, and {func}`ak.ravel` eliminates them all. {func}`ak.flatten` also removes missing values (`None`), which plotting libraries might not recognize. -Awkward Array extends the rich indexing syntax used by NumPy to support named fields and ragged indexing: -```pycon ->>> array = ak.Array([ -... [1, 2, 3], -... [6, 7, 8, 9] -... ]) ->>> is_even = (array % 2) == 0 ->>> array[is_even].to_list() - -``` +Depending on what you're trying to plot, selecting the first element of each list or computing the {func}`ak.sum` or {func}`ak.mean` of each list might be more meaningful. -Meanwhile, the {attr}`ak.Array.mask` interface makes it easy to select a subset of an array whilst preserving its structure: -```pycon ->>> array.mask[is_even].to_list() -[[None, 2, None], [4, None], [6, None, 8, None]] -``` :::::: ::::::: +:::::::{grid-item} +::::::{dropdown} How do I make ragged dimensions regular for (ML) algorithms that require it? + +The {func}`ak.to_regular` function changes the _data type_ from variable-length (`var`) to fixed-length _if_ all lists in that dimension happen to have the same length anyway. + +If you need to _change_ the data to make it conform to a rectangular shape, you can + +* slice it to the minimum {func}`ak.num` (and then use {func}`ak.to_regular` to formalize the data type as being regular) +* perform a reduction over a ragged dimension, such as {func}`ak.sum` or {func}`ak.mean`, +* {func}`ak.pad_none` to pad the lists to the maximum {func}`ak.num` and then use {func}`ak.fill_none` to replace the missing values with a value of your choice, +* use {func}`ak.pad_none` with `clip=True` to pad and clip in one step. + +:::::: +::::::: :::::::{grid-item} -::::::{dropdown} How do I reshape ragged arrays to change their dimensions? -New, regular, dimensions can be added using {data}`numpy.newaxis`, whilst {func}`ak.unflatten` can be used to introduce a new _ragged_ axis. -```pycon ->>> array = ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9]) ->>> array[:, np.newaxis] - ->>> ak.unflatten(array, [3, 2, 4]) - -``` -The {func}`ak.flatten` and {func}`ak.ravel` functions can be used to remove surplus (or all) dimensions from Awkward Arrays. -```pycon ->>> array = ak.Array([ -... [1, 2, 3], -... [6, 7, 8, 9] -... ]) ->>> ak.flatten(array, axis=1) - ->>> ak.ravel(array) - -``` +::::::{dropdown} How can I make or break records in arrays? + +Record (struct/class) data structures may come from JSON objects, Arrow Tables, Parquet columns, etc. This small Python dataset produces an array of lists of records: + +```python +import awkward as ak + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +Individual fields can be extracted by slicing it: `example["x"]` (`example.x`) and `example["y"]` (`example.y`), and all fields can be extracted at once with {func}`ak.unzip`: + +```python +x, y = ak.unzip(example) +``` + +The following is a particularly useful idiom, for turning an array of records into a Python dict of arrays, using both {func}`ak.fields` and {func}`ak.unzip`: + +```python +dict_of_arrays = dict(zip(ak.fields(example), ak.unzip(example))) +``` + +The opposite, {func}`ak.zip`, takes a Python dict of arrays and makes a record array: + +```python +ak.zip(dict_of_arrays) +``` + +When a set of Awkward Arrays are zipped together, it's not clear which level of nested lists should be populated with records; {func}`ak.zip` attempts to create records at the deepest level, inside of all nested lists (which might not even be possible, if the Awkward Arrays don't have the same list lengths at all levels). The `depth_limit` argument of {func}`ak.zip` controls this: + +```python +ak.zip(dict_of_arrays, depth_limit=2) +``` + +reproduces the original `example`, in which the `y` field has one more dimension than the `x` field (scalar `x` values sit beside `y` values that are lists). + :::::: ::::::: +:::::::{grid-item} +::::::{dropdown} How do I add a field to an existing record array? + +As a shorthand for {func}`ak.unzip`, add a field, and {func}`ak.zip` (see the question above), new fields can be assigned with {func}`ak.Array.__setitem__`. + +For example, with + +```python +import awkward as ak + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +you can add a third field, `z` to the record with + +```python +example["z"] = example.x * 10 +``` + +Note that for assignment, the left-hand side must be expressed with square brackets, not a dot. This is to support assignment into records nested within records. + +:::::: +::::::: :::::::{grid-item} -::::::{dropdown} How do I compute reductions or summary statistics? +::::::{dropdown} Why can't I assign numerical values in an array? -Awkward Array supports NumPy's {np:doc}`reference/ufuncs` mechanism, and many of the high-level NumPy reducers (e.g. {func}`numpy.sum`). +Awkward Arrays are immutable, and almost all operations on them view parts of a data structure and only replace the parts that have changed. Therefore, with an array like -:::::{grid} 1 1 2 2 +```python +import awkward as ak -::::{grid-item} -```pycon ->>> array = ak.Array([ -... [1, 2, 4], -... [ ], -... [None, 8 ], -... [16 ] -... ]) ->>> ak.sum(array, axis=0) - ->>> ak.sum(array, axis=1) - -``` -:::: +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` -::::{grid-item} -:::{figure} ../image/example-reduction-sum-only.svg -::: -:::: +attempting to assign + +```python +example[0, "x", 0] = 999 +``` + +results in an error. (If it were allowed, it could have unpredictable consequences.) Immutability is not enforced at a very low level, so if you know what you're doing, you can deconstruct the array, view it in NumPy, Arrow, or as raw memory buffers, and change it. + +Problems that would be solved by assigning values in place can usually be solved by {func}`ak.where`. + +The only kind of assignment that _is_ allowed is to add new fields, such as + +```python +example["z"] = 999 +``` + +(see the question above). This kind of assignment won't cause values in another array to change unpredictably. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I get rid of missing values (`None`)? + +Some functions, such as {func}`ak.min` or {func}`ak.max` on empty lists, produce missing values. For example, with + +```python +import awkward as ak + +x = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) +``` + +`ak.max(x, axis=1)` returns + +``` + +``` + +`None` represents a missing value (distinct from floating-point `nan`), and the `?` or `option[float64]` in the type means that values _could_ be missing. Such an array can be used in numerical calculations—missing values pass through most functions as missing values in the output—but third-party libraries might not recognize them. + +* {func}`ak.drop_none` simply removes the missing values, changing the lengths of lists and the data type to reflect the fact that no values are missing. +* {func}`ak.flatten` removes missing values in the process of flattening nested lists (it treats `None` like `[]`). +* {func}`ak.fill_none` lets you replace missing values with a specified value. +* {func}`ak.firsts` and {func}`ak.singletons` convert between representing option-type data as `option[T]` and `var * T`. In the latter, a missing value is an empty list and a non-missing value is a length-1 list. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} Why am I getting ValueError or IndexError in mathematical operations? + +Most likely, your arrays don't line up at every level of nested lists. This is a generalization of a `shape` mismatch in rectangular arrays. + +For example, with + +```python +import awkward as ak + +x = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) +y = ak.Array([[1.1, 2.2, 3.3, 999], [], [4.4, 5.5]]) +``` + +an attempt to add `x + y` would fail because even though `x` and `y` have the same array length (3), the length of the first list differs (3 versus 4). + +This type of error is often more subtle than the example above. It won't happen if two arrays are derived from the same array with shape-preserving operations, but if, for instance, you remove outlier data from from one array and not another, they may fail to line up somewhere in the middle of a large dataset. + +One way to avoid that is to introduce missing values (`None`) instead of removing outliers. Whereas + +```python +x[x > 2] +``` + +makes an array without values smaller than 2, + +``` + +``` + +a mask, + +```python +x.mask[x > 2] +``` + +replaces the values smaller than 2 with `None`: + +``` + +``` + +This preserves the shape of the array so that it can continue to be used in mathematical expressions. For instance, `x + x.mask[x > 2]` returns + +``` + +``` + +(the missing value propagates through to the output). + +Missing values can be dropped, using {func}`ak.drop_none`, or replaced, using {func}`ak.fill_none`, as described in the question above. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I use Awkward Array with Numba? + +Awkward Arrays can be passed into and out of functions that have been JIT-compiled with [Numba](https://numba.pydata.org/). For example, with + +```python +import awkward as ak +import numpy as np +import numba as nb + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +A function that sums `x` in each entry (like {func}`ak.sum`) can be written in JIT-compiled imperative Python like this: + +```python +@nb.jit +def sum_over_x(array): + output = np.zeros(len(array)) + for i, list_of_records in enumerate(array): + for record in list_of_records: + output[i] += record.x + return output + +sum_over_x(example) +``` + +Since Numba JIT-compiled the function, it doesn't suffer the usual slow-down of iterating in Python. On the other hand, all variables in the function must have fixed data type and adhere to Numba's set of [supported Python features](https://numba.readthedocs.io/en/stable/reference/pysupported.html) and [supported NumPy features](https://numba.readthedocs.io/en/stable/reference/numpysupported.html) to be compiled. None of Awkward Array's `ak.*` functions can be used—only iteration over values. + +A JIT-compiled function can also return a part of the input Awkward Array: + +```python +@nb.jit +def record_in_which_y_sums_to_10(array): + for list_of_records in array: + for record in list_of_records: + if np.asarray(record.y).sum() == 10: + return record + +record_in_which_y_sums_to_10(example) +``` + +returns + +``` + +``` + +which is the `record` that has `np.asarray(record.y).sum() == 10`. (One-dimensional Awkward Arrays may be cast as NumPy arrays, to take advantage of NumPy functions.) + +Awkward Arrays are immutable inside of JIT-compiled functions, just as they are outside. To create new Awkward Arrays with Numba, use {obj}`ak.ArrayBuilder`. + +Awkward Arrays with {func}`ak.backend` equal to `"cuda"` can be passed to Numba functions on GPUs, compiled with `@nb.cuda.jit`. See {doc}`../user-guide/how-to-use-in-numba-cuda` for more. + +The choice between computing outside of a Numba JIT-compiled function and outside of one is an either/or choice between imperative style in Numba (only iteration is allowed, no `ak.*` functions or fancy slices) and array-oriented style outside (iteration is slow in Python; `ak.*` functions are encouraged). + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I perform computations on arrays of spatial or momentum vectors? + +For 2-D, 3-D, and 4-D space and space-time vectors, see the [Vector](https://vector.readthedocs.io/) library. These can be used as momentum vectors for physics with a variety of coordinate transformations, including special relativity. As Awkward Arrays, these each vector is a record whose fields are coordinates, such as `x`, `y`, `z` or `rho`, `phi`, `theta`. + +To enable Awkward Arrays of vectors, import Vector as + +```python +import vector +vector.register_awkward() +``` + +and now any Awkward records with an appropriate name, such as + +```python +example = ak.zip({ + "x": ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]), + "y": ak.Array([[ 1, 2, 3], [], [ 4, 5]]), + "z": ak.Array([[ 10, 20, 30], [], [ 40, 50]]) +}, with_name="Momentum3D") +``` + +is recognized as an array of vectors with methods like + +```python +example.phi +``` + +and + +```python +(2 * example).is_parallel(example) +``` + +These methods also work in Numba JIT-compiled functions. (See the above question.) + +Several array-constructing functions accept a `with_name` argument, including the {obj}`ak.Array` constructor and {func}`ak.zip`. There's also a {func}`ak.with_name` function to add a name after an array has already been created. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How would I write my own suite of functions, like Vector? + +Add new classes or functions to {obj}`ak.behavior`, which links record names to Python code. + +Names are strings that can be saved in files or transferred across networks, but Python code is not always serializable. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How can I delay or distribute a computation? + +Use [Dask](https://www.dask.org/). The [dask-awkward](https://dask-awkward.readthedocs.io/) library provides a new high-level collection for Awkward Arrays, similar to `dask.array` and `dask.dataframe`. + +For example, with + +```python +import awkward as ak +import dask_awkward as dak + +example = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +you can make delayed data with + +```python +dak.from_awkward(example, npartitions=1) +``` + +although it's more common to use [dak.from_parquet](https://dask-awkward.readthedocs.io/en/stable/api/generated/dask_awkward.from_parquet.html) or [uproot.dask](https://uproot.readthedocs.io/en/latest/uproot._dask.dask.html). + +Any operations on this delayed array are collected as a Directed Acyclic Graph (DAG) that is computed when you call [dask.compute](https://distributed.dask.org/en/stable/manage-computation.html). The computation may be [distributed](https://distributed.dask.org/) across multiple CPUs on one computer or across multiple computers in a network. + +The [dask-awkward project](https://github.com/dask-contrib/dask-awkward) intends to cover the same interface as Awkward Array, though there may be some functions implemented in `ak.*` that aren't in `dak.*` yet. See [dask-awkward's GitHub Issues](https://github.com/dask-contrib/dask-awkward/issues) for Dask-specific issues. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I use Awkward Array with ROOT? + +[Uproot](https://uproot.readthedocs.io/) can read and write ROOT files, and works with Awkward Arrays by default. + +Also, {func}`ak.to_rdataframe` and {func}`ak.from_rdataframe` converts Awkward Arrays in memory to and from ROOT's [RDataFrame](https://root.cern/doc/master/classROOT_1_1RDataFrame.html) for computations. See {doc}`../user-guide/how-to-convert-rdataframe` for details. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I use Awkward Array with C++? + +One method is to convert Awkward Arrays to or from ROOT's [RDataFrame](https://root.cern/doc/master/classROOT_1_1RDataFrame.html) using {func}`ak.to_rdataframe` and {func}`ak.from_rdataframe`. RDataFrame supports computation in JIT-compiled C++. + +Another method is to pass Awkward Arrays into JIT-compiled C++ functions defined with [cppyy](https://cppyy.readthedocs.io/)'s [cppdef](https://cppyy.readthedocs.io/en/latest/toplevel.html#loading-c). This interface is similar to Numba, in that the JIT-compiled functions have arbitrary arguments and return values, rather than fitting into a pipeline like RDataFrame, but it also means that you need to set up the loop over entries manually and inside the compiled block. See {doc}`../user-guide/how-to-use-in-cpp-cppyy` for details. + +If you are a library developer wishing to produce and/or consume Awkward Arrays in ahead-of-time compiled code (not JIT), like [fastjet](https://github.com/scikit-hep/fastjet), you'll want to use {doc}`LayoutBuilder <../user-guide/how-to-use-header-only-layoutbuilder>`, {func}`ak.from_buffers`/{func}`ak.to_buffers`, or both. LayoutBuilder constructs an append-only array object like {obj}`ak.ArrayBuilder`, but with statically typed array type in header-only C++ that can be integrated with CMake. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I use Awkward Array with Julia? + +[AwkwardArray.jl](https://github.com/JuliaHEP/AwkwardArray.jl) is a [Julia](https://julialang.org/) implementation of Awkward Array, sharing the same memory layout. It can therefore be used as a JIT-compilation target like Numba and C++ (see questions above), but with more flexibility: a single array data type can be used as an {obj}`ak.Array` and as an {obj}`ak.ArrayBuilder`. Whereas Pythonic Awkward Arrays are only _borrowed_ by JIT-compiled Numba or C++ (Python continues to own the memory and decide when it will be deleted), Julia's JIT-compiled environment is the entire environment, so such decisions don't need to be made. Julia can act as a producer and/or a consumer of Python Awkward Arrays. + +See the [AwkwardArray.jl documentation](https://juliahep.github.io/AwkwardArray.jl/dev/) for details. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} How do I emulate nested for-loops in Awkward Array (combinatorics)? + +In the simplest cases, imperative code like + +```python +output = [] +for x in awkward_array: + output.append(compute(x)) +``` + +can be replaced with + +```python +output = compute(awkward_array) +``` + +But some problems would be solved by imperative code like + +```python +output = [] +for x in awkward_array1: + for y in awkward_array2: + output.append(compute(x, y)) +``` + +or even + +```python +output = [] +for i, x in enumerate(awkward_array): + for j in range(i + 1, len(awkward_array)): # avoid repeating x + y = awkward_array[j] + output.append(compute(x, y)) +``` + +These cases involve combinatorics: a Cartesian product and sampling without replacement. To perform such operations at compiled speeds on Awkward Arrays, you may either + +* JIT-compile these for loops with Numba, C++, or Julia (as in the questions above), +* use Awkward Array's combinatorics primitives. + +{func}`ak.cartesian` is Awkward Array's primitive for Cartesian products: it makes an array of all pairs drawn from two (or more) provided arrays. It emulates nested, unrestricted for loops. + +{func}`ak.combinations` is Awkward Array's primitive for sampling without replacement: it makes an array of all pairs drawn from an array and itself without duplicates. It emulates nested for loops that avoid repeating the same element. + +These pairs (or triples, etc.) are tuples, which are records without field names. Often, `nested=True` is a useful argument to avoid flattening the output. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} Why don't my arrays broadcast as in NumPy? + +See the last section of {doc}`../user-guide/how-to-math-broadcasting`. + +:::::: +::::::: + +:::::::{grid-item} +::::::{dropdown} What if I need more help? What if I think I've found a bug? + +After checking the tutorials on the left-bar of this Getting started guide, the User guide, and the API reference, you can ask questions about how to use a feature or solve a problem on Awkward Array's [GitHub Discussions](https://github.com/scikit-hep/awkward/discussions). + +If the behavior you're seeing looks like a bug, an error in Awkward Array itself, post some simplified code to reproduce it on Awkward Array's [GitHub Issues](https://github.com/scikit-hep/awkward/issues). -::::: :::::: ::::::: :::::::: +




diff --git a/docs/getting-started/jagged-ragged-awkward-arrays.ipynb b/docs/getting-started/jagged-ragged-awkward-arrays.ipynb new file mode 100644 index 0000000000..edfea7d053 --- /dev/null +++ b/docs/getting-started/jagged-ragged-awkward-arrays.ipynb @@ -0,0 +1,1547 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "558f5f22", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Jagged, Ragged, Awkward Arrays!" + ] + }, + { + "cell_type": "markdown", + "id": "9f76191e", + "metadata": {}, + "source": [ + "_Originally presented as [part](https://hsf-training.github.io/hsf-training-scikit-hep-webpage/04-awkward/index.html) of [HSF Scikit-HEP training on March 28, 2022](https://indico.cern.ch/event/1112526/)._" + ] + }, + { + "cell_type": "markdown", + "id": "8685b555", + "metadata": {}, + "source": [ + "


" + ] + }, + { + "cell_type": "markdown", + "id": "f8e3073f", + "metadata": {}, + "source": [ + "NumPy can't represent an array of variable-length lists without resorting to arrays of objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa4b4479", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# generates a ValueError\n", + "np.array([[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]])" + ] + }, + { + "cell_type": "markdown", + "id": "e8513978", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Awkward Array is intended to fill this gap:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8bceb0e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import awkward as ak\n", + "\n", + "ak.Array([[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]])" + ] + }, + { + "cell_type": "markdown", + "id": "b2bc2788", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Arrays like this are sometimes called \"[jagged arrays](https://en.wikipedia.org/wiki/Jagged_array)\" and sometimes \"ragged arrays.\"" + ] + }, + { + "cell_type": "markdown", + "id": "07303759", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Slicing in Awkward Array" + ] + }, + { + "cell_type": "markdown", + "id": "4d3e1679", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Basic slices are a generalization of NumPy's—what NumPy would do if it had variable-length lists." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "350fa738", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array = ak.Array([[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]])\n", + "array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efcbf09b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9d32f6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[-1, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65e1b348", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[2:, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2adb679b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[2:, 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdeea244", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "array[:, 0]" + ] + }, + { + "cell_type": "markdown", + "id": "d9c5aa3a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Quick quiz:** why does the last one raise an error?" + ] + }, + { + "cell_type": "markdown", + "id": "543260f6", + "metadata": {}, + "source": [ + "Boolean and integer slices work, too:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54783b4f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[[True, False, True, False, True]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7799c181", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[[2, 3, 3, 1]]" + ] + }, + { + "cell_type": "markdown", + "id": "2ba5a2da", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Like NumPy, boolean arrays for slices can be computed, and functions like [ak.num](https://awkward-array.readthedocs.io/en/latest/_auto/ak.num.html) are helpful for that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "528e98cf", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.num(array)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68dfaa13", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.num(array) > 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a84dc910", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[ak.num(array) > 0, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7749a21b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[ak.num(array) > 1, 1]" + ] + }, + { + "cell_type": "markdown", + "id": "e8c43b79", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Now consider this (similar to an example from the first lesson):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a781f45b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cut = array * 10 % 2 == 0\n", + "cut" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bd0fa71", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array[cut]" + ] + }, + { + "cell_type": "markdown", + "id": "3fc32773", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This array, `cut`, is not just an array of booleans. It's a jagged array of booleans. All of its nested lists fit into `array`'s nested lists, so it can deeply select numbers, rather than selecting lists." + ] + }, + { + "cell_type": "markdown", + "id": "347891c0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Application: selecting particles, rather than events" + ] + }, + { + "cell_type": "markdown", + "id": "2e515351", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Returning to the big TTree from the previous lesson," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10d7130c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import uproot\n", + "\n", + "file = uproot.open(\n", + " \"https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/raw/main/data/SMHiggsToZZTo4L.root\"\n", + ")\n", + "tree = file[\"Events\"]\n", + "\n", + "muon_pt = tree[\"Muon_pt\"].array(entry_stop=10)" + ] + }, + { + "cell_type": "markdown", + "id": "20cf355e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This jagged array of booleans selects all *muons* with at least 20 GeV:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b24110bd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "particle_cut = muon_pt > 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5811556c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "muon_pt[particle_cut]" + ] + }, + { + "cell_type": "markdown", + "id": "746a9944", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "and this non-jagged array of booleans (made with [ak.any](https://awkward-array.readthedocs.io/en/latest/_auto/ak.any.html)) selects all events *that have* a muon with at least 20 GeV:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "176255e3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "event_cut = ak.any(muon_pt > 20, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8589d804", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "muon_pt[event_cut]" + ] + }, + { + "cell_type": "markdown", + "id": "fb0e32ee", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Quick quiz:** construct exactly the same `event_cut` using [ak.max](https://awkward-array.readthedocs.io/en/latest/_auto/ak.max.html)." + ] + }, + { + "cell_type": "markdown", + "id": "3f3472fa", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Quick quiz:** apply both cuts; that is, select muons with over 20 GeV from events that have them." + ] + }, + { + "cell_type": "markdown", + "id": "e068fb31", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Hint: you'll want to make a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2af1c7bb", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cleaned = muon_pt[particle_cut]" + ] + }, + { + "cell_type": "markdown", + "id": "ec7ce43b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "intermediary and you can't use the variable `event_cut`, as-is." + ] + }, + { + "cell_type": "markdown", + "id": "8e178dad", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Hint:** the final result should be a jagged array, just like muon_pt, but with fewer lists and fewer items in those lists." + ] + }, + { + "cell_type": "markdown", + "id": "2c896b99", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Combinatorics in Awkward Array" + ] + }, + { + "cell_type": "markdown", + "id": "6ecc3524", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Variable-length lists present more problems than just slicing and computing formulas array-at-a-time. Often, we want to combine particles in all possible pairs (within each event) to look for decay chains." + ] + }, + { + "cell_type": "markdown", + "id": "e2914e2a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Pairs from two arrays, pairs from a single array" + ] + }, + { + "cell_type": "markdown", + "id": "2ec24b29", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Awkward Array has functions that generate these combinations. For instance, [ak.cartesian](https://awkward-array.readthedocs.io/en/latest/_auto/ak.cartesian.html) takes a Cartesian product per event (when `axis=1`, the default).\n", + "\n", + "![](cartoon-cartesian.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87277899", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "numbers = ak.Array([[1, 2, 3], [], [5, 7], [11]])\n", + "letters = ak.Array([[\"a\", \"b\"], [\"c\"], [\"d\"], [\"e\", \"f\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a7d18d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pairs = ak.cartesian((numbers, letters))" + ] + }, + { + "cell_type": "markdown", + "id": "2cd52754", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "These `pairs` are 2-tuples, which are like records in how they're sliced out of an array: using strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09e285e5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pairs[\"0\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8571883", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pairs[\"1\"]" + ] + }, + { + "cell_type": "markdown", + "id": "c06781d6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "There's also [ak.unzip](https://awkward-array.readthedocs.io/en/latest/_auto/ak.unzip.html), which extracts every field into a separate array (opposite of [ak.zip](https://awkward-array.readthedocs.io/en/latest/_auto/ak.zip.html))." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7453f65", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lefts, rights = ak.unzip(pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0326b2aa", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lefts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c119522a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rights" + ] + }, + { + "cell_type": "markdown", + "id": "d27a1313", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Note that these `lefts` and `rights` are not the original `numbers` and `letters`: they have been duplicated and have the same shape.\n", + "\n", + "The Cartesian product is equivalent to this C++ `for` loop over two collections:\n", + "\n", + "```cpp\n", + "for (int i = 0; i < numbers.size(); i++) {\n", + " for (int j = 0; j < letters.size(); j++) {\n", + " // compute formula with numbers[i] and letters[j]\n", + " }\n", + "}\n", + "```\n", + "\n", + "Sometimes, though, we want to find all pairs within a single collection, without repetition. That would be equivalent to this C++ `for` loop:\n", + "\n", + "```cpp\n", + "for (int i = 0; i < numbers.size(); i++) {\n", + " for (int j = i + 1; i < numbers.size(); j++) {\n", + " // compute formula with numbers[i] and numbers[j]\n", + " }\n", + "}\n", + "```\n", + "\n", + "The Awkward function for this case is [ak.combinations](https://awkward-array.readthedocs.io/en/latest/_auto/ak.combinations.html)." + ] + }, + { + "cell_type": "markdown", + "id": "ad2ba0fd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "![cartoon-combinations](cartoon-combinations.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637cc498", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pairs = ak.combinations(numbers, 2)\n", + "pairs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6949c0a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lefts, rights = ak.unzip(pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a204ffd5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lefts * rights # they line up, so we can compute formulas" + ] + }, + { + "cell_type": "markdown", + "id": "f891b767", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Application to dimuons" + ] + }, + { + "cell_type": "markdown", + "id": "536244e3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The dimuon search in the previous lesson was a little naive in that we required *exactly two* muons to exist in every event and only computed the mass of that combination. If a third muon were present because it's a complex electroweak decay or because something was mismeasured, we would be blind to the other two muons. They might be real dimuons.\n", + "\n", + "A better procedure would be to look for all pairs of muons in an event and apply some criteria for selecting them.\n", + "\n", + "In this example, we'll [ak.zip](https://awkward-array.readthedocs.io/en/latest/_auto/ak.zip.html) the muon variables together into records." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2319e9f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import uproot\n", + "import awkward as ak\n", + "\n", + "file = uproot.open(\n", + " \"https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/raw/main/data/SMHiggsToZZTo4L.root\"\n", + ")\n", + "tree = file[\"Events\"]\n", + "\n", + "arrays = tree.arrays(filter_name=\"/Muon_(pt|eta|phi|charge)/\", entry_stop=10000)\n", + "\n", + "muons = ak.zip(\n", + " {\n", + " \"pt\": arrays[\"Muon_pt\"],\n", + " \"eta\": arrays[\"Muon_eta\"],\n", + " \"phi\": arrays[\"Muon_phi\"],\n", + " \"charge\": arrays[\"Muon_charge\"],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fab27117", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "arrays.type.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e22e64e8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "muons.type.show()" + ] + }, + { + "cell_type": "markdown", + "id": "a9073550", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The difference between `arrays` and `muons` is that `arrays` contains separate lists of `\"Muon_pt\"`, `\"Muon_eta\"`, `\"Muon_phi\"`, `\"Muon_charge\"`, while `muons` contains lists of records with `\"pt\"`, `\"eta\"`, `\"phi\"`, `\"charge\"` fields." + ] + }, + { + "cell_type": "markdown", + "id": "e5d22c0c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Now we can compute pairs of muon *objects*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "405cd5e4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pairs = ak.combinations(muons, 2)\n", + "pairs.type.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4d35e896", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "and separate them into arrays of the first muon and the second muon in each pair." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a696fcf", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "mu1, mu2 = ak.unzip(pairs)" + ] + }, + { + "cell_type": "markdown", + "id": "47338d3e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Quick quiz:** how would you ensure that all lists of records in `mu1` and `mu2` have the same lengths? Hint: see [ak.num](https://awkward-array.readthedocs.io/en/latest/_auto/ak.num.html) and [ak.all](https://awkward-array.readthedocs.io/en/latest/_auto/ak.all.html).\n", + "\n", + "Since they do have the same lengths, we can use them in a formula." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73af7a52", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "mass = np.sqrt(\n", + " 2 * mu1.pt * mu2.pt * (np.cosh(mu1.eta - mu2.eta) - np.cos(mu1.phi - mu2.phi))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "17af8b2f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Quick quiz:** how many masses do we have in each event? How does this compare with `muons`, `mu1`, and `mu2`?" + ] + }, + { + "cell_type": "markdown", + "id": "889c348d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Plotting the jagged array" + ] + }, + { + "cell_type": "markdown", + "id": "f353e537", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Since this `mass` is a jagged array, it can't be directly histogrammed. Histograms take a set of *numbers* as inputs, but this array contains *lists*.\n", + "\n", + "Supposing you just want to plot the numbers from the lists, you can use [ak.flatten](https://awkward-array.readthedocs.io/en/latest/_auto/ak.flatten.html) to flatten one level of list or [ak.ravel](https://awkward-array.readthedocs.io/en/latest/_auto/ak.ravel.html) to flatten all levels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a25b9e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import hist\n", + "\n", + "hist.Hist(hist.axis.Regular(120, 0, 120, label=\"mass [GeV]\")).fill(\n", + " ak.ravel(mass)\n", + ").plot()\n", + "\n", + "None" + ] + }, + { + "cell_type": "markdown", + "id": "b9455a9c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Alternatively, suppose you want to plot the *maximum* mass-candidate in each event, biasing it toward Z bosons? [ak.max](https://awkward-array.readthedocs.io/en/latest/_auto/ak.max.html) is a different function that picks one element from each list, when used with `axis=1`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02c82332", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.max(mass, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "b5837499", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Some values are `None` because there is no maximum of an empty list. [ak.flatten](https://awkward-array.readthedocs.io/en/latest/_auto/ak.flatten.html)/[ak.ravel](https://awkward-array.readthedocs.io/en/latest/_auto/ak.ravel.html) remove missing values (`None`) as well as squashing lists," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "648d3ee0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.flatten(ak.max(mass, axis=1), axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "9b17abd2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "but so does removing the empty lists in the first place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1048884", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.max(mass[ak.num(mass) > 0], axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "c1b29eee", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Exercise: select pairs of muons with opposite charges" + ] + }, + { + "cell_type": "markdown", + "id": "85a10733", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is neither an event-level cut nor a particle-level cut, it is a cut on particle *pairs*." + ] + }, + { + "cell_type": "markdown", + "id": "7c1d770f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Solution\n", + "\n", + "The `mu1` and `mu2` variables are the left and right halves of muon pairs. Therefore," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faddaa48", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cut = (mu1.charge != mu2.charge)" + ] + }, + { + "cell_type": "markdown", + "id": "91b6506e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "has the right multiplicity to be applied to the `mass` array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e8ab66f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "hist.Hist(hist.axis.Regular(120, 0, 120, label=\"mass [GeV]\")).fill(\n", + "\n", + " ak.ravel(mass[cut])\n", + "\n", + ").plot()\n", + "\n", + "None" + ] + }, + { + "cell_type": "markdown", + "id": "d7ae0d9e-77a0-42a7-996c-8209dc21e493", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "plots the cleaned muon pairs." + ] + }, + { + "cell_type": "markdown", + "id": "1a780f85-c53c-4322-afe7-f53aa4e08bfd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Exercise (harder): plot the one mass candidate per event that is strictly closest to the Z mass" + ] + }, + { + "cell_type": "markdown", + "id": "24b016ea-bfe8-456c-99b7-b51d060fc8ac", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Instead of just taking the maximum mass in each event, find the one with the minimum difference between computed mass and `zmass = 91`.\n", + "\n", + "**Hint:** use [ak.argmin](https://awkward-array.readthedocs.io/en/latest/_auto/ak.argmin.html) with `keepdims=True`.\n", + "\n", + "Anticipating one of the future lessons, you could get a more accurate mass by asking the Particle library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42029417", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import particle, hepunits\n", + "\n", + "zmass = particle.Particle.findall(\"Z0\")[0].mass / hepunits.GeV" + ] + }, + { + "cell_type": "markdown", + "id": "2f36f265", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Solution\n", + "\n", + "Instead of maximizing `mass`, we want to minimize `abs(mass - zmass)` and apply that choice to `mass`. [ak.argmin](https://awkward-array.readthedocs.io/en/latest/_auto/ak.argmin.html) returns the *index position* of this minimum difference, which we can then apply to the original `mass`. However, without `keepdims=True`, [ak.argmin](https://awkward-array.readthedocs.io/en/latest/_auto/ak.argmin.html) removes the dimension we would need for this array to have the same nested shape as `mass`. Therefore, we `keepdims=True` and then use [ak.ravel](https://awkward-array.readthedocs.io/en/latest/_auto/ak.ravel.html) to get rid of missing values and flatten lists.\n", + "\n", + "The last step would require two applications of [ak.flatten](https://awkward-array.readthedocs.io/en/latest/_auto/ak.flatten.html): one for squashing lists at the first level and another for removing `None` at the second level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9865f55e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "which = ak.argmin(abs(mass - zmass), axis=1, keepdims=True)\n", + "\n", + "hist.Hist(hist.axis.Regular(120, 0, 120, label=\"mass [GeV]\")).fill(\n", + "\n", + " ak.flatten(mass[which], axis=None)\n", + "\n", + ").plot()\n", + "\n", + "None" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/getting-started/logo-arrow.svg b/docs/getting-started/logo-arrow.svg new file mode 100644 index 0000000000..a72ef97bb5 --- /dev/null +++ b/docs/getting-started/logo-arrow.svg @@ -0,0 +1,96 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + diff --git a/docs/getting-started/logo-awkward.svg b/docs/getting-started/logo-awkward.svg new file mode 100644 index 0000000000..41281df94e --- /dev/null +++ b/docs/getting-started/logo-awkward.svg @@ -0,0 +1,152 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/getting-started/logo-parquet.svg b/docs/getting-started/logo-parquet.svg new file mode 100644 index 0000000000..cc7ee3b363 --- /dev/null +++ b/docs/getting-started/logo-parquet.svg @@ -0,0 +1,17 @@ + + + + Apache Parquet logo + + + + image/svg+xml + + Apache Parquet logo + + + + + + + diff --git a/docs/getting-started/papers-and-talks.md b/docs/getting-started/papers-and-talks.md index d7c41f60b1..3c56c433d8 100644 --- a/docs/getting-started/papers-and-talks.md +++ b/docs/getting-started/papers-and-talks.md @@ -4,27 +4,41 @@ * [Awkward Arrays in Python, C++, and Numba](https://doi.org/10.1051/epjconf/202024505023) (published in EPJ Web of Conferences; [arXiv](https://arxiv.org/abs/2001.06307)) on July 2, 2020. * [AwkwardForth: accelerating Uproot with an internal DSL](https://doi.org/10.1051/epjconf/202125103002) (published in EPJ Web of Conferences; [arXiv](https://arxiv.org/abs/2102.13516) on February 24, 2021. - * [An array-oriented Python interface for FastJet](https://doi.org/10.48550/arXiv.2202.03911) (to be published in ACAT 2021 proceedings; [arXiv](https://arxiv.org/abs/2202.03911)) on February 8, 2022. + * [An array-oriented Python interface for FastJet](https://doi.org/10.1088/1742-6596/2438/1/012011) (published in ACAT 2021 proceedings; [arXiv](https://arxiv.org/abs/2202.03911)) on February 8, 2022. + * [Awkward to RDataFrame and back](https://doi.org/10.48550/arXiv.2302.09860) (to be published in ACAT 2022 proceedings; [arXiv](https://arxiv.org/abs/2302.09860) on February 20, 2023. + * [Using a DSL to read ROOT TTrees faster in Uproot](https://doi.org/10.48550/arXiv.2303.02202) (to be published in ACAT 2022 proceedings; [arXiv](https://arxiv.org/abs/2303.02202) on March 3, 2023. + * [The Awkward World of Python and C++](https://doi.org/10.48550/arXiv.2303.02205) (to be published in ACAT 2022 proceedings; [arXiv](https://arxiv.org/abs/2303.02205) on March 3, 2023. + * [Awkward Just-In-Time (JIT) Compilation: A Developer's Experience](https://doi.org/10.48550/arXiv.2310.01461) (to be published in CHEP 2023 proceedings; [arXiv](https://arxiv.org/abs/2310.01461) on October 2, 2023. ## Talks - * [StrangeLoop talk](https://youtu.be/2NxWpU7NArk) (video) on September 14, 2019. - * [PyHEP talk](https://indico.cern.ch/event/833895/contributions/3577882) on October 17, 2019. - * [CHEP talk](https://indico.cern.ch/event/773049/contributions/3473258) on November 7, 2019. - * [Summary poster](https://github.com/jpivarski/2020-02-27-irishep-poster/blob/master/pivarski-irishep-poster.pdf) on February 27, 2020. - * [Presentation at SciPy 2020](https://youtu.be/WlnUF3LRBj4) (video) on July 5, 2020. - * [Lessons learned in Python-C++ integration](https://indico.cern.ch/event/855454/contributions/4605044/) ([video](https://videos.cern.ch/record/2295164) and [slides](https://indico.cern.ch/event/855454/contributions/4605044/attachments/2349193/4006676/main.pdf)) on December 1, 2021. This talk describes the motivation for Awkward version 2.0. - * [Awkward Array updates](https://indico.cern.ch/event/1140031/) on April 6, 2022. A demonstration and overview of Awkward version 2.0. - * [Summary poster](https://github.com/jpivarski-talks/2022-07-25-cssi-meeting-poster/blob/main/pivarski-awkward-cssi-poster.pdf) on July 5, 2022. + * [StrangeLoop talk](https://www.thestrangeloop.com/2019/jagged-ragged-awkward-arrays.html) ([video](https://youtu.be/2NxWpU7NArk)) on September 14, 2019: motivation for Awkward Arrays. + * [PyHEP talk](https://indico.cern.ch/event/833895/contributions/3577882) on October 17, 2019: motivation for Awkward 1.0. + * [CHEP talk](https://indico.cern.ch/event/773049/contributions/3473258) on November 7, 2019: architecture of Awkward 1.0. + * [Summary poster](https://github.com/jpivarski/2020-02-27-irishep-poster/blob/master/pivarski-irishep-poster.pdf) on February 27, 2020: overview. + * [Presentation at SciPy 2020](http://conference.scipy.org.s3-website-us-east-1.amazonaws.com/proceedings/scipy2020/jim_pivarski.html) ([video](https://youtu.be/WlnUF3LRBj4)) on July 5, 2020: introduction beyond HEP. + * [Lessons learned in Python-C++ integration](https://indico.cern.ch/event/855454/contributions/4605044/) ([video](https://videos.cern.ch/record/2295164)) on December 1, 2021: the motivation for Awkward version 2.0. + * [Awkward Array updates](https://indico.cern.ch/event/1140031/) on April 6, 2022: an overview of Awkward version 2.0. + * [CSSI summary (poster)](https://github.com/jpivarski-talks/2022-07-25-cssi-meeting-poster/blob/main/pivarski-awkward-cssi-poster.pdf) on July 5, 2022: applications beyond HEP. + * [All about AwkwardForth](https://github.com/jpivarski-talks/2023-03-06-awkwardforth-for-atlas/blob/main/main.pdf) on March 6, 2023: how & why of the AwkwardForth minilanguage. + * [Awkward Array in Tiled](https://indico.cern.ch/event/1311780/) on October 25, 2023: integration with the Tiled database. + * [Awkward Array in Julia](https://indico.cern.ch/event/1292759/contributions/5613061/) on November 6, 2023: AwkwardArray.jl. + * [Awkward family (poster)](https://github.com/jpivarski-talks/2024-03-11-acat2024-awkward-family-poster/blob/main/pivarski-acat2024-poster.pdf) on March 11, 2024: the ecosystem built around Awkward Array. ## Tutorials -(In addition to the ones in {doc}`community-tutorials`.) - - * [Demo for Electron Ion Collider users](https://github.com/jpivarski/2020-04-08-eic-jlab#readme) ([video](https://www.youtube.com/watch?v=FoxNS6nlbD0)) on April 8, 2020. - * [Tutorial at PyHEP 2020](https://youtu.be/ea-zYLQBS4U) (video with [interactive notebook on Binder](https://mybinder.org/v2/gh/jpivarski/2020-07-13-pyhep2020-tutorial.git/1.1?urlpath=lab/tree/tutorial.ipynb)) on July 13, 2020. - * [Tutorial at PyHEP 2021](https://youtu.be/5aWAxvdrszw?t=9189) (video with [interactive notebook on Binder](https://mybinder.org/v2/gh/jpivarski-talks/2021-07-06-pyhep-uproot-awkward-tutorial/v1.2?urlpath=lab/tree/uproot-awkward-tutorial.ipynb) on July 6, 2021. - * [Tutorial for STAR collaboration meeting](https://youtu.be/NnU_zp5s1MY) on September 13, 2021 (video with [notebooks on GitHub](https://github.com/jpivarski-talks/2021-09-13-star-uproot-awkward-tutorial#readme)). This is the first tutorial with extensive exercises to test your understanding. + * [Tutorial for the Electron Ion Collider (EIC)](https://github.com/jpivarski-talks/2020-04-08-eic-jlab) ([agenda](https://indico.bnl.gov/event/8242/), [video](https://www.youtube.com/watch?v=FoxNS6nlbD0)) on April 8, 2020. + * [Uproot Awkward Columnar HATS (2020)](https://github.com/jpivarski-talks/2020-06-08-uproot-awkward-columnar-hats) ([agenda](https://indico.cern.ch/event/917675)) on June 8, 2020. + * [Tutorial at PyHEP 2020](https://github.com/jpivarski-talks/2020-07-13-pyhep2020-tutorial) ([agenda](https://indico.cern.ch/event/882824/timetable), [video](https://youtu.be/ea-zYLQBS4U)) on July 13, 2020. + * [PyHEP Numba tutorial](https://github.com/jpivarski-talks/2021-02-03-pyhep-numba-tutorial) ([agenda](https://indico.cern.ch/event/985350/), [video](https://youtu.be/X_BJrmofRWQ)) on February 3, 2021. + * [Uproot Awkward Columnar HATS (2021)](https://github.com/jpivarski-talks/2021-06-14-uproot-awkward-columnar-hats) ([agenda](https://indico.cern.ch/event/1042866/)) on June 14, 2021. + * [Tutorial at PyHEP 2021](https://github.com/jpivarski-talks/2021-07-06-pyhep-uproot-awkward-tutorial) ([agenda](https://indico.cern.ch/event/1019958/timetable/#20210705.detailed), [video](https://youtu.be/s47Nz0h0vcg)) on July 6, 2021. + * [Tutorial for the STAR collaboration](https://github.com/jpivarski-talks/2021-09-13-star-uproot-awkward-tutorial) ([video](https://youtu.be/NnU_zp5s1MY)) on September 13, 2021. + * [Loopy and Unloopy programming techniques (SciPy 2022)](https://github.com/jpivarski-talks/2022-07-11-scipy-loopy-tutorial) ([agenda](https://www.scipy2022.scipy.org/tutorials-schedule), [video](https://youtu.be/Dovyd72eD70)) on July 11, 2022. + * [CoDaS-HEP Columnar Analysis](https://github.com/jpivarski-talks/2022-08-03-codas-hep-columnar-tutorial) ([agenda](https://indico.cern.ch/event/1151367/timetable/#41-columnar-data-analysis)) on August 3, 2022. + * [HSF-India Mumbai](https://github.com/jpivarski-talks/2023-05-01-hsf-india-tutorial) ([agenda](https://indico.cern.ch/event/1254939/)) on May 1, 2023. + * [Thinking in Arrays (SciPy 2023)](https://github.com/jpivarski-talks/2023-07-11-scipy2023-tutorial-thinking-in-arrays) ([agenda](https://www.scipy2023.scipy.org/schedule/), [video](https://youtu.be/d7etLJeK20M?si=m9b3YttCtz8nP31g)) on July 11, 2023. + * [HSF-India Bhubaneswar](https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar) ([agenda](https://indico.cern.ch/event/1328624/)) on December 18, 2023. ## Citations diff --git a/docs/getting-started/pivarski-one-slide-summary.svg b/docs/getting-started/pivarski-one-slide-summary.svg new file mode 100644 index 0000000000..4a049a2461 --- /dev/null +++ b/docs/getting-started/pivarski-one-slide-summary.svg @@ -0,0 +1,1271 @@ + + + +image/svg+xmloutput += [] +for +sublist +in +python_objects +: +tmp1 += [] +for +record +in +sublist +: +tmp2 += [] +for +number +in +record +[ +"y" +][ +1: +]: +tmp2 +. +append +( +np +.square +( +number +)) +tmp1 +. +append +( +tmp2 +) +output +. +append +( +tmp1 +) +output += +np +. +square +( +array +[ +"y" +, +... +, +1: +])[ [[], [ +4 +], [ +4 +, +9 +]], [], [[ +4 +, +9 +, +16 +], [ +4 +, +9 +, +16 +, +25 +]]] +(single-threaded on a 2.2 GHz processor with adataset 10 million times larger than the one shown) +array += +ak +. +Array +([ [{ +"x" +: +1.1 +, +"y" +: [ +1 +]}, { +"x" +: +2.2 +, +"y" +: [ +1 +, +2 +]}, { +"x" +: +3.3 +, +"y" +: [ +1 +, +2 +, +3 +]}], [], [{ +"x" +: +4.4 +, +"y" +: [ +1 +, +2 +, +3 +, +4 +]}, { +"x" +: +5.5 +, +"y" +: [ +1 +, +2 +, +3 +, +4 +, +5 +]}]]) +138 seconds to run (22 GB footprint)4.6 seconds to run (2 GB footprint) +equivalent PythonNumPy-like expression + \ No newline at end of file diff --git a/docs/getting-started/thinking-in-arrays.ipynb b/docs/getting-started/thinking-in-arrays.ipynb new file mode 100644 index 0000000000..d1476cb9dc --- /dev/null +++ b/docs/getting-started/thinking-in-arrays.ipynb @@ -0,0 +1,2033 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b9607e38-b639-4670-91f3-7f9a3c767402", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Thinking in arrays" + ] + }, + { + "cell_type": "markdown", + "id": "df28b3b5-449d-4ff5-8352-66f3d9f5ec1c", + "metadata": {}, + "source": [ + "_Originally presented as [part](https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/blob/main/lesson-3-awkward/lecture-slides.ipynb) of [HSF-India training on December 18, 2023](https://indico.cern.ch/event/1328624/)._" + ] + }, + { + "cell_type": "markdown", + "id": "81f1b565-ecc3-44be-a885-01010182f7ad", + "metadata": {}, + "source": [ + "


" + ] + }, + { + "cell_type": "markdown", + "id": "bf8f76ed-e777-4e58-83c9-a37698afdc81", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "So far, all the arrays we've dealt with have been rectangular (in $n$ dimensions; \"rectilinear\").\n", + "\n", + "![](8-layer_cube.jpg\")" + ] + }, + { + "cell_type": "markdown", + "id": "f109c5fc-f23a-434a-b5ed-27b5ce7a1d17", + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "What if we had data like this?\n", + "\n", + "```json\n", + "[\n", + " [[1.84, 0.324]],\n", + " [[-1.609, -0.713, 0.005], [0.953, -0.993, 0.011, 0.718]],\n", + " [[0.459, -1.517, 1.545], [0.33, 0.292]],\n", + " [[-0.376, -1.46, -0.206], [0.65, 1.278]],\n", + " [[], [], [1.617]],\n", + " []\n", + "]\n", + "[\n", + " [[-0.106, 0.611]],\n", + " [[0.118, -1.788, 0.794, 0.658], [-0.105]]\n", + "]\n", + "[\n", + " [[-0.384], [0.697, -0.856]],\n", + " [[0.778, 0.023, -1.455, -2.289], [-0.67], [1.153, -1.669, 0.305, 1.517, -0.292]]\n", + "]\n", + "[\n", + " [[0.205, -0.355], [-0.265], [1.042]],\n", + " [[-0.004], [-1.167, -0.054, 0.726, 0.213]],\n", + " [[1.741, -0.199, 0.827]]\n", + "]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "8ee85cb4-c8fe-457b-99b4-b07ccf60e6bb", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "What if we had data like this?\n", + "\n", + "```json\n", + "[\n", + " {\"fill\": \"#b1b1b1\", \"stroke\": \"none\", \"points\": [{\"x\": 5.27453, \"y\": 1.03276},\n", + " {\"x\": -3.51280, \"y\": 1.74849}]},\n", + " {\"fill\": \"#b1b1b1\", \"stroke\": \"none\", \"points\": [{\"x\": 8.21630, \"y\": 4.07844},\n", + " {\"x\": -0.79157, \"y\": 3.49478}, {\"x\": 16.38932, \"y\": 5.29399},\n", + " {\"x\": 10.38641, \"y\": 0.10832}, {\"x\": -2.07070, \"y\": 14.07140},\n", + " {\"x\": 9.57021, \"y\": -0.94823}, {\"x\": 1.97332, \"y\": 3.62380},\n", + " {\"x\": 5.66760, \"y\": 11.38001}, {\"x\": 0.25497, \"y\": 3.39276},\n", + " {\"x\": 3.86585, \"y\": 6.22051}, {\"x\": -0.67393, \"y\": 2.20572}]},\n", + " {\"fill\": \"#d0d0ff\", \"stroke\": \"none\", \"points\": [{\"x\": 3.59528, \"y\": 7.37191},\n", + " {\"x\": 0.59192, \"y\": 2.91503}, {\"x\": 4.02932, \"y\": -1.13601},\n", + " {\"x\": -1.01593, \"y\": 1.95894}, {\"x\": 1.03666, \"y\": 0.05251}]},\n", + " {\"fill\": \"#d0d0ff\", \"stroke\": \"none\", \"points\": [{\"x\": -8.78510, \"y\": -0.00497},\n", + " {\"x\": -15.22688, \"y\": 3.90244}, {\"x\": 5.74593, \"y\": 4.12718}]},\n", + " {\"fill\": \"none\", \"stroke\": \"#000000\", \"points\": [{\"x\": 4.40625, \"y\": -6.953125},\n", + " {\"x\": 4.34375, \"y\": -7.09375}, {\"x\": 4.3125, \"y\": -7.140625},\n", + " {\"x\": 4.140625, \"y\": -7.140625}]},\n", + " {\"fill\": \"none\", \"stroke\": \"#808080\", \"points\": [{\"x\": 0.46875, \"y\": -0.09375},\n", + " {\"x\": 0.46875, \"y\": -0.078125}, {\"x\": 0.46875, \"y\": 0.53125}]}\n", + "]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "a4a85956-6430-497f-8357-9741d602a6df", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "What if we had data like this?\n", + "\n", + "```json\n", + "[\n", + " {\"movie\": \"Evil Dead\", \"year\": 1981, \"actors\":\n", + " [\"Bruce Campbell\", \"Ellen Sandweiss\", \"Richard DeManincor\", \"Betsy Baker\"]\n", + " },\n", + " {\"movie\": \"Darkman\", \"year\": 1900, \"actors\":\n", + " [\"Liam Neeson\", \"Frances McDormand\", \"Larry Drake\", \"Bruce Campbell\"]\n", + " },\n", + " {\"movie\": \"Army of Darkness\", \"year\": 1992, \"actors\":\n", + " [\"Bruce Campbell\", \"Embeth Davidtz\", \"Marcus Gilbert\", \"Bridget Fonda\",\n", + " \"Ted Raimi\", \"Patricia Tallman\"]\n", + " },\n", + " {\"movie\": \"A Simple Plan\", \"year\": 1998, \"actors\":\n", + " [\"Bill Paxton\", \"Billy Bob Thornton\", \"Bridget Fonda\", \"Brent Briscoe\"]\n", + " },\n", + " {\"movie\": \"Spider-Man 2\", \"year\": 2004, \"actors\":\n", + " [\"Tobey Maguire\", \"Kristen Dunst\", \"Alfred Molina\", \"James Franco\",\n", + " \"Rosemary Harris\", \"J.K. Simmons\", \"Stan Lee\", \"Bruce Campbell\"]\n", + " },\n", + " {\"movie\": \"Drag Me to Hell\", \"year\": 2009, \"actors\":\n", + " [\"Alison Lohman\", \"Justin Long\", \"Lorna Raver\", \"Dileep Rao\", \"David Paymer\"]\n", + " }\n", + "]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "704a1603-85f5-4680-a9b3-28137f2e7cea", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "What if we had data like this?\n", + "\n", + "```json\n", + "[\n", + " {\"run\": 1, \"luminosityBlock\": 156, \"event\": 46501,\n", + " \"PV\": {\"x\": 0.243, \"y\": 0.393, \"z\": 1.451},\n", + " \"electron\": [],\n", + " \"muon\": [\n", + " {\"pt\": 63.043, \"eta\": -0.718, \"phi\": 2.968, \"mass\": 0.105, \"charge\": 1},\n", + " {\"pt\": 38.120, \"eta\": -0.879, \"phi\": -1.032, \"mass\": 0.105, \"charge\": -1},\n", + " {\"pt\": 4.048, \"eta\": -0.320, \"phi\": 1.038, \"mass\": 0.105, \"charge\": 1}\n", + " ],\n", + " \"MET\": {\"pt\": 21.929, \"phi\": -2.730}\n", + " },\n", + " {\"run\": 1, \"luminosityBlock\": 156, \"event\": 46502,\n", + " \"PV\": {\"x\": 0.244, \"y\": 0.395, \"z\": -2.879},\n", + " \"electron\": [\n", + " {\"pt\": 21.902, \"eta\": -0.702, \"phi\": 0.133, \"mass\": 0.005, \"charge\": 1},\n", + " {\"pt\": 42.632, \"eta\": -0.979, \"phi\": -1.863, \"mass\": 0.008, \"charge\": 1},\n", + " {\"pt\": 78.012, \"eta\": -0.933, \"phi\": -2.207, \"mass\": 0.018, \"charge\": -1},\n", + " {\"pt\": 23.835, \"eta\": -1.362, \"phi\": -0.621, \"mass\": 0.008, \"charge\": -1}\n", + " ],\n", + " \"muon\": [],\n", + " \"MET\": {\"pt\": 16.972, \"phi\": 2.866}},\n", + " ...\n", + "]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "bcf2ffb7-dba0-477d-933f-ce8d178db643", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "It might be possible to turn these datasets into tabular form using surrogate keys and database normalization, but\n", + "\n", + " * they could be inconvenient or less efficient in that form, depending on what we want to do,\n", + " * they were very likely _given_ in a ragged/untidy form. You can't ignore the data-cleaning step!" + ] + }, + { + "cell_type": "markdown", + "id": "22e1411d-4ed6-4fe3-8be6-a63e2125162a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "308210c5-eeda-4a3b-a420-a097a25d3248", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Dealing with these datasets as JSON or Python objects is inefficient for the same reason as for lists of numbers." + ] + }, + { + "cell_type": "markdown", + "id": "44c2b36f-3cba-4cef-a821-b2647c497070", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "9871edb0-31fe-485a-a8e5-922a6dc1e187", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We want arbitrary data structure with array-oriented interface and performance..." + ] + }, + { + "cell_type": "markdown", + "id": "c579226d-b2b7-4519-8b6a-c160597db348", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "![](awkward-motivation-venn-diagram.svg)" + ] + }, + { + "cell_type": "markdown", + "id": "a6590eff-e876-4c50-811d-51b870541911", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "## Libraries for irregular arrays" + ] + }, + { + "cell_type": "markdown", + "id": "f3df31d7-0185-414f-9c31-7e08a88fdce3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "c7aa6b97-c9ff-4b3c-8237-4888eb0a5f35", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "![](logo-arrow.svg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "853cd417-4bd2-413c-83d2-4bfb6b15d1c7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pyarrow as pa" + ] + }, + { + "cell_type": "markdown", + "id": "7c75130d-065b-4ed7-a8d3-df4b83eb6e8f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2233d1b-d680-4ee0-9d84-351422128a77", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "arrow_array = pa.array([\n", + " [{\"x\": 1.1, \"y\": [1]}, {\"x\": 2.2, \"y\": [1, 2]}, {\"x\": 3.3, \"y\": [1, 2, 3]}],\n", + " [],\n", + " [{\"x\": 4.4, \"y\": [1, 2, 3, 4]}, {\"x\": 5.5, \"y\": [1, 2, 3, 4, 5]}]\n", + "])" + ] + }, + { + "cell_type": "markdown", + "id": "f4aa8cb3-e350-4fbc-b5a2-76ad1d409c00", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc80bf50-aea5-41df-b5f0-9a35d2bae75f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "arrow_array.type" + ] + }, + { + "cell_type": "markdown", + "id": "c796e4b8-aa92-44aa-bb2e-8742550dc88a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e77c680-bd02-45c2-a899-e8af53bfcafc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "arrow_array" + ] + }, + { + "cell_type": "markdown", + "id": "830211c5-e747-4bdb-9beb-d90a07895f10", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "6e5b9cd8-cce1-427a-a97d-3707f9e2016a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "![](logo-awkward.svg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77739aec-06d9-4164-aa9b-86075c86874e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import awkward as ak" + ] + }, + { + "cell_type": "markdown", + "id": "c6212da5-dbdc-43ac-9859-fba6b7c66827", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87d9dff3-3107-45b0-b636-ae840e4ed638", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "awkward_array = ak.from_arrow(arrow_array)\n", + "awkward_array" + ] + }, + { + "cell_type": "markdown", + "id": "5ec486a3-44fd-436e-ac1d-eab6ddbe8c08", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "fdd4ce8c-3e17-44c3-9f7b-b18ac5e2449d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "![](logo-parquet.svg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "698e3514-659e-4392-a0cd-79ca4741449e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.to_parquet(awkward_array, \"/tmp/file.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "5e9be1a7-0ad7-4f25-a082-fd6ece2de7e7", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa676558-b980-42ef-bdc8-f7a2d9248684", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ak.from_parquet(\"/tmp/file.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "b49f0d23-016b-440c-8b25-f0fdd859ba5a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "## Awkward Array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ed60a0e-315f-4f39-917e-b23f998fffe1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ragged = ak.Array([\n", + " [\n", + " [[1.84, 0.324]],\n", + " [[-1.609, -0.713, 0.005], [0.953, -0.993, 0.011, 0.718]],\n", + " [[0.459, -1.517, 1.545], [0.33, 0.292]],\n", + " [[-0.376, -1.46, -0.206], [0.65, 1.278]],\n", + " [[], [], [1.617]],\n", + " []\n", + " ],\n", + " [\n", + " [[-0.106, 0.611]],\n", + " [[0.118, -1.788, 0.794, 0.658], [-0.105]]\n", + " ],\n", + " [\n", + " [[-0.384], [0.697, -0.856]],\n", + " [[0.778, 0.023, -1.455, -2.289], [-0.67], [1.153, -1.669, 0.305, 1.517, -0.292]]\n", + " ],\n", + " [\n", + " [[0.205, -0.355], [-0.265], [1.042]],\n", + " [[-0.004], [-1.167, -0.054, 0.726, 0.213]],\n", + " [[1.741, -0.199, 0.827]]\n", + " ]\n", + "])" + ] + }, + { + "cell_type": "markdown", + "id": "e2bc442c-dedb-4aa6-be20-bc7c5e47bab3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Multidimensional indexing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a7165ea-fff7-4035-b18d-c05a2c2ca97d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ragged[3, 1, -1, 2]" + ] + }, + { + "cell_type": "markdown", + "id": "935b1dba-34ef-4e07-b175-be8b754010ef", + "metadata": {}, + "source": [ + "
\n", + "\n", + "**Basic slicing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ac1c3e3-741d-4855-9636-9e98f6690fa5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ragged[3, 1:, -1, 1:3]" + ] + }, + { + "cell_type": "markdown", + "id": "4e382ae9-81db-486f-9078-22295781c739", + "metadata": {}, + "source": [ + "
\n", + "\n", + "**Advanced slicing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6322cb65-eb27-45e6-abc5-9f35f81661cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ragged[[False, False, True, True], [0, -1, 0, -1], 0, -1]" + ] + }, + { + "cell_type": "markdown", + "id": "0adeb6c9-65c4-4d35-85f4-91f986b7291d", + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Awkward slicing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13105929-1f22-472a-9138-44ec73d52231", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ragged > 0" + ] + }, + { + "cell_type": "markdown", + "id": "dd809703-6d16-42b6-9d89-63d4e72bc40c", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55ea9417-62f6-4223-be2b-26e1abc4c42a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ragged[ragged > 0]" + ] + }, + { + "cell_type": "markdown", + "id": "9f9ee115-1343-469a-93ff-2cf50a432b7c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Reductions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d105c3cf-5b55-44cd-a163-b5d27767d227", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.sum(ragged)" + ] + }, + { + "cell_type": "markdown", + "id": "9f003781-8876-4d4e-ad14-851f7d40fd9e", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f7d467e-1fc2-4aed-8721-32eb411a1e6a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ak.sum(ragged, axis=-1)" + ] + }, + { + "cell_type": "markdown", + "id": "acdc2d38-2f6b-4531-893d-fbc189d54c64", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b7c68f-a1d3-41d0-9ca1-58e607753186", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.sum(ragged, axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "8f37d2af-c67e-4853-b672-793eee0763c1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "How do we even define reductions on an array with variable length lists?" + ] + }, + { + "cell_type": "markdown", + "id": "82441d9a-3242-4b21-b486-6ef36e35a69f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "![](example-reducer-2d.svg)" + ] + }, + { + "cell_type": "markdown", + "id": "d11a1d81-20f2-45fe-8767-09d95fa9e1c9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "How do we even define reductions on an array with variable length lists?" + ] + }, + { + "cell_type": "markdown", + "id": "f1377dfb-acd8-4a6f-aa3b-47b544c90645", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "![](example-reducer-ragged.svg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdff4345-b3a2-453a-9d8b-d47d03aca1e9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "array = ak.Array([[ 1, 2, 3, 4],\n", + " [ 10, None, 30 ],\n", + " [ 100, 200 ]])" + ] + }, + { + "cell_type": "markdown", + "id": "7474589a-4b79-4baa-98de-7d6c457a43d1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac7395e3-dbe9-479f-b595-a37fcb8cc977", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.sum(array, axis=0).tolist()" + ] + }, + { + "cell_type": "markdown", + "id": "ad53c2cb-b550-4158-8e44-11303e085050", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36727ca1-d85d-42aa-968e-04a4f7eb98ce", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.sum(array, axis=1).tolist()" + ] + }, + { + "cell_type": "markdown", + "id": "473f41e3-2c52-48de-8536-33210b5bca13", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "72050082-e01c-4eb2-84b7-220a85fff881", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "source": [ + "(You almost always want the deepest/maximum `axis`, which you can get with `axis=-1`.)" + ] + }, + { + "cell_type": "markdown", + "id": "3f335dcf-b128-4b4d-bbdb-dae52c43c03b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "bd72b617-a540-4ff8-9faa-cfc24befda51", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "### Awkward Arrays in particle physics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c8b9fb5-d73d-4689-b8c1-46e11bb73f5f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import uproot\n", + "\n", + "file = uproot.open(\"https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/raw/main/data/SMHiggsToZZTo4L.root\")\n", + "file" + ] + }, + { + "cell_type": "markdown", + "id": "c1a51aef-3bb9-4c25-8af7-388f0732bdef", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b25cb34b-ff02-4726-88b3-b33b51cc3a81", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tree = file[\"Events\"]\n", + "tree" + ] + }, + { + "cell_type": "markdown", + "id": "42fec548-3363-4eb1-8297-5a7e130815d1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bb09f4f-eb47-4f61-9614-a17fd89afa61", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tree.arrays(entry_stop=100)" + ] + }, + { + "cell_type": "markdown", + "id": "4b5fe20f-b93c-4e99-9251-9728a20cb767", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "The same data fits into Parquet files (a little more easily)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "296ae255-f894-4430-932c-a3ec67863897", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events = ak.from_parquet(\"https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/raw/main/data/SMHiggsToZZTo4L.parquet\")\n", + "events" + ] + }, + { + "cell_type": "markdown", + "id": "85dc100c-200a-455a-9827-e76b92a0b752", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "View the first event as Python lists and dicts (like JSON)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30661cff-00f8-4556-8a6e-b25642f13170", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events[0].to_list()" + ] + }, + { + "cell_type": "markdown", + "id": "6663c8f3-f84f-4185-bd79-a9c7f7b4f034", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Get one numeric field (also known as \"column\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68a0c07c-e7f3-4e98-a368-4c8899c833e2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events.electron.pt" + ] + }, + { + "cell_type": "markdown", + "id": "f21e3de6-8631-4cbf-bb39-f91f6e9abfb5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Compute something ($p_z = p_T \\sinh\\eta$)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b97c4bcf-58da-4e0a-a47d-c1c4ae1c9da8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "events.electron.pt * np.sinh(events.electron.eta)" + ] + }, + { + "cell_type": "markdown", + "id": "f3873822-10c9-4cc9-bbd9-7b7986695900", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Note that the Vector library works with Awkward Arrays, if it is imported this way:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f1de55-90cd-4a61-9627-9a4c110855ec", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import vector\n", + "vector.register_awkward()" + ] + }, + { + "cell_type": "markdown", + "id": "c032df1d-bc42-40ed-ab64-fa51726e5730", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "5c343480-9a4a-428d-b695-f9748e0e4641", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Records with `name=\"Momentum4D\"` and fields with coordinate names (`px`, `py`, `pz`, `E` or `pt`, `phi`, `eta`, `m`) automatically get Vector properties and methods." + ] + }, + { + "cell_type": "markdown", + "id": "a91c9d45-ecce-4b87-91db-0539b233a76f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8eb191-b606-4809-bfdf-ed4c690cbc6a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events.electron.type.show()" + ] + }, + { + "cell_type": "markdown", + "id": "311f608b-66f4-4b12-9426-f24a16bc8525", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13be5a96-b4f3-4b95-afe9-c2941778aefb", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# implicitly computes pz = pt * sinh(eta)\n", + "events.electron.pz" + ] + }, + { + "cell_type": "markdown", + "id": "7a992469-b5e0-402e-bef1-ee46ed234627", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "To make histograms or other plots, we need numbers without structure, so {func}`ak.flatten` the array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c24df813-578a-4f81-8934-46d657c11024", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from hist import Hist\n", + "\n", + "Hist.new.Regular(100, 0, 100, name=\" \").Double().fill(\n", + " ak.flatten(events.electron.pt)\n", + ").plot();" + ] + }, + { + "cell_type": "markdown", + "id": "d573f2e8-c0c2-4e23-b5c1-4cc4c4d87541", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Each event has a different number of electrons and muons ({func}`ak.num` to check)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf7a2d4a-2386-4f6b-bde0-c563c162b5f0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.num(events.electron), ak.num(events.muon)" + ] + }, + { + "cell_type": "markdown", + "id": "81d4d4c4-7235-493a-a293-ff4d45c76ba7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "source": [ + "
\n", + "\n", + "So what happens if we try to compute something with the electrons' $p_T$ and the muons' $\\eta$?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6efa2427-2732-45af-928e-0d649b7f92c2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "events.electron.pt * np.sinh(events.muon.eta)" + ] + }, + { + "cell_type": "markdown", + "id": "ad8d0d1a-722d-4eb7-b341-839be6919cf6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "This is data structure-aware, array-oriented programming." + ] + }, + { + "cell_type": "markdown", + "id": "c31d978c-f829-4e30-8558-01e14bab641a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Application:** Filtering events with an array of booleans." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbd40cdb-3764-4201-8384-714103194ca7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events.MET.pt, events.MET.pt > 20" + ] + }, + { + "cell_type": "markdown", + "id": "3a886516-e4ba-4fa1-9119-7a450a84562a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fa99f53-9b84-44ad-b0c7-39ac39fc15b3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "len(events), len(events[events.MET.pt > 20])" + ] + }, + { + "cell_type": "markdown", + "id": "20b44fe8-9311-4c47-b7de-4bcdff4d079f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "source": [ + "
\n", + "\n", + "**Application:** Filtering particles with an array of lists of booleans." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2319e94e-9bd7-490e-b6c8-c5f560dacebf", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events.electron.pt, events.electron.pt > 30" + ] + }, + { + "cell_type": "markdown", + "id": "4ff7bfda-1752-4f3e-9085-08f3e9f64c1b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f7b75c8-248e-43da-97be-c6525c36db62", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.num(events.electron), ak.num(events.electron[events.electron.pt > 30])" + ] + }, + { + "cell_type": "markdown", + "id": "4d48aa4f-49bb-455c-b664-3cb405064ab0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Quizlet:** Using the reducer {func}`ak.any`, how would we select _events_ in which any electron has $p_T > 30$ GeV/c$^2$?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85339038-b580-4eb7-9421-fb32d2a402d1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "events.electron[events.electron.pt > 30]" + ] + }, + { + "cell_type": "markdown", + "id": "e5bd3b3f-ef4b-48d4-873d-ff831a85a768", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "id": "28a196d8-56d8-4ee7-ae13-9622865a8076", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Awkward Array has two combinatorial primitives:" + ] + }, + { + "cell_type": "markdown", + "id": "e143afb0-4a0f-4188-8aa6-6385a1f7a816", + "metadata": {}, + "source": [ + "{func}`ak.cartesian` takes a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of lists from $N$ different arrays, producing an array of lists of $N$-tuples.\n", + "\n", + "{func}`ak.combinations` takes $N$ [samples without replacement](http://prob140.org/sp18/textbook/notebooks-md/5_04_Sampling_Without_Replacement.html) of lists from a single array, producing an array of lists of $N$-tuples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3a4d8d1-d210-4fe6-9b2e-6fe3c6c064af", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "numbers = ak.Array([[1, 2, 3], [], [4]])\n", + "letters = ak.Array([[\"a\", \"b\"], [\"c\"], [\"d\", \"e\"]])" + ] + }, + { + "cell_type": "markdown", + "id": "c27d876b-cd7f-4be5-9a88-5f75f4b5bc80", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a188c11-5af4-4bac-9d3d-273b98e62d44", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.cartesian([numbers, letters])" + ] + }, + { + "cell_type": "markdown", + "id": "f60ba73e-1b00-4b2f-9367-318b52f251b3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af07b963-cc1d-4367-ba48-ffe4da4303e5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "values = ak.Array([[1.1, 2.2, 3.3, 4.4], [], [5.5, 6.6]])" + ] + }, + { + "cell_type": "markdown", + "id": "18a0a5f3-3f49-4f77-a09c-fd5b5748b1df", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5713cc98-4ba3-4087-b9a3-506838b6b020", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.combinations(values, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "917c2815-6508-474e-b58b-39b57b37aa7e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "Often, it's useful to separate the separate the left-hand sides and right-hand sides of these pairs with {func}`ak.unzip`, so they can be used in mathematical expressions." + ] + }, + { + "cell_type": "markdown", + "id": "dc11ada3-92b0-4f7f-9118-a32ba41dc0d2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7bac853-59b1-4a52-887a-83c5ec845e32", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "electron_muon_pairs = ak.cartesian([events.electron, events.muon])\n", + "electron_muon_pairs.type.show()" + ] + }, + { + "cell_type": "markdown", + "id": "1a247a4c-01a6-4739-a828-00ec20ba2e7d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51960f1b-eeef-444c-860d-473ad7083396", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "electron_in_pair, muon_in_pair = ak.unzip(electron_muon_pairs)\n", + "electron_in_pair.type.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8403df4a-0a79-4220-ab3b-b538e818aa7d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4d696a8-a5a3-47b5-bdd9-a2a2f21b4403", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "electron_in_pair.pt, muon_in_pair.pt" + ] + }, + { + "cell_type": "markdown", + "id": "832690e2-7c09-4c18-989e-3fcdc6c90358", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "886f1267-6b6b-4a37-ba8c-2f2680c91e4e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "fragment" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ak.num(electron_in_pair), ak.num(muon_in_pair)" + ] + }, + { + "cell_type": "markdown", + "id": "eceaad8c-2fb3-4e32-bad7-12b1a6e45034", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "To use Vector's `deltaR` method ($\\Delta R = \\sqrt{\\Delta\\phi^2 + \\Delta\\eta^2}$), we need to have the electrons and muons in separate arrays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32eecc20-2397-40c8-a6b1-b03b6211906b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "electron_in_pair, muon_in_pair = ak.unzip(ak.cartesian([events.electron, events.muon]))" + ] + }, + { + "cell_type": "markdown", + "id": "dd3a02fa-1b23-40cd-923c-9fb92341eb8e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d193dfa0-2027-420a-a583-7f6fe35826b7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "electron_in_pair.deltaR(muon_in_pair)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "464996c0-0a00-4c9f-b3d9-dc6baf70d96a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "first_electron_in_pair, second_electron_in_pair = ak.unzip(ak.combinations(events.electron, 2))" + ] + }, + { + "cell_type": "markdown", + "id": "c05fe323-5350-44b0-858f-849ca4ac4e76", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de6d4ddc-8410-443f-8c7c-ba669d30b233", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "first_electron_in_pair.deltaR(second_electron_in_pair)" + ] + }, + { + "cell_type": "markdown", + "id": "076cb4a3-c407-4d50-a4d4-3cdcc6b977dd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "source": [ + "**Quizlet:** What's this?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5fcbc46-1bd9-4944-9321-df47e9e2845d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "(first_electron_in_pair + second_electron_in_pair).mass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c28f5f07-70eb-4827-99a5-cf5811f6bf68", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "Hist.new.Reg(120, 0, 120, name=\"mass (GeV)\").Double().fill(\n", + " ak.flatten((first_electron_in_pair + second_electron_in_pair).mass, axis=-1)\n", + ").plot();" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/getting-started/try-awkward-array.md b/docs/getting-started/try-awkward-array.md deleted file mode 100644 index 16aef0165d..0000000000 --- a/docs/getting-started/try-awkward-array.md +++ /dev/null @@ -1,9 +0,0 @@ -# Try it - - diff --git a/docs/getting-started/uproot-awkward-columnar-hats.ipynb b/docs/getting-started/uproot-awkward-columnar-hats.ipynb new file mode 100644 index 0000000000..dd91184423 --- /dev/null +++ b/docs/getting-started/uproot-awkward-columnar-hats.ipynb @@ -0,0 +1,717 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ae14d3ce-0dc5-4a62-bed2-a0f75e72887c", + "metadata": {}, + "source": [ + "# Uproot Awkward Columnar HATS" + ] + }, + { + "cell_type": "markdown", + "id": "d2fa2b35-5e1c-4be2-aa58-385f3b370683", + "metadata": {}, + "source": [ + "_Originally presented as [part](https://github.com/jpivarski-talks/2021-06-14-uproot-awkward-columnar-hats/blob/main/3-awkward-array.ipynb) of [CMS HATS training on June 14, 2021](https://indico.cern.ch/event/1042866/)._" + ] + }, + { + "cell_type": "markdown", + "id": "0f98f8c2-91ce-4a15-b06a-c20f1d40256b", + "metadata": {}, + "source": [ + "




" + ] + }, + { + "cell_type": "markdown", + "id": "8da7642a-d311-488c-be06-8fd51114b71c", + "metadata": {}, + "source": [ + "## What about an array of lists?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8e59475-25c6-41b1-a37c-3553517b3a98", + "metadata": {}, + "outputs": [], + "source": [ + "import skhep_testdata\n", + "import awkward as ak\n", + "import numpy as np\n", + "import uproot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3a79fec-71a0-40fd-83c6-0c3369cf7597", + "metadata": {}, + "outputs": [], + "source": [ + "events = uproot.open(skhep_testdata.data_path(\"uproot-HZZ.root\"))[\"events\"]\n", + "events.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39208788-6a41-4afe-be49-9b42321a899f", + "metadata": {}, + "outputs": [], + "source": [ + "events[\"Muon_Px\"].array()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98d28d9b-96bd-4316-b26c-42f12db10614", + "metadata": {}, + "outputs": [], + "source": [ + "events[\"Muon_Px\"].array(entry_stop=20).tolist()" + ] + }, + { + "cell_type": "markdown", + "id": "e163f018-cd77-47eb-be1a-e90e8252a796", + "metadata": {}, + "source": [ + "This is what Awkward Array was made for. NumPy's equivalent is cumbersome and inefficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a406416-f5b6-49aa-afb9-720446a8b990", + "metadata": {}, + "outputs": [], + "source": [ + "jagged_numpy = events[\"Muon_Px\"].array(entry_stop=20, library=\"np\")\n", + "jagged_numpy" + ] + }, + { + "cell_type": "markdown", + "id": "0f6366b4-a61f-4b59-9574-ce2a203d6d39", + "metadata": {}, + "source": [ + "What if I want the first item in each list as an array?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d93944f5-cdc2-4280-82a3-9bc4865e2f25", + "metadata": {}, + "outputs": [], + "source": [ + "np.array([x[0] for x in jagged_numpy])" + ] + }, + { + "cell_type": "markdown", + "id": "f611c7ac-aa33-4446-aea9-4dc1224e488a", + "metadata": {}, + "source": [ + "This violates the rule from [1-python-performance.ipynb](https://github.com/jpivarski-talks/2021-06-14-uproot-awkward-columnar-hats/blob/main/1-python-performance.ipynb): don't iterate in Python." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8bb7a35-4a43-4956-ab78-613742726ae5", + "metadata": {}, + "outputs": [], + "source": [ + "jagged_awkward = events[\"Muon_Px\"].array(entry_stop=20, library=\"ak\")\n", + "jagged_awkward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d7617c-5951-448b-8202-77ee6ae4354b", + "metadata": {}, + "outputs": [], + "source": [ + "jagged_awkward[:, 0]" + ] + }, + { + "cell_type": "markdown", + "id": "0c1cdce9-4b40-4878-a3a9-a42cf858a910", + "metadata": {}, + "source": [ + "




" + ] + }, + { + "cell_type": "markdown", + "id": "237987c8-97ff-4002-adf1-b735ff0bc640", + "metadata": {}, + "source": [ + "## Awkward Array is a general-purpose library: NumPy-like idioms on JSON-like data" + ] + }, + { + "cell_type": "markdown", + "id": "9eaca985-580b-4564-a9be-a05cf434fb89", + "metadata": {}, + "source": [ + "![](pivarski-one-slide-summary.svg)" + ] + }, + { + "cell_type": "markdown", + "id": "93577b1f-2008-4ae1-a4d9-d78da0859d44", + "metadata": {}, + "source": [ + "




" + ] + }, + { + "cell_type": "markdown", + "id": "3632e9fe-91c7-4319-9041-0abda61b0a62", + "metadata": {}, + "source": [ + "## Main idea: slicing through structure is computationally inexpensive" + ] + }, + { + "cell_type": "markdown", + "id": "bebb13ec-3c82-4c85-a4fa-8668fbe383f4", + "metadata": {}, + "source": [ + "Slicing by field name doesn't modify any large buffers and [ak.zip](https://awkward-array.readthedocs.io/en/latest/_auto/ak.zip.html) only scans them to ensure they're compatible (not even that if `depth_limit=1`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c272052a-1a9e-4fe3-951b-db38a6cceb40", + "metadata": {}, + "outputs": [], + "source": [ + "array = events.arrays()\n", + "array" + ] + }, + { + "cell_type": "markdown", + "id": "d93d9d83-a5f6-49d2-a1d6-9e985b94465c", + "metadata": {}, + "source": [ + "Think of this as zero-cost:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c6f8d39-75f3-4d6e-867f-c60bd16d83ba", + "metadata": {}, + "outputs": [], + "source": [ + "array.Muon_Px, array.Muon_Py, array.Muon_Pz" + ] + }, + { + "cell_type": "markdown", + "id": "e2ed505d-6eca-4807-b43b-880ed4c4fd0c", + "metadata": {}, + "source": [ + "Think of this as zero-cost:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684275da-c070-4523-ab77-1f4e1727cf0e", + "metadata": {}, + "outputs": [], + "source": [ + "ak.zip({\"px\": array.Muon_Px, \"py\": array.Muon_Py, \"pz\": array.Muon_Pz})" + ] + }, + { + "cell_type": "markdown", + "id": "f534ea92-4d94-4265-9166-c3789548cfb1", + "metadata": {}, + "source": [ + "(The above is a manual version of `how=\"zip\"`.)" + ] + }, + { + "cell_type": "markdown", + "id": "74f6e268-26ff-45ff-af49-24e1fc4be70c", + "metadata": {}, + "source": [ + "


\n", + "\n", + "NumPy ufuncs work on these arrays (if they're \"[broadcastable](https://awkward-array.readthedocs.io/en/latest/_auto/ak.broadcast_arrays.html)\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c107261e-e687-4844-aad3-65ce162531c3", + "metadata": {}, + "outputs": [], + "source": [ + "np.sqrt(array.Muon_Px**2 + array.Muon_Py**2)" + ] + }, + { + "cell_type": "markdown", + "id": "9f96c45a-dac4-4bf8-bc0d-e8e539129ee4", + "metadata": {}, + "source": [ + "


\n", + "\n", + "And there are specialized operations that only make sense in a variable-length context.\n", + "\n", + "{func}`ak.cartesian`\n", + "\n", + "![](cartoon-cartesian.png)\n", + "\n", + "{func}`ak.combinations`\n", + "\n", + "![](cartoon-combinations.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "020c5f92-13d5-48b6-a29d-8ae19827becf", + "metadata": {}, + "outputs": [], + "source": [ + "ak.cartesian((array.Muon_Px, array.Jet_Px))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7976a9d-4e14-4f71-821b-b07659701bec", + "metadata": {}, + "outputs": [], + "source": [ + "ak.combinations(array.Muon_Px, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "b836817e-1fea-405a-ae1e-92ba4f6c09cb", + "metadata": {}, + "source": [ + "




" + ] + }, + { + "cell_type": "markdown", + "id": "2b021955-b508-4fe1-9e91-98cd5ab93241", + "metadata": {}, + "source": [ + "## Arrays can have custom [behavior](https://awkward-array.readthedocs.io/en/latest/ak.behavior.html)" + ] + }, + { + "cell_type": "markdown", + "id": "f7f35dea-5745-4953-8053-36d744a5c196", + "metadata": {}, + "source": [ + "The following come from the new [Vector](https://github.com/scikit-hep/vector#readme) library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a09f2884-dc3b-4f15-a4e0-3afbcd77a984", + "metadata": {}, + "outputs": [], + "source": [ + "import vector\n", + "vector.register_awkward()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f497377-398d-4c11-ad08-8483c61f2239", + "metadata": {}, + "outputs": [], + "source": [ + "muons = ak.zip({\"px\": array.Muon_Px, \"py\": array.Muon_Py, \"pz\": array.Muon_Pz, \"E\": array.Muon_E}, with_name=\"Momentum4D\")\n", + "muons" + ] + }, + { + "cell_type": "markdown", + "id": "3099e3d5-2dc6-41ec-8cb9-372923904c45", + "metadata": {}, + "source": [ + "This is an array of lists of vectors, and methods like `pt`, `eta`, `phi` apply through the whole array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "877cac01-693b-435d-9e6f-f67325cbe9d0", + "metadata": {}, + "outputs": [], + "source": [ + "muons.pt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67d0cb46-a576-4de4-8968-f7734a049fad", + "metadata": {}, + "outputs": [], + "source": [ + "muons.eta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "830140bb-fe03-4c78-8178-f15a8748dd60", + "metadata": {}, + "outputs": [], + "source": [ + "muons.phi" + ] + }, + { + "cell_type": "markdown", + "id": "7e56579b-d3e2-4fa3-9774-da3f15fbe0a5", + "metadata": {}, + "source": [ + "


" + ] + }, + { + "cell_type": "markdown", + "id": "a4e6b57c-09d2-4b2d-b112-8a89f04c9e75", + "metadata": {}, + "source": [ + "Let's try an example: ΔR(muons, jets)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0646a94b-7e04-48e5-b1b9-cf347e0b16d7", + "metadata": {}, + "outputs": [], + "source": [ + "jets = ak.zip({\"px\": array.Jet_Px, \"py\": array.Jet_Py, \"pz\": array.Jet_Pz, \"E\": array.Jet_E}, with_name=\"Momentum4D\")\n", + "jets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38732ecc-4850-4e11-956e-7413a0845cbb", + "metadata": {}, + "outputs": [], + "source": [ + "ak.num(muons), ak.num(jets)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1933b22d-42fe-45dd-a0cd-cfbf949053bc", + "metadata": {}, + "outputs": [], + "source": [ + "ms, js = ak.unzip(ak.cartesian((muons, jets)))\n", + "ms, js" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac76799e-b86d-4872-a09f-aeb9d3ed6fb7", + "metadata": {}, + "outputs": [], + "source": [ + "ak.num(ms), ak.num(js)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0206ebe6-b580-4872-9b16-58d606e92b09", + "metadata": {}, + "outputs": [], + "source": [ + "ms.deltaR(js)" + ] + }, + { + "cell_type": "markdown", + "id": "12b2c1e7-5cfd-44b8-8870-d37878422a28", + "metadata": {}, + "source": [ + "


\n", + "\n", + "And another: muon pairs (all combinations, not just the first two per event)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b366c0b7-a4e3-4ebc-b4e2-a6150019ca16", + "metadata": {}, + "outputs": [], + "source": [ + "ak.num(muons)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207bf9a9-84c0-428a-815e-6de6fb8694a3", + "metadata": {}, + "outputs": [], + "source": [ + "m1, m2 = ak.unzip(ak.combinations(muons, 2))\n", + "m1, m2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3b698ef-989d-4185-8de0-62a70087072c", + "metadata": {}, + "outputs": [], + "source": [ + "ak.num(m1), ak.num(m2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d2444a8-a7ef-4731-b3cd-923c0ed0c7ea", + "metadata": {}, + "outputs": [], + "source": [ + "m1 + m2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bc067cb-a97e-4333-92b4-48d705fe5107", + "metadata": {}, + "outputs": [], + "source": [ + "(m1 + m2).mass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2722fa73-649d-43f2-8312-2703776a9433", + "metadata": {}, + "outputs": [], + "source": [ + "import hist\n", + "\n", + "hist.Hist.new.Reg(120, 0, 120, name=\"mass\").Double().fill(\n", + " ak.flatten((m1 + m2).mass)\n", + ").plot()\n", + "\n", + "None" + ] + }, + { + "cell_type": "markdown", + "id": "0a2bbb34-0e56-42e3-9251-8e53b7df1f16", + "metadata": {}, + "source": [ + "


" + ] + }, + { + "cell_type": "markdown", + "id": "4e3f780c-5fcd-4281-b4dd-be0a1c5f1ace", + "metadata": {}, + "source": [ + "### It doesn't matter which coordinates were used to construct it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "118fd5b1-894f-486d-9e23-548ba1c84c63", + "metadata": {}, + "outputs": [], + "source": [ + "array2 = uproot.open(\n", + " \"https://github.com/jpivarski-talks/2023-12-18-hsf-india-tutorial-bhubaneswar/raw/main/data/SMHiggsToZZTo4L.root:Events\"\n", + ").arrays([\"Muon_pt\", \"Muon_eta\", \"Muon_phi\", \"Muon_charge\"], entry_stop=100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d266244f-461d-4590-9214-1d4380a8866d", + "metadata": {}, + "outputs": [], + "source": [ + "import particle\n", + "\n", + "muons2 = ak.zip({\"pt\": array2.Muon_pt, \"eta\": array2.Muon_eta, \"phi\": array2.Muon_phi, \"q\": array2.Muon_charge}, with_name=\"Momentum4D\")\n", + "muons2[\"mass\"] = particle.Particle.findall(\"mu-\")[0].mass / 1000.0\n", + "muons2" + ] + }, + { + "cell_type": "markdown", + "id": "d0391ff6-7281-46ff-801e-0b8928347fc3", + "metadata": {}, + "source": [ + "As long as you use properties (dots, not strings in brackets), you don't need to care what coordinates it's based on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb3b75b-f5a3-4658-a9cc-c0b29d1b0e4b", + "metadata": {}, + "outputs": [], + "source": [ + "muons2.px" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6566a49c-4ce4-481a-9931-b8c2e95e80a6", + "metadata": {}, + "outputs": [], + "source": [ + "muons2.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d13dc67-35b1-4fad-8e98-f0d9733f577d", + "metadata": {}, + "outputs": [], + "source": [ + "muons2.pz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eef9466-6d63-4e9e-b1c9-c0ea896a6118", + "metadata": {}, + "outputs": [], + "source": [ + "muons2.E" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0da4eb75-6b02-4cca-b770-0944f40a5da8", + "metadata": {}, + "outputs": [], + "source": [ + "m1, m2 = ak.unzip(ak.combinations(muons2, 2))\n", + "hist.Hist.new.Log(200, 0.1, 120, name=\"mass\").Double().fill(\n", + " ak.flatten((m1 + m2).mass)\n", + ").plot()\n", + "\n", + "None" + ] + }, + { + "cell_type": "markdown", + "id": "9758607a-8216-47ee-a41a-2e47694fd6cb", + "metadata": {}, + "source": [ + "


" + ] + }, + { + "cell_type": "markdown", + "id": "8b93a41f-3099-4889-ba46-55b87bd64e71", + "metadata": {}, + "source": [ + "## Awkward Arrays and Vector in Numba" + ] + }, + { + "cell_type": "markdown", + "id": "e84b6e59-dbee-4cf6-9349-8b989935e3ca", + "metadata": {}, + "source": [ + "Remember Numba, the JIT-compiler from [1-python-performance.ipynb](https://github.com/jpivarski-talks/2021-06-14-uproot-awkward-columnar-hats/blob/main/1-python-performance.ipynb)? Awkward Array and Vector have been implemented in Numba's compiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27f91719-7143-47d1-bc93-ece0e14d1515", + "metadata": {}, + "outputs": [], + "source": [ + "import numba as nb\n", + "\n", + "@nb.njit\n", + "def first_big_dimuon(events):\n", + " for event in events:\n", + " for i in range(len(event)):\n", + " mu1 = event[i]\n", + " for j in range(i + 1, len(event)):\n", + " mu2 = event[j]\n", + " dimuon = mu1 + mu2\n", + " if dimuon.mass > 10:\n", + " return dimuon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa36234f-f1f8-432d-80e4-697072a8be85", + "metadata": {}, + "outputs": [], + "source": [ + "first_big_dimuon(muons2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/getting-started/what-is-an-awkward-array.md b/docs/getting-started/what-is-an-awkward-array.md new file mode 100644 index 0000000000..d3a1205ffd --- /dev/null +++ b/docs/getting-started/what-is-an-awkward-array.md @@ -0,0 +1,300 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# What is an "Awkward" Array? + + +```{code-cell} ipython3 +import numpy as np +import awkward as ak +``` + +## Versatile Arrays +Awkward Arrays are general tree-like data structures, like JSON, but contiguous in memory and operated upon with compiled, vectorized code like NumPy. + +They look like NumPy arrays: + + +```{code-cell} ipython3 +ak.Array([1, 2, 3]) +``` + +Like NumPy, they can have multiple dimensions: + + +```{code-cell} ipython3 +ak.Array([ + [1, 2, 3], + [4, 5, 6] +]) +``` + +These dimensions can have varying lengths; arrays can be [ragged](https://en.wikipedia.org/wiki/Jagged_array): + + +```{code-cell} ipython3 +ak.Array([ + [1, 2, 3], + [4], + [5, 6] +]) +``` + +Each dimension can contain missing values: + + +```{code-cell} ipython3 +ak.Array([ + [1, 2, 3], + [4], + [5, 6, None] +]) +``` + +Awkward Arrays can store _numbers_: + + +```{code-cell} ipython3 +ak.Array([ + [3, 141], + [59, 26, 535], + [8] +]) +``` + +They can also work with _dates_: + + +```{code-cell} ipython3 +ak.Array( + [ + [np.datetime64("1815-12-10"), np.datetime64("1969-07-16")], + [np.datetime64("1564-04-26")], + ] +) +``` + +They can even work with _strings_: + + +```{code-cell} ipython3 +ak.Array( + [ + [ + "Benjamin List", + "David MacMillan", + ], + [ + "Emmanuelle Charpentier", + "Jennifer A. Doudna", + ], + ] +) +``` + +Awkward Arrays can have structure through _records_: + + +```{code-cell} ipython3 +ak.Array( + [ + [ + {"name": "Benjamin List", "age": 53}, + {"name": "David MacMillan", "age": 53}, + ], + [ + {"name": "Emmanuelle Charpentier", "age": 52}, + {"name": "Jennifer A. Doudna", "age": 57}, + ], + [ + {"name": "Akira Yoshino", "age": 73}, + {"name": "M. Stanley Whittingham", "age": 79}, + {"name": "John B. Goodenough", "age": 98}, + ], + ] +) +``` + +In fact, Awkward Arrays can represent many kinds of jagged data. They can possess complex structures that mix records, and primitive types. + + +```{code-cell} ipython3 +ak.Array( + [ + [ + { + "name": "Benjamin List", + "age": 53, + "institutions": [ + "University of Cologne", + "Max Planck Institute for Coal Research", + "Hokkaido University", + ], + }, + { + "name": "David MacMillan", + "age": 53, + "institutions": None, + }, + ] + ] +) +``` + +They can even contain unions! + + +```{code-cell} ipython3 +ak.Array( + [ + [np.datetime64("1815-12-10"), "Cassini"], + [np.datetime64("1564-04-26")], + ] +) +``` + +## NumPy-like interface + +Awkward Array _looks like_ NumPy. It behaves identically to NumPy for regular arrays + + +```{code-cell} ipython3 +x = ak.Array([ + [1, 2, 3], + [4, 5, 6] +]); +``` + + +```{code-cell} ipython3 +ak.sum(x, axis=-1) +``` + +providing a similar high-level API, and implementing the [ufunc](https://numpy.org/doc/stable/reference/ufuncs.html) mechanism: + + +```{code-cell} ipython3 +powers_of_two = ak.Array( + [ + [1, 2, 4], + [None, 8], + [16], + ] +); +``` + + +```{code-cell} ipython3 +ak.sum(powers_of_two) +``` + +But generalises to the tricky kinds of data that NumPy struggles to work with. It can perform reductions through varying length lists: + +![](example-reduction-sum.svg) + + +```{code-cell} ipython3 +ak.sum(powers_of_two, axis=0) +``` + +## Lightweight structures +Awkward makes it east to pull apart record structures: + + +```{code-cell} ipython3 +nobel_prize_winner = ak.Array( + [ + [ + {"name": "Benjamin List", "age": 53}, + {"name": "David MacMillan", "age": 53}, + ], + [ + {"name": "Emmanuelle Charpentier", "age": 52}, + {"name": "Jennifer A. Doudna", "age": 57}, + ], + [ + {"name": "Akira Yoshino", "age": 73}, + {"name": "M. Stanley Whittingham", "age": 79}, + {"name": "John B. Goodenough", "age": 98}, + ], + ] +); +``` + + +```{code-cell} ipython3 +nobel_prize_winner.name +``` + + +```{code-cell} ipython3 +nobel_prize_winner.age +``` + +These records are lightweight, and simple to compose: + + +```{code-cell} ipython3 +nobel_prize_winner_with_birth_year = ak.zip({ + "name": nobel_prize_winner.name, + "age": nobel_prize_winner.age, + "birth_year": 2021 - nobel_prize_winner.age +}); +``` + + +```{code-cell} ipython3 +nobel_prize_winner_with_birth_year.show() +``` + +## High performance +Like NumPy, Awkward Array performs computations in fast, optimised kernels. + + +```{code-cell} ipython3 +large_array = ak.Array([[1, 2, 3], [], [4, 5]] * 1_000_000) +``` + +We can compute the sum in `3.37 ms ± 107 µs` on a reference CPU: + + +```{code-cell} ipython3 +ak.sum(large_array) +``` + +The same sum can be computed with pure-Python over the flattened array in `369 ms ± 8.07 ms`: + + +```{code-cell} ipython3 +large_flat_array = ak.ravel(large_array) + +sum(large_flat_array) +``` + +These performance values are not benchmarks; they are only an indication of the speed of Awkward Array. + +Some problems are hard to solve with array-oriented programming. Awkward Array supports [Numba](https://numba.pydata.org/) out of the box: + +```{code-cell} ipython3 +import numba as nb + +@nb.njit +def cumulative_sum(arr): + result = 0 + for x in arr: + for y in x: + result += y + return result + +cumulative_sum(large_array) +``` diff --git a/docs/index.md b/docs/index.md index 68419d0f80..8579372535 100644 --- a/docs/index.md +++ b/docs/index.md @@ -95,12 +95,20 @@ Spotted a typo in the documentation? Want to add to the codebase? The contributi ::: +:::{grid-item-card} +:columns: 12 +:link: https://dask-awkward.readthedocs.io/ +:class-card: admonition warning + + Using dask-awkward arrays in Dask? Click this card. +:::: + :::{grid-item-card} :columns: 12 :link: https://juliahep.github.io/AwkwardArray.jl/dev/ :class-card: admonition warning - Looking for the documentation for Awkward Array in Julia? Click this card. + Using AwkwardArray.jl in Julia? Click this card. :::: - + ::::: diff --git a/docs/redirects-user-guide.json b/docs/redirects-user-guide.json index ed91511f83..626850c31e 100644 --- a/docs/redirects-user-guide.json +++ b/docs/redirects-user-guide.json @@ -47,12 +47,13 @@ "how-to-restructure-sort.any-ext": "user-guide/how-to-restructure-sort.html", "how-to-restructure-zip-project.any-ext": "user-guide/how-to-restructure-zip-project.html", "how-to-restructure.any-ext": "user-guide/how-to-restructure.html", - "how-to-specialize-in-numba.any-ext": "user-guide/how-to-specialize-in-numba.html", - "how-to-specialize-lorentz.any-ext": "user-guide/how-to-specialize-lorentz.html", - "how-to-specialize-override-numpy.any-ext": "user-guide/how-to-specialize-override-numpy.html", - "how-to-specialize-subclass.any-ext": "user-guide/how-to-specialize-subclass.html", - "how-to-specialize.any-ext": "user-guide/how-to-specialize.html", + "how-to-specialize-in-numba.any-ext": "reference/ak.behavior.html", + "how-to-specialize-lorentz.any-ext": "reference/ak.behavior.html", + "how-to-specialize-override-numpy.any-ext": "reference/ak.behavior.html", + "how-to-specialize-subclass.any-ext": "reference/ak.behavior.html", + "how-to-specialize.any-ext": "reference/ak.behavior.html", "how-to-use-in-numba-arraybuilder.any-ext": "user-guide/how-to-use-in-numba-arraybuilder.html", "how-to-use-in-numba-features.any-ext": "user-guide/how-to-use-in-numba-features.html", - "how-to-use-in-numba.any-ext": "user-guide/how-to-use-in-numba.html" + "how-to-use-in-numba.any-ext": "user-guide/how-to-use-in-numba.html", + "10-minutes-to-awkward-array.any-ext": "getting-started/10-minutes-to-awkward-array.html" } diff --git a/docs/reference/index.md b/docs/reference/index.md index f80fe62444..698c086895 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -1,9 +1,37 @@ # API reference +See the left side-bar (or bring it into view by clicking on the upper-left `≡`) for a detailed description of every public class and function in Awkward Array. You can use your browser's find-in-page to find a particular function. + +In a nutshell, the `awkward` library consists of + +* a high-level {obj}`ak.Array` class, as well as {obj}`ak.Record` for scalar records, +* a suite of functions in the `ak.*` and `ak.str.*` namespaces, which operate on arrays, +* high-level data {obj}`ak.types.Type` classes, a generalization of NumPy's shape and dtype, +* low-level array {obj}`ak.contents.Content`, which describe the memory layout of arrays, as well as their {obj}`ak.forms.Form` (low-level types), +* an {obj}`ak.behavior` dict to add functionality to arrays and records. + +For details about array slicing, see {func}`ak.Array.__getitem__`. + +For details about adding record fields to an array of records, see {func}`ak.Array.__setitem__`. + +To get a low-level {obj}`ak.contents.Content` from an array or record, see {obj}`ak.Array.layout` and {obj}`ak.Record.layout`. + +If you're looking for "how to..." guides arranged by task, rather than function, see the user guide instead. + +You can test any of these functions in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




+ :::{card} C++ Documentation {fas}`external-link-alt` :link: ../_static/doxygen/index.html -The C++ classes, cpu-kernels, and gpu-kernels are documented separately. Click here to go to the C++ API reference +The C++ code implementing the `awkward-cpp` helper library are documented separately. Click here to go to the C++ API reference. +::: + +:::{card} dask-awkward {fas}`external-link-alt` +:link: https://dask-awkward.readthedocs.io/ + +Although many of the functions have the same names and interfaces, the `dask-awkward` library is documented separately. Click here to learn about Awkward Arrays in Dask. ::: ```{eval-rst} diff --git a/docs/reference/toctree.txt b/docs/reference/toctree.txt index 8f0049450f..126e0a2b9c 100644 --- a/docs/reference/toctree.txt +++ b/docs/reference/toctree.txt @@ -5,65 +5,77 @@ generated/ak.Record .. toctree:: - :caption: Append-only builder + :caption: Converting untyped data - generated/ak.ArrayBuilder + generated/ak.from_iter + generated/ak.to_list + generated/ak.from_json + generated/ak.to_json .. toctree:: - :caption: Converting from other formats + :caption: Converting rectangular arrays - generated/ak.from_arrow - generated/ak.from_arrow_schema - generated/ak.from_avro_file - generated/ak.from_buffers + generated/ak.from_numpy + generated/ak.to_numpy generated/ak.from_cupy - generated/ak.from_feather - generated/ak.from_iter + generated/ak.to_cupy + generated/ak.from_dlpack generated/ak.from_jax - generated/ak.from_json - generated/ak.from_numpy - generated/ak.from_parquet - generated/ak.from_rdataframe - generated/ak.from_avro_file - generated/ak.metadata_from_parquet + generated/ak.to_jax .. toctree:: - :caption: Converting to other formats + :caption: Reading and writing files - generated/ak.to_arrow - generated/ak.to_arrow_table - generated/ak.to_buffers - generated/ak.to_cupy - generated/ak.to_dataframe - generated/ak.to_feather - generated/ak.to_jax - generated/ak.to_json - generated/ak.to_list - generated/ak.to_numpy - generated/ak.to_packed + generated/ak.from_parquet + generated/ak.metadata_from_parquet generated/ak.to_parquet generated/ak.to_parquet_dataset generated/ak.to_parquet_row_groups + generated/ak.from_arrow + generated/ak.from_arrow_schema + generated/ak.to_arrow + generated/ak.to_arrow_table + generated/ak.from_feather + generated/ak.to_feather + generated/ak.from_avro_file + +.. toctree:: + :caption: Converting to Pandas DataFrames + + generated/ak.to_dataframe + +.. toctree:: + :caption: Converting ROOT RDataFrames + + generated/ak.from_rdataframe generated/ak.to_rdataframe .. toctree:: - :caption: Correctness checking + :caption: Append-only builder - generated/ak.is_valid - generated/ak.validity_error + generated/ak.ArrayBuilder .. toctree:: - :caption: Lengths of lists + :caption: Low-level conversion primitives - generated/ak.num - generated/ak.count - generated/ak.count_nonzero + generated/ak.from_buffers + generated/ak.to_buffers + generated/ak.to_packed + generated/ak.copy + +.. toctree:: + :caption: Validity checking + + generated/ak.is_valid + generated/ak.validity_error .. toctree:: - :caption: Making and breaking records (columns) + :caption: Restructuring records (columns) generated/ak.zip generated/ak.unzip + generated/ak.merge_union_of_records + generated/ak.merge_option_of_records .. toctree:: :caption: Merging arrays (rows) @@ -79,32 +91,43 @@ generated/ak.ravel .. toctree:: - :caption: Functions that reduce dimension + :caption: Lengths of lists - generated/ak.all - generated/ak.any + generated/ak.num + generated/ak.count + +.. toctree:: + :caption: Reducers + + generated/ak.count_nonzero generated/ak.sum - generated/ak.prod - generated/ak.max - generated/ak.min - generated/ak.argmax - generated/ak.argmin generated/ak.nansum + generated/ak.prod generated/ak.nanprod - generated/ak.nanmax + generated/ak.any + generated/ak.all + +.. toctree:: + :caption: Minimum and maximum + + generated/ak.min generated/ak.nanmin - generated/ak.nanargmax + generated/ak.max + generated/ak.nanmax + generated/ak.argmin generated/ak.nanargmin + generated/ak.argmax + generated/ak.nanargmax .. toctree:: - :caption: Statistics that reduce dimension + :caption: Descriptive statistics generated/ak.moment generated/ak.mean - generated/ak.var - generated/ak.std generated/ak.nanmean + generated/ak.var generated/ak.nanvar + generated/ak.std generated/ak.nanstd generated/ak.covar generated/ak.corr @@ -122,8 +145,8 @@ :caption: Missing value handling generated/ak.mask - generated/ak.drop_none generated/ak.is_none + generated/ak.drop_none generated/ak.pad_none generated/ak.fill_none generated/ak.firsts @@ -132,8 +155,8 @@ .. toctree:: :caption: Ragged and regular dimensions - generated/ak.to_regular generated/ak.from_regular + generated/ak.to_regular .. toctree:: :caption: Broadcasting @@ -145,8 +168,8 @@ :caption: Combinatorics ("for loop" replacements) generated/ak.cartesian - generated/ak.combinations generated/ak.argcartesian + generated/ak.combinations generated/ak.argcombinations .. toctree:: @@ -230,10 +253,6 @@ generated/ak.nan_to_num generated/ak.values_astype generated/ak.strings_astype - generated/ak.round - generated/ak.real - generated/ak.imag - generated/ak.angle .. toctree:: :caption: Arrays of categorical data @@ -248,21 +267,9 @@ generated/ak.local_index generated/ak.run_lengths - -.. toctree:: - :caption: Restructuring records - - generated/ak.merge_union_of_records - generated/ak.merge_option_of_records .. toctree:: - :caption: Copying and packing arrays - - generated/ak.copy - generated/ak.to_packed - -.. toctree:: - :caption: Extracting metadata + :caption: Extracting type metadata generated/ak.type generated/ak.parameters @@ -270,12 +277,12 @@ generated/ak.is_tuple .. toctree:: - :caption: Manipulating metadata + :caption: Manipulating type metadata generated/ak.with_name generated/ak.with_field - generated/ak.with_parameter generated/ak.without_field + generated/ak.with_parameter generated/ak.without_parameters .. toctree:: @@ -292,17 +299,21 @@ generated/ak.backend .. toctree:: - :caption: NumPy compatibility + :caption: Approximation - generated/ak.full_like + generated/ak.round generated/ak.isclose - generated/ak.ones_like - generated/ak.zeros_like + generated/ak.almost_equal .. toctree:: - :caption: Array comparison + :caption: NumPy compatibility - generated/ak.almost_equal + generated/ak.real + generated/ak.imag + generated/ak.angle + generated/ak.zeros_like + generated/ak.ones_like + generated/ak.full_like .. toctree:: :caption: Third-party integration @@ -314,14 +325,26 @@ generated/ak.jax.register_behavior_class .. toctree:: - :caption: Array layout transformations + :caption: High-level data types - generated/ak.transform + generated/ak.types.Type + generated/ak.types.ArrayType + generated/ak.types.ScalarType + generated/ak.types.ListType + generated/ak.types.NumpyType + generated/ak.types.OptionType + generated/ak.types.RecordType + generated/ak.types.RegularType + generated/ak.types.UnionType + generated/ak.types.UnknownType + generated/ak.types.from_datashape + generated/ak.types.is_primitive + generated/ak.types.primitive_to_dtype + generated/ak.types.dtype_to_primitive .. toctree:: :caption: Low-level array layouts - generated/ak.to_layout generated/ak.contents.Content generated/ak.contents.BitMaskedArray generated/ak.contents.ByteMaskedArray @@ -336,6 +359,8 @@ generated/ak.contents.UnionArray generated/ak.contents.UnmaskedArray generated/ak.record.Record + generated/ak.to_layout + generated/ak.transform .. toctree:: :caption: Index for layout nodes @@ -347,24 +372,6 @@ generated/ak.index.IndexU32 generated/ak.index.Index64 -.. toctree:: - :caption: High-level data types - - generated/ak.types.from_datashape - generated/ak.types.Type - generated/ak.types.ArrayType - generated/ak.types.ScalarType - generated/ak.types.ListType - generated/ak.types.NumpyType - generated/ak.types.OptionType - generated/ak.types.RecordType - generated/ak.types.RegularType - generated/ak.types.UnionType - generated/ak.types.UnknownType - generated/ak.types.is_primitive - generated/ak.types.dtype_to_primitive - generated/ak.types.primitive_to_dtype - .. toctree:: :caption: Low-level types: "forms" @@ -386,16 +393,7 @@ generated/ak.forms.from_type .. toctree:: - :caption: Builtin behaviors - - generated/ak.ByteBehavior - generated/ak.ByteStringBehavior - generated/ak.CharBehavior - generated/ak.StringBehavior - generated/ak.CategoricalBehavior - -.. toctree:: - :caption: AwkwardForth for data ingest + :caption: Low-level data ingest :maxdepth: 1 awkwardforth diff --git a/docs/requirements.txt b/docs/requirements.txt index 8b23b3784b..e6f1efdede 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -18,11 +18,18 @@ sphinx-external-toc ipyleaflet numpy +scipy numba>=0.50.0;python_version<"3.11" pandas>=0.24.0 numexpr pyarrow>=7.0.0 fsspec +fsspec-xrootd +hist[plot] +particle +hepunits +vector +scikit-hep-testdata s3fs h5py matplotlib diff --git a/docs/switcher.json b/docs/switcher.json index 23f6684261..2f3af5be6a 100644 --- a/docs/switcher.json +++ b/docs/switcher.json @@ -10,7 +10,7 @@ { "version": "2.6", "url": "https://awkward-array.org/doc/2.6/" - }, + }, { "version": "2.5", "url": "https://awkward-array.org/doc/2.5/" diff --git a/docs/user-guide/cartoon-cartesian.png b/docs/user-guide/cartoon-cartesian.png new file mode 120000 index 0000000000..d0bf1a45e9 --- /dev/null +++ b/docs/user-guide/cartoon-cartesian.png @@ -0,0 +1 @@ +../../docs-img/diagrams/cartoon-cartesian.png \ No newline at end of file diff --git a/docs/user-guide/cartoon-combinations.png b/docs/user-guide/cartoon-combinations.png new file mode 120000 index 0000000000..b5ffe4f16a --- /dev/null +++ b/docs/user-guide/cartoon-combinations.png @@ -0,0 +1 @@ +../../docs-img/diagrams/cartoon-combinations.png \ No newline at end of file diff --git a/docs/user-guide/how-to-combinatorics-best-match.md b/docs/user-guide/how-to-combinatorics-best-match.md index b4a239bfce..c84b9ef602 100644 --- a/docs/user-guide/how-to-combinatorics-best-match.md +++ b/docs/user-guide/how-to-combinatorics-best-match.md @@ -14,10 +14,155 @@ kernelspec: How to find the best match between two collections using Cartesian (cross) product ================================================================================== -**This is a stub:** I intend to write this article, but haven't yet. +In high energy physics (HEP), {func}`ak.combinations` is often needed to find particles whose trajectories are close to each other, separately in many high-energy collision events (`axis=1`). In some applications, the two collections that need to be matched are simulated particles and reconstructed versions of those particles ("gen-reco matching"), and in other applications, the two collections are different types of particles, such as electrons and jets. -If you need it soon, create an issue saying so and I'll make it a higher priority. +I'll describe how to solve such a problem on this page, but avoid domain-specific jargon by casting it as a problem of finding the distance between bunnies and foxes—if a bunny is too close to a fox, it will get eaten! -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +## Setting up the problem + +In 1000 separate yards (big suburb), there's a random number of bunnies and a random number of foxes, each with random _x_, _y_ positions. We're making ragged arrays of records using {func}`ak.unflatten` and {func}`ak.zip`. + +```{code-cell} ipython3 +np.random.seed(12345) + +number_of_bunnies = np.random.poisson(3.5, 1000) # average of 3.5 bunnies/yard +number_of_foxes = np.random.poisson(1.5, 1000) # average of 1.5 foxes/yard + +bunny_xy = np.random.normal(0, 1, (number_of_bunnies.sum(), 2)) +fox_xy = np.random.normal(0, 1, (number_of_foxes.sum(), 2)) + +bunnies = ak.unflatten(ak.zip({"x": bunny_xy[:, 0], "y": bunny_xy[:, 1]}), number_of_bunnies) +foxes = ak.unflatten(ak.zip({"x": fox_xy[:, 0], "y": fox_xy[:, 1]}), number_of_foxes) +``` + +```{code-cell} ipython3 +bunnies +``` + +```{code-cell} ipython3 +foxes +``` + +## Find all combinations + +In each yard, we find all bunny-fox pairs, regardless of whether they're close or not using {func}`ak.cartesian`, and then unpacking the pairs with {func}`ak.unzip`. + +```{code-cell} ipython3 +pair_bunnies, pair_foxes = ak.unzip(ak.cartesian([bunnies, foxes])) +``` + +These two arrays, `pair_bunnies` and `pair_foxes`, have the same type as `bunnies` and `foxes`, but different numbers of items in each list because now they're paired to match each other. Both kinds of animals are duplicated to enable this match. + +```{code-cell} ipython3 +pair_bunnies +``` + +```{code-cell} ipython3 +pair_foxes +``` + +The two arrays have the same list lengths as each other because they came from the same {func}`ak.unzip`. + +```{code-cell} ipython3 +ak.num(pair_bunnies), ak.num(pair_foxes) +``` + +## Calculating distances + +Since the arrays have the same shapes, they can be used in the same mathematical formula. Here's the formula for distance: + +```{code-cell} ipython3 +distances = np.sqrt((pair_bunnies.x - pair_foxes.x)**2 + (pair_bunnies.y - pair_foxes.y)**2) +distances +``` + +Let's say that 1 unit is close enough for a bunny to be eaten. + +```{code-cell} ipython3 +eaten = (distances < 1) +eaten +``` + +This is great (not for the bunnies, but perhaps for the foxes). However, if we want to use this information on the original arrays, we're stuck: this array has a different shape from the original `bunnies` (and the original `foxes`). + +Perhaps the question we really wanted to ask is, "For each bunny, is there _any_ fox that can eat it?" + +## Combinations with `nested=True` + +Asking a question about _any_ fox means performing a reducer, {func}`ak.any`, over lists, one list per bunny. The list would be all of the foxes in its yard. For that, we'll need to pass `nested=True` to {func}`ak.cartesian`. + +```{code-cell} ipython3 +pair_bunnies, pair_foxes = ak.unzip(ak.cartesian([bunnies, foxes], nested=True)) +``` + +Now `pair_bunnies` and `pair_foxes` are one list-depth deeper than the original `bunnies` and `foxes`. + +```{code-cell} ipython3 +pair_bunnies +``` + +```{code-cell} ipython3 +pair_foxes +``` + +We can compute `distances` in the same way, though it's also one list-depth deeper. + +```{code-cell} ipython3 +distances = np.sqrt((pair_bunnies.x - pair_foxes.x)**2 + (pair_bunnies.y - pair_foxes.y)**2) +distances +``` + +Similarly for `eaten`. + +```{code-cell} ipython3 +eaten = (distances < 1) +eaten +``` + +Now each inner list of booleans is answering the questions, "Can fox 0 eat me?", "Can fox 1 eat me?", ..., "Can fox _n_ eat me?" and there are exactly as many of these lists as there are bunnies. Applying {func}`ak.any` over the innermost lists (`axis=-1`), + +```{code-cell} ipython3 +bunny_eaten = ak.any(eaten, axis=-1) +bunny_eaten +``` + +We've now answered the question, "Can any fox eat me?" for each bunny. After the mayhem, these are the bunnies we have left: + +```{code-cell} ipython3 +bunnies[~bunny_eaten] +``` + +Whereas there was originally an average of 3.5 bunnies per yard, by construction, + +```{code-cell} ipython3 +ak.mean(ak.num(bunnies, axis=1)) +``` + +Now there's only + +```{code-cell} ipython3 +ak.mean(ak.num(bunnies[~bunny_eaten], axis=1)) +``` + +left. + +## Asymmetry in the problem + +The way we performed this calculation was asymmetric: for each bunny, we asked if it was eaten. We could have performed a similar, but different, calculation to ask, which foxes get to eat? To do that, we must reverse the order of arguments because `nested=True` groups from the left. + +```{code-cell} ipython3 +pair_foxes, pair_bunnies = ak.unzip(ak.cartesian([foxes, bunnies], nested=True)) + +distances = np.sqrt((pair_foxes.x - pair_bunnies.x)**2 + (pair_foxes.y - pair_bunnies.y)**2) + +eating = (distances < 1) + +fox_eats = ak.any(eating, axis=-1) + +foxes[fox_eats] +``` diff --git a/docs/user-guide/how-to-combinatorics-cartesian-combinations.md b/docs/user-guide/how-to-combinatorics-cartesian-combinations.md index ae0485823d..5b5123006a 100644 --- a/docs/user-guide/how-to-combinatorics-cartesian-combinations.md +++ b/docs/user-guide/how-to-combinatorics-cartesian-combinations.md @@ -14,10 +14,193 @@ kernelspec: How to find all combinations of elements: Cartesian (cross) product and "n choose k" ==================================================================================== -**This is a stub:** I intend to write this article, but haven't yet. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -If you need it soon, create an issue saying so and I'll make it a higher priority. +## Motivation -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +In non-array code that operates on arbitrary data structures, such as Python for loops and Python objects, doubly nested for loops like the following are pretty common: -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +class City: + def __init__(self, name, latitude, longitude): + self.name = name + self.latitude = latitude + self.longitude = longitude + +cities_us = [ + City("New York", 40.7128, -74.0060), + City("Los Angeles", 34.0522, -118.2437), + City("Chicago", 41.8781, -87.6298), +] +cities_canada = [ + City("Toronto", 43.6510, -79.3470), + City("Vancouver", 49.2827, -123.1207), + City("Montreal", 45.5017, -73.5673), +] +``` + +Cartesian product: + +```{code-cell} ipython3 +class CityPair: + def __init__(self, city1, city2): + self.city1 = city1 + self.city2 = city2 + def __repr__(self): + return f"" + +pairs = [] + +for city_us in cities_us: + for city_canada in cities_canada: + pairs.append(CityPair(city_us, city_canada)) + +pairs +``` + +and "n choose k" (combinations without replacement): + +```{code-cell} ipython3 +all_cities = cities_us + cities_canada +``` + +```{code-cell} ipython3 +pairs = [] + +for i, city1 in enumerate(all_cities): + for city2 in all_cities[i + 1:]: + pairs.append(CityPair(city1, city2)) + +pairs +``` + +These kinds of combinations are common enough that there are special functions for them in Python's [itertools](https://docs.python.org/3/library/itertools.html) library: + +* [itertools.product](https://docs.python.org/3/library/itertools.html#itertools.product) for the Cartesian product, +* [itertools.combinations](https://docs.python.org/3/library/itertools.html#itertools.combinations) for combinations without replacement. + +```{code-cell} ipython3 +import itertools +``` + +```{code-cell} ipython3 +list( + CityPair(city1, city2) + for city1, city2 in itertools.product(cities_us, cities_canada) +) +``` + +```{code-cell} ipython3 +list( + CityPair(city1, city2) + for city1, city2 in itertools.combinations(all_cities, 2) +) +``` + +Awkward Array has special functions for these kinds of combinations as well: + +* {func}`ak.cartesian` for the Cartesian product, +* {func}`ak.combinations` for combinations without replacement. + +```{code-cell} ipython3 +def instance_to_dict(city): + return {"name": city.name, "latitude": city.latitude, "longitude": city.longitude} + +cities_us = ak.Array([instance_to_dict(city) for city in cities_us]) +cities_canada = ak.Array([instance_to_dict(city) for city in cities_canada]) + +all_cities = ak.concatenate([cities_us, cities_canada]) +``` + +```{code-cell} ipython3 +ak.cartesian([cities_us, cities_canada], axis=0) +``` + +```{code-cell} ipython3 +ak.combinations(all_cities, 2, axis=0) +``` + +## Combinations with `axis=1` + +The default `axis` for these functions is 1, rather than 0, as in the motivating example. Problems that are big enough to benefit from vectorized combinations would produce very large output arrays, which likely wouldn't fit in any computer's memory. (Those problems are a better fit for SQL's `CROSS JOIN`; note that Python has a built-in interface to [sqlite3](https://docs.python.org/3/library/sqlite3.html) in-memory tables. You could even use SQL to populate an array of integer indexes to later slice an Awkward Array...) + +The most useful application of Awkward Array combinatorics are on problems in which small, variable-length lists need to be combined—and there are many of them. This is `axis=1` (default) or `axis > 1`. + +Here is an example of many Cartesian products: + +![](cartoon-cartesian.png) + +```{code-cell} ipython3 +numbers = ak.Array([[1, 2, 3], [], [4, 5], [6, 7, 8, 9]] * 250) +letters = ak.Array([["a", "b"], ["c"], ["d", "e", "f", "g"], ["h", "i"]] * 250) +``` + +```{code-cell} ipython3 +ak.cartesian([numbers, letters]) +``` + +Here is an example of many combinations without replacement: + +![](cartoon-combinations.png) + +```{code-cell} ipython3 +ak.combinations(numbers, 2) +``` + +## Calculations on pairs + +Usually, you'll want to do some calculation on each pair (or on each triple or quadruple, etc.). To get the left-side and right-side of each pair into separate arrays, so they can be used in a calculation, you could address the members of the tuple individually: + +```{code-cell} ipython3 +tuples = ak.combinations(numbers, 2) +``` + +```{code-cell} ipython3 +tuples["0"], tuples["1"] +``` + +Be sure to use integers in strings when addressing fields of a tuple ("columns") and plain integers when addressing array elements ("rows"). The above is different from + +```{code-cell} ipython3 +tuples[0], tuples[1] +``` + +Once they're in separate arrays, they can be used in a formula: + +```{code-cell} ipython3 +tuples["0"] * tuples["1"] +``` + +Another way to get fields of a tuple (or fields of a record) as individual arrays is to use {func}`ak.unzip`: + +```{code-cell} ipython3 +lefts, rights = ak.unzip(tuples) + +lefts * rights +``` + +## Maintaining groups + +In combinations like + +```{code-cell} ipython3 +ak.cartesian([np.arange(5), np.arange(4)], axis=0) +``` + +produce a flat list of combinations, but some calculations need triples with the same first or second value in the same list, for instance if they're going to {func}`ak.max` over lists ("find the best combination in which...") or compute {func}`ak.any` or {func}`ak.all` ("is there any combination in which...?"). The `nested` argument controls this. + +```{code-cell} ipython3 +result = ak.cartesian([np.arange(5), np.arange(4)], axis=0, nested=True) +result +``` + +For instance, "is there any combination in which |_left_ - _right_| ≥ 3?" + +```{code-cell} ipython3 +lefts, rights = ak.unzip(result) + +ak.any(abs(lefts - rights) >= 3, axis=1) +``` diff --git a/docs/user-guide/how-to-combinatorics.md b/docs/user-guide/how-to-combinatorics.md index 10dd4dec39..674875fc3f 100644 --- a/docs/user-guide/how-to-combinatorics.md +++ b/docs/user-guide/how-to-combinatorics.md @@ -14,5 +14,10 @@ kernelspec: Combinatorics ============= -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-convert.md b/docs/user-guide/how-to-convert.md index ee0361582d..2818e67301 100644 --- a/docs/user-guide/how-to-convert.md +++ b/docs/user-guide/how-to-convert.md @@ -14,5 +14,10 @@ kernelspec: Converting arrays ================= -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-create-records.md b/docs/user-guide/how-to-create-records.md index aafd3449a8..d6a9a43a16 100644 --- a/docs/user-guide/how-to-create-records.md +++ b/docs/user-guide/how-to-create-records.md @@ -193,7 +193,7 @@ ak.cartesian( ) ``` -Names are for giving records [specialized behavior](how-to-specialize) through the {data}`ak.behavior` registry. These are like attaching methods to a class in the sense that all records with a particular name can be given Python properties and methods. +Names are for giving records specialized behavior through the {data}`ak.behavior` registry (see the {data}`ak.behavior` reference documentation for details). These are like attaching methods to a class in the sense that all records with a particular name can be given Python properties and methods. ```{code-cell} ipython3 class XYZRecord(ak.Record): diff --git a/docs/user-guide/how-to-create.md b/docs/user-guide/how-to-create.md index 3ab27797ad..2e34bbdd38 100644 --- a/docs/user-guide/how-to-create.md +++ b/docs/user-guide/how-to-create.md @@ -14,5 +14,10 @@ kernelspec: Creating arrays =============== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-examine-checking-validity.md b/docs/user-guide/how-to-examine-checking-validity.md index ae60f3688c..a3fab7bb31 100644 --- a/docs/user-guide/how-to-examine-checking-validity.md +++ b/docs/user-guide/how-to-examine-checking-validity.md @@ -14,10 +14,75 @@ kernelspec: How to ensure that an array is valid ==================================== -**This is a stub:** I intend to write this article, but haven't yet. +Awkward Arrays are complex data structures with their own rules for internal consistency. In principle, all data sources should serve valid array structures and all operations on valid structures should return valid structures. However, errors sometimes happen. -If you need it soon, create an issue saying so and I'll make it a higher priority. +Awkward Array's compiled routines check for validity in the course of computation, so that errors are reported as Python exceptions, rather than undefined behavior or segmentation faults. However, those errors can be hard to understand because the invalid structure might have been constructed much earlier in a program than the point where it is discovered. -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +For that reason, you have tools to check an Awkward Array's internal validity: {func}`ak.is_valid`, {func}`ak.validity_error`, and the `check_valid` argument to constructors like {obj}`ak.Array`. -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +import awkward as ak +``` + +To demonstrate, here's a valid array: + +```{code-cell} ipython3 +array_is_valid = ak.Array([[0, 1, 2], [], [3, 4], [5], [6, 7, 8, 9]]) +array_is_valid +``` + +and here is a copy of it that I will make invalid. + +```{code-cell} ipython3 +array_is_invalid = ak.copy(array_is_valid) +``` + +```{code-cell} ipython3 +array_is_invalid.layout +``` + +```{code-cell} ipython3 +array_is_invalid.layout.offsets.data +``` + +```{code-cell} ipython3 +array_is_invalid.layout.offsets.data[3] = 100 + +array_is_invalid.layout +``` + +The {func}`ak.is_valid` function only tells us whether an array is valid or not: + +```{code-cell} ipython3 +ak.is_valid(array_is_valid) +``` + +```{code-cell} ipython3 +ak.is_valid(array_is_invalid) +``` + +But the {func}`ak.validity_error` function tells us what the error was (if any). + +```{code-cell} ipython3 +ak.validity_error(array_is_valid) +``` + +```{code-cell} ipython3 +ak.validity_error(array_is_invalid) +``` + +If you suspect that an array is invalid or becomes invalid in the course of your program, you can either use these functions to check or construct arrays with `check_valid=True` in the {obj}`ak.Array` constructor. + +```{code-cell} ipython3 +ak.Array(array_is_valid, check_valid=True) +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +ak.Array(array_is_invalid, check_valid=True) +``` diff --git a/docs/user-guide/how-to-examine-list-fields.md b/docs/user-guide/how-to-examine-list-fields.md index 6d5871e149..02d30ace51 100644 --- a/docs/user-guide/how-to-examine-list-fields.md +++ b/docs/user-guide/how-to-examine-list-fields.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.16.1 kernelspec: display_name: Python 3 (ipykernel) language: python @@ -30,7 +30,6 @@ As seen in {doc}`how-to-create-records`, one of Awkward Array's most useful feat import awkward as ak import numpy as np - records = ak.Array( [ {"x": 0.014309631995020777, "y": 0.7077380205549498}, @@ -45,7 +44,11 @@ records = ak.Array( The type of an array gives an indication of the fields that it contains. We can see that the `records` array contains two fields `"x"` and `"y"`: ```{code-cell} ipython3 -records.type +print(records.type) +``` + +```{code-cell} ipython3 +records.type.show() ``` The {class}`ak.Array` object itself provides a convenient {attr}`ak.Array.fields` property that returns the list of field names @@ -78,7 +81,7 @@ tuples = ak.Array( These look very similar to records, but the fields are un-named: ```{code-cell} ipython3 -tuples.type +print(tuples.type) ``` Despite this, the {func}`ak.fields` function, and {attr}`ak.Array.fields` property both return non-empty lists of strings when used to query a tuple array: diff --git a/docs/user-guide/how-to-examine-simple-slicing.md b/docs/user-guide/how-to-examine-simple-slicing.md index d48ecee1d8..5ba8757528 100644 --- a/docs/user-guide/how-to-examine-simple-slicing.md +++ b/docs/user-guide/how-to-examine-simple-slicing.md @@ -14,10 +14,155 @@ kernelspec: How to examine an array with simple slicing =========================================== -**This is a stub:** I intend to write this article, but haven't yet. +Slicing data from an array is a basic operation in array-oriented data analysis. Awkward Array extends [NumPy's slicing capabilities](https://numpy.org/doc/stable/user/basics.indexing.html) to handle nested and ragged data structures. This tutorial illustrates several ways to slice an array. -If you need it soon, create an issue saying so and I'll make it a higher priority. +For a complete list of slicing features, see {func}`ak.Array.__getitem__`. -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +## Basic slicing with ranges + +Much like NumPy, you can slice Awkward Arrays using simple ranges specified with colons (`start:stop:step`). Here’s an example of a regular (non-ragged) Awkward Array: + +```{code-cell} ipython3 +array = ak.Array(np.arange(10)**2) # squaring numbers for clarity +array +``` + +To select the first five elements: + +```{code-cell} ipython3 +array[:5] +``` + +To select from the fifth-to-last onward: + +```{code-cell} ipython3 +array[-5:] +``` + +To select every other element starting from the second: + +```{code-cell} ipython3 +array[1::2] +``` + +## Multiple ranges for multiple dimensions + +Similarly, for multidimensional data, + +```{code-cell} ipython3 +np_array3d = np.arange(2*3*5).reshape(2, 3, 5) +np_array3d +``` + +```{code-cell} ipython3 +array3d = ak.Array(np_array3d) +array3d +``` + +```{code-cell} ipython3 +np_array3d[1, ::2, 1:-1] +``` + +```{code-cell} ipython3 +array3d[1, ::2, 1:-1] +``` + +Just as with NumPy, a single colon (`:`) means "take everything from this dimension" and an ellipsis (`...`) expands to all dimensions between two slices. + +```{code-cell} ipython3 +array3d[:, :, 1:-1] +``` + +```{code-cell} ipython3 +array3d[..., 1:-1] +``` + +## Boolean array slices + +Like NumPy's [advanced slicing](https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing), an array of booleans filters individual items. For instance, consider an array of booleans constructed by asking which elements of `array` are greater than 20: + +```{code-cell} ipython3 +array > 20 +``` + +When applied to `array` between square brackets, the boolean array eliminates all items in which `array > 20` is `False`: + +```{code-cell} ipython3 +array[array > 20] +``` + +Boolean array slicing is more powerful than range slicing because the `True` and `False` values may have any pattern. The following selects only even numbers. + +```{code-cell} ipython3 +array % 2 == 0 +``` + +```{code-cell} ipython3 +array[array % 2 == 0] +``` + +## Integer array slices + +You can also use arrays of integer indices to select specific elements. + +```{code-cell} ipython3 +indices = ak.Array([2, 5, 3]) +array[indices] +``` + +If you are passing indexes directly between the `array`'s square brackets, be sure that they, too, are nested within square brackets (to be a list, rather than a tuple). + +```{code-cell} ipython3 +array[[2, 5, 3]] +``` + +In addition to picking elements out of order, you can pick the same element multiple times. + +```{code-cell} ipython3 +array[[2, 5, 5, 5, 5, 5, 3]] +``` + +Any slices that could be performed by boolean arrays can be performed by integer arrays, but only integer arrays can reorder and duplicate elements. + +## Ragged array slicing + +One of the unique features of Awkward Array is its ability to handle ragged arrays efficiently. Here's an example of a ragged array: + +```{code-cell} ipython3 +ragged_array = ak.Array([[10, 20, 30], [40], [], [50, 60]]) +ragged_array +``` + +You can slice individual sublists like this: + +```{code-cell} ipython3 +ragged_array[1] +``` + +And you can perform slices that operate across the sublists: + +```{code-cell} ipython3 +ragged_array[:, :2] # get first two elements of each sublist +``` + +Ranges and single indices mixed with slice notation allow the complexity of ragged slicing to express selecting ranges in nested lists, a feature unique to Awkward Array beyond NumPy’s capabilities. Here's an example where we skip the first element of each sublist that has more than one element: + +```{code-cell} ipython3 +ragged_array[ak.num(ragged_array) > 1, 1:] +``` + +## Boolean array slicing with missing data + +When working with boolean arrays for slicing, the arrays can include `None` (missing) values. Awkward Array handles missing data gracefully during boolean slicing: + +```{code-cell} ipython3 +bool_mask = ak.Array([True, None, False, True]) +array[bool_mask] +``` + +This ability to cope with missing data without failing or needing imputation is invaluable in data analysis tasks where missing data is common. diff --git a/docs/user-guide/how-to-examine-single-item.md b/docs/user-guide/how-to-examine-single-item.md index 5634bda1e3..4156ebdb77 100644 --- a/docs/user-guide/how-to-examine-single-item.md +++ b/docs/user-guide/how-to-examine-single-item.md @@ -14,10 +14,138 @@ kernelspec: How to examine a single item in detail ====================================== -**This is a stub:** I intend to write this article, but haven't yet. +It's often useful to pull out a single item from an array to inspect its contents, particularly in the early stages of a data analysis, to get a sense of the data's structure. This tutorial shows how to extract one item from an Awkward Array and examine it in different ways. -If you need it soon, create an issue saying so and I'll make it a higher priority. +For this example, we'll to use the Chicago taxi trips dataset from [10 minutes to Awkward Array](https://awkward-array.org/doc/main/getting-started/10-minutes-to-awkward-array.html). Recall that this dataset includes information about trips by various taxis collected over a few years, enriched with GPS path data. -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) ++++ -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +## Loading the dataset + +First, let's load the dataset using the {func}`ak.from_parquet` function. We will only load the first row group, for the sake of this demonstration: + +```{code-cell} ipython3 +import awkward as ak + +url = "https://pivarski-princeton.s3.amazonaws.com/chicago-taxi.parquet" +taxi = ak.from_parquet( + url, + row_groups=[0], + columns=["trip.km", "trip.begin.l*", "trip.end.l*", "trip.path.*"], +) +``` + +## What is a single item? + +The first "item" of this dataset could be a single taxi, which comprises many trips. + +```{code-cell} ipython3 +single_taxi = taxi[5] +single_taxi +``` + +Or it could be a single trip. + +```{code-cell} ipython3 +single_trip = single_taxi.trip[5] +single_trip +``` + +Or it could be a single latitude, longitude position along the path. + +```{code-cell} ipython3 +single_trip.path +``` + +```{code-cell} ipython3 +single_point = single_trip.path[5] +single_point +``` + +```{code-cell} ipython3 +print(f"longitude: {single_trip.begin.lon + single_point.londiff:.3f}") +print(f"latitude: {single_trip.begin.lat + single_point.latdiff:.3f}") +``` + +In Jupyter notebooks (and this documentation), the array contents are presented in a multi-line format with the data type below a dashed line. + +## Standard Python `repr` + +In a Python prompt, the format is more concise: + +```{code-cell} ipython3 +print(f"{single_taxi!r}") +``` + +```{code-cell} ipython3 +print(f"{single_trip!r}") +``` + +```{code-cell} ipython3 +print(f"{single_point!r}") +``` + +The long form can be obtained in a Python prompt with the `show` method: + +```{code-cell} ipython3 +single_taxi.show() +``` + +```{code-cell} ipython3 +single_trip.show() +``` + +```{code-cell} ipython3 +single_point.show() +``` + +## The `show` method + +The `show` method can take a `type=True` argument to include the type as well (at the top this time, because values are presented in the "most valuable real estate," which is the bottom of a print-out in the terminal, but the top in a Jupyter notebook). + +```{code-cell} ipython3 +single_point.show(type=True) +``` + +Types also have a `show` method, so if you _only_ want the type, you can do + +```{code-cell} ipython3 +single_trip.type.show() +``` + +If you need to get this as a string or pass it to an output other than `sys.stdout`, use the `stream` parameter. + +```{code-cell} ipython3 +single_point.show(stream=None) +``` + +## Using `to_list` and Python’s `pprint` for a detailed view + +The `repr` and `show` representations print into a restricted space: 1 line (80 characters) for `repr`, and 20 lines (80 character width) for `show` without `type=True`. To do this, they replace data with ellipses (`...`) until it fits. + +You might want to ensure that you see everything. One way to do that is to turn the data into Python objects with {func}`ak.to_list` (or `to_list` or `tolist` as a method) and pretty-print them with Python's `pprint`. + +```{code-cell} ipython3 +import pprint + +trip_list = ak.to_list(single_trip) +pprint.pprint(trip_list) +``` + +Keep in mind that if you don't slice a small enough section of data, your terminal or Jupyter notebook may be overwhelmed with output! + +## Viewing data as JSON + +Another way you can dump everything is to convert the data to JSON with {func}`ak.to_json`. + +```{code-cell} ipython3 +print(ak.to_json(single_trip)) +``` + +That's not very readable, so we'll pass `num_indent_spaces=4` to add newlines and indentation, and `num_readability_spaces=1` to add spaces after commas (`,`) and colons (`:`). + +```{code-cell} ipython3 +print(ak.to_json(single_trip, num_indent_spaces=4, num_readability_spaces=1)) +``` + +{func}`ak.to_json` is also one of the bulk output methods, so it can write data to a file, as a single JSON object or as `line_delimited` JSON. diff --git a/docs/user-guide/how-to-examine.md b/docs/user-guide/how-to-examine.md index 55635026a2..a767217dca 100644 --- a/docs/user-guide/how-to-examine.md +++ b/docs/user-guide/how-to-examine.md @@ -14,5 +14,10 @@ kernelspec: Examining arrays ================ -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-filter-cut-mask.md b/docs/user-guide/how-to-filter-cut-mask.md index 69edc411fd..3b31bbf728 100644 --- a/docs/user-guide/how-to-filter-cut-mask.md +++ b/docs/user-guide/how-to-filter-cut-mask.md @@ -14,10 +14,162 @@ kernelspec: How to filter arrays: cutting vs. masking ========================================= -**This is a stub:** I intend to write this article, but haven't yet. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -If you need it soon, create an issue saying so and I'll make it a higher priority. +## The problem with slicing -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +When you write a mathematical formula using binary operators like `+` and `*`, or [NumPy universal functions (ufuncs)](https://numpy.org/doc/stable/reference/ufuncs.html) like `np.sqrt`, the shapes of nested lists must align. If the arrays in an expression were derived from a single array, this is often automatic. For instance, -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +original_array = ak.Array([ + [ + {"title": "zero", "x": 0, "y": 0}, + {"title": "one", "x": 1, "y": 1.1}, + {"title": "two", "x": 2, "y": 2.2}, + ], + [], + [ + {"title": "three", "x": 3, "y": 3.3}, + {"title": "four", "x": 4, "y": 4.4}, + ], + [ + {"title": "five", "x": 5, "y": 5.5}, + ], + [ + {"title": "six", "x": 6, "y": 6.6}, + {"title": "seven", "x": 7, "y": 7.7}, + {"title": "eight", "x": 8, "y": 8.8}, + {"title": "nine", "x": 9, "y": 9.9}, + ], +]) +``` + +```{code-cell} ipython3 +array_x = original_array.x +array_y = original_array.y +``` + +The `array_x` and `array_y` have the same number of lists and the same numbers of items in each list because they were both slices of the `original_array`. + +```{code-cell} ipython3 +array_x +``` + +```{code-cell} ipython3 +array_y +``` + +Thus, they can be used together in a mathematical formula. + +```{code-cell} ipython3 +array_x**2 + array_y**2 +``` + +However, if one array is sliced, or if the two arrays are sliced by different criteria, they would no longer line up: + +```{code-cell} ipython3 +sliced_x = array_x[array_x > 3] +sliced_y = array_y[array_y > 3] +``` + +```{code-cell} ipython3 +sliced_x +``` + +```{code-cell} ipython3 +sliced_y +``` + +Notice that the first was sliced with `array_x > 3` and the second was sliced with `array_y > 3`, and as a result, the third list differs in length between the two arrays: + +```{code-cell} ipython3 +sliced_x[2], sliced_y[2] +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +If we try to use these together, we get a ValueError: + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +sliced_x**2 + sliced_y**2 +``` + +Sometimes, these misalignments are overt, but sometimes they're subtle and embedded deep within a very large array. You can start investigating a problem like this with {func}`ak.num`: + +```{code-cell} ipython3 +ak.num(sliced_x) != ak.num(sliced_y) +``` + +```{code-cell} ipython3 +np.nonzero(ak.to_numpy(ak.num(sliced_x) != ak.num(sliced_y))) +``` + +But it's also possible to avoid them in the first place. + +## Masking with missing values + +The problem was that the two arrays' shapes changed differently; instead, we'll slice them in such a way that their shapes don't change at all. + +The {func}`ak.mask` function uses a boolean array like a slice, but takes values that line up with `False` and returns `None` instead of removing them. + +```{code-cell} ipython3 +ak.mask(array_x, array_x > 3) +``` + +It can also be accessed as an array property, with square brackets, so that it resembles a slice: + +```{code-cell} ipython3 +masked_x = array_x.mask[array_x > 3] +masked_y = array_y.mask[array_y > 3] +``` + +```{code-cell} ipython3 +masked_x +``` + +```{code-cell} ipython3 +masked_y +``` + +The results of these two masks can be used in a mathematical expression because they line up: + +```{code-cell} ipython3 +result = masked_x**2 + masked_y**2 +result +``` + +Now only one problem remains: the `None` (missing) values might be undesirable in the output. There are several ways to get rid of them: + +* {func}`ak.drop_none` eliminates `None`, like a slice, but it can be done once at the end of a calculation, +* {func}`ak.fill_none` replaces `None` with a chosen value, +* {func}`ak.flatten` removes list structure, and if the `None` values are at the level of a list (the ones in `result` aren't), they'll be removed too, +* {func}`ak.singletons` replaces `None` with `[]` and any other value `x` with `[x]`. The resulting lists all have length 0 or length 1. + +```{code-cell} ipython3 +ak.drop_none(result, axis=1) +``` + +```{code-cell} ipython3 +ak.fill_none(result, -1, axis=1) +``` + +```{code-cell} ipython3 +ak.singletons(result, axis=1) +``` + +As a final note, the difference between using {func}`ak.drop_none` and slicing with the result of {func}`ak.is_none` is that {func}`ak.drop_none` also removes "missingness" from the data type; a slice does not. + +```{code-cell} ipython3 +result[~ak.is_none(result, axis=1)] +``` + +(Note the `?` for "option-type" before `float64`. This could have consequences, good or bad, at a later stage in processing.) diff --git a/docs/user-guide/how-to-filter-num.md b/docs/user-guide/how-to-filter-num.md index 055191460b..14f74530a4 100644 --- a/docs/user-guide/how-to-filter-num.md +++ b/docs/user-guide/how-to-filter-num.md @@ -14,10 +14,79 @@ kernelspec: How to filter arrays by number of items ======================================= -**This is a stub:** I intend to write this article, but haven't yet. +```{code-cell} ipython3 +import awkward as ak +``` -If you need it soon, create an issue saying so and I'll make it a higher priority. +In general, arrays are filtered using NumPy-like slicing. Numerical values can be filtered by numerical expressions in a way that is very similar to NumPy: -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +```{code-cell} ipython3 +array = ak.Array([ + [[0, 1.1, 2.2], []], [[3.3, 4.4]], [], [[5.5], [6.6, 7.7, 8.8, 9.9]] +]) +``` -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +array[array > 4] +``` + +but it's also common to want to filter arrays by the number of items in each list, for two reasons: + +* to exclude empty lists so that subsequent slices can select the item at index `0`, +* to make the list lengths rectangular for computational steps that require rectangular array (such as most forms of machine learning). + +There are two functions that provide the lengths of lists: {func}`ak.num` and {func}`ak.count`. To filter arrays, you'll most likely want {func}`ak.num`. + +## Use `ak.num` + +{func}`ak.num` can be applied at any `axis`, and it returns the number of items in lists at that `axis` with the same shape for all levels above that `axis`. + +```{code-cell} ipython3 +ak.num(array, axis=0) +``` + +```{code-cell} ipython3 +ak.num(array, axis=1) # default +``` + +```{code-cell} ipython3 +ak.num(array, axis=2) +``` + +Thus, if you want to select outer lists of `array` with length 2, you would use `axis=1`: + +```{code-cell} ipython3 +array[ak.num(array) == 2] +``` + +And if you want to select inner lists of `array` with length greater than 2, you would use `axis=2`: + +```{code-cell} ipython3 +array[ak.num(array, axis=2) > 2] +``` + +The ragged array of booleans that you get from comparing {func}`ak.num` with a number is exactly what is needed to slice the array. + +## Don't use `ak.count` + +By contrast, {func}`ak.count` returns structures that you can't use this way (for all but `axis=-1`): + +```{code-cell} ipython3 +ak.count(array, axis=None) # default +``` + +```{code-cell} ipython3 +ak.count(array, axis=0) +``` + +```{code-cell} ipython3 +ak.count(array, axis=1) +``` + +```{code-cell} ipython3 +ak.count(array, axis=2) # equivalent to axis=-1 for this array +``` + +Also, {func}`ak.num` can be used on arrays that contain records, whereas {func}`ak.count` (like other reducers), can't. + +As a reducer, {func}`ak.count` is intended to be used in a mathematical formula with other reducers, like {func}`ak.sum`, {func}`ak.max`, etc. (usually as a denominator). Its `axis` behavior matches that of other reducers, which is important for the shapes of nested lists to align. diff --git a/docs/user-guide/how-to-filter.md b/docs/user-guide/how-to-filter.md index 547f485af0..be3620a5d6 100644 --- a/docs/user-guide/how-to-filter.md +++ b/docs/user-guide/how-to-filter.md @@ -14,5 +14,10 @@ kernelspec: Filtering data ============== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-math-argminmax.md b/docs/user-guide/how-to-math-argminmax.md index b0648113ba..e21091c6d2 100644 --- a/docs/user-guide/how-to-math-argminmax.md +++ b/docs/user-guide/how-to-math-argminmax.md @@ -11,13 +11,146 @@ kernelspec: name: python3 --- -How to use argmin and argmax -============================ +Min/max/sort one array by another +================================= -**This is a stub:** I intend to write this article, but haven't yet. +A common task in data analysis is to select items from one array that minimizes or maximizes another, or to sort one array by the values of another. -If you need it soon, create an issue saying so and I'll make it a higher priority. +```{code-cell} ipython3 +import awkward as ak +``` -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +## Naive attempt goes wrong -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +For instance, in + +```{code-cell} ipython3 +data = ak.Array([ + [ + {"title": "zero", "x": 0, "y": 0}, + {"title": "two", "x": 2, "y": 2.2}, + {"title": "one", "x": 1, "y": 1.1}, + ], + [], + [ + {"title": "four", "x": 4, "y": 4.4}, + {"title": "three", "x": 3, "y": 3.3}, + ], + [ + {"title": "five", "x": 5, "y": 5.5}, + ], + [ + {"title": "eight", "x": 8, "y": 8.8}, + {"title": "six", "x": 6, "y": 6.6}, + {"title": "nine", "x": 9, "y": 9.9}, + {"title": "seven", "x": 7, "y": 7.7}, + ], +]) +``` + +you may want to score each record with a computed value, such as `x**2 + y**2`, and then select the record with the highest score from each list. + +```{code-cell} ipython3 +score = data.x**2 + data.y**2 +score +``` + +At first, it would seem that {func}`ak.argmax` is what you need to identify the item with the highest score from each list and select it from `data`. + +```{code-cell} ipython3 +best_index = ak.argmax(score, axis=1) +best_index +``` + +However, if you attempt to slice the `data` with this, you'll either get an indexing error or lists instead of records: + +```{code-cell} ipython3 +data[best_index] +``` + +## What happend? + +Following the logic for {doc}`reducers `, the {func}`ak.argmin` function returns an array with one fewer dimension than the input: the `data` is an array of lists of records, but `best_index` is an array of integers. We want an array of lists of integers. + +The `keepdims=True` parameter can ensure that the output has the same number of dimensions as the input: + +```{code-cell} ipython3 +best_index = ak.argmax(score, axis=1, keepdims=True) +best_index +``` + +Now these integers are at the same level of depth as the records that we want to select: + +```{code-cell} ipython3 +result = data[best_index] +result +``` + +In the above, each length-1 list contains the record with the highest `score`. Even the empty list, for which the {func}`ak.argmax` is missing (`None`), is now a length-1 list containing `None`. We can remove this length-1 list structure with a slice: + +```{code-cell} ipython3 +result[:, 0] +``` + +To summarize this as a handy idiom, the way to get the record with maximum `data.x**2 + data.y**2` from an array of lists of records named `data` is + +```{code-cell} ipython3 +data[ak.argmax(data.x**2 + data.y**2, axis=1, keepdims=True)][:, 0] +``` + +For an array of lists of lists of records, `axis=2` and the final slice would be `[:, :, 0]`, and so on. + +## Sorting by another array + +In addition to selecting items corresponding to the minimum or maximum of some other array, we may want to sort by another array. Just as {func}`ak.argmin` and {func}`ak.argmax` are the functions that would convey indexes from one array to another, {func}`ak.argsort` conveys sorted indexes from one array to another array. However, {func}`ak.argsort` always maintains the total number of dimensions, so we don't need to worry about `keepdims`. + +```{code-cell} ipython3 +sorted_indexes = ak.argsort(score) +sorted_indexes +``` + +```{code-cell} ipython3 +data[sorted_indexes] +``` + +This sorted data has the same type as `data`: + +```{code-cell} ipython3 +data.type.show() +``` + +It's exactly what we want. {func}`ak.argsort` is easier to use than {func}`ak.argmin` and {func}`ak.argmax`. + +## Getting the top _n_ items + +The {func}`ak.min`, {func}`ak.max`, {func}`ak.argmin`, and {func}`ak.argmax` functions select one extreme value. If you want the top _n_ items (with _n ≠ 1_), you can use {func}`ak.sort` or {func}`ak.argsort`, followed by a slice: + +```{code-cell} ipython3 +top2 = data[ak.argsort(score)][:, :2] +top2 +``` + +Notice, though, that not all of these lists have length 2. The lists with 0 or 1 input items have 0 or 1 output items: these lists have _up to_ length 2. That may be fine, but the example with {func}`ak.argmax`, above, resulted in `None` for an empty list. We could emulate that with {func}`ak.pad_none`. + +```{code-cell} ipython3 +padded = ak.pad_none(top2, 2, axis=1) +padded +``` + +The data type still says "`var *`", meaning that the lists are allowed to be variable-length, even though they happen to all have length 2. At this point, we might not care because that's all we need in order to convert these fields into NumPy arrays (e.g. for some machine learning process): + +```{code-cell} ipython3 +ak.to_numpy(padded.x) +``` + +```{code-cell} ipython3 +ak.to_numpy(padded.y) +``` + +Or we might want to force the data type to ensure that the lists have length 2, using {func}`ak.to_regular`, {func}`ak.enforce_type`, or just by passing `clip=True` in the original {func}`ak.pad_none`. + +```{code-cell} ipython3 +ak.to_regular(padded, axis=1) +``` + +(Now the list lengths are "`2 *`", rather than "`var *`".) diff --git a/docs/user-guide/how-to-math-broadcasting.md b/docs/user-guide/how-to-math-broadcasting.md index bb92efe766..2b1a3b7473 100644 --- a/docs/user-guide/how-to-math-broadcasting.md +++ b/docs/user-guide/how-to-math-broadcasting.md @@ -14,10 +14,234 @@ kernelspec: How Awkward broadcasting works ============================== -**This is a stub:** I intend to write this article, but haven't yet. +Functions that accept more than one array argument need to combine the elements of their array elements somehow, particularly if the input arrays have different numbers of dimensions. That combination is called "broadcasting." Broadcasting in Awkward Array is very similar to [NumPy broadcasting](https://numpy.org/doc/stable/user/basics.broadcasting.html), with some minor differences described at the end of this section. -If you need it soon, create an issue saying so and I'll make it a higher priority. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +## Broadcasting in mathematical functions -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +Any function that takes more than one array argument has to broadcast them together; a common case of that is in binary operators of a mathematical expression: + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], [], [4, 5]]) +array2 = ak.Array([10, 20, 30]) + +array1 + array2 +``` + +The single `10` in `array2` is added to every element of `[1, 2, 3]` in `array1`, and the single `30` is added to every element of `[4, 5]`. The single `20` in `array2` is not added to anything in `array1` because the corresponding list is empty. + +For broadcasting to be successful, the arrays need to have the same length in all dimensions except the one being broadcasted; `array1` and `array2` both had to be length 3 in the example above. That's why this example fails: + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +array1 = ak.Array([[1, 2, 3], [4, 5]]) +array2 = ak.Array([10, 20, 30]) + +array1 + array2 +``` + +The same applies to functions of multiple arguments that aren't associated with any binary operator: + +```{code-cell} ipython3 +array1 = ak.Array([[True, False, True], [], [False, True]]) +array2 = ak.Array([True, True, False]) + +np.logical_and(array1, array2) +``` + +And functions that aren't universal functions (ufuncs): + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], [], [4, 5]]) +array2 = ak.Array([10, 20, 30]) + +np.where(array1 % 2 == 0, array1, array2) +``` + +## Using `ak.broadcast_arrays` + +Sometimes, you may want to broadcast arrays to a common shape without performing an additional operation. The {func}`ak.broadcast_arrays` function allows you to do this: + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], [], [4, 5]]) +array2 = ak.Array([10, 20, 30]) + +ak.broadcast_arrays(array1, array2) +``` + +This code would align `array1` and `array2` into compatible shapes that can be used in subsequent operations, effectively showing how each element corresponds between the two original arrays. + ++++ + +## Missing data, heterogeneous data, and records + +One of the ways Awkward Arrays extend beyond NumPy is by allowing the use of `None` for missing data. These `None` values are broadcasted like empty lists: + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], None, [4, 5]]) +array2 = ak.Array([10, 20, 30]) + +array1 + array2 +``` + +Another difference from NumPy is that Awkward Arrays can contain data of mixed type, such as different numbers of dimensions. If numerical values _can_ be matched across such arrays, they are: + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], 4, 5]) +array2 = ak.Array([10, 20, 30]) + +array1 + array2 +``` + +Arrays containing records can also be broadcasted, though most mathematical operations cannot be applied to records. Here is an example using {func}`ak.broadcast_arrays`. + +```{code-cell} ipython3 +array1 = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}], +]) +array2 = ak.Array([10, 20, 30]) + +ak.broadcast_arrays(array1, array2) +``` + +## Differences from NumPy broadcasting + +Awkward Array broadcasting is identical to NumPy broadcasting in three respects: + +1. arrays with the same number of dimensions must match lengths (except for length 1) exactly, +2. length-1 dimensions expand like scalars (one to many), +3. for arrays with different numbers of dimensions, the smaller number of dimensions is expanded to match the largest number of dimensions. + +Awkward Arrays with fixed-length dimensions—not "variable-length" or "ragged"—broadcast exactly like NumPy. + +Awkward Arrays with ragged dimensions expand the smaller number of dimensions on the left, whereas NumPy and Awkward-with-fixed-length expand the smaller number of dimensions on the right, when implementing point 3 above. This is the only difference. + +Here's a demonstration of NumPy broadcasting: + +```{code-cell} ipython3 +x = np.array([ + [1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], +]) +y = np.array([ + [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]], + [[100, 200, 300, 400], [500, 600, 700, 800], [900, 1000, 1100, 1200]], +]) + +x + y +``` + +And fixed-length Awkward Arrays made from these can be broadcasted the same way: + +```{code-cell} ipython3 +ak.Array(x) + ak.Array(y) +``` + +but only because the latter have completely regular dimensions, like their NumPy counterparts. + +```{code-cell} ipython3 +print(x.shape) +print(y.shape) +``` + +```{code-cell} ipython3 +print(ak.Array(x).type) +print(ak.Array(y).type) +``` + +In both NumPy and Awkward Array, `x` has fewer dimensions than `y`, so `x` is expanded on the _left_ from length-1 to length-2. + +However, if the Awkward Array has variable-length _type_, regardless of whether the actual lists have variable lengths, + +```{code-cell} ipython3 +print(ak.Array(x.tolist()).type) +print(ak.Array(y.tolist()).type) +``` + +this broadcasting does not work: + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +ak.Array(x.tolist()) + ak.Array(y.tolist()) +``` + +Instead of trying to add a dimension to the left of `x`'s shape, `(3, 4)`, to make `(2, 3, 4)`, the ragged broadcasting is trying to add a dimension to the right of `x`'s shape, and it doesn't line up. + +### Why does ragged broadcasting have to be different? + +Instead of adding a new dimension on the left, as NumPy and fixed-length Awkward Arrays do, ragged broadcasting tries to add a new dimension on the right in order to make it useful for emulating imperative code like + +```{code-cell} ipython3 +for x_i, y_i in zip(x, y): + for x_ij, y_ij in zip(x_i, y_i): + print("[", end=" ") + for y_ijk in y_ij: + print(x_ij + y_ijk, end=" ") + print("]") + print() +``` + +In the above, the value of `x_ij` is not varying while `y_ijk` varies in the innermost for-loop. In imperative code like this, it's natural for the outermost (left-most) dimensions of two nested lists to line up, while a scalar from the list with fewer dimensions, `x`, stays constant (is effectively duplicated) for each innermost `y` value. + +This is _not_ what NumPy's left-broadcasting does: + +```{code-cell} ipython3 +x + y +``` + +Notice that the numerical values are different! + +To get the behavior we expect from imperative code, we need to right-broadcast, which is what ragged broadcasting in Awkward Array does: + +```{code-cell} ipython3 +x = ak.Array([ + [1.1, 2.2, 3.3], + [], + [4.4, 5.5] +]) +y = ak.Array([ + [[1], [1, 2], [1, 2, 3]], + [], + [[1, 2, 3, 4], [1, 2, 3, 4, 5]] +]) + +for x_i, y_i in zip(x, y): + print("[") + for x_ij, y_ij in zip(x_i, y_i): + print(" [", end=" ") + for y_ijk in y_ij: + print(x_ij + y_ijk, end=" ") + print("]") + print("]\n") + +x + y +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +In summary, + +* NumPy left-broadcasts, +* Awkward Arrays with fixed-length lists left-broadcast, for consistency with NumPy, +* Awkward Arrays with variable-length lists right-broadcast, for consistency with imperative code. + +One way to control this is to ensure that all arrays involved in an expression have the same number of dimensions by explicitly expanding them. Implicit broadcasting only happens for arrays of different numbers of dimensions, or if the length of a dimension is 1. + +But it might also be the case that your arrays have lists of equal length, so they seem to be regular like a NumPy array, yet their data type says that the lists _can_ be variable-length. Perhaps you got the NumPy-like data from a source that doesn't enforce fixed lengths, such as Python lists ({func}`ak.from_iter`), JSON ({func}`ak.from_json`), or Parquet ({func}`ak.from_parquet`). Check the array's {func}`ak.type` to see whether all dimensions are ragged (`var *`) or regular (some number `*`). + +The {func}`ak.from_regular` and {func}`ak.to_regular` functions toggle ragged (`var *`) and regular (some number `*`) dimensions, and {func}`ak.enforce_type` can be used to cast types like this in general. diff --git a/docs/user-guide/how-to-math-gpu.md b/docs/user-guide/how-to-math-gpu.md index fc021b0645..be682fd96b 100644 --- a/docs/user-guide/how-to-math-gpu.md +++ b/docs/user-guide/how-to-math-gpu.md @@ -1,23 +1,139 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- +Awkward Arrays on GPUs +====================== -How to use Awkward Arrays on GPUs -================================= +At the time when this article is being written, Awkward Arrays can be used on Nvidia GPUs on Linux, assuming that you have the [CuPy](https://cupy.dev/) package installed. In the future, we may support more GPU vendors on more platforms, so check [Awkward Array on GitHub](https://github.com/scikit-hep/awkward) for more up-to-date information. -**This is a stub:** I intend to write this article, but haven't yet. +```python +import awkward as ak +import numpy as np +import cupy as cp +``` -If you need it soon, create an issue saying so and I'll make it a higher priority. +## Copying data from RAM to a GPU -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +An Awkward Array might either reside in main memory (RAM), to be processed by the CPU, or in a GPU's global memory, to be processed by a GPU. Arrays can be copied between devices using the {func}`ak.to_backend` function, and their device can be checked with {func}`ak.backend`. -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```python +array_cpu = ak.Array( + [[0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]] * 10000 +) +ak.backend(array_cpu) +``` + +``` +'cpu' +``` + +```python +array_gpu = ak.to_backend(array_cpu, "cuda") +ak.backend(array_gpu) +``` + +``` +'cuda' +``` + +The backend names, `"cpu"` and `"cuda"`, refer to the software that performs the calculations, which is written in conventional C code for the CPU and CUDA for (Nvidia) GPUs. By passing `"cpu"` to {func}`ak.to_backend`, you can copy an array from the GPU back to main memory. + +Arrays are never copied without an explicit {func}`ak.to_backend` call, so if you pass arrays from different devices to the same function, it will raise an error. + +## Array calculations on a GPU + +All of the `ak.*` functions (excluding `ak.str.*`), slices, and NumPy functions that work on Awkward Arrays on CPUs will work on Awkward Arrays on GPUs. At the time of writing, the implementation is nearing completion, so check [Awkward Array on GitHub](https://github.com/scikit-hep/awkward) if a function doesn't work. + +Here's an example, using {func}`ak.num`: + +```python +ak.num(array_gpu) +``` + +``` +[3, + 0, + 2, + 1, + 4, + 3, + 0, + 2, + 1, + 4, + ..., + 0, + 2, + 1, + 4, + 3, + 0, + 2, + 1, + 4] +------------------- +type: 50000 * int64 +``` + +and here is a slice: + +```python +array_gpu[100:] +``` + +``` +[[0.0, 1.1, 2.2], + [], + [3.3, 4.4], + [5.5], + [6.6, 7.7, 8.8, 9.9], + [0.0, 1.1, 2.2], + [], + [3.3, 4.4], + [5.5], + [6.6, 7.7, 8.8, 9.9], + ..., + [], + [3.3, 4.4], + [5.5], + [6.6, 7.7, 8.8, 9.9], + [0.0, 1.1, 2.2], + [], + [3.3, 4.4], + [5.5], + [6.6, 7.7, 8.8, 9.9]] +--------------------------- +type: 49900 * var * float64 +``` + +All [NumPy universal functions (ufuncs)](https://numpy.org/doc/stable/reference/ufuncs.html) _for which there is a CuPy equivalent_ also work: + +```python +np.sqrt(array_gpu) +``` + +``` +[[0.0, 1.0488088481701516, 1.4832396974191326], + [], + [1.816590212458495, 2.0976176963403033], + [2.345207879911715], + [2.569046515733026, 2.7748873851023217, ..., 3.146426544510455], + [0.0, 1.0488088481701516, 1.4832396974191326], + [], + [1.816590212458495, 2.0976176963403033], + [2.345207879911715], + [2.569046515733026, 2.7748873851023217, ..., 3.146426544510455], + ..., + [], + [1.816590212458495, 2.0976176963403033], + [2.345207879911715], + [2.569046515733026, 2.7748873851023217, ..., 3.146426544510455], + [0.0, 1.0488088481701516, 1.4832396974191326], + [], + [1.816590212458495, 2.0976176963403033], + [2.345207879911715], + [2.569046515733026, 2.7748873851023217, ..., 3.146426544510455]] +----------------------------------------------------------------- +type: 50000 * var * float64 +``` + +## JIT-compilation in Numba + +Just as Awkward Arrays in main memory can be iterated over in functions that have been JIT-compiled by [Numba](https://numba.pydata.org/), Awkward Arrays on GPUs can be iterated over in functions JIT-compiled by `@numba.cuda.jit`. The same restrictions apply (iteration only; no `ak.*` functions); see {doc}`how-to-use-in-numba-cuda.md`. diff --git a/docs/user-guide/how-to-math-numpy.md b/docs/user-guide/how-to-math-numpy.md index 53669348c4..b54e469d7c 100644 --- a/docs/user-guide/how-to-math-numpy.md +++ b/docs/user-guide/how-to-math-numpy.md @@ -14,10 +14,174 @@ kernelspec: How to perform computations with NumPy ====================================== -**This is a stub:** I intend to write this article, but haven't yet. +Awkward Array's integration with NumPy allows you to use NumPy's array functions on data with complex structures, including ragged and heterogeneous arrays. -If you need it soon, create an issue saying so and I'll make it a higher priority. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +## Universal functions (ufuncs) -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +[NumPy's universal functions (ufuncs)](https://numpy.org/doc/stable/reference/ufuncs.html) are functions that operate elementwise on arrays. They are broadcasting-aware, so they can naturally handle data structures like ragged arrays that are common in Awkward Arrays. + +Here's an example of applying `np.sqrt`, a NumPy ufunc, to an Awkward Array: + +```{code-cell} ipython3 +data = ak.Array([[1, 4, 9], [], [16, 25]]) + +np.sqrt(data) +``` + +Notice that the ufunc applies to the numeric data, passing through all dimensions of nested lists, even if those lists have variable length. This also applies to heterogeneous data, in which the data are not all of the same type. + +```{code-cell} ipython3 +data = ak.Array([[1, 4, 9], [], 16, [[[25]]]]) + +np.sqrt(data) +``` + +Unary and binary operations on Awkward Arrays, such as `+`, `-`, `>`, and `==`, are actually calling NumPy ufuncs. For instance, `+`: + +```{code-cell} ipython3 +array1 = ak.Array([[1, 2, 3], [], [4, 5]]) +array2 = ak.Array([[10, 20, 30], [], [40, 50]]) + +array1 + array2 +``` + +is actually `np.add`: + +```{code-cell} ipython3 +np.add(array1, array2) +``` + +### Arrays with record fields + +Ufuncs can only be applied to numerical data in lists, not records. + +```{code-cell} ipython3 +records = ak.Array([{"x": 4, "y": 9}, {"x": 16, "y": 25}]) +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +np.sqrt(records) +``` + +However, you can pull each field out of a record and apply the ufunc to it. + +```{code-cell} ipython3 +np.sqrt(records.x) +``` + +```{code-cell} ipython3 +np.sqrt(records.y) +``` + +If you want the result wrapped up in a new array of records, you can use {func}`ak.zip` to do that. + +```{code-cell} ipython3 +ak.zip({"x": np.sqrt(records.x), "y": np.sqrt(records.y)}) +``` + +Here's an idiom that would apply a ufunc to every field individually, and then wrap up the result as a new record with the same fields (using {func}`ak.fields`, {func}`ak.unzip`, and {func}`ak.zip`): + +```{code-cell} ipython3 +ak.zip({key: np.sqrt(value) for key, value in zip(ak.fields(records), ak.unzip(records))}) +``` + +The reaons that Awkward Array does not do this automatically is to prevent mistakes: it's common for records to represent coordinates of data points, and if the coordinates are not Cartesian, the one-to-one application is not correct. + ++++ + +### Using non-NumPy ufuncs + +NumPy-compatible ufuncs exist in other libraries, like SciPy, and can be applied in the same way. Here’s how you can apply `scipy.special.gamma` and `scipy.special.erf`: + +```{code-cell} ipython3 +import scipy.special + +data = ak.Array([[0.1, 0.2, 0.3], [], [0.4, 0.5]]) +``` + +```{code-cell} ipython3 +scipy.special.gamma(data) +``` + +```{code-cell} ipython3 +scipy.special.erf(data) +``` + +You can even create your own ufuncs using Numba's `@nb.vectorize`: + +```{code-cell} ipython3 +import numba as nb + +@nb.vectorize +def gcd_euclid(x, y): + # computation that is more complex than a formula + while y != 0: + x, y = y, x % y + return x +``` + +```{code-cell} ipython3 +x = ak.Array([[10, 20, 30], [], [40, 50]]) +y = ak.Array([[5, 40, 15], [], [24, 255]]) +``` + +```{code-cell} ipython3 +gcd_euclid(x, y) +``` + +Since Numba has JIT-compiled this function, it would run much faster on large arrays than custom Python code. + ++++ + +## Non-ufunc NumPy functions + +Some NumPy functions don't satisfy the ufunc protocol, but have been implemented for Awkward Arrays because they are useful. You can tell when a NumPy function has an Awkward Array implementation when a function with the same name and signature exists in both libraries. + +For instance, `np.where` works on Awkward Arrays because {func}`ak.where` exists: + +```{code-cell} ipython3 +np.where(y % 2 == 0, x, y) +``` + +(The above selects elements from `x` when `y` is even and elements from `y` when `y` is odd.) + +Similarly, `np.concatenate` works on Awkward Arrays because {func}`ak.concatenate` exists: + +```{code-cell} ipython3 +np.concatenate([x, y]) +``` + +```{code-cell} ipython3 +np.concatenate([x, y], axis=1) +``` + +Other NumPy functions, without an equivalent in the Awkward Array library, will work only if the Awkward Array can be converted into a NumPy array. + +Ragged arrays can't be converted to NumPy: + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +np.fft.fft(ak.Array([[1.1, 2.2, 3.3], [], [7.7, 8.8, 9.9]])) +``` + +But arrays with equal-sized lists can: + +```{code-cell} ipython3 +np.fft.fft(ak.Array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) +``` diff --git a/docs/user-guide/how-to-math-reducing.md b/docs/user-guide/how-to-math-reducing.md index 712c42f8e3..8b3857917d 100644 --- a/docs/user-guide/how-to-math-reducing.md +++ b/docs/user-guide/how-to-math-reducing.md @@ -14,10 +14,246 @@ kernelspec: How to reduce dimensions (sum/min/any/all) ========================================== -**This is a stub:** I intend to write this article, but haven't yet. +After elementwise functions, dimension-reducer functions are the most commonly used. These functions replace a list of numbers with a single, scalar number by adding, multiplying, minimizing, maximizing, or performing logical-or ("any") or logical-and ("all"). -If you need it soon, create an issue saying so and I'll make it a higher priority. +These are also called aggregation functions; in relational databases, SQL, and data-frames, aggregations are applied after a "group by" operation. Awkward Array doesn't have "group by" operations; lists are already grouped. -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +```{code-cell} ipython3 +import awkward as ak +import numpy as np +``` -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +## First reducer: `ak.sum` + +To illustrate all of these functions, let's consider addition. Given an array: + +```{code-cell} ipython3 +array = ak.Array([[1, 2, 3], [4, 5], [], [6]]) +``` + +{func}`ak.sum` with no arguments adds all of the values in the nested lists, just like `np.sum`. + +```{code-cell} ipython3 +ak.sum(array) +``` + +With Awkward Arrays, it's usually more useful to supply an `axis` argument to reduce one dimension, rather than all dimensions. + +For reasons that will be explained below, `axis=-1` is the most frequently useful. + +```{code-cell} ipython3 +ak.sum(array, axis=-1) +``` + +### The `axis` argument + +Before getting deeper into the `axis` argument, let's consider a NumPy array with more dimensions. + +```{code-cell} ipython3 +array3d = np.array([ + [ + [ 1, 2, 3, 4, 5], + [ 10, 20, 30, 40, 50], + [ 100, 200, 300, 400, 500], + ], + [ + [0.1 , 0.2 , 0.3 , 0.4 , 0.5 ], + [0.01 , 0.02 , 0.03 , 0.04 , 0.05 ], + [0.001, 0.002, 0.003, 0.004, 0.005], + ], +]) + +with np.printoptions(suppress=True): + print(array3d) +``` + +This array has 3 dimensions, so in addition to `axis=None` (reduce everything to a scalar), there are 3 possible axis values. + +The first case, `axis=0`, adds the first 3×5 block to the second 3×5 block, i.e. summing over the first (length-2) dimension. Thus, the `1` is added to `0.1`, the `2` is added to `0.2`, and so on until the `500` is added to `0.005`. + +```{code-cell} ipython3 +with np.printoptions(suppress=True): + print(np.sum(array3d, axis=0)) +``` + +The second case, `axis=1`, adds vertically within each 3×5 block, i.e. summing over the second (length-3) dimension. What's left are two lists of length 5. + +```{code-cell} ipython3 +with np.printoptions(suppress=True): + print(np.sum(array3d, axis=1)) +``` + +The third case, `axis=2`, adds horizontally within each 3×5 block, i.e. summing over the third (length-5) dimension. What's left are two lists of length 3. + +```{code-cell} ipython3 +with np.printoptions(suppress=True): + print(np.sum(array3d, axis=2)) +``` + +Since negative `axis` counts from the other end of the scale, + +* `axis=0` is equivalent to `axis=-3` +* `axis=1` is equivalent to `axis=-2` +* `axis=2` is equivalent to `axis=-1`. + +### The `axis` argument with ragged lists + +Awkward Arrays allow the lengths of lists in an array to differ, so we can have + +```{code-cell} ipython3 +array_ragged = ak.Array([ + [ 1, 2, 3 ], + [ 10, 20 ], + [100, 200, 300, 400], +]) +array_ragged +``` + +As before, `axis=-1` sums over the innermost lists, replacing each of the 3 horizontal rows with a sum. + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=-1) +``` + +And `axis=-2` sums vertically, replacing each of the 4 vertical columns with a sum. Since the list lengths differ, some of the places we might expect to see a value is an empty gap—it contributes nothing to the result. + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=0) +``` + +We also have to choose a convention: should the values be left-aligned or right-aligned within their lists? Awkward Array choses left-aligned. + +In ragged data from real datasets, summing over whole lists usually has more meaning than summing over parts of different lists, so `axis=-1` is usually the most meaningful choice of `axis`. + ++++ + +### The `axis` argument with missing data + ++++ + +Just as empty gaps contribute nothing to the sum, missing values (`None`) don't contribute anything, either. + +```{code-cell} ipython3 +array_ragged = ak.Array([ + [None, None, 3, 4], + [ 10, None, 30 ], + [ 100, 200, 300, 400], +]) +array_ragged +``` + +`axis=-1` sums over each inner list, horizontally, replacing it with a scalar. + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=-1) +``` + +And `axis=-2` sums over the outer dimension, vertically. + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=-2) +``` + +For {func}`ak.sum`, each `None` has the same effect as a `0` value, for {func}`ak.prod` (multiplication), each `None` has the same effect as a `1` value, etc. + +## The `keepdims` argument + +Sometimes, you want to replace lists with a length-1 list, rather than a scalar. `keepdims=True` does that. + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=-1, keepdims=True) +``` + +```{code-cell} ipython3 +ak.sum(array_ragged, axis=-2, keepdims=True) +``` + +The `keepdims` argument is particularly useful for {func}`ak.argmin` and {func}`ak.argmax`, which return positions in a list where the value is minimized or maximized. Those positions can only be used as slice indexes if they're at the right nesting level, which `keepdims=True` maintains. + +## Other reducers + +* The {func}`ak.prod` reducer multiplies, rather than adding. +* {func}`ak.min` and {func}`ak.max` minimize and maximize, returning `None` for empty lists. +* {func}`ak.argmin` and {func}`ak.argmax` return the index positions of the minimum or maximum value, with `None` for empty lists. +* {func}`ak.nansum`, {func}`ak.nanprod`, {func}`ak.nanmin`, {func}`ak.nanmax`, {func}`ak.nanargmin`, and {func}`ak.nanargmax` ignore floating-point `nan` values before operating, the way that all reducers ignore `None` values before operating. +* {func}`ak.count_nonzero` counts non-zero values. +* {func}`ak.count` simply counts values. In NumPy, there's no need for such a function because it would return constants (drawn from the NumPy array's `shape`), but for ragged arrays, it counts the number of values that enter into a reduction. {func}`ak.num` also returns lengths of lists, but in a way that's more useful for slicing; {func}`ak.count` is useful as the denominator of expressions in which another reducer (with the same `axis` and `keepdims` choices) is in the numerator. +* {func}`ak.any` and {func}`ak.all` reduce like logical-or and logical-and, which makes them particularly useful in slices (below). + +## Reducing over "any" and "all" + +{func}`ak.any` and {func}`ak.all` reduce boolean arrays, asking if a predicate is satisfied by "any" item or "all" items, respectively. + +```{code-cell} ipython3 +array_bool = ak.Array([ + [False, False, True, True], + [False, True, False, True], + [False, True, True, True], +]) +array_bool +``` + +```{code-cell} ipython3 +ak.any(array_bool, axis=-1) +``` + +```{code-cell} ipython3 +ak.any(array_bool, axis=-2) +``` + +```{code-cell} ipython3 +ak.all(array_bool, axis=-1) +``` + +```{code-cell} ipython3 +ak.all(array_bool, axis=-2) +``` + +Since logical-or is like addition of booleans and logical-and is like multiplication, these reducers could have been replaced with {func}`ak.sum` and {func}`ak.prod`, but they're very useful to have because they make some boolean-array slices easier to read. + +```{code-cell} ipython3 +array = ak.Array([[0, 1, 2], [], [-3, 4], [-5], [-6, -7, -8, -9]]) +array +``` + +Select _whole lists_ if _any_ of their values are negative: + +```{code-cell} ipython3 +array[ak.any(array < 0, axis=-1)] +``` + +Select _whole lists_ if _all_ of their values are negative: + +```{code-cell} ipython3 +array[ak.all(array < 0, axis=-1)] +``` + +(If a list is empty, all of its elements satisfy a constraint.) + +In both cases above, the selection can be read like an English sentence, "select lists if _any_..." or "select lists if _all_...". + +## Heterogeneous data and records cannot be reduced + +These two kinds of data types are not reducible. Heterogeneous data allows an array to have multiple numbers of dimensions, so the problem is ill-posed: + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +ak.sum(ak.Array([[1.1, 2.2, 3.3], [], 4.4, 5.5])) +``` + +And records are sometimes used to represent data with coordinates; applying {func}`ak.sum` to non-Cartesian coordinates would be a subtle error. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +ak.sum(ak.Array([{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}]), axis=-1) +``` diff --git a/docs/user-guide/how-to-math-statistics.md b/docs/user-guide/how-to-math-statistics.md index 1f8d185a74..340b2a9d2d 100644 --- a/docs/user-guide/how-to-math-statistics.md +++ b/docs/user-guide/how-to-math-statistics.md @@ -14,10 +14,138 @@ kernelspec: How to compute statistics on dimensions (mean/var/std) ====================================================== -**This is a stub:** I intend to write this article, but haven't yet. +Awkward Array provides several functions for statistical analysis that operate on ragged arrays. These are dimensional reducers, like {func}`ak.sum`, {func}`ak.min`, {func}`ak.any`, and {func}`ak.all` in the {doc}`previous section `, but they compute quantities such as mean, variance, standard deviation, and higher moments, as well as functions for linear regression and correlation. -If you need it soon, create an issue saying so and I'll make it a higher priority. +```{code-cell} +import awkward as ak +import numpy as np +``` -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +## Basic statistical functions -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +### Mean, variance, and standard deviation + +To compute the [mean](https://en.wikipedia.org/wiki/Mean), [variance](https://en.wikipedia.org/wiki/Variance), and [standard deviation](https://en.wikipedia.org/wiki/Standard_deviation) of an array, use {func}`ak.mean`, {func}`ak.var`, and {func}`ak.std`. Unlike the NumPy functions with the same names, these functions apply to arrays with variable-length dimensions and missing values (but not heterogeneous dimensionality or records; see the last section of {doc}`reducing `. + +```{code-cell} +array = ak.Array([[0, 1.1, 2.2], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]]) +``` + +```{code-cell} +ak.mean(array, axis=-1) +``` + +```{code-cell} +ak.var(array, axis=-1) +``` + +```{code-cell} +ak.std(array, axis=-1) +``` + +These functions also have counterparts that ignore `nan` values: {func}`ak.nanmean`, {func}`ak.nanvar`, and {func}`ak.nanstd`. + +```{code-cell} +array_with_nan = ak.Array([[0, 1.1, np.nan], [3.3, 4.4], [np.nan], [6.6, np.nan, 8.8, 9.9]]) +``` + +```{code-cell} +ak.nanmean(array_with_nan, axis=-1) +``` + +```{code-cell} +ak.nanvar(array_with_nan, axis=-1) +``` + +```{code-cell} +ak.nanstd(array_with_nan, axis=-1) +``` + +Note that floating-point `nan` is different from missing values (`None`). Unlike `nan`, integer arrays can have missing values, and whole lists can be missing as well. For both types of functions, missing values are ignored if they are in the dimension being reduced or pass through a function to the output otherwise, just as the `nan`-ignoring functions ignore `nan`. + +```{code-cell} +array_with_None = ak.Array([[0, 1.1, 2.2], None, [None, 4.4], [5.5], [6.6, np.nan, 8.8, 9.9]]) +``` + +```{code-cell} +ak.mean(array_with_None, axis=-1) +``` + +```{code-cell} +ak.nanmean(array_with_None, axis=-1) +``` + +### Moments + +For higher moments, use {func}`ak.moment`. For example, to calculate the third [moment](https://en.wikipedia.org/wiki/Moment_(mathematics)) (skewness), you would do the following: + +```{code-cell} +ak.moment(array, 3, axis=-1) +``` + +## Correlation and covariance + +For [correlation](https://en.wikipedia.org/wiki/Correlation) and [covariance](https://en.wikipedia.org/wiki/Covariance) between two arrays, use {func}`ak.corr` and {func}`ak.covar`. + +```{code-cell} +array_x = ak.Array([[0, 1.1, 2.2], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]]) +array_y = ak.Array([[0, 1, 2], [3, 4], [5], [6, 7, 8, 9]]) +``` + +```{code-cell} +ak.corr(array_x, array_y, axis=-1) +``` + +```{code-cell} +ak.covar(array_x, array_y, axis=-1) +``` + +## Linear fits + +To perform [linear fits](https://en.wikipedia.org/wiki/Linear_regression), use {func}`ak.linear_fit`. Instead of reducing each list to a number, it reduces each list to a record that has `intercept`, `slope`, `intercept_error`, and `slope_error` fields. (These "errors" are uncertainty estimates of the intercept and slope parameters, assuming that the underlying generator of data is truly linear.) + +```{code-cell} +ak.linear_fit(array_x, array_y, axis=-1) +``` + +[Ordinary least squares](https://en.wikipedia.org/wiki/Ordinary_least_squares) linear fits can be computed by a formula, without approximation or iteration, so it can be thought of like computing the mean or other moments, but with greater fidelity to the data because it models a general correlation. For example, some statistical models achieve high granularity by segmenting a dataset in some meaningful way and then summarizing the data in each segment (such as a regression [decision tree](https://en.wikipedia.org/wiki/Decision_tree)). Performing linear fits on each segment fine-tunes the model more than performing just taking the average of data in each segment. + ++++ + +## Peak to peak + +The peak-to-peak function {func}`ak.ptp` can be used to find the range (maximum - minimum) of data along an axis. It's more convenient than calling {func}`ak.min` and {func}`ak.max` separately. + +```{code-cell} +ak.ptp(array, axis=-1) +``` + +## Softmax + +The [softmax](https://en.wikipedia.org/wiki/Softmax_function) function is useful in machine learning, particularly in the context of logistic regression and neural networks. Awkward Array provides {func}`ak.softmax` to compute softmax values of an array. + +Note that this function does not _reduce_ a dimension; it computes one output value for each input value, but each output value is normalized by all the other values in the same list. + +Also note that only `axis=-1` (innermost lists) is supported by {func}`ak.softmax`. + +```{code-cell} +ak.softmax(array, axis=-1) +``` + +## Example uses in data analysis + +Here is an example that normalizes an input array to have an overall mean of 0 and standard deviation of 1: + +```{code-cell} +array = ak.Array([[1.1, 2.2, 3.3], [4.4, 5.5], [6.6, 7.7, 8.8, 9.9]]) +``` + +```{code-cell} +(array - ak.mean(array)) / ak.std(array) +``` + +And here's another example that normalizes each _list_ within the array to each have a mean of 0 and a standard deviation of 1: + +```{code-cell} +(array - ak.mean(array, axis=-1)) / ak.std(array, axis=-1) +``` diff --git a/docs/user-guide/how-to-math.md b/docs/user-guide/how-to-math.md index f93ed1f532..2ab0e399c3 100644 --- a/docs/user-guide/how-to-math.md +++ b/docs/user-guide/how-to-math.md @@ -14,5 +14,10 @@ kernelspec: Numerical math ============== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-restructure-concatenate.md b/docs/user-guide/how-to-restructure-concatenate.md index a98038b513..e9f47fdef0 100644 --- a/docs/user-guide/how-to-restructure-concatenate.md +++ b/docs/user-guide/how-to-restructure-concatenate.md @@ -14,10 +14,190 @@ kernelspec: How to concatenate and interleave arrays ======================================== -**This is a stub:** I intend to write this article, but haven't yet. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +import pandas as pd +``` -If you need it soon, create an issue saying so and I'll make it a higher priority. +## Simple concatenation -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +{func}`ak.concatenate` is an analog of [np.concatenate](https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html) (in fact, you can use [np.concatenate](https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html) where you mean {func}`ak.concatenate`). However, it applies to data of arbitrary data structures: -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +array1 = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}], +]) +array2 = ak.Array([ + [{"x": 6.6, "y": [1, 2, 3, 4, 5, 6]}], + [{"x": 7.7, "y": [1, 2, 3, 4, 5, 6, 7]}], +]) +``` + +```{code-cell} ipython3 +ak.concatenate([array1, array2]) +``` + +The arrays can even have different data types, in which case the output has union-type. + +```{code-cell} ipython3 +array3 = ak.Array([{"z": None}, {"z": 0}, {"z": 123}]) +``` + +```{code-cell} ipython3 +ak.concatenate([array1, array2, array3]) +``` + +Keep in mind, however, that some operations can't deal with union-types (heterogeneous data), so you might want to avoid this. + +## Interleaving lists with `axis > 0` + +The default `axis=0` returns an array whose length is equal to the sum of the lengths of the input arrays. + +Other `axis` values combine lists within the arrays, as long as the arrays have the same lengths. + +```{code-cell} ipython3 +array1 = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) +array2 = ak.Array([[10, 20], [30], [40, 50, 60, 70]]) +``` + +```{code-cell} ipython3 +len(array1), len(array2) +``` + +```{code-cell} ipython3 +ak.concatenate([array1, array2], axis=1) +``` + +This can be used in some non-trivial ways: sometimes a problem that doesn't seem to have anything to do with concatenation can be solved this way. + +For instance, suppose that you have to pad some lists so that they start and stop with 0 (for some window-averaging procedure, perhaps). You can make the pad as a new array: + +```{code-cell} ipython3 +pad = np.zeros(len(array1))[:, np.newaxis] +pad +``` + +and concatenate it with `axis=1` to get the desired effect: + +```{code-cell} ipython3 +ak.concatenate([pad, array1, pad], axis=1) +``` + +Or similarly, to double the first value and double the last value (without affecting empty lists): + +```{code-cell} ipython3 +ak.concatenate([array1[:, :1], array1, array1[:, -1:]], axis=1) +``` + +The same applies for more deeply nested lists and `axis > 1`. Remember that `axis=-1` starts counting from the innermost dimension, outward. + +## Emulating NumPy's "stack" functions + +[np.stack](https://numpy.org/doc/stable/reference/generated/numpy.stack.html), [np.hstack](https://numpy.org/doc/stable/reference/generated/numpy.hstack.html), [np.vstack](https://numpy.org/doc/stable/reference/generated/numpy.vstack.html), and [np.dstack](https://numpy.org/doc/stable/reference/generated/numpy.dstack.html) are concatenations with [np.newaxis](https://numpy.org/doc/stable/reference/constants.html#numpy.newaxis) (reshaping to add a dimension of length 1). + +```{code-cell} ipython3 +a = np.array([1, 2, 3]) +b = np.array([4, 5, 6]) +``` + +```{code-cell} ipython3 +np.stack([a, b]) +``` + +```{code-cell} ipython3 +np.concatenate([a[np.newaxis], b[np.newaxis]], axis=0) +``` + +```{code-cell} ipython3 +np.stack([a, b], axis=1) +``` + +```{code-cell} ipython3 +np.concatenate([a[:, np.newaxis], b[:, np.newaxis]], axis=1) +``` + +Since {func}`ak.concatenate` has the same interface as [np.concatenate](https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html) and Awkward Arrays can also be sliced with [np.newaxis](https://numpy.org/doc/stable/reference/constants.html#numpy.newaxis), they can be stacked the same way, with the addition of arbitrary data structures. + +```{code-cell} ipython3 +a = ak.Array([[1], [1, 2], [1, 2, 3]]) +b = ak.Array([[4], [4, 5], [4, 5, 6]]) +``` + +```{code-cell} ipython3 +ak.concatenate([a[np.newaxis], b[np.newaxis]], axis=0) +``` + +```{code-cell} ipython3 +ak.concatenate([a[:, np.newaxis], b[:, np.newaxis]], axis=1) +``` + +## Differences from Pandas + +Concatenation in Awkward Array combines arrays lengthwise: by adding the lengths of the arrays or adding the lengths of lists within an array. It does not refer to adding fields to a record (that is, "adding columns to a table"). To add fields to a record, see {func}`ak.zip` or {func}`ak.Array.__setitem__` in {doc}`how to zip/unzip and project ` and {doc}`how to add fields `. This is important to note because [pandas.concat](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) does both, depending on its `axis` argument (and there's no equivalent in NumPy). + +Here's a table-like example of concatenation in Awkward Array: + +```{code-cell} ipython3 +array1 = ak.Array({"column": [[1, 2, 3], [], [4, 5]]}) +array2 = ak.Array({"column": [[1.1, 2.2, 3.3], [], [4.4, 5.5]]}) +``` + +```{code-cell} ipython3 +array1 +``` + +```{code-cell} ipython3 +array2 +``` + +```{code-cell} ipython3 +ak.concatenate([array1, array2], axis=0) +``` + +This is like Pandas for `axis=0`, + +```{code-cell} ipython3 +df1 = pd.DataFrame({"column": [[1, 2, 3], [], [4, 5]]}) +df2 = pd.DataFrame({"column": [[1.1, 2.2, 3.3], [], [4.4, 5.5]]}) +``` + +```{code-cell} ipython3 +df1 +``` + +```{code-cell} ipython3 +df2 +``` + +```{code-cell} ipython3 +pd.concat([df1, df2], axis=0) +``` + +But for `axis=1`, they're quite different: + +```{code-cell} ipython3 +ak.concatenate([array1, array2], axis=1) +``` + +```{code-cell} ipython3 +pd.concat([df1, df2], axis=1) +``` + +{func}`ak.concatenate` accepts any `axis` less than the number of dimensions in the arrays, but Pandas has only two choices, `axis=0` and `axis=1`. + +Fields ("columns") of an Awkward Array are unrelated to array dimensions. If you want what [pandas.concat](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) does with `axis=1`, you would use {func}`ak.zip`: + +```{code-cell} ipython3 +ak.zip({"column1": array1.column, "column2": array2.column}, depth_limit=1) +``` + +The `depth_limit` prevents {func}`ak.zip` from interleaving the lists further: + +```{code-cell} ipython3 +ak.zip({"column1": array1.column, "column2": array2.column}) +``` + +which Pandas doesn't do because lists in Pandas cells are Python objects that it doesn't modify. diff --git a/docs/user-guide/how-to-restructure-rename-records.md b/docs/user-guide/how-to-restructure-rename-records.md deleted file mode 100644 index f423f0ffaf..0000000000 --- a/docs/user-guide/how-to-restructure-rename-records.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to restructure arrays by renaming records -============================================= - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-restructure-sort.md b/docs/user-guide/how-to-restructure-sort.md deleted file mode 100644 index fb5b4625bb..0000000000 --- a/docs/user-guide/how-to-restructure-sort.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to sort arrays and inner arrays -=================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-restructure.md b/docs/user-guide/how-to-restructure.md index 2229029d73..4483108e93 100644 --- a/docs/user-guide/how-to-restructure.md +++ b/docs/user-guide/how-to-restructure.md @@ -14,5 +14,10 @@ kernelspec: Restructuring data ================== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-specialize-differentiate-jax.md b/docs/user-guide/how-to-specialize-differentiate-jax.md index 133a072432..2a112aed36 100644 --- a/docs/user-guide/how-to-specialize-differentiate-jax.md +++ b/docs/user-guide/how-to-specialize-differentiate-jax.md @@ -24,23 +24,21 @@ JAX, amongst other things, is a powerful tool for computing derivatives of nativ How to differentiate Awkward Arrays? ------------------------------------ -Before using JAX on functions which deal with Awkward Arrays we need to configure JAX to use only the CPU +For this notebook (which is evaluated on a CPU), we need to configure JAX to use only the CPU. ```{code-cell} import jax - jax.config.update("jax_platform_name", "cpu") ``` -Next, we must call {func}`ak.jax.register_and_check()` to register Awkward's JAX integration +Next, we must call {func}`ak.jax.register_and_check()` to register Awkward's JAX integration. ```{code-cell} import awkward as ak - ak.jax.register_and_check() ``` -Let's define a simple function that accepts an Awkward Array +Let's define a simple function that accepts an Awkward Array. ```{code-cell} def reverse_sum(array): diff --git a/docs/user-guide/how-to-specialize-in-numba.md b/docs/user-guide/how-to-specialize-in-numba.md deleted file mode 100644 index ea808e8b3f..0000000000 --- a/docs/user-guide/how-to-specialize-in-numba.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to specialize behavior in Numba-compiled functions -====================================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-specialize-lorentz.md b/docs/user-guide/how-to-specialize-lorentz.md deleted file mode 100644 index 1749c71656..0000000000 --- a/docs/user-guide/how-to-specialize-lorentz.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to use specialized Lorentz vectors -====================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-specialize-override-numpy.md b/docs/user-guide/how-to-specialize-override-numpy.md deleted file mode 100644 index b79e5a7e72..0000000000 --- a/docs/user-guide/how-to-specialize-override-numpy.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to specialize behavior by overriding NumPy functions -======================================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-specialize-subclass.md b/docs/user-guide/how-to-specialize-subclass.md deleted file mode 100644 index 0f29913006..0000000000 --- a/docs/user-guide/how-to-specialize-subclass.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to specialize behavior by subclassing Array/Record -====================================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-specialize.md b/docs/user-guide/how-to-specialize.md index 81bb5627a9..627198c356 100644 --- a/docs/user-guide/how-to-specialize.md +++ b/docs/user-guide/how-to-specialize.md @@ -11,8 +11,13 @@ kernelspec: name: python3 --- -Specialized behavior -==================== +Special topics +============== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-strings.md b/docs/user-guide/how-to-strings.md new file mode 100644 index 0000000000..fd74237f21 --- /dev/null +++ b/docs/user-guide/how-to-strings.md @@ -0,0 +1,23 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.10.3 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +Working with strings +==================== + +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-use-in-cpp-cppyy.ipynb b/docs/user-guide/how-to-use-in-cpp-cppyy.ipynb index bccb72b5ee..bb7b0a6c5f 100644 --- a/docs/user-guide/how-to-use-in-cpp-cppyy.ipynb +++ b/docs/user-guide/how-to-use-in-cpp-cppyy.ipynb @@ -20,11 +20,11 @@ "source": [ ":::{warning}\n", "\n", - "Awkward Array can only work with `cppyy` 3.0.1 or later. As of writing, this is _only_ available from Git.\n", + "Awkward Array can only work with `cppyy` 3.1 or later.\n", ":::\n", "\n", ":::{warning}\n", - "`cppyy` must be in a different venv or conda environment from ROOT, if you have installed ROOT.\n", + "`cppyy` must be in a different venv or conda environment from ROOT, if you have installed ROOT, because the two packages define modules with conflicting names.\n", ":::\n", "\n", "The [cppyy](https://cppyy.readthedocs.io/en/latest/index.html) is an automatic, run-time, Python-C++ bindings generator, for calling C++ from Python and Python from C++. `cppyy` is based on the C++ interpreter `Cling`.\n", diff --git a/docs/user-guide/how-to-use-in-cpp.md b/docs/user-guide/how-to-use-in-cpp.md index 07cde4ba30..86ac3a1f1d 100644 --- a/docs/user-guide/how-to-use-in-cpp.md +++ b/docs/user-guide/how-to-use-in-cpp.md @@ -14,5 +14,10 @@ kernelspec: Using arrays in C++ ===================== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/how-to-use-in-numba-arraybuilder.md b/docs/user-guide/how-to-use-in-numba-arraybuilder.md deleted file mode 100644 index 95292f7349..0000000000 --- a/docs/user-guide/how-to-use-in-numba-arraybuilder.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.10.3 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -How to output Awkward Array structures from a Numba-compiled function -===================================================================== - -**This is a stub:** I intend to write this article, but haven't yet. - -If you need it soon, create an issue saying so and I'll make it a higher priority. - -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) - -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. diff --git a/docs/user-guide/how-to-use-in-numba-features.md b/docs/user-guide/how-to-use-in-numba-features.md index ef73bf1943..a307e73238 100644 --- a/docs/user-guide/how-to-use-in-numba-features.md +++ b/docs/user-guide/how-to-use-in-numba-features.md @@ -14,10 +14,183 @@ kernelspec: Awkward Array features that are supported in Numba-compiled functions ===================================================================== -**This is a stub:** I intend to write this article, but haven't yet. +See the [Numba documentation](https://numba.readthedocs.io/), which maintains lists of -If you need it soon, create an issue saying so and I'll make it a higher priority. +* [supported Python language features](https://numba.pydata.org/numba-doc/dev/reference/pysupported.html) and +* [supported NumPy library features](https://numba.readthedocs.io/en/stable/reference/numpysupported.html) -[![](../image/github-issues-documentation.png)](https://github.com/scikit-hep/awkward-1.0/issues/new?assignees=&labels=docs&template=documentation.md&title=) +in JIT-compiled functions. This page describes the supported Awkward Array library features. -The text of your issue doesn't have to be much more than a link to this page, so I can be sure which page you're referring to. If you add details about how and why you need it, however, I may be able to tailor the text to help you more. +```{code-cell} ipython3 +import awkward as ak +import numpy as np +import numba as nb +``` + +## Passing Awkward Arrays as arguments to a function + +The main use is to pass an Awkward Array into a function that has been JIT-compiled by Numba. As many arguments as you want can be Awkward Arrays, and they don't have to have the same length or shape. + +```{code-cell} ipython3 +array1 = ak.Array([[0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]]) +array2 = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}] +]) +``` + +```{code-cell} ipython3 +@nb.jit +def first_array(array): + for i, list_of_numbers in enumerate(array): + for x in list_of_numbers: + if x == 3.3: + return i + +@nb.jit +def second_array(array): + for i, list_of_records in enumerate(array): + for record in list_of_records: + if record.x == 3.3: + return i + +@nb.jit +def where_is_3_point_3(a, b): + return first_array(a), second_array(b) +``` + +```{code-cell} ipython3 +where_is_3_point_3(array1, array2) +``` + +The only constraint is that union types can't be _accessed_ within the compiled function. (Heterogeneous _parts_ of an array can be ignored and passed through a compiled function.) + +## Returning Awkward Arrays from a function + +Parts of the input array can be returned from a compiled function. + +```{code-cell} ipython3 +@nb.jit +def first_array(array): + for list_of_numbers in array: + for x in list_of_numbers: + if x == 3.3: + return list_of_numbers + +@nb.jit +def second_array(array): + for list_of_records in array: + for record in list_of_records: + if record.x == 3.3: + return record + +@nb.jit +def find_3_point_3(a, b): + return first_array(a), second_array(b) +``` + +```{code-cell} ipython3 +found_a, found_b = find_3_point_3(array1, array2) +``` + +```{code-cell} ipython3 +found_a +``` + +```{code-cell} ipython3 +found_b +``` + +## Cannot use `ak.*` functions or ufuncs + +Outside of a compiled function, Awkward's vectorized `ak.*` functions and NumPy's [universal functions (ufuncs)](https://numpy.org/doc/stable/reference/ufuncs.html) should be highly preferred over for-loop iteration because they are much faster. + +Inside of a compiled function, however, they can't be used at all. Use for-loops and if-statements instead. + +This is an either-or choice at the boundary of a `@nb.jit`-compiled function. (Even if `ak.*` had been implemented in Numba's compiled context, it would be slower than _compiled_ for-loops and if-statements because of the intermediate arrays they would necessarily create.) + +## Cannot use fancy slicing + +Similarly, any slicing other than + +* a single integer, like `array[i]` where `i` is an integer, or +* a single record field as a _constant, literal_ string, like `array["x"]` or `array.x`, + +is not allowed. Unpack the data structures one level at a time. + +## Casting one-dimensional arrays as NumPy + +One-dimensional Awkward Arrays of numbers, which are completely equivalent to NumPy arrays, can be _cast_ as NumPy arrays within the compiled function. + +```{code-cell} ipython3 +@nb.jit +def return_last_y_list_squared(array): + y_list_squared = None + for list_of_records in array: + for record in list_of_records: + y_list_squared = np.asarray(record.y)**2 + return y_list_squared +``` + +```{code-cell} ipython3 +return_last_y_list_squared(array2) +``` + +This ability to cast Awkward Arrays as NumPy arrays, and then use NumPy's ufuncs or fancy slicing, softens the law against vectorized functions in the compiled context. (However, making intermediate NumPy arrays is just as bad as making intermediate Awkward Arrays. + +## Creating new arrays with `ak.ArrayBuilder` + +Numba can create NumPy arrays inside a compiled function and return them as NumPy arrays in Python, but Awkward Arrays are more complex and this is not possible. (Aside from implementation, what would be the interface? Data in Numba's compiled context must be fully typed, and Awkward Array types are complex.) + +Instead, arrays can be built with {obj}`ak.ArrayBuilder`, which can be used in compiled contexts and discovers type dynamically. Each {obj}`ak.ArrayBuilder` must be instantiated outside of a compiled function and passed in, and then its {func}`ak.ArrayBuilder.snapshot` (which creates the {obj}`ak.Array`) must be called outside of the compiled function, like this: + +```{code-cell} ipython3 +@nb.jit +def create_ragged_array(builder, n): + for i in range(n): + builder.begin_list() + for j in range(i): + builder.integer(j) + builder.end_list() + return builder +``` + +```{code-cell} ipython3 +builder = ak.ArrayBuilder() + +create_ragged_array(builder, 10) + +array = builder.snapshot() + +array +``` + +or, more succintly, + +```{code-cell} ipython3 +create_ragged_array(ak.ArrayBuilder(), 10).snapshot() +``` + +Note that we didn't need to specify that the type of the data would be `var * int64`; this was determined by the way that {obj}`ak.ArrayBuilder` was called: {func}`ak.ArrayBuilder.integer` was only ever called between {func}`ak.ArrayBuilder.begin_list` and {func}`ak.ArrayBuilder.end_list`, and hence the type is `var * int64`. + +Note that {obj}`ak.ArrayBuilder` can be used outside of compiled functions, too, so it can be tested interactively: + +```{code-cell} ipython3 +with builder.record(): + builder.field("x").real(3.14) + with builder.field("y").list(): + builder.string("one") + builder.string("two") + builder.string("three") +``` + +```{code-cell} ipython3 +builder.snapshot() +``` + +But the context managers, `with builder.record()` and `with builder.list()`, don't work in Numba-compiled functions because Numba does not yet support it as a language feature. + +## Overriding behavior with `ak.behavior` + +Just as behaviors can be customized for Awkward Arrays in general, they can be customized in the compiled context as well. See the last section of the {obj}`ak.behavior` reference for details. diff --git a/docs/user-guide/how-to-use-in-numba-intro.md b/docs/user-guide/how-to-use-in-numba-intro.md new file mode 100644 index 0000000000..25229827cb --- /dev/null +++ b/docs/user-guide/how-to-use-in-numba-intro.md @@ -0,0 +1,126 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.10.3 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +Using Awkward Array with Numba +============================== + +## Why Numba? + +The array-oriented (NumPy-like) interface that Awkward Array provides is often more convenient than imperative code and it's always faster than pure Python. But sometimes it's less convenient than imperative code and it's always slower than C, C++, Julia, Rust, or other compiled code. + +* The matching problem described in {doc}`how-to-combinatorics-best-match` is already rather complex—if a problem is more intricate than that, you may want to consider doing it in imperative code, so that you or anyone reading your code don't get lost in indices. +* Although all iterations over arrays in Awkward Array are precompiled, most operations involve several passes over the data, which are not cache-friendly and might exceed your working memory budget. + +For this reason, Awkward Arrays were made to be interchangeable with [Numba](https://numba.pydata.org/), a JIT-compiler for Python. Recently, JIT-compiled C++ and Julia have been added as well. Our intention is not to make you choose upfront whether to use array-oriented syntax or JIT-compiled code, but to mix them in the most convenient ways for each task. + +## Small example + +```{code-cell} ipython3 +import awkward as ak +import numpy as np +import numba as nb +``` + +```{code-cell} ipython3 +array = ak.Array([ + [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}], + [], + [{"x": 4.4, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}], + [{"x": 6.6, "y": [1, 2, 3, 4, 5, 6]}], +])[np.tile([0, 1, 2, 3], 250000)] +array +``` + +Suppose we want to compute the sum of all `y` values in each of the million entries above. We can do that with a simple Awkward expression, + +```{code-cell} ipython3 +ak.sum(ak.sum(array.y, axis=-1), axis=-1) +``` + +Although it's faster than iterating over pure Python loops, it makes intermediate arrays that aren't necessary for the final result. Allocating them and iterating over all of them slows down the Awkward Array expression relative to compiled code. + +```{code-cell} ipython3 +%%timeit + +ak.sum(ak.sum(array.y, axis=-1), axis=-1) +``` + +```{code-cell} ipython3 +@nb.jit +def sum_of_y(array): + out = np.zeros(len(array), dtype=np.int64) + + for i, list_of_records in enumerate(array): + for record in list_of_records: + for y in record.y: + out[i] += y + + return out +``` + +```{code-cell} ipython3 +ak.Array(sum_of_y(array)) +``` + +The JIT-compiled function is faster. + +```{code-cell} ipython3 +%%timeit + +ak.Array(sum_of_y(array)) +``` + +## Combining features of Awkward Array and Numba + +Even on a per-task level, Awkward Array's array-oriented functions and Numba's JIT-compilation don't need to be exclusive. Numba can be used to prepare steps of an array-oriented process, such as generating boolean or integer-valued arrays to use as slices for an Awkward Array. + +```{code-cell} ipython3 +@nb.jit +def sum_of_y_is_more_than_10(array): + out = np.zeros(len(array), dtype=np.bool_) + + for i, list_of_records in enumerate(array): + total = 0 + for record in list_of_records: + for y in record.y: + total += y + if total > 10: + out[i] = True + + return out +``` + +```{code-cell} ipython3 +array[sum_of_y_is_more_than_10(array)] +``` + +## Relative strengths and weaknesses + +Awkward Array's array oriented interface is + +* good for reading and writing data to and from columnar file formats like Parquet, +* good for interactive exploration in Jupyter, applying a sequence of simple operations to a whole dataset and observing its effects after each operation, +* good for speed and memory use, relative to pure Python, +* bad for very intricate calculations with many indices, +* bad for large intermediate arrays, +* bad for speed and memory use, relative to custom-compiled code. + +Numba's JIT-compilation is + +* good for writing understandable algorithms with many moving parts, +* good for speed and memory use, on par with other compiled languages, +* bad for interactive exploration of data and iterative data analysis, since you have to write whole functions, +* bad for working through type errors, as you would have in any compiled language (unlike pure Python), +* bad for unboxing and boxing large non-array data when entering and exiting a compiled function. + +The {doc}`next section ` lists what you can and can't do with Awkward Arrays in Numba-compiled code. diff --git a/docs/user-guide/how-to-use-in-numba.md b/docs/user-guide/how-to-use-in-numba.md index 4c73fe33bf..757e64cf8c 100644 --- a/docs/user-guide/how-to-use-in-numba.md +++ b/docs/user-guide/how-to-use-in-numba.md @@ -14,5 +14,10 @@ kernelspec: Using arrays in Numba ===================== -```{tableofcontents} -``` +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. + +If you're looking for documentation on a specific function, see the API reference instead. + +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index c8f373864b..522e32a53c 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -1,8 +1,9 @@ # User guide -This user guide provides examples of using Awkward Array, organised by high-level topic into subsections. Each subsection provides an introduction to the topic, and is split into smaller pages that focus on a particular set of features. +The user guide is a collection of "how to..." guides for common tasks. See the left side-bar (or bring it into view by clicking on the upper-left `≡`) to access the guides, grouped by topic. -New users should start with {doc}`10-minutes-to-awkward-array`. +If you're looking for documentation on a specific function, see the API reference instead. -```{tableofcontents} -``` +You can test any examples in a new window/tab by clicking on [![Try It! ⭷](https://img.shields.io/badge/-Try%20It%21%20%E2%86%97-orange?style=for-the-badge)](https://awkward-array.org/doc/main/_static/try-it.html). + +




diff --git a/src/awkward/_util.py b/src/awkward/_util.py index 5a4c430c85..9c68aaf89b 100644 --- a/src/awkward/_util.py +++ b/src/awkward/_util.py @@ -86,6 +86,9 @@ def __repr__(self): UNSET = Sentinel("UNSET", __name__) +STDOUT = Sentinel("STDOUT", __name__) +STDOUT.stream = sys.stdout + T = TypeVar("T") diff --git a/src/awkward/highlevel.py b/src/awkward/highlevel.py index a7ae97eb70..e8a0c92b3e 100644 --- a/src/awkward/highlevel.py +++ b/src/awkward/highlevel.py @@ -12,7 +12,6 @@ import keyword import pickle import re -import sys from collections.abc import Iterable, Mapping, Sequence, Sized from awkward_cpp.lib import _ext @@ -35,6 +34,7 @@ from awkward._prettyprint import Formatter from awkward._regularize import is_non_string_like_iterable from awkward._typing import Any, TypeVar +from awkward._util import STDOUT __all__ = ("Array", "ArrayBuilder", "Record") @@ -1337,7 +1337,7 @@ def show( limit_rows=20, limit_cols=80, type=False, - stream=sys.stdout, + stream=STDOUT, *, formatter=None, precision=3, @@ -1380,6 +1380,8 @@ def show( if stream is None: return out else: + if stream is STDOUT: + stream = STDOUT.stream stream.write(out + "\n") def _repr_mimebundle_(self, include=None, exclude=None): @@ -2202,7 +2204,7 @@ def show( limit_rows=20, limit_cols=80, type=False, - stream=sys.stdout, + stream=STDOUT, *, formatter=None, precision=3, @@ -2243,6 +2245,8 @@ def show( if stream is None: return out else: + if stream is STDOUT: + stream = STDOUT.stream stream.write(out + "\n") def _repr_mimebundle_(self, include=None, exclude=None): @@ -2651,7 +2655,7 @@ def show( limit_rows=20, limit_cols=80, type=False, - stream=sys.stdout, + stream=STDOUT, *, formatter=None, precision=3, diff --git a/src/awkward/operations/ak_max.py b/src/awkward/operations/ak_max.py index 471101309d..a01a0d64c5 100644 --- a/src/awkward/operations/ak_max.py +++ b/src/awkward/operations/ak_max.py @@ -130,7 +130,7 @@ def nanmax( # Implementation return _impl( - ak.operations.ak_nan_to_none._impl(array, False, None), + ak.operations.ak_nan_to_none._impl(array, False, None, None), axis, keepdims, initial, @@ -159,6 +159,7 @@ def _impl(array, axis, keepdims, initial, mask_identity, highlevel, behavior, at @ak._connect.numpy.implements("amax") +@ak._connect.numpy.implements("max") def _nep_18_impl_amax( a, axis=None, out=UNSUPPORTED, keepdims=False, initial=None, where=UNSUPPORTED ): diff --git a/src/awkward/operations/ak_mean.py b/src/awkward/operations/ak_mean.py index c40e9689a0..fa74a89b61 100644 --- a/src/awkward/operations/ak_mean.py +++ b/src/awkward/operations/ak_mean.py @@ -169,6 +169,7 @@ def nanmean( mask_identity, highlevel=highlevel, behavior=behavior, + attrs=attrs, ) diff --git a/src/awkward/operations/ak_min.py b/src/awkward/operations/ak_min.py index 081bc91faf..05e583d430 100644 --- a/src/awkward/operations/ak_min.py +++ b/src/awkward/operations/ak_min.py @@ -130,7 +130,7 @@ def nanmin( # Implementation return _impl( - ak.operations.ak_nan_to_none._impl(array, False, None), + ak.operations.ak_nan_to_none._impl(array, False, None, None), axis, keepdims, initial, @@ -159,6 +159,7 @@ def _impl(array, axis, keepdims, initial, mask_identity, highlevel, behavior, at @ak._connect.numpy.implements("amin") +@ak._connect.numpy.implements("min") def _nep_18_impl_amin( a, axis=None, diff --git a/src/awkward/operations/ak_round.py b/src/awkward/operations/ak_round.py index f0e6d3feff..6d6e0e0471 100644 --- a/src/awkward/operations/ak_round.py +++ b/src/awkward/operations/ak_round.py @@ -13,6 +13,7 @@ np = NumpyMetadata.instance() +@ak._connect.numpy.implements("around") @ak._connect.numpy.implements("round") @high_level_function() def round( diff --git a/src/awkward/operations/ak_to_categorical.py b/src/awkward/operations/ak_to_categorical.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/awkward/types/__init__.py b/src/awkward/types/__init__.py index 3c8eb0b619..01ce161f2f 100644 --- a/src/awkward/types/__init__.py +++ b/src/awkward/types/__init__.py @@ -4,7 +4,12 @@ from awkward.types.arraytype import ArrayType # noqa: F401 from awkward.types.listtype import ListType # noqa: F401 -from awkward.types.numpytype import NumpyType # noqa: F401 +from awkward.types.numpytype import ( # noqa: F401 + NumpyType, + dtype_to_primitive, + is_primitive, + primitive_to_dtype, +) from awkward.types.optiontype import OptionType # noqa: F401 from awkward.types.recordtype import RecordType # noqa: F401 from awkward.types.regulartype import RegularType # noqa: F401 diff --git a/src/awkward/types/arraytype.py b/src/awkward/types/arraytype.py index f1b71d9e04..df3e317d00 100644 --- a/src/awkward/types/arraytype.py +++ b/src/awkward/types/arraytype.py @@ -2,13 +2,13 @@ from __future__ import annotations -import sys from collections.abc import Mapping import awkward as ak from awkward._nplikes.shape import ShapeItem, unknown_length from awkward._regularize import is_integer from awkward._typing import Any +from awkward._util import STDOUT from awkward.types.type import Type @@ -43,8 +43,14 @@ def behavior(self) -> Mapping | None: def __str__(self) -> str: return "".join(self._str("", True)) - def show(self, stream=sys.stdout): - stream.write("".join([*self._str("", False), "\n"])) + def show(self, stream=STDOUT): + out = "".join(self._str("", False)) + if out is None: + return out + else: + if stream is STDOUT: + stream = STDOUT.stream + stream.write(out + "\n") def _str(self, indent: str, compact: bool) -> list[str]: return [ diff --git a/src/awkward/types/scalartype.py b/src/awkward/types/scalartype.py index 9061405e5d..578ff8d810 100644 --- a/src/awkward/types/scalartype.py +++ b/src/awkward/types/scalartype.py @@ -2,11 +2,11 @@ from __future__ import annotations -import sys from collections.abc import Mapping import awkward as ak from awkward._typing import Any +from awkward._util import STDOUT from awkward.types.type import Type @@ -30,8 +30,14 @@ def behavior(self) -> Mapping | None: def __str__(self) -> str: return "".join(self._str("", True)) - def show(self, stream=sys.stdout): - stream.write("".join([*self._str("", False), "\n"])) + def show(self, stream=STDOUT): + out = "".join(self._str("", False)) + if out is None: + return out + else: + if stream is STDOUT: + stream = STDOUT.stream + stream.write(out + "\n") def _str(self, indent: str, compact: bool) -> list[str]: return self._content._str( diff --git a/src/awkward/types/type.py b/src/awkward/types/type.py index 78c8bddb4d..963213da91 100644 --- a/src/awkward/types/type.py +++ b/src/awkward/types/type.py @@ -3,13 +3,12 @@ from __future__ import annotations import json -import sys from collections.abc import Mapping import awkward as ak from awkward._nplikes.numpy_like import NumpyMetadata from awkward._typing import Any, JSONMapping, JSONSerializable, Self -from awkward._util import UNSET, Sentinel +from awkward._util import STDOUT, UNSET, Sentinel from awkward.types._awkward_datashape_parser import Lark_StandAlone, Transformer np = NumpyMetadata.instance() @@ -39,9 +38,14 @@ def __str__(self) -> str: def _str(self, indent: str, compact: bool, behavior: Mapping | None) -> list[str]: raise NotImplementedError - def show(self, stream=sys.stdout): - # TODO: deprecate lowlevel show - stream.write("".join([*self._str("", False, None), "\n"])) + def show(self, stream=STDOUT): + out = "".join(self._str("", False, None)) + if out is None: + return out + else: + if stream is STDOUT: + stream = STDOUT.stream + stream.write(out + "\n") _str_parameters_exclude: tuple[str, ...] = ("__categorical__",)