diff --git a/CHANGELOG.md b/CHANGELOG.md index cae5edf38e..a0ae7ed486 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,14 @@ Recommendation: for ease of reading, use the following order: - Fixed --> +## [Unreleased] +### Changed +- Flight SQL protocol now fully support authentication (anonymous and bearer token) +- The `kamu notebook` command now defaults to `DataFusion` engine for speed, but you can switch to Spark with `--engine spark` argument +- The `kamu notebook` command uses new image based on latest Jupyter and new [`kamu-client-python`](https://github.com/kamu-data/kamu-client-python) library +- The `kamu sql server` command interface changed to use `--engine datafusion/spark`, removing the `--flight-sql` flag +- Examples in `example/flightsql/python` were updated to new auth and showcasing `kamu` Python library + ## [0.215.1] - 2024-12-30 ### Fixed - GraphQL: in a multi-tenant workspace, `datasets.createEmpty` and `datasets.createFromSnapshot` mutations now return dataset aliases prefixed with account name. diff --git a/Cargo.lock b/Cargo.lock index 8ddda3044b..6aeb4eb468 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5495,21 +5495,30 @@ version = "0.215.1" dependencies = [ "arrow-flight", "async-trait", + "base32", "base64 0.22.1", + "bytes", "chrono", + "database-common", "datafusion", "dill", "futures", + "http 1.2.0", + "http-body 1.0.1", "indoc 2.0.5", + "kamu-accounts", "kamu-core", "kamu-data-utils", "like", + "mockall", "prost", + "rand", "test-log", "time-source", "tokio", "tokio-stream", "tonic", + "tower 0.5.2", "tracing", "tracing-subscriber", "uuid", diff --git a/examples/archive/commercial-fishing/.kamuconfig b/examples/archive/commercial-fishing/.kamuconfig new file mode 100644 index 0000000000..40ca8eb190 --- /dev/null +++ b/examples/archive/commercial-fishing/.kamuconfig @@ -0,0 +1,14 @@ +kind: CLIConfig +version: 1 +content: + users: + predefined: + - accountName: kamu + isAdmin: true + avatarUrl: https://avatars.githubusercontent.com/u/50896974?s=200&v=4 + - accountName: acme.fishing.co + accountType: Organization + avatarUrl: https://cdn-icons-png.flaticon.com/512/1090/1090630.png + - accountName: globalfishingwatch.org + accountType: Organization + avatarUrl: https://cdn-icons-png.flaticon.com/512/744/744480.png diff --git a/examples/archive/commercial-fishing/init.sh b/examples/archive/commercial-fishing/init.sh index 341478c909..015eb505d6 100755 --- a/examples/archive/commercial-fishing/init.sh +++ b/examples/archive/commercial-fishing/init.sh @@ -4,6 +4,7 @@ set -e KAMU_NODE_URL="odf+https://node.demo.kamu.dev/" kamu init --multi-tenant --exists-ok +cp -f .kamuconfig .kamu/ kamu --account acme.fishing.co pull "${KAMU_NODE_URL}acme.fishing.co/vessels.gps" kamu --account acme.fishing.co pull "${KAMU_NODE_URL}acme.fishing.co/vessels.trawl" diff --git a/examples/archive/commercial-fishing/analysis.ipynb b/examples/archive/commercial-fishing/notebook.ipynb similarity index 83% rename from examples/archive/commercial-fishing/analysis.ipynb rename to examples/archive/commercial-fishing/notebook.ipynb index 8c215f565a..b7c1cd4a32 100644 --- a/examples/archive/commercial-fishing/analysis.ipynb +++ b/examples/archive/commercial-fishing/notebook.ipynb @@ -13,15 +13,14 @@ { "cell_type": "code", "execution_count": null, - "id": "f0d0f5b5", + "id": "d5947b7d-4cfd-4a8b-b12f-c1a76402438b", "metadata": {}, "outputs": [], "source": [ - "%import_dataset acme.fishing.co/vessels.gps --alias gps\n", - "%import_dataset acme.fishing.co/vessels.trawl --alias trawl\n", - "%import_dataset acme.fishing.co/vessels.fuel --alias fuel\n", - "%import_dataset acme.fishing.co/vessels.location-annotated --alias loc\n", - "%import_dataset globalfishingwatch.org/protected-areas --alias areas" + "import kamu\n", + "import kamu.utils\n", + "\n", + "con = kamu.connect(engine=\"spark\")" ] }, { @@ -31,14 +30,12 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "import os\n", "import pandas as pd\n", "import plotly.graph_objects as go\n", "import plotly.express as px\n", "from mapboxgl.viz import *\n", "from mapboxgl.utils import *\n", - "from utils.plotting import *\n", "\n", "# Must be a public token, starting with `pk`\n", "token = os.getenv('MAPBOX_ACCESS_TOKEN')\n", @@ -68,7 +65,7 @@ " longitude,\n", " latitude,\n", " is_trawling\n", - "from loc" + "from `acme.fishing.co/vessels.location-annotated`" ] }, { @@ -78,7 +75,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "fig = go.Figure()\n", "\n", "for vessel_name in gps['vessel_name'].unique():\n", @@ -136,7 +132,7 @@ " date,\n", " wdpa_pid,\n", " gis_area\n", - "from areas" + "from `globalfishingwatch.org/protected-areas`" ] }, { @@ -146,9 +142,8 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "viz = ChoroplethViz(\n", - " df_to_geojson(areas),\n", + " kamu.utils.df_to_geojson(areas),\n", " style=mapbox_style,\n", " center=(2, 51),\n", " zoom=5,\n", @@ -186,7 +181,8 @@ " name,\n", " gis_area,\n", " geometry\n", - "from areas where parent_iso in (\"NLD\", \"FRA\", \"DMK\", \"BEL\")" + "from `globalfishingwatch.org/protected-areas`\n", + "where parent_iso in (\"NLD\", \"FRA\", \"DMK\", \"BEL\")" ] }, { @@ -197,22 +193,27 @@ "outputs": [], "source": [ "%%sql -o isect -q\n", - "select\n", - " gps.event_time,\n", - " gps.vessel_name,\n", - " gps.latitude,\n", - " gps.longitude\n", - "from (\n", + "with location_trawling as (\n", " select\n", " event_time, vessel_name, latitude, longitude, st_point(longitude, latitude) as geometry \n", - " from loc where is_trawling = 1\n", - ") gps,\n", - "(\n", + " from `acme.fishing.co/vessels.location-annotated`\n", + " where is_trawling = 1\n", + "),\n", + "protected_areas as (\n", " select\n", " st_geomfromgeojson(geometry) as geometry\n", - " from areas where parent_iso = \"NLD\"\n", - ") areas\n", - "where st_contains(areas.geometry, gps.geometry)" + " from `globalfishingwatch.org/protected-areas`\n", + " where parent_iso = \"NLD\"\n", + ")\n", + "select\n", + " loc.event_time,\n", + " loc.vessel_name,\n", + " loc.latitude,\n", + " loc.longitude\n", + "from\n", + " location_trawling as loc,\n", + " protected_areas as area\n", + "where st_contains(area.geometry, loc.geometry)" ] }, { @@ -222,8 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "isect_areas_geojson = df_to_geojson(isect_areas)\n", + "isect_areas_geojson = kamu.utils.df_to_geojson(isect_areas)\n", "\n", "fig = go.Figure()\n", "\n", @@ -283,19 +283,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/archive/commercial-fishing/utils/plotting.py b/examples/archive/commercial-fishing/utils/plotting.py deleted file mode 100644 index ef857fe314..0000000000 --- a/examples/archive/commercial-fishing/utils/plotting.py +++ /dev/null @@ -1,28 +0,0 @@ -import json - -def to_plain(v): - if type(v) in [int, float, str]: - return v - else: - return str(v) - -# For every row we first combine GeoJson geometry with other columns into a Feature object -# Then we combine all Features into a FeatureCollection -def df_to_geojson(df, geom='geometry', props=None): - if props is None: - props = [ - c for c in df.columns - if c != geom - ] - - return { - "type": "FeatureCollection", - "features": [ - { - "type": "Feature", - "geometry": json.loads(row[geom]), - "properties": {p: to_plain(row[p]) for p in props} - } - for _, row in df.iterrows() - ] - } \ No newline at end of file diff --git a/examples/archive/water-management/.kamuconfig b/examples/archive/water-management/.kamuconfig new file mode 100644 index 0000000000..8b5a0dceca --- /dev/null +++ b/examples/archive/water-management/.kamuconfig @@ -0,0 +1,12 @@ +kind: CLIConfig +version: 1 +content: + users: + predefined: + - accountName: kamu + isAdmin: true + avatarUrl: https://avatars.githubusercontent.com/u/50896974?s=200&v=4 + - accountName: rijkswaterstaat.nl + avatarUrl: https://www.shutterstock.com/image-vector/royal-exclusive-badge-logo-two-260nw-236025661.jpg + - accountName: deltares.nl + avatarUrl: https://avatars.githubusercontent.com/u/6613768?s=200&v=4 \ No newline at end of file diff --git a/examples/archive/water-management/init.sh b/examples/archive/water-management/init.sh index 366394ce0d..50c73b00ee 100755 --- a/examples/archive/water-management/init.sh +++ b/examples/archive/water-management/init.sh @@ -4,6 +4,7 @@ set -e KAMU_NODE_URL="odf+https://node.demo.kamu.dev/" kamu init --multi-tenant --exists-ok +cp -f .kamuconfig .kamu/ kamu --account rijkswaterstaat.nl pull "${KAMU_NODE_URL}rijkswaterstaat.nl/stations" kamu --account rijkswaterstaat.nl pull "${KAMU_NODE_URL}rijkswaterstaat.nl/measurements.boven-rijn" diff --git a/examples/archive/water-management/analyze.ipynb b/examples/archive/water-management/notebook.ipynb similarity index 83% rename from examples/archive/water-management/analyze.ipynb rename to examples/archive/water-management/notebook.ipynb index fb5b4ad12f..71b28f10bd 100644 --- a/examples/archive/water-management/analyze.ipynb +++ b/examples/archive/water-management/notebook.ipynb @@ -3,35 +3,35 @@ { "cell_type": "code", "execution_count": null, - "id": "ab79739d", + "id": "4a15a1c9-7e81-4a17-8282-a32c06bdbaf4", "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "import os\n", - "import numpy as np\n", - "import xarray as xr\n", - "import pandas as pd\n", - "import geopandas as gpd\n", - "import matplotlib.pyplot as plt\n", - "import hvplot.pandas # noqa\n", - "import hvplot.xarray # noqa\n", - "from datetime import datetime\n", - "from mapboxgl.utils import create_color_stops, create_numeric_stops, df_to_geojson\n", - "from mapboxgl.viz import CircleViz" + "%load_ext kamu" ] }, { "cell_type": "code", "execution_count": null, - "id": "d4774ffb", + "id": "1d60c661-237b-4850-bafe-705ac04bd7c9", "metadata": {}, "outputs": [], "source": [ - "%load_ext kamu\n", + "import kamu\n", + "\n", + "import os\n", + "import numpy as np\n", + "import xarray as xr\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import hvplot.pandas\n", + "import hvplot.xarray\n", + "from datetime import datetime\n", + "from mapboxgl.utils import create_color_stops, create_numeric_stops, df_to_geojson\n", + "from mapboxgl.viz import CircleViz\n", "\n", - "%import_dataset rijkswaterstaat.nl/stations\n", - "%import_dataset rijkswaterstaat.nl/measurements.boven-rijn" + "con = kamu.connect()" ] }, { @@ -42,7 +42,7 @@ "outputs": [], "source": [ "%%sql\n", - "select * from `rijkswaterstaat.nl/stations` limit 5" + "select * from 'rijkswaterstaat.nl/stations' limit 3" ] }, { @@ -53,7 +53,7 @@ "outputs": [], "source": [ "%%sql\n", - "select * from `rijkswaterstaat.nl/measurements.boven-rijn` limit 5" + "select * from 'rijkswaterstaat.nl/measurements.boven-rijn' limit 3" ] }, { @@ -65,16 +65,6 @@ "Here we show the dataset using a scatterplot. We select the 25th timestep, which corresponds to 2020-01-01 00:00 . We visualize the waterlevels for that timestep, using a scatterplot with 1 value per station." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c4470be", - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset deltares.nl/rhine-basin.netherlands" - ] - }, { "cell_type": "code", "execution_count": null, @@ -94,8 +84,8 @@ " waterlevel,\n", " velocity,\n", " discharge\n", - "from `deltares.nl/rhine-basin.netherlands` m\n", - "left join `rijkswaterstaat.nl/stations` s \n", + "from 'deltares.nl/rhine-basin.netherlands' as m\n", + "left join 'rijkswaterstaat.nl/stations' as s \n", " on m.station_id = s.station_id" ] }, @@ -106,7 +96,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "ds = df.set_index(['event_time', 'station_id']).to_xarray()\n", "ds.plot.scatter(x='lon', y='lat', hue='waterlevel', edgecolors='none')" ] @@ -118,7 +107,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "viz = CircleViz(\n", " df_to_geojson(\n", " df, properties=['station_id', 'station_name', 'waterlevel'], lat='lat', lon='lon', precision=3\n", @@ -145,16 +133,6 @@ "# Simulating Predicted Water Levels" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "f77e1cab", - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset deltares.nl/rhine-basin.netherlands.sim" - ] - }, { "cell_type": "code", "execution_count": null, @@ -171,8 +149,8 @@ " waterlevel,\n", " velocity,\n", " discharge,\n", - " round((cast(sim_time as long) - cast(analysis_time as long))/3600) as lookahead\n", - "from `deltares.nl/rhine-basin.netherlands.sim` m\n", + " round((cast(sim_time as bigint) - cast(analysis_time as bigint))/3600) as lookahead\n", + "from 'deltares.nl/rhine-basin.netherlands.sim' as m\n", "where \n", " station_id = 'BR_0863.00'\n", " and waterlevel is not null\n", @@ -186,7 +164,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "df2.hvplot.scatter(\n", " x='sim_time', y=['waterlevel', 'velocity', 'discharge'], shared_axes=False, c='lookahead', \n", " cmap='magma', s=2, height=300, width=800, subplots=True\n", @@ -216,7 +193,7 @@ " analysis_time,\n", " sim_time,\n", " discharge\n", - "from `deltares.nl/rhine-basin.netherlands.sim`\n", + "from 'deltares.nl/rhine-basin.netherlands.sim'\n", "where station_id = 'BR_0863.00'\n", "order by analysis_time, sim_time" ] @@ -236,7 +213,7 @@ " sim_time,\n", " waterlevel,\n", " velocity\n", - "from `deltares.nl/rhine-basin.netherlands.sim`\n", + "from 'deltares.nl/rhine-basin.netherlands.sim'\n", "where station_id = 'WA_0913.00'\n", "order by analysis_time, sim_time" ] @@ -264,7 +241,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "lds = lobith.to_xarray()\n", "tds = thiel.to_xarray()" ] @@ -276,7 +252,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "fig, axes = plt.subplots(ncols=3, figsize=(13, 6))\n", "axes[0].hist(lds.discharge.values.ravel())\n", "axes[1].hist(tds.waterlevel.values.ravel())\n", @@ -299,7 +274,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "fig, ax = plt.subplots(figsize=(13, 8))\n", "ax.plot(lds.discharge.values.ravel(), tds.waterlevel.values.ravel(), 'k.', alpha=0.1)\n", "ax.set_xlabel('Discharge @ Lobith [m3/s]')\n", @@ -333,19 +307,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/covid/Covid Using Graphs in Jupyter Notebook.ipynb b/examples/covid/notebook.ipynb similarity index 53% rename from examples/covid/Covid Using Graphs in Jupyter Notebook.ipynb rename to examples/covid/notebook.ipynb index 8968bbfb12..7bbe254d26 100644 --- a/examples/covid/Covid Using Graphs in Jupyter Notebook.ipynb +++ b/examples/covid/notebook.ipynb @@ -74,165 +74,171 @@ }, { "cell_type": "markdown", - "id": "c2c61eee", + "id": "ff03afd2-8f27-4b4b-bf37-98e4b8aee997", "metadata": {}, "source": [ - "## Load Kamu Extension\n", + "## Connect to Kamu\n", + "First we need to import `kamu` library and create a connection to the server. We will let the library to figure out where to find the server, but you can connect to other nodes by providing a URL.\n", + "\n", "
\n", - "Start by loading kamu Jupyter extension in your terminal:\n", + "\n", + "Connect to `kamu` server.\n", + "\n", "
" ] }, { "cell_type": "code", "execution_count": null, - "id": "28c1c94a", - "metadata": { - "scrolled": true - }, + "id": "77ff78d6-d674-41ac-a762-b43fd22d428d", + "metadata": {}, "outputs": [], "source": [ - "%load_ext kamu" + "import kamu\n", + "\n", + "con = kamu.connect()" ] }, { "cell_type": "markdown", - "id": "b4c3ad82", + "id": "2c8f5a2f-74c9-4476-a91e-792f45e542b1", "metadata": {}, "source": [ - "## Import and Test Data\n", - "
\n", - "Now it is time to start importing your Covid data by province. First import the data from the province of BC by using the command %import dataset. An alias was created to make it easier to call this file.\n", - "
" + "You can already query data using the connection object." ] }, { "cell_type": "code", "execution_count": null, - "id": "f665cd74", - "metadata": { - "scrolled": true - }, + "id": "144502a9-831f-4072-9fa6-b9add039c5d1", + "metadata": {}, "outputs": [], "source": [ - "%import_dataset covid19.british-columbia.case-details --alias cases_bc" + "con.query(\"select 1 as value\")" ] }, { "cell_type": "markdown", - "id": "64b3449f", + "id": "c2c61eee", "metadata": {}, "source": [ - "
\n", - "To test if the data was loaded correctly a SQL querry is run.\n", - "
" + "## Load Kamu Extension\n", + "To avoid typying `con.query(\"...\")` all the time let's load kamu Jupyter extension." ] }, { "cell_type": "code", "execution_count": null, - "id": "cb505b23", + "id": "28c1c94a", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "%%sql\n", - "SELECT * FROM cases_bc\n", - "ORDER BY id DESC\n", - "LIMIT 10" + "%load_ext kamu" ] }, { "cell_type": "markdown", - "id": "665855a7", + "id": "b4c3ad82", "metadata": {}, "source": [ - "
\n", - "Now it is time to import the rest of the Covid data files and create aliases for them\n", - "
" + "The extension provides a convenient `%%sql` cell magic. Let's use it to look at the data from the province of BC." ] }, { "cell_type": "code", "execution_count": null, - "id": "50a8426b", - "metadata": {}, + "id": "cb505b23", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "%import_dataset covid19.ontario.case-details --alias cases_on\n", - "%import_dataset covid19.alberta.case-details --alias cases_ab\n", - "%import_dataset covid19.quebec.case-details --alias cases_qb" + "%%sql\n", + "select * from 'covid19.british-columbia.case-details' limit 3" ] }, { "cell_type": "markdown", - "id": "7b1540af", + "id": "821818ad-90c8-4034-9b7f-6ac16ea6c48b", "metadata": {}, "source": [ - "
\n", - "Time to test again if the data was imported correctly. You can test the Alberta files by changing cases_on to cases_ab. For Quebec change it to cases_qb and id to row_id. \n", - "
" + "## Explore Data" + ] + }, + { + "cell_type": "markdown", + "id": "665855a7", + "metadata": {}, + "source": [ + "We can use the same approach to sample data from other provinces:" ] }, { "cell_type": "code", "execution_count": null, - "id": "5c50f473", + "id": "599cac31-e7d7-4313-a768-dbb0d1c5fdae", "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%sql\n", - "SELECT * FROM cases_on\n", - "ORDER BY id DESC\n", - "LIMIT 10" + "select * from 'covid19.alberta.case-details' limit 3" ] }, { - "cell_type": "markdown", - "id": "314078ef", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "id": "ad28d886-cb48-40c9-b2b5-161644f304b3", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ - "
\n", - "The next file that you import is case details for the four provinces combined. The file covid19.canada.case-details uses an SQL query in the yaml file to combine that data so that you don't have to combine them with 'UNION ALL'.\n", - "The SQL queries that harmonize the data of each province can be found in (province).case-details.hm. If you open these yamls, there are queries that make the datasets be able to be compared without semantic differences between them. For example only two provinces have a 90+ whereas the other two has age ranges of 80+. Therefore, we need to switch the age ranges to 80+ to compare the data.\n", - "
" + "%%sql\n", + "select * from 'covid19.ontario.case-details' limit 3" ] }, { "cell_type": "code", "execution_count": null, - "id": "d8c32581", - "metadata": {}, + "id": "874863d9-e4af-41fc-9a90-a48fe8072a9d", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "%import_dataset covid19.canada.case-details --alias cases_four_provinces" + "%%sql\n", + "select * from 'covid19.quebec.case-details' limit 3" ] }, { "cell_type": "markdown", - "id": "03b616c3", + "id": "314078ef", "metadata": {}, "source": [ + "Notice how data schemas and column semantics are slightly different between provinces. This makes pretty difficult to work with data across all provinces.\n", + "\n", + "To tackle that we have created several harmonization datasets `{province}.case-details.hm` that bring data from all provinces under a common format. The `covid19.canada.case-details` dataset then uses `UNION ALL` operation to derive a new pan-Canadian dataset.\n", + "\n", "
\n", - "Again, test to see if the data worked by showing the last 10 data rows.\n", - "
" + "Take a minute to study the definitions of these datasets.\n", + "\n", + "\n", + "Let's sample the pan-Canadian dataset now." ] }, { "cell_type": "code", "execution_count": null, "id": "ee795d89", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "%%sql\n", - "SELECT * FROM cases_four_provinces\n", - "LIMIT 10" + "select * from 'covid19.canada.case-details' limit 3" ] }, { @@ -240,9 +246,7 @@ "id": "027313e1", "metadata": {}, "source": [ - "
\n", - "To use this file, a SQL query is created to combine all of the cases by age group and by province\n", - "
" + "Let's write a query that counts the number of cases by age group and by province." ] }, { @@ -253,10 +257,13 @@ "outputs": [], "source": [ "%%sql -o age_cases\n", - "SELECT province, age_group, COUNT(*) \n", - "FROM cases_four_provinces\n", - "GROUP BY province, age_group\n", - "ORDER BY province, age_group;" + "select\n", + " province,\n", + " age_group,\n", + " count(*)\n", + "from 'covid19.canada.case-details'\n", + "group by province, age_group\n", + "order by province, age_group" ] }, { @@ -264,8 +271,7 @@ "id": "27e8856f", "metadata": {}, "source": [ - "
\n", - " Through With plotly.express.pie a pie chart can be created to compare the cases per province then per age group. As can bee seen over a third of Quebec's cases are unknow which is probably to to Quebec's strict privacy act laws that are part of the Act Respecting Access to Documents Held by Public Bodies and the Protection of Personal Information. These differences in law can cause errors when comparing data.
" + "We can use `plotly` to visualize this data as a pie chart." ] }, { @@ -275,9 +281,16 @@ "metadata": {}, "outputs": [], "source": [ - "%%local \n", "import plotly.express \n", - "plotly.express.pie(age_cases, values='count(1)', names='age_group', color='age_group', title='Cases by Age Group and Province', facet_col='province')" + "plotly.express.pie(age_cases, values='count(*)', names='age_group', color='age_group', title='Cases by Age Group and Province', facet_col='province')" + ] + }, + { + "cell_type": "markdown", + "id": "932dfe7c-1765-466b-b5bf-0b8f1fc7ff67", + "metadata": {}, + "source": [ + "As can bee seen over a third of Quebec's cases are unknow which is probably due to Quebec's strict privacy act laws that are part of the Act Respecting Access to Documents Held by Public Bodies and the Protection of Personal Information. These differences in law can cause errors when comparing data!" ] }, { @@ -285,9 +298,7 @@ "id": "fa22f18f", "metadata": {}, "source": [ - "
\n", - "Another piece of data we can get from this yaml is gender. Therefore, a SQL query is created to combine all of the cases by gender and by province\n", - "
" + "Now let's look at the distribution of cases by gender and by province" ] }, { @@ -300,20 +311,13 @@ "outputs": [], "source": [ "%%sql -o total_cases\n", - "SELECT province, gender, COUNT(*) \n", - "FROM cases_four_provinces\n", - "GROUP BY province, gender\n", - "ORDER BY province, gender;" - ] - }, - { - "cell_type": "markdown", - "id": "65858b3b", - "metadata": {}, - "source": [ - "
\n", - " Through plotly.express.bar a bar chart can be created to compare the cases per province then per gender (male, female, unspecified).\n", - "
" + "select\n", + " province,\n", + " gender,\n", + " count(*)\n", + "from 'covid19.canada.case-details'\n", + "group by province, gender\n", + "order by province, gender" ] }, { @@ -323,9 +327,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%local \n", - "import plotly.express \n", - "plotly.express.bar(total_cases, x='province', y='count(1)', color='gender', title='Cases per Gender')\n" + "plotly.express.bar(total_cases, x='province', y='count(*)', color='gender', title='Cases per Gender')\n" ] }, { @@ -333,9 +335,7 @@ "id": "f0c746b6", "metadata": {}, "source": [ - "
\n", - " By looking through the data you can see that Quebec has a large amount of people who were classified as undefined. This is probably again due to Quebec's strict privacy laws.\n", - "
" + "Here you can see that Quebec has a large amount of people who were classified as undefined. This is probably again due to Quebec's strict privacy laws." ] }, { @@ -343,29 +343,7 @@ "id": "d65f66df", "metadata": {}, "source": [ - "
\n", - "The last dataset that we are importing is daily cases for the four provinces.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b54ceff", - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset covid19.canada.daily-cases --alias daily_cases" - ] - }, - { - "cell_type": "markdown", - "id": "44ff2b2a", - "metadata": {}, - "source": [ - "
\n", - "Now test again to see if the data was succcesfully installed for this file.\n", - "
" + "The last dataset that we will look at is daily cases aggregation for the four provinces." ] }, { @@ -376,7 +354,7 @@ "outputs": [], "source": [ "%%sql -o daily_cases\n", - "select * from daily_cases" + "select * from 'covid19.canada.daily-cases'" ] }, { @@ -384,9 +362,7 @@ "id": "083fecc2", "metadata": {}, "source": [ - "
\n", - "The last step is to create a line plot graph to compare the different amount of cases per day by province.\n", - "
" + "We can use it to create a line plot graph to compare the different amount of cases per day by province." ] }, { @@ -398,8 +374,6 @@ }, "outputs": [], "source": [ - "%%local\n", - "import plotly.express\n", "plotly.express.line(daily_cases, x=\"reported_date\" , y=\"total_daily\", color=\"province\")" ] }, @@ -408,14 +382,12 @@ "id": "6bd2a6b0", "metadata": {}, "source": [ - "
\n", - "\n", - "As seen in the graph above, the case data has multiple spikes, including two significant ones in Quebec from late December 2020 and early January 2021. As explained in [this data source issue](https://github.com/ccodwg/Covid19Canada/issues/44) these spikes don't reflect an actual surge in cases, but rather a **delay in data entry** due to the holidays and weekends, with cases being attributed to the day they are entered on instead of amending the past data for the days they were registered on. This issue makes data hard to work with, often requiring some \"smoothing\" to get approximate number of cases on a cetrain date.\n", + "As seen in the graph above, the case data has multiple spikes, including two extreme ones in Quebec from late December 2020 and early January 2021. As explained in [this data source issue](https://github.com/ccodwg/Covid19Canada/issues/44) these spikes don't reflect an actual surge in cases, but rather a **delay in data entry** due to the holidays and weekends, with cases being attributed to the day they are entered on instead of amending the past data for the days they were registered on. This issue makes data hard to work with, often requiring some \"smoothing\" to get approximate number of cases on a cetrain date.\n", "\n", "\n", - "Kamu offers a combination of techniques like [watermarks](https://docs.kamu.dev/glossary/#watermark), explicit [retractions and corrections](https://docs.kamu.dev/glossary/#retractions-and-corrections) to automatically account for late arriving data and simultaneously provide **minimal latency** and **accuracy and consistency** of data. Continue to [other examples](https://docs.kamu.dev/cli/get-started/examples/) to learn more.\n", + "Kamu offers a combination of techniques like [watermarks](https://docs.kamu.dev/glossary/#watermark), explicit [retractions and corrections](https://docs.kamu.dev/glossary/#retractions-and-corrections) to automatically account for late arriving data and simultaneously provide **minimal latency** and **accuracy and consistency** of data.\n", "\n", - "
" + "Continue to [other examples](https://docs.kamu.dev/cli/get-started/examples/) to learn more!" ] }, { @@ -429,19 +401,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/currency_conversion/rates.ipynb b/examples/currency_conversion/rates.ipynb index 072c9e31df..6cbfedaa6b 100644 --- a/examples/currency_conversion/rates.ipynb +++ b/examples/currency_conversion/rates.ipynb @@ -7,7 +7,7 @@ "outputs": [], "source": [ "%load_ext kamu\n", - "# Loads kamu extension to use `import_dataset` command" + "# Loads kamu extension to use `%%sql` cell magic and auto-viz" ] }, { @@ -16,29 +16,10 @@ "metadata": {}, "outputs": [], "source": [ - "%import_dataset ca.bankofcanada.exchange-rates.daily --alias rates\n", - "# Imports dataset and gives it an *SQL table / PySpark* alias" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# By default all code is executed remotely via PySpark and has direct access to imported datasets\n", - "rates.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "# The above makes python code execute in a *local* notebook kernel \n", - "print(\"This runs in the notebook\")" + "import kamu\n", + "\n", + "# Create onnection to kamu server\n", + "con = kamu.connect()" ] }, { @@ -50,7 +31,7 @@ "%%sql -o rates\n", "-- We can run SQL queries directly\n", "-- By adding `-o ` we can download the result into the local notebook as Pandas dataframe!\n", - "select * from rates" + "select * from 'ca.bankofcanada.exchange-rates.daily'" ] }, { @@ -59,21 +40,9 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "# Local notebook now has `rates` variable\n", "rates.info()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset my.trading.transactions --alias transactions\n", - "%import_dataset my.trading.transactions.cad --alias transactions_cad" - ] - }, { "cell_type": "code", "execution_count": null, @@ -81,7 +50,7 @@ "outputs": [], "source": [ "%%sql -o tx\n", - "-- Let's use SQL to shape the data via Spark and download the processed result into the notebook\n", + "-- Let's use SQL to shape the data and download the processed result into the notebook\n", "select \n", " offset,\n", " system_time,\n", @@ -92,7 +61,7 @@ " cast(price_cad as double) as price_cad,\n", " cast(settlement_usd as double) as settlement_usd,\n", " cast(settlement_cad as double) as settlement_cad\n", - "from transactions_cad" + "from 'my.trading.transactions.cad'" ] }, { @@ -101,14 +70,13 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "import os\n", "import numpy as np\n", "import xarray as xr\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", - "import hvplot.pandas # noqa\n", - "import hvplot.xarray # noqa" + "import hvplot.pandas\n", + "import hvplot.xarray" ] }, { @@ -117,7 +85,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "# Now we can visualize it!\n", "rates.hvplot.line(\n", " x=\"date\", \n", @@ -135,7 +102,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "tx.hvplot.scatter(\n", " x=\"event_time\", \n", " y=[\"settlement_cad\"], \n", @@ -156,19 +122,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/flight-sql/python/client_flightsql_adbc.py b/examples/flight-sql/python/client_flightsql_adbc.py index 96d979b885..afcb6787b4 100644 --- a/examples/flight-sql/python/client_flightsql_adbc.py +++ b/examples/flight-sql/python/client_flightsql_adbc.py @@ -2,34 +2,43 @@ import adbc_driver_flightsql.dbapi import pandas -# No-TLS local connection -# -# To test with local server use: -# cd examples/reth-vs-snp500 -# kamu -vv sql server --flight-sql --port 50050 --address 0.0.0.0 -# +# # Secure remote connection con = adbc_driver_flightsql.dbapi.connect( - "grpc://localhost:50050", + "grpc+tls://node.demo.kamu.dev:50050", db_kwargs={ - adbc_driver_manager.DatabaseOptions.USERNAME.value: "kamu", - adbc_driver_manager.DatabaseOptions.PASSWORD.value: "kamu", + # Anonymous users have to authenticate using basic auth so they could be assigned a session token + adbc_driver_manager.DatabaseOptions.USERNAME.value: "anonymous", + adbc_driver_manager.DatabaseOptions.PASSWORD.value: "anonymous", + # Registered users can provide a bearer token directy + # adbc_driver_flightsql.DatabaseOptions.AUTHORIZATION_HEADER.value: "Bearer ", }, autocommit=True, ) -# Secure remote connection +# No-TLS local connection +# +# To test with local server use: +# cd examples/reth-vs-snp500 +# kamu -vv sql server --port 50050 --address 0.0.0.0 +# # con = adbc_driver_flightsql.dbapi.connect( -# "grpc+tls://node.demo.kamu.dev:50050", +# "grpc://localhost:50050", # db_kwargs={ -# adbc_driver_manager.DatabaseOptions.USERNAME.value: "kamu", -# adbc_driver_manager.DatabaseOptions.PASSWORD.value: "kamu", +# # Anonymous users have to authenticate using basic auth so they could be assigned a session token +# adbc_driver_manager.DatabaseOptions.USERNAME.value: "anonymous", +# adbc_driver_manager.DatabaseOptions.PASSWORD.value: "anonymous", +# # Registered users can provide a bearer token directy +# # adbc_driver_flightsql.DatabaseOptions.AUTHORIZATION_HEADER.value: "Bearer ", # }, # autocommit=True, # ) with con: + df = pandas.read_sql("select 1", con) + print(df) + df = pandas.read_sql("show tables", con) print(df) - df = pandas.read_sql("select * from 'co.alphavantage.tickers.daily.spy' limit 10", con) + df = pandas.read_sql("select * from 'kamu/co.alphavantage.tickers.daily.spy' limit 10", con) print(df) diff --git a/examples/flight-sql/python/client_flightsql_dbapi2.py b/examples/flight-sql/python/client_flightsql_dbapi2.py index 68da8057a7..449bb8da4f 100644 --- a/examples/flight-sql/python/client_flightsql_dbapi2.py +++ b/examples/flight-sql/python/client_flightsql_dbapi2.py @@ -1,35 +1,45 @@ from flightsql import connect, FlightSQLClient +# Secure remote connection +client = FlightSQLClient( + host="node.demo.kamu.dev", + port=50050, + # Anonymous users have to authenticate using basic auth so they could be assigned a session token + user="anonymous", + password="anonymous", + # Registered users can provide a bearer token + # token="", +) + # No-TLS local connection # # To test with local server use: # cd examples/reth-vs-snp500 -# kamu -vv sql server --flight-sql --port 50050 --address 0.0.0.0 +# kamu -vv sql server --port 50050 --address 0.0.0.0 # -client = FlightSQLClient( - host='localhost', - port=50050, - user='kamu', - password='kamu', - insecure=True, -) - -# Secure remote connection # client = FlightSQLClient( -# host='node.demo.kamu.dev', +# host="localhost", # port=50050, -# user='kamu', -# password='kamu', +# insecure=True, +# # Anonymous users have to authenticate using basic auth so they could be assigned a session token +# user="anonymous", +# password="anonymous", +# # Registered users can provide a bearer token +# # token="", # ) con = connect(client) cursor = con.cursor() +cursor.execute("select 1 as value") +print("columns:", cursor.description) +print("rows:", [r for r in cursor]) + cursor.execute("show tables") print("columns:", cursor.description) print("rows:", [r for r in cursor]) -cursor.execute("select * from 'co.alphavantage.tickers.daily.spy' limit 10") +cursor.execute("select * from 'kamu/co.alphavantage.tickers.daily.spy' limit 10") print("columns:", cursor.description) print("rows:", [r for r in cursor]) @@ -40,5 +50,5 @@ df = pandas.read_sql("show tables", con) print(df) -df = pandas.read_sql("select * from 'co.alphavantage.tickers.daily.spy' limit 10", con) +df = pandas.read_sql("select * from 'kamu/co.alphavantage.tickers.daily.spy' limit 10", con) print(df) diff --git a/examples/flight-sql/python/client_flightsql_sqlalchemy.py b/examples/flight-sql/python/client_flightsql_sqlalchemy.py index 11acc5bc67..a546ff4fef 100644 --- a/examples/flight-sql/python/client_flightsql_sqlalchemy.py +++ b/examples/flight-sql/python/client_flightsql_sqlalchemy.py @@ -2,24 +2,33 @@ import sqlalchemy import pandas as pd +# Secure remote connection +engine = sqlalchemy.create_engine( + # Anonymous users have to authenticate using basic auth so they could be assigned a session token + "datafusion+flightsql://anonymous:anonymous@node.demo.kamu.dev:50050" + # Registered users can provide a bearer token directy + # "datafusion+flightsql://node.demo.kamu.dev:50050?token=kamu-token" +) + # No-TLS local connection # # To test with local server use: # cd examples/reth-vs-snp500 -# kamu -vv sql server --flight-sql --port 50050 --address 0.0.0.0 +# kamu -vv sql server --port 50050 --address 0.0.0.0 # -engine = sqlalchemy.create_engine( - "datafusion+flightsql://kamu:kamu@localhost:50050?insecure=True" -) - -# Secure remote connection # engine = sqlalchemy.create_engine( -# "datafusion+flightsql://kamu:kamu@node.demo.kamu.dev:50050" +# # Anonymous users have to authenticate using basic auth so they could be assigned a session token +# "datafusion+flightsql://anonymous:anonymous@localhost:50050?insecure=True" +# # Registered users can provide a bearer token directy +# # "datafusion+flightsql://localhost:50050?insecure=True&token=kamu-token" # ) with engine.connect() as con: + df = pd.read_sql(sql="select 1 as value", con=con.connection) + print(df) + df = pd.read_sql(sql="show tables", con=con.connection) print(df) - df = pd.read_sql(sql="select * from 'co.alphavantage.tickers.daily.spy' limit 10", con=con.connection) + df = pd.read_sql(sql="select * from 'kamu/co.alphavantage.tickers.daily.spy' limit 10", con=con.connection) print(df) diff --git a/examples/flight-sql/python/client_kamu.py b/examples/flight-sql/python/client_kamu.py new file mode 100644 index 0000000000..4fa1df8842 --- /dev/null +++ b/examples/flight-sql/python/client_kamu.py @@ -0,0 +1,32 @@ +import kamu + +# See more examples at: https://github.com/kamu-data/kamu-client-python + +# Secure remote connection +con = kamu.connect( + "grpc+tls://node.demo.kamu.dev:50050", + # Registered users can provide a bearer token + # token="", +) + +# No-TLS local connection +# +# To test with local server use: +# cd examples/reth-vs-snp500 +# kamu -vv sql server --port 50050 --address 0.0.0.0 +# +# con = kamu.connect( +# "grpc://localhost:50050", +# # Registered users can provide a bearer token +# # token="", +# ) + +with con: + df = con.query("select 1 as value") + print(df) + + df = con.query("show tables") + print(df) + + df = con.query("select * from 'kamu/co.alphavantage.tickers.daily.spy' limit 10") + print(df) diff --git a/examples/flight-sql/python/requirements.txt b/examples/flight-sql/python/requirements.txt index a0b91edfe4..c38038faf7 100644 --- a/examples/flight-sql/python/requirements.txt +++ b/examples/flight-sql/python/requirements.txt @@ -1,6 +1,8 @@ adbc_driver_manager adbc_driver_flightsql flightsql-dbapi +kamu sqlalchemy pandas +pyarrow jpype1 diff --git a/examples/flight-sql/python/shell.nix b/examples/flight-sql/python/shell.nix new file mode 100644 index 0000000000..c3a543d7ac --- /dev/null +++ b/examples/flight-sql/python/shell.nix @@ -0,0 +1,16 @@ +# Development shell for NixOS +# Currently has to be kept in sync with `requirements.txt` manually +let + pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/nixos-unstable.tar.gz") {}; +in pkgs.mkShell { + packages = [ + (pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ + # adbc_driver_manager + # adbc_driver_flightsql + # flightsql-dbapi + sqlalchemy + pandas + pyarrow + ])) + ]; +} \ No newline at end of file diff --git a/examples/housing_prices/ca.vancouver.opendata.property.tax-reports.yaml b/examples/housing_prices/ca.vancouver.opendata.property.tax-reports.yaml index 8ae86cc6b7..24325fa46c 100644 --- a/examples/housing_prices/ca.vancouver.opendata.property.tax-reports.yaml +++ b/examples/housing_prices/ca.vancouver.opendata.property.tax-reports.yaml @@ -84,4 +84,4 @@ content: - REPORT_YEAR - PID - kind: SetVocab - eventTimeColumn: report_year + eventTimeColumn: REPORT_YEAR diff --git a/examples/housing_prices/heatmap.ipynb b/examples/housing_prices/heatmap.ipynb deleted file mode 100644 index 8761ea7ed9..0000000000 --- a/examples/housing_prices/heatmap.ipynb +++ /dev/null @@ -1,197 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext kamu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset ca.vancouver.opendata.property.parcel-polygons --alias lots\n", - "%import_dataset ca.vancouver.opendata.property.tax-reports --alias tax" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(lots.count())\n", - "lots.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(tax.count())\n", - "tax.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "select * from tax limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "CREATE OR REPLACE TEMP VIEW lot_tax AS (\n", - "SELECT\n", - " t.*,\n", - " l.geometry\n", - "FROM lots as l\n", - "JOIN tax as t\n", - "ON l.tax_coord = t.land_coordinate\n", - "WHERE\n", - " t.legal_type = 'LAND'\n", - " AND t.tax_assessment_year = 2020\n", - " AND t.current_land_value is not null\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql -o df\n", - "SELECT\n", - " land_coordinate,\n", - " geometry,\n", - " CAST(current_land_value AS DOUBLE) + CAST(current_improvement_value AS DOUBLE) AS current_total_value\n", - "FROM lot_tax" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import json\n", - "\n", - "# For every row we first combine GeoJson geometry with other columns into a Feature object\n", - "# Then we combine all Features into a FeatureCollection\n", - "def df_to_geojson(df, geom='geometry', props=None):\n", - " if props is None:\n", - " props = [\n", - " c for c in df.columns\n", - " if c != geom\n", - " ]\n", - " \n", - " return {\n", - " \"type\": \"FeatureCollection\",\n", - " \"features\": [\n", - " {\n", - " \"type\": \"Feature\",\n", - " \"geometry\": json.loads(row[geom]),\n", - " \"properties\": {p: row[p] for p in props}\n", - " }\n", - " for _, row in df.iterrows()\n", - " ]\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "gj = df_to_geojson(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import os\n", - "from mapboxgl.viz import *\n", - "from mapboxgl.utils import *\n", - "\n", - "# Must be a public token, starting with `pk`\n", - "token = os.getenv('MAPBOX_ACCESS_TOKEN')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "%%local\n", - "viz = ChoroplethViz(\n", - " gj,\n", - " style='mapbox://styles/mapbox/dark-v10',\n", - " center=(-123.1207, 49.2827),\n", - " zoom=10,\n", - " access_token=token,\n", - " color_property='current_total_value',\n", - " color_stops=create_color_stops([1000000, 2000000, 3000000, 5000000, 10000000], colors='YlOrRd'),\n", - " color_default='rgb(158,202,195)',\n", - " line_width=0,\n", - " opacity=1.0,\n", - " legend_layout='horizontal',\n", - " legend_key_shape='bar',\n", - " legend_key_borders_on=False)\n", - "\n", - "viz.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "PySpark", - "language": "python", - "name": "pysparkkernel" - }, - "language_info": { - "codemirror_mode": { - "name": "python", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/housing_prices/notebook.ipynb b/examples/housing_prices/notebook.ipynb new file mode 100644 index 0000000000..3dbe6560a6 --- /dev/null +++ b/examples/housing_prices/notebook.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "090c009f-34ae-4262-b9d7-36ca94d88baa", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext kamu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "213fc2aa-5f6f-48f9-87be-38e76ad731e5", + "metadata": {}, + "outputs": [], + "source": [ + "import kamu\n", + "con = kamu.connect(engine=\"spark\", connection_params=dict(driver_memory=\"1000m\", executor_memory=\"2000m\"))" + ] + }, + { + "cell_type": "markdown", + "id": "9654a8c1-4981-42a1-aef1-91dc36dded5c", + "metadata": {}, + "source": [ + "# Land value heatmap\n", + "Let's join land tax report records to their corresponding geographical boundaries and visualize their price on a map." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7086bb8c-08b8-4a40-b51d-d74165dcbb18", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select * from `ca.vancouver.opendata.property.tax-reports` limit 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaa1ed5d-f65b-40dd-b749-1dc327b5677b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select * from `ca.vancouver.opendata.property.parcel-polygons` limit 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36538d2f-04ee-4f43-ac14-de708a066175", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql -o heatmap -q\n", + "select\n", + " tax.report_year,\n", + " tax.pid,\n", + " tax.legal_type,\n", + " tax.zoning_district,\n", + " cast(tax.current_land_value as double) + cast(tax.current_improvement_value as double) as current_total_value,\n", + " polys.geometry\n", + "from `ca.vancouver.opendata.property.parcel-polygons` as polys\n", + "inner join `ca.vancouver.opendata.property.tax-reports` as tax\n", + " on tax.land_coordinate = polys.tax_coord\n", + "where\n", + " tax.legal_type = 'LAND'\n", + " and tax.tax_assessment_year = 2024\n", + " and tax.current_land_value is not null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b54cbaf0-9a08-429d-8949-e9f884e13683", + "metadata": {}, + "outputs": [], + "source": [ + "import kamu.utils\n", + "\n", + "heatmap_gj = kamu.utils.df_to_geojson(heatmap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2500525-2095-4e7f-855b-830cd1c1d549", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import kamu.utils\n", + "from mapboxgl.viz import *\n", + "from mapboxgl.utils import *\n", + "\n", + "# Must be a public token, starting with `pk`\n", + "token = os.getenv('MAPBOX_ACCESS_TOKEN')\n", + "\n", + "viz = ChoroplethViz(\n", + " heatmap_gj,\n", + " style='mapbox://styles/mapbox/dark-v10',\n", + " center=(-123.1207, 49.2827),\n", + " zoom=10,\n", + " access_token=token,\n", + " color_property='current_total_value',\n", + " color_stops=create_color_stops([1000000, 2000000, 3000000, 5000000, 10000000], colors='YlOrRd'),\n", + " color_default='rgb(158,202,195)',\n", + " line_width=0,\n", + " opacity=1.0,\n", + " legend_layout='horizontal',\n", + " legend_key_shape='bar',\n", + " legend_key_borders_on=False)\n", + "\n", + "viz.show()" + ] + }, + { + "cell_type": "markdown", + "id": "bcc55e34-4871-44fb-828d-364323f8f339", + "metadata": {}, + "source": [ + "# Spatial JOIN\n", + "We have two GIS datasets with outlines of every city block and geo boundaries of city neighbourhoods. Let's classify which neighbourhood every city block belongs to by joining two datasets using `st_contains` to test that a block polygon is fully contained within a neighbourhood polygon." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba53b40c-7360-4fdc-9f37-175b433121c3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select * from `ca.vancouver.opendata.property.block-outlines` limit 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a464c56b-945a-4437-b880-e42c219a070a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select * from `ca.vancouver.opendata.property.local-area-boundaries` limit 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eea7076e-0362-403f-a15a-183664d85f89", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql -o blocks_by_hood -q\n", + "with blocks as (\n", + " select\n", + " st_geomfromgeojson(geometry) as geometry\n", + " from `ca.vancouver.opendata.property.block-outlines`\n", + "),\n", + "hoods as (\n", + " select\n", + " st_geomfromgeojson(geometry) as geometry,\n", + " name\n", + " from `ca.vancouver.opendata.property.local-area-boundaries`\n", + "),\n", + "blocks_by_hood as (\n", + " select hoods.name, blocks.geometry\n", + " from\n", + " blocks,\n", + " hoods\n", + " where st_intersects(blocks.geometry, hoods.geometry)\n", + ")\n", + "select\n", + " st_asgeojson(geometry) as geometry,\n", + " name,\n", + " -- calculating median value is left as an excercise :)\n", + " rand() as median_value\n", + "from blocks_by_hood" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d75afd5d-edbe-4d0b-8a5b-6c5b476a201d", + "metadata": {}, + "outputs": [], + "source": [ + "import kamu.utils\n", + "\n", + "blocks_by_hood_gj = kamu.utils.df_to_geojson(blocks_by_hood)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f52343f-2889-42a9-83fb-e4068398d267", + "metadata": {}, + "outputs": [], + "source": [ + "viz = ChoroplethViz(\n", + " blocks_by_hood_gj,\n", + " style='mapbox://styles/mapbox/dark-v10',\n", + " center=(-123.1207, 49.2827),\n", + " zoom=10,\n", + " access_token=token,\n", + " color_property='median_value',\n", + " color_stops=create_color_stops([0.25, 0.5, 0.75, 1.0], colors='YlOrRd'),\n", + " line_stroke='solid',\n", + " line_width=0.1,\n", + " line_color='rgb(128,0,38)',\n", + " opacity=0.8,\n", + " legend_layout='horizontal',\n", + " legend_key_shape='bar',\n", + " legend_key_borders_on=False)\n", + "\n", + "viz.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "924d86c3-41b9-40a8-b2c1-ac70b601164e", + "metadata": {}, + "outputs": [], + "source": [ + "# adjust view angle\n", + "viz.bearing = -15\n", + "viz.pitch = 45\n", + "\n", + "# add extrusion to viz using interpolation keyed on density in GeoJSON features\n", + "viz.height_property = 'median_value'\n", + "viz.height_stops = create_numeric_stops([0, 1], 0, 500)\n", + "viz.height_function_type = 'interpolate'\n", + "viz.opacity = 1\n", + "\n", + "# render again\n", + "viz.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f15c06e8-9b55-4ca6-a15a-b3f9d3321518", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/housing_prices/spatial_join.ipynb b/examples/housing_prices/spatial_join.ipynb deleted file mode 100644 index 62fd4296d7..0000000000 --- a/examples/housing_prices/spatial_join.ipynb +++ /dev/null @@ -1,192 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext kamu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset ca.vancouver.opendata.property.block-outlines --alias blocks\n", - "%import_dataset ca.vancouver.opendata.property.local-area-boundaries --alias hoods" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "blocks.printSchema()\n", - "hoods.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "create or replace temp view blocks_by_hood as (\n", - " select h.name, b.geometry\n", - " from\n", - " (select st_geomfromgeojson(geometry) as geometry from blocks) b,\n", - " (select st_geomfromgeojson(geometry) as geometry, name from hoods) h\n", - " where st_intersects(b.geometry, h.geometry)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql -o df\n", - "select\n", - " st_asgeojson(geometry) as geometry, \n", - " name, \n", - " rand() as median_value\n", - "from blocks_by_hood" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import json\n", - "\n", - "# For every row we first combine GeoJson geometry with other columns into a Feature object\n", - "# Then we combine all Features into a FeatureCollection\n", - "def df_to_geojson(df, geom='geometry', props=None):\n", - " if props is None:\n", - " props = [\n", - " c for c in df.columns\n", - " if c != geom\n", - " ]\n", - " \n", - " return {\n", - " \"type\": \"FeatureCollection\",\n", - " \"features\": [\n", - " {\n", - " \"type\": \"Feature\",\n", - " \"geometry\": json.loads(row[geom]),\n", - " \"properties\": {p: row[p] for p in props}\n", - " }\n", - " for _, row in df.iterrows()\n", - " ]\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "gj = df_to_geojson(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import os\n", - "from mapboxgl.viz import *\n", - "from mapboxgl.utils import *\n", - "\n", - "# Must be a public token, starting with `pk`\n", - "token = os.getenv('MAPBOX_ACCESS_TOKEN')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "viz = ChoroplethViz(\n", - " gj,\n", - " style='mapbox://styles/mapbox/dark-v10',\n", - " center=(-123.1207, 49.2827),\n", - " zoom=10,\n", - " access_token=token,\n", - " color_property='median_value',\n", - " color_stops=create_color_stops([0.25, 0.5, 0.75, 1.0], colors='YlOrRd'),\n", - " line_stroke='solid',\n", - " line_width=0.1,\n", - " line_color='rgb(128,0,38)',\n", - " opacity=0.8,\n", - " legend_layout='horizontal',\n", - " legend_key_shape='bar',\n", - " legend_key_borders_on=False)\n", - "\n", - "viz.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "# adjust view angle\n", - "viz.bearing = -15\n", - "viz.pitch = 45\n", - "\n", - "# add extrusion to viz using interpolation keyed on density in GeoJSON features\n", - "viz.height_property = 'median_value'\n", - "viz.height_stops = create_numeric_stops([0, 1], 0, 500)\n", - "viz.height_function_type = 'interpolate'\n", - "viz.opacity = 1\n", - "\n", - "# render again\n", - "viz.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "PySpark", - "language": "python", - "name": "pysparkkernel" - }, - "language_info": { - "codemirror_mode": { - "name": "python", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/reth-vs-snp500/init-s3-all.sh b/examples/reth-vs-snp500/init-s3-all.sh deleted file mode 100755 index bf2fe96112..0000000000 --- a/examples/reth-vs-snp500/init-s3-all.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -e - -S3_CONTRIB_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/contrib/" -S3_EXAMPLE_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/example/" - -kamu init || true - -# Root -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-minted" -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-burned" -kamu pull "${S3_CONTRIB_URL}com.cryptocompare.ohlcv.eth-usd" -kamu pull "${S3_CONTRIB_URL}co.alphavantage.tickers.daily.spy" - -kamu pull "${S3_EXAMPLE_URL}account.transactions" -kamu pull "${S3_EXAMPLE_URL}account.tokens.transfers" - -# Deriv -kamu pull "${S3_EXAMPLE_URL}net.rocketpool.reth.mint-burn" -kamu pull "${S3_EXAMPLE_URL}account.tokens.portfolio" -kamu pull "${S3_EXAMPLE_URL}account.tokens.portfolio.market-value" -kamu pull "${S3_EXAMPLE_URL}account.tokens.portfolio.usd" -kamu pull "${S3_EXAMPLE_URL}account.whatif.reth-vs-snp500.market-value" -kamu pull "${S3_EXAMPLE_URL}account.whatif.reth-vs-snp500.portfolio" diff --git a/examples/reth-vs-snp500/init-s3-ipfs.sh b/examples/reth-vs-snp500/init-s3-ipfs.sh deleted file mode 100755 index 84226e57c1..0000000000 --- a/examples/reth-vs-snp500/init-s3-ipfs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/sh -set -e - -S3_CONTRIB_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/contrib/" -S3_EXAMPLE_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/example/" - -kamu init || true - -# Pull from S3 for speed but then alias to IPFS -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-minted" --no-alias -kamu repo alias add --pull net.rocketpool.reth.tokens-minted "ipns://net.rocketpool.reth.tokens-minted.ipns.kamu.dev" - -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-burned" --no-alias -kamu repo alias add --pull net.rocketpool.reth.tokens-burned "ipns://net.rocketpool.reth.tokens-burned.ipns.kamu.dev" - -kamu pull "${S3_CONTRIB_URL}com.cryptocompare.ohlcv.eth-usd" --no-alias -kamu repo alias add --pull com.cryptocompare.ohlcv.eth-usd "ipns://com.cryptocompare.ohlcv.eth-usd.ipns.kamu.dev" - -kamu pull "${S3_CONTRIB_URL}co.alphavantage.tickers.daily.spy" --no-alias -kamu repo alias add --pull co.alphavantage.tickers.daily.spy "ipns://co.alphavantage.tickers.daily.spy.ipns.kamu.dev" - -kamu pull "${S3_EXAMPLE_URL}account.transactions" --no-alias -kamu pull "${S3_EXAMPLE_URL}account.tokens.transfers" --no-alias - -kamu add -r . diff --git a/examples/reth-vs-snp500/init-s3.sh b/examples/reth-vs-snp500/init-s3.sh deleted file mode 100755 index 264ebd6084..0000000000 --- a/examples/reth-vs-snp500/init-s3.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh -set -e - -S3_CONTRIB_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/contrib/" -S3_EXAMPLE_URL="https://s3.us-west-2.amazonaws.com/datasets.kamu.dev/odf/v2/example/" - -kamu init || true - -# Root -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-minted" -kamu pull "${S3_CONTRIB_URL}net.rocketpool.reth.tokens-burned" -kamu pull "${S3_CONTRIB_URL}com.cryptocompare.ohlcv.eth-usd" -kamu pull "${S3_CONTRIB_URL}co.alphavantage.tickers.daily.spy" - -kamu pull "${S3_EXAMPLE_URL}account.transactions" -kamu pull "${S3_EXAMPLE_URL}account.tokens.transfers" - -kamu add -r . diff --git a/examples/reth-vs-snp500/analysis.ipynb b/examples/reth-vs-snp500/notebook.ipynb similarity index 83% rename from examples/reth-vs-snp500/analysis.ipynb rename to examples/reth-vs-snp500/notebook.ipynb index d9b3060559..836de66a6b 100644 --- a/examples/reth-vs-snp500/analysis.ipynb +++ b/examples/reth-vs-snp500/notebook.ipynb @@ -16,17 +16,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "import os\n", - "import numpy as np\n", - "import xarray as xr\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import hvplot.pandas # noqa\n", - "import hvplot.xarray # noqa\n", - "import holoviews as hv\n", - "from datetime import datetime\n", - "pd.set_option('max_colwidth', None)" + "%load_ext kamu" ] }, { @@ -35,7 +25,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext kamu" + "import kamu\n", + "\n", + "con = kamu.connect()" ] }, { @@ -44,36 +36,34 @@ "metadata": {}, "outputs": [], "source": [ - "%import_dataset net.rocketpool.reth.mint-burn\n", - "%import_dataset com.cryptocompare.ohlcv.eth-usd\n", - "\n", - "%import_dataset account.transactions\n", - "%import_dataset account.tokens.transfers\n", - "%import_dataset account.tokens.portfolio\n", - "%import_dataset account.tokens.portfolio.market-value\n", + "import os\n", + "import numpy as np\n", + "import xarray as xr\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import hvplot.pandas\n", + "import hvplot.xarray\n", + "import holoviews as hv\n", + "from datetime import datetime\n", "\n", - "%import_dataset co.alphavantage.tickers.daily.spy\n", - "%import_dataset account.whatif.reth-vs-snp500.portfolio\n", - "%import_dataset account.whatif.reth-vs-snp500.market-value" + "pd.set_option('max_colwidth', None)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "%%sql -o reth_pool\n", "select \n", " event_time, \n", " case \n", - " when event_name = \"TokensMinted\" then \"Mint\"\n", - " when event_name = \"TokensBurned\" then \"Burn\"\n", + " when event_name = 'TokensMinted' then 'Mint'\n", + " when event_name = 'TokensBurned' then 'Burn'\n", " end as event_name, \n", " avg(eth_amount / amount) as ratio \n", - "from `net.rocketpool.reth.mint-burn` \n", + "from 'net.rocketpool.reth.mint-burn'\n", "group by event_time, event_name\n", "order by 1" ] @@ -84,7 +74,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "reth_pool.hvplot.step(\n", " x=\"event_time\",\n", " by=\"event_name\",\n", @@ -101,7 +90,7 @@ "outputs": [], "source": [ "%%sql -o eth2usd\n", - "select event_time, open, close from `com.cryptocompare.ohlcv.eth-usd` order by event_time" + "select event_time, open, close from 'com.cryptocompare.ohlcv.eth-usd' order by event_time" ] }, { @@ -110,7 +99,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "eth2usd.hvplot.step(x=\"event_time\", y=\"close\", height=500, width=800)" ] }, @@ -121,7 +109,7 @@ "outputs": [], "source": [ "%%sql -o portfolio\n", - "select * from `account.tokens.portfolio` order by block_time" + "select * from 'account.tokens.portfolio' order by block_time" ] }, { @@ -130,7 +118,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "portfolio[\n", " portfolio.token_symbol == \"rETH\"\n", "].hvplot.scatter(\n", @@ -148,7 +135,7 @@ "outputs": [], "source": [ "%%sql -o reth_mv\n", - "select * from `account.tokens.portfolio.market-value` order by event_time" + "select * from 'account.tokens.portfolio.market-value' order by event_time" ] }, { @@ -157,7 +144,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "reth_mv.hvplot.line(\n", " x=\"event_time\", \n", " y=[\"token_book_value_eth\", \"token_market_value_eth\"], \n", @@ -174,7 +160,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "reth_mv.hvplot.line(\n", " x=\"event_time\", \n", " y=[\"token_book_value_eth_as_usd\", \"token_market_value_usd\"], \n", @@ -192,7 +177,7 @@ "outputs": [], "source": [ "%%sql -o spy_ticks\n", - "select * from `co.alphavantage.tickers.daily.spy`" + "select * from 'co.alphavantage.tickers.daily.spy'" ] }, { @@ -201,7 +186,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "spy_ticks.hvplot.step(\n", " x=\"event_time\", \n", " y=[\"close\"],\n", @@ -218,7 +202,7 @@ "outputs": [], "source": [ "%%sql -o market_value -q\n", - "select * from `account.tokens.portfolio.market-value`" + "select * from 'account.tokens.portfolio.market-value'" ] }, { @@ -228,7 +212,7 @@ "outputs": [], "source": [ "%%sql -o alternative_market_value -q\n", - "select * from `account.whatif.reth-vs-snp500.market-value`" + "select * from 'account.whatif.reth-vs-snp500.market-value'" ] }, { @@ -237,8 +221,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "\n", "max_height = max(\n", " alternative_market_value[\"alt_spy_market_value_usd\"].max(),\n", " market_value[\"token_market_value_usd\"].max(),\n", @@ -270,20 +252,6 @@ "# )" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -369,19 +337,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/trading/trading.ipynb b/examples/trading/notebook.ipynb similarity index 61% rename from examples/trading/trading.ipynb rename to examples/trading/notebook.ipynb index ca3a33c7d9..c267bec220 100644 --- a/examples/trading/trading.ipynb +++ b/examples/trading/notebook.ipynb @@ -15,10 +15,9 @@ "metadata": {}, "outputs": [], "source": [ - "%import_dataset com.yahoo.finance.tickers.daily --alias tickers\n", - "%import_dataset my.trading.transactions --alias transactions\n", - "%import_dataset my.trading.holdings --alias holdings\n", - "%import_dataset my.trading.holdings.market-value --alias value" + "import kamu\n", + "\n", + "con = kamu.connect()" ] }, { @@ -28,7 +27,7 @@ "outputs": [], "source": [ "%%sql\n", - "select * from value" + "select * from 'my.trading.holdings.market-value'" ] }, { @@ -41,21 +40,23 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/images/Makefile b/images/Makefile index a68675ea1d..9c7fecba1e 100644 --- a/images/Makefile +++ b/images/Makefile @@ -1,6 +1,6 @@ IMAGE_PLATFORMS = linux/amd64,linux/arm64 IMAGE_REPO = ghcr.io/kamu-data -IMAGE_JUPYTER_TAG = 0.6.3 +IMAGE_JUPYTER_TAG = 0.7.0 KAMU_VERSION = $(shell cargo metadata --format-version 1 | jq -r '.packages[] | select( .name == "kamu") | .version') diff --git a/images/demo/Dockerfile.jupyter b/images/demo/Dockerfile.jupyter index ae291e7e3a..89fcfd3913 100644 --- a/images/demo/Dockerfile.jupyter +++ b/images/demo/Dockerfile.jupyter @@ -1,7 +1,7 @@ # Base image info: https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html # Base image tags: https://quay.io/repository/jupyter/minimal-notebook # Customization is based on: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -FROM quay.io/jupyter/minimal-notebook:2024-02-13 +FROM quay.io/jupyter/minimal-notebook:2024-12-09 ARG TARGETPLATFORM ARG KAMU_VERSION ARG dev_mode=false @@ -10,15 +10,10 @@ ARG dev_mode=false ######################################################################################### USER root -# Podman +# Podman & tools # Source: https://github.com/containers/podman/blob/056f492f59c333d521ebbbe186abde0278e815db/contrib/podmanimage/stable/Dockerfile RUN apt update && \ - apt -y install ca-certificates curl wget gnupg unzip jq && \ - . /etc/os-release && \ - echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_${VERSION_ID}/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list && \ - curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_${VERSION_ID}/Release.key" | apt-key add - && \ - apt update && \ - apt -y install podman fuse-overlayfs && \ + apt -y install ca-certificates curl wget gnupg unzip jq podman fuse-overlayfs && \ apt-get clean && rm -rf /var/lib/apt/lists /var/cache/apt/archives COPY podman/containers.conf /etc/containers/containers.conf @@ -47,15 +42,12 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2 # Sparkmagic and tools -COPY jupyter/requirements/$TARGETPLATFORM/requirements.txt requirements.txt - -# TODO: Semi-permanent hack for `mapboxgl` package being broken in conda-forge -# See: https://github.com/kamu-data/kamu-cli/issues/533 -RUN mamba install -y --file requirements.txt && \ - mamba uninstall mapboxgl && pip install --no-cache-dir mapboxgl && \ - mamba clean --all -f -y && \ - rm requirements.txt && \ - fix-permissions "${CONDA_DIR}" && \ +COPY jupyter/requirements/$TARGETPLATFORM/env.yaml env.yaml + +RUN mamba env update -y -f env.yaml && \ + mamba clean --all -f -y && \ + rm env.yaml && \ + fix-permissions "${CONDA_DIR}" && \ fix-permissions "/home/${NB_USER}" @@ -70,15 +62,9 @@ COPY jupyter/.kamuconfig /.kamuconfig ######################################################################################### USER $NB_USER -COPY jupyter/kamu.py /opt/conda/lib/python3.11/site-packages/kamu.py -COPY jupyter/sparkmagic.json /home/$NB_USER/.sparkmagic/config.json - -RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension -#RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/sparkkernel -RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/pysparkkernel -#RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/sparkrkernel -RUN jupyter serverextension enable --py sparkmagic +COPY jupyter/overrides.json /opt/conda/share/jupyter/lab/settings/overrides.json +RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements" ######################################################################################### USER root @@ -91,6 +77,4 @@ RUN fix-permissions "/home/${NB_USER}" ######################################################################################### USER $NB_USER -# TODO: Remove show_banner option after Sparkmagic supports novebook >= 7.0.0 -# See: https://github.com/jupyter-incubator/sparkmagic/issues/885 -CMD ["jupyter", "notebook", "--ip", "0.0.0.0", "--port", "8080", "--NotebookApp.iopub_data_rate_limit=1e10", "--NotebookApp.show_banner=False"] +CMD ["jupyter", "notebook", "--ip", "0.0.0.0", "--port", "8080", "--NotebookApp.iopub_data_rate_limit=1e10"] diff --git a/images/demo/Makefile b/images/demo/Makefile index c992364a90..afa1c4941e 100644 --- a/images/demo/Makefile +++ b/images/demo/Makefile @@ -1,7 +1,9 @@ IMAGE_PLATFORMS = linux/amd64,linux/arm64 IMAGE_REPO = ghcr.io/kamu-data KAMU_VERSION = $(shell cargo metadata --format-version 1 | jq -r '.packages[] | select( .name == "kamu") | .version') -DEMO_VERSION = 0.16.6 + +# Keep in sync with versions of Jupyter and Minio in docker-compose.yml +DEMO_VERSION = 0.17.0 ######################################################################################### diff --git a/images/demo/docker-compose.yml b/images/demo/docker-compose.yml index f35a8795b4..9f9bf5fa03 100644 --- a/images/demo/docker-compose.yml +++ b/images/demo/docker-compose.yml @@ -5,7 +5,7 @@ networks: services: jupyter: - image: ghcr.io/kamu-data/kamu-cli-demo-jupyter:0.16.6 + image: ghcr.io/kamu-data/kamu-cli-demo-jupyter:0.17.0 # Unfortunately running podman within another container requires elevated permissions privileged: true command: @@ -36,7 +36,7 @@ services: - minio minio: - image: ghcr.io/kamu-data/kamu-cli-demo-minio:0.16.6 + image: ghcr.io/kamu-data/kamu-cli-demo-minio:0.17.0 command: - "server" - "--address" diff --git a/images/demo/jupyter/Makefile b/images/demo/jupyter/Makefile index 4a36ef57ad..488ddb1ceb 100644 --- a/images/demo/jupyter/Makefile +++ b/images/demo/jupyter/Makefile @@ -1,5 +1,5 @@ PLATFORM=linux/amd64 -BASE_IMAGE:=quay.io/jupyter/minimal-notebook:2024-02-13 +BASE_IMAGE:=quay.io/jupyter/minimal-notebook:2024-12-09 # Requires QEMU @@ -23,7 +23,22 @@ requirements-platform: # Executed from inside the base image +# +# The stupidity of Python package management ecosystems is unbelievabe. Jupyter images are +# based on conda, but some packages we have are only installable by pip. We want to make +# environment reproducible, but `conda env export` in `dependencies.pip` section includes only +# **top-level** packages, ignoring all direct and transitive dependencies. +# +# To make environment fully reproducible we have to resort to: +# - Run `conda env export` to lock conda packages (and part of pip packages) +# - Strig partial `pip` packages from conda env +# - Run `pip freeze` to lock pip packages +# - Filter out conda packages from `pip freeze` output +# - Merge the rest into `dependencies.pip` section of `conda env export` .PHONY: requirements-install-freeze requirements-install-freeze: - mamba install -y --file requirements/$(PLATFORM)/requirements.in - mamba list --export > requirements/$(PLATFORM)/requirements.txt + pip install -r requirements/$(PLATFORM)/requirements.in + pip freeze > requirements/$(PLATFORM)/requirements.txt + mamba env export --no-builds > requirements/$(PLATFORM)/env.yaml + python ./merge_requirements.py requirements/$(PLATFORM)/env.yaml requirements/$(PLATFORM)/requirements.txt + rm requirements/$(PLATFORM)/requirements.txt diff --git a/images/demo/jupyter/kamu.py b/images/demo/jupyter/kamu.py deleted file mode 100644 index b451f6dd14..0000000000 --- a/images/demo/jupyter/kamu.py +++ /dev/null @@ -1,282 +0,0 @@ -import os -import re -import json -import time -import socket -import signal -import subprocess -from collections import namedtuple -from IPython.core import magic_arguments -from IPython.core.magic import line_magic, cell_magic, line_cell_magic, Magics, magics_class -from IPython.display import clear_output - - -SPARK_INIT_CODE = """ -spark.sparkContext._jvm.org.datasyslab.geosparksql.utils.GeoSparkSQLRegistrator.registerAll(sc._jvm.SQLContext(sc._jsc.sc())) -""" - - -SPARK_IMPORT_DATASET_CODE = """ -import os - -def resolve_dataset_ref(dataset_ref): - if "/" not in dataset_ref: - # Single-tenant - data_path = os.path.join(dataset_ref, "data") - if os.path.exists(data_path): - return data_path - else: - # Multi-tenant - # Assumptions: - # - Layout of the data directory is `//info/alias` - # - Alias file contains `/` - account_name, dataset_name = dataset_ref.split("/", 1) - if os.path.isdir(account_name): - for dataset_id in os.listdir(account_name): - alias_path = os.path.join(account_name, dataset_id, "info", "alias") - if not os.path.exists(alias_path): - continue - with open(alias_path) as f: - alias = f.read().strip() - if alias != dataset_ref: - continue - return os.path.join(account_name, dataset_id, "data") - - raise Exception(f"Dataset {{dataset_ref}} not found") - -data_path = resolve_dataset_ref("{ref}") -{alias} = spark.read.parquet(os.path.join(data_path, "*")) -{alias}.createOrReplaceTempView("`{ref}`") -{alias}.createOrReplaceTempView("{alias}") -""" - - -LIVY_START_TIMEOUT = 60 -LIVY_PIDFILE = os.path.expanduser("~/.local/kamu/livy.pid") -LIVY_STDOUT = os.path.expanduser("~/.local/kamu/livy.out.txt") -LIVY_STDERR = os.path.expanduser("~/.local/kamu/livy.err.txt") - - -@magics_class -class KamuMagics(Magics): - @line_magic - @magic_arguments.magic_arguments() - @magic_arguments.argument( - '--executor-instances', - type=int, - default=2, - help='Number of executor instances to run' - ) - def kamu(self, line): - self._ensure_livy_is_running() - - args = magic_arguments.parse_argstring(self.kamu, line) - code = SPARK_INIT_CODE - self.shell.run_cell_magic('spark', '', code) - - @line_magic - @magic_arguments.magic_arguments() - @magic_arguments.argument('dataset_ref', - nargs=1, - help='Dataset to load' - ) - @magic_arguments.argument('--alias', - help='Also registers the dataset under provided alias' - ) - def import_dataset(self, line): - self._ensure_images() - self._ensure_livy_is_running() - - args = magic_arguments.parse_argstring(self.import_dataset, line) - dataset_ref = args.dataset_ref[0] - if not args.alias: - args.alias = re.sub(r"[\.\-/]", "_", dataset_ref) - code = SPARK_IMPORT_DATASET_CODE.format( - ref=dataset_ref, - alias=args.alias, - ) - self.shell.run_cell_magic('spark', '', code) - - def _ensure_livy_is_running(self): - livy = LivyProcessHelper() - procinfo = livy.get_proc_info(check_running=True) - if procinfo is None: - print("Starting Livy server") - livy.start(timeout=LIVY_START_TIMEOUT) - clear_output() - - def _ensure_images(self): - out = subprocess.run(["kamu", "init", "--pull-images", "--list-only"], capture_output=True) - assert out.returncode == 0, "Failed to list images from kamu" - images = [ - img for img in out.stdout.decode("utf-8").split("\n") - if "spark" in img - ] - assert len(images) > 0, "No images in output" - - touch_image_statuses = ( - subprocess.run(["podman", "inspect", img], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - for img in images - ) - images_pulled = all( - status.returncode == 0 - for status in touch_image_statuses - ) - if images_pulled: - return - - print("First time run. Please wait while we pull the necessary images.") - for image in images: - print(f"Pulling: {image}") - out = subprocess.run(["podman", "pull", image]) - assert out.returncode == 0, f"Failed to pull image: {image}" - - clear_output() - - @line_magic - def stop_livy(self, line): - livy = LivyProcessHelper() - livy.stop() - - -LivyProcInfo = namedtuple("LivyProcInfo", ["pid", "port"]) - - -class LivyProcessHelper: - def __init__(self, pidfile=LIVY_PIDFILE): - self._pidfile = pidfile - - def get_proc_info(self, check_running=True): - if not os.path.exists(self._pidfile): - return None - - with open(self._pidfile, 'r') as f: - procinfo = LivyProcInfo(**json.load(f)) - - if not check_running: - return procinfo - - if not self.is_running(procinfo=procinfo): - return None - - return procinfo - - def save_proc_info(self, procinfo): - pi_dir, _ = os.path.split(self._pidfile) - os.makedirs(pi_dir, exist_ok=True) - with open(self._pidfile, "w") as f: - json.dump(procinfo._asdict(), f) - - def is_running(self, procinfo=None): - if procinfo is None: - procinfo = self.get_proc_info(check_running=False) - if procinfo is None: - return False - - return ( - self.is_process_running(procinfo.pid) and - self.is_port_open(procinfo.port) - ) - - def is_process_running(self, pid=None): - if pid is None: - procinfo = self.get_proc_info(check_running=False) - if procinfo is None: - return False - pid = procinfo.pid - - try: - os.kill(pid, 0) - return True - except OSError: - return False - - def is_port_open(self, port=None): - if port is None: - procinfo = self.get_proc_info(check_running=False) - if procinfo is None: - return False - port = procinfo.port - - try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect(("127.0.0.1", port)) - s.close() - return True - except socket.error: - return False - - def start(self, timeout): - if not self.is_in_workspace(): - raise Exception( - "Current directory is not under kamu workspace. " - "Create a workspace in the desired location by running `kamu init` in the terminal " - "and place your notebook in that directory." - ) - - # TODO: Other ports are not supported due to podman running in host networking mode - port = 8998 - - out_dir, _ = os.path.split(LIVY_STDOUT) - os.makedirs(out_dir, exist_ok=True) - - p = subprocess.Popen( - ["/usr/local/bin/kamu", "sql", "server", "--livy", "--port", str(port)], - stdout=open(LIVY_STDOUT, "w"), - stderr=open(LIVY_STDERR, "w"), - close_fds=True - ) - - deadline = time.time() + timeout - while True: - try: - status = p.wait(1) - raise Exception( - f"Livy failed to start with status code: {status}\n" - f"See logs for details:\n" - f"- {LIVY_STDOUT}\n" - f"- {LIVY_STDERR}" - ) - except subprocess.TimeoutExpired: - pass - - if self.is_port_open(port): - break - - if time.time() >= deadline: - p.send_signal(signal.SIGTERM) - raise Exception( - f"Livy failed to start within {timeout} seconds\n" - f"See logs for details:\n" - f"- {LIVY_STDOUT}\n" - f"- {LIVY_STDERR}" - ) - - procinfo = LivyProcInfo(pid=p.pid, port=port) - self.save_proc_info(procinfo) - return procinfo - - def stop(self): - procinfo = self.get_proc_info(check_running=False) - if procinfo is None: - return - - try: - os.kill(procinfo.pid, signal.SIGTERM) - print("Stopping Livy") - except OSError: - pass - - def is_in_workspace(self, cwd=None): - p = subprocess.run( - ["/usr/local/bin/kamu", "list"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - cwd=cwd, - ) - - return p.returncode == 0 - - -def load_ipython_extension(ipython): - ipython.register_magics(KamuMagics) diff --git a/images/demo/jupyter/merge_requirements.py b/images/demo/jupyter/merge_requirements.py new file mode 100644 index 0000000000..6ff0b294e2 --- /dev/null +++ b/images/demo/jupyter/merge_requirements.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import sys +import yaml + +env_path = sys.argv[1] +req_path = sys.argv[2] + +# Read files +with open(env_path) as f: + env = yaml.safe_load(f) + +with open(req_path) as f: + reqs = [r.strip() for r in f.readlines()] + +# Filter out pip packages from `conda env export` +env['dependencies'] = [ + dep for dep in env['dependencies'] + if not isinstance(dep, dict) or 'pip' not in dep +] + +# Filter conda packages from `pip freeze` output +reqs = [r for r in reqs if not '@ file://' in r] + +# Merge into environment +env['dependencies'].append({'pip': reqs}) + +# Replace env file +with open(env_path, 'w') as f: + yaml.safe_dump(env, f) diff --git a/images/demo/jupyter/overrides.json b/images/demo/jupyter/overrides.json new file mode 100644 index 0000000000..36a8a9e7b5 --- /dev/null +++ b/images/demo/jupyter/overrides.json @@ -0,0 +1,5 @@ +{ + "@jupyterlab/apputils-extension:themes": { + "adaptive-theme": true + } +} \ No newline at end of file diff --git a/images/demo/jupyter/requirements/linux/amd64/env.yaml b/images/demo/jupyter/requirements/linux/amd64/env.yaml new file mode 100644 index 0000000000..42c02bd385 --- /dev/null +++ b/images/demo/jupyter/requirements/linux/amd64/env.yaml @@ -0,0 +1,274 @@ +channels: +- conda-forge +dependencies: +- _libgcc_mutex=0.1 +- _openmp_mutex=4.5 +- alembic=1.14.0 +- annotated-types=0.7.0 +- anyio=4.7.0 +- archspec=0.2.3 +- argon2-cffi=23.1.0 +- argon2-cffi-bindings=21.2.0 +- arrow=1.3.0 +- asttokens=3.0.0 +- async-lru=2.0.4 +- async_generator=1.10 +- attrs=24.2.0 +- babel=2.16.0 +- beautifulsoup4=4.12.3 +- bleach=6.2.0 +- blinker=1.9.0 +- boltons=24.0.0 +- brotli-python=1.1.0 +- bzip2=1.0.8 +- c-ares=1.34.3 +- ca-certificates=2024.8.30 +- cached-property=1.5.2 +- cached_property=1.5.2 +- certifi=2024.8.30 +- certipy=0.2.1 +- cffi=1.17.1 +- charset-normalizer=3.4.0 +- colorama=0.4.6 +- comm=0.2.2 +- conda=24.11.0 +- conda-libmamba-solver=24.11.1 +- conda-package-handling=2.4.0 +- conda-package-streaming=0.11.0 +- cpp-expected=1.1.0 +- cryptography=44.0.0 +- debugpy=1.8.9 +- decorator=5.1.1 +- defusedxml=0.7.1 +- distro=1.9.0 +- entrypoints=0.4 +- exceptiongroup=1.2.2 +- executing=2.1.0 +- fmt=11.0.2 +- fqdn=1.5.1 +- frozendict=2.4.6 +- greenlet=3.1.1 +- h11=0.14.0 +- h2=4.1.0 +- hpack=4.0.0 +- httpcore=1.0.7 +- httpx=0.28.1 +- hyperframe=6.0.1 +- idna=3.10 +- importlib-metadata=8.5.0 +- importlib_resources=6.4.5 +- ipykernel=6.29.5 +- ipython=8.30.0 +- ipython_genutils=0.2.0 +- isoduration=20.11.0 +- jedi=0.19.2 +- jinja2=3.1.4 +- json5=0.10.0 +- jsonpatch=1.33 +- jsonpointer=3.0.0 +- jsonschema=4.23.0 +- jsonschema-specifications=2024.10.1 +- jsonschema-with-format-nongpl=4.23.0 +- jupyter-lsp=2.2.5 +- jupyter_client=8.6.3 +- jupyter_core=5.7.2 +- jupyter_events=0.10.0 +- jupyter_server=2.14.2 +- jupyter_server_terminals=0.5.3 +- jupyterhub-base=5.2.1 +- jupyterhub-singleuser=5.2.1 +- jupyterlab=4.3.2 +- jupyterlab_pygments=0.3.0 +- jupyterlab_server=2.27.3 +- keyutils=1.6.1 +- krb5=1.21.3 +- ld_impl_linux-64=2.43 +- libarchive=3.7.7 +- libcurl=8.10.1 +- libedit=3.1.20191231 +- libev=4.33 +- libexpat=2.6.4 +- libffi=3.4.2 +- libgcc=14.2.0 +- libgcc-ng=14.2.0 +- libgomp=14.2.0 +- libiconv=1.17 +- liblzma=5.6.3 +- libmamba=2.0.4 +- libmambapy=2.0.4 +- libnghttp2=1.64.0 +- libnsl=2.0.1 +- libsodium=1.0.20 +- libsolv=0.7.30 +- libsqlite=3.47.0 +- libssh2=1.11.1 +- libstdcxx=14.2.0 +- libstdcxx-ng=14.2.0 +- libuuid=2.38.1 +- libxcrypt=4.4.36 +- libxml2=2.13.5 +- libzlib=1.3.1 +- lz4-c=1.10.0 +- lzo=2.10 +- make=4.4.1 +- mako=1.3.8 +- mamba=2.0.4 +- markupsafe=3.0.2 +- matplotlib-inline=0.1.7 +- menuinst=2.2.0 +- mistune=3.0.2 +- nbclassic=1.1.0 +- nbclient=0.10.1 +- nbconvert-core=7.16.4 +- nbformat=5.10.4 +- ncurses=6.5 +- nest-asyncio=1.6.0 +- nlohmann_json=3.11.3 +- notebook=7.3.1 +- notebook-shim=0.2.4 +- oauthlib=3.2.2 +- openssl=3.4.0 +- overrides=7.7.0 +- packaging=24.2 +- pamela=1.2.0 +- pandocfilters=1.5.0 +- parso=0.8.4 +- pexpect=4.9.0 +- pickleshare=0.7.5 +- pip=24.3.1 +- pkgutil-resolve-name=1.3.10 +- platformdirs=4.3.6 +- pluggy=1.5.0 +- prometheus_client=0.21.1 +- prompt-toolkit=3.0.48 +- psutil=6.1.0 +- ptyprocess=0.7.0 +- pure_eval=0.2.3 +- pybind11-abi=4 +- pycosat=0.6.6 +- pycparser=2.22 +- pydantic=2.10.3 +- pydantic-core=2.27.1 +- pygments=2.18.0 +- pyjwt=2.10.1 +- pysocks=1.7.1 +- python=3.12.8 +- python-dateutil=2.9.0.post0 +- python-fastjsonschema=2.21.1 +- python-json-logger=2.0.7 +- python_abi=3.12 +- pytz=2024.2 +- pyyaml=6.0.2 +- pyzmq=26.2.0 +- readline=8.2 +- referencing=0.35.1 +- reproc=14.2.5.post0 +- reproc-cpp=14.2.5.post0 +- requests=2.32.3 +- rfc3339-validator=0.1.4 +- rfc3986-validator=0.1.1 +- rpds-py=0.22.3 +- ruamel.yaml=0.18.6 +- ruamel.yaml.clib=0.2.8 +- send2trash=1.8.3 +- setuptools=75.6.0 +- simdjson=3.10.1 +- six=1.17.0 +- sniffio=1.3.1 +- soupsieve=2.5 +- spdlog=1.14.1 +- sqlalchemy=2.0.36 +- stack_data=0.6.3 +- terminado=0.18.1 +- tinycss2=1.4.0 +- tk=8.6.13 +- tomli=2.2.1 +- tornado=6.4.2 +- tqdm=4.67.1 +- traitlets=5.14.3 +- truststore=0.10.0 +- types-python-dateutil=2.9.0.20241206 +- typing-extensions=4.12.2 +- typing_extensions=4.12.2 +- typing_utils=0.1.0 +- tzdata=2024b +- uri-template=1.3.0 +- urllib3=2.2.3 +- wcwidth=0.2.13 +- webcolors=24.11.1 +- webencodings=0.5.1 +- websocket-client=1.8.0 +- wheel=0.45.1 +- yaml=0.2.5 +- yaml-cpp=0.8.0 +- zeromq=4.3.5 +- zipp=3.21.0 +- zstandard=0.23.0 +- zstd=1.5.6 +- pip: + - adbc-driver-flightsql==1.3.0 + - adbc-driver-manager==1.3.0 + - altair==5.5.0 + - autovizwidget==0.22.0 + - bokeh==3.6.2 + - branca==0.8.1 + - cftime==1.6.4.post1 + - chroma-py==0.1.0.dev1 + - click==8.1.8 + - cloudpickle==3.1.0 + - colorcet==3.1.0 + - colour==0.1.5 + - contourpy==1.3.1 + - cycler==0.12.1 + - dask==2024.12.1 + - folium==0.19.2 + - fonttools==4.55.3 + - fsspec==2024.12.0 + - geojson==3.2.0 + - geopandas==1.0.1 + - hdijupyterutils==0.22.0 + - holoviews==1.20.0 + - hvplot==0.11.2 + - ipywidgets==8.1.5 + - jupyter==1.1.1 + - jupyter-console==6.6.3 + - jupyterlab_widgets==3.0.13 + - kamu==0.6.0 + - kiwisolver==1.4.8 + - linkify-it-py==2.0.3 + - livy==0.8.0 + - locket==1.0.0 + - mapboxgl==0.10.2 + - Markdown==3.7 + - markdown-it-py==3.0.0 + - matplotlib==3.10.0 + - mdit-py-plugins==0.4.2 + - mdurl==0.1.2 + - narwhals==1.19.1 + - netCDF4==1.7.2 + - numpy==2.2.1 + - pandas==2.2.3 + - pandas-bokeh==0.5.5 + - panel==1.5.5 + - param==2.2.0 + - partd==1.4.2 + - pillow==11.0.0 + - plotly==5.24.1 + - pyarrow==18.1.0 + - pyogrio==0.10.0 + - pyparsing==3.2.0 + - pyproj==3.7.0 + - pyviz_comms==3.0.3 + - setuptools==75.6.0 + - shapely==2.0.6 + - tenacity==9.0.0 + - toolz==1.0.0 + - tzdata==2024.2 + - uc-micro-py==1.0.3 + - wheel==0.45.1 + - widgetsnbextension==4.0.13 + - xarray==2024.11.0 + - xyzservices==2024.9.0 + - zstandard==0.23.0 +name: base +prefix: /opt/conda diff --git a/images/demo/jupyter/requirements/linux/amd64/requirements.in b/images/demo/jupyter/requirements/linux/amd64/requirements.in index d7b27f8f59..68140ad718 100644 --- a/images/demo/jupyter/requirements/linux/amd64/requirements.in +++ b/images/demo/jupyter/requirements/linux/amd64/requirements.in @@ -1,21 +1,16 @@ -# TODO: Pinned due to sparkmagic installation issue -# See: https://github.com/jupyter-incubator/sparkmagic/issues/825 -# See workaround applied in: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -notebook==6.5.5 +kamu[jupyter-autoviz,jupyter-sql,spark] -sparkmagic - -pandas +dask geopandas geojson -xarray netcdf4 -dask +pandas +xarray +altair bokeh -hvplot -pandas-bokeh folium -altair +hvplot mapboxgl +pandas-bokeh shapely diff --git a/images/demo/jupyter/requirements/linux/amd64/requirements.txt b/images/demo/jupyter/requirements/linux/amd64/requirements.txt deleted file mode 100644 index c24a4e3dcf..0000000000 --- a/images/demo/jupyter/requirements/linux/amd64/requirements.txt +++ /dev/null @@ -1,414 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -_libgcc_mutex=0.1=conda_forge -_openmp_mutex=4.5=2_gnu -alembic=1.13.1=pyhd8ed1ab_1 -altair=5.2.0=pyhd8ed1ab_0 -anyio=4.2.0=pyhd8ed1ab_0 -archspec=0.2.2=pyhd8ed1ab_0 -argon2-cffi=23.1.0=pyhd8ed1ab_0 -argon2-cffi-bindings=21.2.0=py311h459d7ec_4 -arrow=1.3.0=pyhd8ed1ab_0 -asttokens=2.4.1=pyhd8ed1ab_0 -async-lru=2.0.4=pyhd8ed1ab_0 -async_generator=1.10=py_0 -attrs=23.2.0=pyh71513ae_0 -autovizwidget=0.21.0=pyh1a96a4e_1 -aws-c-auth=0.7.16=h70caa3e_0 -aws-c-cal=0.6.9=h14ec70c_3 -aws-c-common=0.9.12=hd590300_0 -aws-c-compression=0.2.17=h572eabf_8 -aws-c-event-stream=0.4.2=h17cd1f3_0 -aws-c-http=0.8.0=hc6da83f_5 -aws-c-io=0.14.3=h3c8c088_1 -aws-c-mqtt=0.10.2=h0ef3971_0 -aws-c-s3=0.5.1=h2910485_1 -aws-c-sdkutils=0.1.14=h572eabf_0 -aws-checksums=0.1.17=h572eabf_7 -aws-crt-cpp=0.26.2=ha623a59_3 -aws-sdk-cpp=1.11.267=h0bb408c_0 -azure-core-cpp=1.10.3=h91d86a7_1 -azure-storage-blobs-cpp=12.10.0=h00ab1b0_0 -azure-storage-common-cpp=12.5.0=hb858b4b_2 -babel=2.14.0=pyhd8ed1ab_0 -beautifulsoup4=4.12.3=pyha770c72_0 -bleach=6.1.0=pyhd8ed1ab_0 -blinker=1.7.0=pyhd8ed1ab_0 -blosc=1.21.5=h0f2a231_0 -bokeh=3.3.4=pyhd8ed1ab_0 -boltons=23.1.1=pyhd8ed1ab_0 -branca=0.7.1=pyhd8ed1ab_0 -brotli=1.1.0=hd590300_1 -brotli-bin=1.1.0=hd590300_1 -brotli-python=1.1.0=py311hb755f60_1 -bzip2=1.0.8=hd590300_5 -c-ares=1.26.0=hd590300_0 -ca-certificates=2024.2.2=hbcca054_0 -cached-property=1.5.2=hd8ed1ab_1 -cached_property=1.5.2=pyha770c72_1 -cairo=1.18.0=h3faef2a_0 -certifi=2024.2.2=pyhd8ed1ab_0 -certipy=0.1.3=py_0 -cffi=1.16.0=py311hb3a22ac_0 -cfitsio=4.3.1=hbdc6101_0 -cftime=1.6.3=py311h1f0f07a_0 -charset-normalizer=3.3.2=pyhd8ed1ab_0 -chroma-py=0.1.0.dev1=py_0 -click=8.1.7=unix_pyh707e725_0 -click-plugins=1.1.1=py_0 -cligj=0.7.2=pyhd8ed1ab_1 -cloudpickle=3.0.0=pyhd8ed1ab_0 -colorama=0.4.6=pyhd8ed1ab_0 -colorcet=3.0.1=pyhd8ed1ab_0 -colour=0.1.5=pyhd8ed1ab_1 -comm=0.2.1=pyhd8ed1ab_0 -conda=23.11.0=py311h38be061_1 -conda-libmamba-solver=24.1.0=pyhd8ed1ab_0 -conda-package-handling=2.2.0=pyh38be061_0 -conda-package-streaming=0.9.0=pyhd8ed1ab_0 -configurable-http-proxy=4.6.1=h92b4e83_0 -contourpy=1.2.0=py311h9547e67_0 -cryptography=42.0.2=py311hcb13ee4_0 -cycler=0.12.1=pyhd8ed1ab_0 -cytoolz=0.12.3=py311h459d7ec_0 -dask=2024.2.0=pyhd8ed1ab_0 -dask-core=2024.2.0=pyhd8ed1ab_0 -debugpy=1.8.1=py311hb755f60_0 -decorator=5.1.1=pyhd8ed1ab_0 -defusedxml=0.7.1=pyhd8ed1ab_0 -distributed=2024.2.0=pyhd8ed1ab_0 -distro=1.9.0=pyhd8ed1ab_0 -entrypoints=0.4=pyhd8ed1ab_0 -exceptiongroup=1.2.0=pyhd8ed1ab_2 -executing=2.0.1=pyhd8ed1ab_0 -expat=2.5.0=hcb278e6_1 -fiona=1.9.5=py311hf8e0aa6_3 -fmt=10.2.1=h00ab1b0_0 -folium=0.15.1=pyhd8ed1ab_0 -font-ttf-dejavu-sans-mono=2.37=hab24e00_0 -font-ttf-inconsolata=3.000=h77eed37_0 -font-ttf-source-code-pro=2.038=h77eed37_0 -font-ttf-ubuntu=0.83=h77eed37_1 -fontconfig=2.14.2=h14ed4e7_0 -fonts-conda-ecosystem=1=0 -fonts-conda-forge=1=0 -fonttools=4.49.0=py311h459d7ec_0 -fqdn=1.5.1=pyhd8ed1ab_0 -freetype=2.12.1=h267a509_2 -freexl=2.0.0=h743c826_0 -fsspec=2024.2.0=pyhca7485f_0 -gdal=3.8.4=py311h8be719e_0 -geojson=3.1.0=pyhd8ed1ab_0 -geopandas=0.14.3=pyhd8ed1ab_0 -geopandas-base=0.14.3=pyha770c72_0 -geos=3.12.1=h59595ed_0 -geotiff=1.7.1=h6b2125f_15 -gettext=0.21.1=h27087fc_0 -gflags=2.2.2=he1b5a44_1004 -giflib=5.2.1=h0b41bf4_3 -glog=0.6.0=h6f12383_0 -greenlet=3.0.3=py311hb755f60_0 -h11=0.14.0=pyhd8ed1ab_0 -h2=4.1.0=pyhd8ed1ab_0 -hdf4=4.2.15=h2a13503_7 -hdf5=1.14.3=nompi_h4f84152_100 -hdijupyterutils=0.21.0=pyh1a96a4e_1 -holoviews=1.18.3=pyhd8ed1ab_0 -hpack=4.0.0=pyh9f0ad1d_0 -httpcore=1.0.2=pyhd8ed1ab_0 -httpx=0.26.0=pyhd8ed1ab_0 -hvplot=0.9.2=pyhd8ed1ab_0 -hyperframe=6.0.1=pyhd8ed1ab_0 -icu=73.2=h59595ed_0 -idna=3.6=pyhd8ed1ab_0 -importlib-metadata=7.0.1=pyha770c72_0 -importlib_metadata=7.0.1=hd8ed1ab_0 -importlib_resources=6.1.1=pyhd8ed1ab_0 -ipykernel=6.29.2=pyhd33586a_0 -ipython=8.21.0=pyh707e725_0 -ipython_genutils=0.2.0=py_1 -ipywidgets=8.1.2=pyhd8ed1ab_0 -isoduration=20.11.0=pyhd8ed1ab_0 -jedi=0.19.1=pyhd8ed1ab_0 -jinja2=3.1.3=pyhd8ed1ab_0 -joblib=1.3.2=pyhd8ed1ab_0 -json-c=0.17=h7ab15ed_0 -json5=0.9.14=pyhd8ed1ab_0 -jsonpatch=1.33=pyhd8ed1ab_0 -jsonpointer=2.4=py311h38be061_3 -jsonschema=4.21.1=pyhd8ed1ab_0 -jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 -jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 -jupyter=1.0.0=pyhd8ed1ab_10 -jupyter-lsp=2.2.2=pyhd8ed1ab_0 -jupyter_client=7.4.9=pyhd8ed1ab_0 -jupyter_console=6.6.3=pyhd8ed1ab_0 -jupyter_core=5.7.1=py311h38be061_0 -jupyter_events=0.9.0=pyhd8ed1ab_0 -jupyter_server=2.12.5=pyhd8ed1ab_0 -jupyter_server_terminals=0.5.2=pyhd8ed1ab_0 -jupyter_telemetry=0.1.0=pyhd8ed1ab_1 -jupyterhub=4.0.2=pyh31011fe_0 -jupyterhub-base=4.0.2=pyh31011fe_0 -jupyterlab=4.1.1=pyhd8ed1ab_0 -jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 -jupyterlab_server=2.25.2=pyhd8ed1ab_0 -jupyterlab_widgets=3.0.10=pyhd8ed1ab_0 -kealib=1.5.3=h2f55d51_0 -keyutils=1.6.1=h166bdaf_0 -kiwisolver=1.4.5=py311h9547e67_1 -krb5=1.21.2=h659d440_0 -lcms2=2.16=hb7c19ff_0 -ld_impl_linux-64=2.40=h41732ed_0 -lerc=4.0.0=h27087fc_0 -libabseil=20230802.1=cxx17_h59595ed_0 -libaec=1.1.2=h59595ed_1 -libarchive=3.7.2=h2aa1ff5_1 -libarrow=15.0.0=h49c8883_4_cpu -libarrow-acero=15.0.0=h59595ed_4_cpu -libarrow-dataset=15.0.0=h59595ed_4_cpu -libarrow-flight=15.0.0=hdc44a87_4_cpu -libarrow-flight-sql=15.0.0=hfbc7f12_4_cpu -libarrow-gandiva=15.0.0=h308e607_4_cpu -libarrow-substrait=15.0.0=hfbc7f12_4_cpu -libblas=3.9.0=21_linux64_openblas -libboost-headers=1.84.0=ha770c72_1 -libbrotlicommon=1.1.0=hd590300_1 -libbrotlidec=1.1.0=hd590300_1 -libbrotlienc=1.1.0=hd590300_1 -libcblas=3.9.0=21_linux64_openblas -libcrc32c=1.1.2=h9c3ff4c_0 -libcurl=8.5.0=hca28451_0 -libdeflate=1.19=hd590300_0 -libedit=3.1.20191231=he28a2e2_2 -libev=4.33=hd590300_2 -libevent=2.1.12=hf998b51_1 -libexpat=2.5.0=hcb278e6_1 -libffi=3.4.2=h7f98852_5 -libgcc-ng=13.2.0=h807b86a_5 -libgdal=3.8.4=h9323651_0 -libgfortran-ng=13.2.0=h69a702a_5 -libgfortran5=13.2.0=ha4646dd_5 -libglib=2.78.4=h783c2da_0 -libgomp=13.2.0=h807b86a_5 -libgoogle-cloud=2.12.0=hef10d8f_5 -libgrpc=1.60.1=h74775cd_0 -libiconv=1.17=hd590300_2 -libjpeg-turbo=3.0.0=hd590300_1 -libkml=1.3.0=h01aab08_1018 -liblapack=3.9.0=21_linux64_openblas -libllvm15=15.0.7=hb3ce162_4 -libmamba=1.5.6=had39da4_0 -libmambapy=1.5.6=py311hf2555c7_0 -libnetcdf=4.9.2=nompi_h9612171_113 -libnghttp2=1.58.0=h47da74e_1 -libnl=3.9.0=hd590300_0 -libnsl=2.0.1=hd590300_0 -libnuma=2.0.16=h0b41bf4_1 -libopenblas=0.3.26=pthreads_h413a1c8_0 -libparquet=15.0.0=h352af49_4_cpu -libpng=1.6.42=h2797004_0 -libpq=16.2=h33b98f1_0 -libprotobuf=4.25.1=hf27288f_2 -libre2-11=2023.06.02=h7a70373_0 -librttopo=1.1.0=h8917695_15 -libsodium=1.0.18=h36c2ea0_1 -libsolv=0.7.28=hfc55251_0 -libspatialindex=1.9.3=h9c3ff4c_4 -libspatialite=5.1.0=h7bd4643_4 -libsqlite=3.45.1=h2797004_0 -libssh2=1.11.0=h0841786_0 -libstdcxx-ng=13.2.0=h7e041cc_5 -libthrift=0.19.0=hb90f79a_1 -libtiff=4.6.0=ha9c0a0a_2 -libutf8proc=2.8.0=h166bdaf_0 -libuuid=2.38.1=h0b41bf4_0 -libuv=1.46.0=hd590300_0 -libwebp-base=1.3.2=hd590300_0 -libxcb=1.15=h0b41bf4_0 -libxcrypt=4.4.36=hd590300_1 -libxml2=2.12.5=h232c23b_0 -libzip=1.10.1=h2629f0a_3 -libzlib=1.2.13=hd590300_5 -linkify-it-py=2.0.3=pyhd8ed1ab_0 -locket=1.0.0=pyhd8ed1ab_0 -lz4=4.3.3=py311h38e4bf4_0 -lz4-c=1.9.4=hcb278e6_0 -lzo=2.10=h516909a_1000 -make=4.3=hd18ef5c_1 -mako=1.3.2=pyhd8ed1ab_0 -mamba=1.5.6=py311h3072747_0 -mapboxgl=0.10.2=py_1 -mapclassify=2.6.1=pyhd8ed1ab_0 -markdown=3.5.2=pyhd8ed1ab_0 -markdown-it-py=3.0.0=pyhd8ed1ab_0 -markupsafe=2.1.5=py311h459d7ec_0 -matplotlib-base=3.8.3=py311h54ef318_0 -matplotlib-inline=0.1.6=pyhd8ed1ab_0 -mdit-py-plugins=0.4.0=pyhd8ed1ab_0 -mdurl=0.1.2=pyhd8ed1ab_0 -menuinst=2.0.2=py311h38be061_0 -minizip=4.0.4=h0ab5242_0 -mistune=3.0.2=pyhd8ed1ab_0 -msgpack-python=1.0.7=py311h9547e67_0 -munkres=1.1.4=pyh9f0ad1d_0 -nbclassic=1.0.0=pyhb4ecaf3_1 -nbclient=0.8.0=pyhd8ed1ab_0 -nbconvert=7.16.0=pyhd8ed1ab_0 -nbconvert-core=7.16.0=pyhd8ed1ab_0 -nbconvert-pandoc=7.16.0=pyhd8ed1ab_0 -nbformat=5.9.2=pyhd8ed1ab_0 -ncurses=6.4=h59595ed_2 -nest-asyncio=1.6.0=pyhd8ed1ab_0 -netcdf4=1.6.5=nompi_py311he8ad708_100 -networkx=3.2.1=pyhd8ed1ab_0 -nodejs=20.9.0=hb753e55_0 -notebook=6.5.5=pyha770c72_0 -notebook-shim=0.2.3=pyhd8ed1ab_0 -nspr=4.35=h27087fc_0 -nss=3.98=h1d7d5a4_0 -numpy=1.26.4=py311h64a7726_0 -oauthlib=3.2.2=pyhd8ed1ab_0 -openjpeg=2.5.0=h488ebb8_3 -openssl=3.2.1=hd590300_0 -orc=1.9.2=h7829240_1 -overrides=7.7.0=pyhd8ed1ab_0 -packaging=23.2=pyhd8ed1ab_0 -pamela=1.1.0=pyh1a96a4e_0 -pandas=1.5.3=py311h2872171_1 -pandas-bokeh=0.5.5=pyhd8ed1ab_0 -pandoc=3.1.11.1=ha770c72_0 -pandocfilters=1.5.0=pyhd8ed1ab_0 -panel=1.3.8=pyhd8ed1ab_0 -param=2.0.2=pyhca7485f_0 -parso=0.8.3=pyhd8ed1ab_0 -partd=1.4.1=pyhd8ed1ab_0 -pcre2=10.42=hcad00b1_0 -pexpect=4.9.0=pyhd8ed1ab_0 -pickleshare=0.7.5=py_1003 -pillow=10.2.0=py311ha6c5da5_0 -pip=24.0=pyhd8ed1ab_0 -pixman=0.43.2=h59595ed_0 -pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 -platformdirs=4.2.0=pyhd8ed1ab_0 -plotly=5.19.0=pyhd8ed1ab_0 -pluggy=1.4.0=pyhd8ed1ab_0 -poppler=24.02.0=h590f24d_0 -poppler-data=0.4.12=hd8ed1ab_0 -postgresql=16.2=h7387d8b_0 -proj=9.3.1=h1d62c97_0 -prometheus_client=0.19.0=pyhd8ed1ab_0 -prompt-toolkit=3.0.42=pyha770c72_0 -prompt_toolkit=3.0.42=hd8ed1ab_0 -psutil=5.9.8=py311h459d7ec_0 -pthread-stubs=0.4=h36c2ea0_1001 -ptyprocess=0.7.0=pyhd3deb0d_0 -pure_eval=0.2.2=pyhd8ed1ab_0 -pyarrow=15.0.0=py311h39c9aba_4_cpu -pyarrow-hotfix=0.6=pyhd8ed1ab_0 -pybind11-abi=4=hd8ed1ab_3 -pycosat=0.6.6=py311h459d7ec_0 -pycparser=2.21=pyhd8ed1ab_0 -pyct=0.5.0=pyhd8ed1ab_0 -pycurl=7.45.1=py311hae980a4_3 -pygments=2.17.2=pyhd8ed1ab_0 -pyjwt=2.8.0=pyhd8ed1ab_1 -pyopenssl=24.0.0=pyhd8ed1ab_0 -pyparsing=3.1.1=pyhd8ed1ab_0 -pyproj=3.6.1=py311hca0b8b9_5 -pysocks=1.7.1=pyha2e5f31_6 -pyspnego=0.9.1=py311h459d7ec_2 -python=3.11.7=hab00c5b_1_cpython -python-dateutil=2.8.2=pyhd8ed1ab_0 -python-fastjsonschema=2.19.1=pyhd8ed1ab_0 -python-json-logger=2.0.7=pyhd8ed1ab_0 -python_abi=3.11=4_cp311 -pytz=2024.1=pyhd8ed1ab_0 -pyviz_comms=3.0.0=pyhd8ed1ab_0 -pyyaml=6.0.1=py311h459d7ec_1 -pyzmq=24.0.1=py311ha4b6469_1 -qtconsole-base=5.5.1=pyha770c72_0 -qtpy=2.4.1=pyhd8ed1ab_0 -rdma-core=50.0=hd3aeb46_1 -re2=2023.06.02=h2873b5e_0 -readline=8.2=h8228510_1 -referencing=0.33.0=pyhd8ed1ab_0 -reproc=14.2.4.post0=hd590300_1 -reproc-cpp=14.2.4.post0=h59595ed_1 -requests=2.31.0=pyhd8ed1ab_0 -requests-kerberos=0.14.0=pyhd8ed1ab_1 -rfc3339-validator=0.1.4=pyhd8ed1ab_0 -rfc3986-validator=0.1.1=pyh9f0ad1d_0 -rpds-py=0.17.1=py311h46250e7_0 -rtree=1.2.0=py311h3bb2b0f_0 -ruamel.yaml=0.18.6=py311h459d7ec_0 -ruamel.yaml.clib=0.2.8=py311h459d7ec_0 -s2n=1.4.3=h06160fa_0 -scikit-learn=1.4.1.post1=py311hc009520_0 -scipy=1.12.0=py311h64a7726_2 -send2trash=1.8.2=pyh41d4057_0 -setuptools=69.0.3=pyhd8ed1ab_0 -shapely=2.0.3=py311h2032efe_0 -six=1.16.0=pyh6c4a22f_0 -snappy=1.1.10=h9fff704_0 -sniffio=1.3.0=pyhd8ed1ab_0 -sortedcontainers=2.4.0=pyhd8ed1ab_0 -soupsieve=2.5=pyhd8ed1ab_1 -sparkmagic=0.21.0=pyhd8ed1ab_1 -sqlalchemy=2.0.26=py311h459d7ec_0 -sqlite=3.45.1=h2c6b66d_0 -stack_data=0.6.2=pyhd8ed1ab_0 -tblib=3.0.0=pyhd8ed1ab_0 -tenacity=8.2.3=pyhd8ed1ab_0 -terminado=0.18.0=pyh0d859eb_0 -threadpoolctl=3.3.0=pyhc1e730c_0 -tiledb=2.20.0=h4386cac_0 -tinycss2=1.2.1=pyhd8ed1ab_0 -tk=8.6.13=noxft_h4845f30_101 -tomli=2.0.1=pyhd8ed1ab_0 -toolz=0.12.1=pyhd8ed1ab_0 -tornado=6.3.3=py311h459d7ec_1 -tqdm=4.66.2=pyhd8ed1ab_0 -traitlets=5.9.0=pyhd8ed1ab_0 -truststore=0.8.0=pyhd8ed1ab_0 -types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 -typing-extensions=4.9.0=hd8ed1ab_0 -typing_extensions=4.9.0=pyha770c72_0 -typing_utils=0.1.0=pyhd8ed1ab_0 -tzcode=2024a=h3f72095_0 -tzdata=2024a=h0c530f3_0 -uc-micro-py=1.0.3=pyhd8ed1ab_0 -ucx=1.15.0=h75e419f_3 -uri-template=1.3.0=pyhd8ed1ab_0 -uriparser=0.9.7=hcb278e6_1 -urllib3=2.2.0=pyhd8ed1ab_0 -wcwidth=0.2.13=pyhd8ed1ab_0 -webcolors=1.13=pyhd8ed1ab_0 -webencodings=0.5.1=pyhd8ed1ab_2 -websocket-client=1.7.0=pyhd8ed1ab_0 -wheel=0.42.0=pyhd8ed1ab_0 -widgetsnbextension=4.0.10=pyhd8ed1ab_0 -xarray=2024.2.0=pyhd8ed1ab_0 -xerces-c=3.2.5=hac6953d_0 -xorg-kbproto=1.0.7=h7f98852_1002 -xorg-libice=1.1.1=hd590300_0 -xorg-libsm=1.2.4=h7391055_0 -xorg-libx11=1.8.7=h8ee46fc_0 -xorg-libxau=1.0.11=hd590300_0 -xorg-libxdmcp=1.1.3=h7f98852_0 -xorg-libxext=1.3.4=h0b41bf4_2 -xorg-libxrender=0.9.11=hd590300_0 -xorg-renderproto=0.11.1=h7f98852_1002 -xorg-xextproto=7.3.0=h0b41bf4_1003 -xorg-xproto=7.0.31=h7f98852_1007 -xyzservices=2023.10.1=pyhd8ed1ab_0 -xz=5.2.6=h166bdaf_0 -yaml=0.2.5=h7f98852_2 -yaml-cpp=0.8.0=h59595ed_0 -zeromq=4.3.5=h59595ed_0 -zict=3.0.0=pyhd8ed1ab_0 -zipp=3.17.0=pyhd8ed1ab_0 -zlib=1.2.13=hd590300_5 -zstandard=0.22.0=py311haa97af0_0 -zstd=1.5.5=hfc55251_0 diff --git a/images/demo/jupyter/requirements/linux/arm64/env.yaml b/images/demo/jupyter/requirements/linux/arm64/env.yaml new file mode 100644 index 0000000000..1b0f42d63d --- /dev/null +++ b/images/demo/jupyter/requirements/linux/arm64/env.yaml @@ -0,0 +1,274 @@ +channels: +- conda-forge +dependencies: +- _openmp_mutex=4.5 +- alembic=1.14.0 +- annotated-types=0.7.0 +- anyio=4.7.0 +- archspec=0.2.3 +- argon2-cffi=23.1.0 +- argon2-cffi-bindings=21.2.0 +- arrow=1.3.0 +- asttokens=3.0.0 +- async-lru=2.0.4 +- async_generator=1.10 +- attrs=24.2.0 +- babel=2.16.0 +- beautifulsoup4=4.12.3 +- bleach=6.2.0 +- blinker=1.9.0 +- boltons=24.0.0 +- brotli-python=1.1.0 +- bzip2=1.0.8 +- c-ares=1.34.3 +- ca-certificates=2024.8.30 +- cached-property=1.5.2 +- cached_property=1.5.2 +- certifi=2024.8.30 +- certipy=0.2.1 +- cffi=1.17.1 +- charset-normalizer=3.4.0 +- colorama=0.4.6 +- comm=0.2.2 +- conda=24.11.0 +- conda-libmamba-solver=24.11.1 +- conda-package-handling=2.4.0 +- conda-package-streaming=0.11.0 +- cpp-expected=1.1.0 +- cryptography=44.0.0 +- debugpy=1.8.9 +- decorator=5.1.1 +- defusedxml=0.7.1 +- distro=1.9.0 +- entrypoints=0.4 +- exceptiongroup=1.2.2 +- executing=2.1.0 +- fmt=11.0.2 +- fqdn=1.5.1 +- frozendict=2.4.6 +- greenlet=3.1.1 +- h11=0.14.0 +- h2=4.1.0 +- hpack=4.0.0 +- httpcore=1.0.7 +- httpx=0.28.1 +- hyperframe=6.0.1 +- icu=75.1 +- idna=3.10 +- importlib-metadata=8.5.0 +- importlib_resources=6.4.5 +- ipykernel=6.29.5 +- ipython=8.30.0 +- ipython_genutils=0.2.0 +- isoduration=20.11.0 +- jedi=0.19.2 +- jinja2=3.1.4 +- json5=0.10.0 +- jsonpatch=1.33 +- jsonpointer=3.0.0 +- jsonschema=4.23.0 +- jsonschema-specifications=2024.10.1 +- jsonschema-with-format-nongpl=4.23.0 +- jupyter-lsp=2.2.5 +- jupyter_client=8.6.3 +- jupyter_core=5.7.2 +- jupyter_events=0.10.0 +- jupyter_server=2.14.2 +- jupyter_server_terminals=0.5.3 +- jupyterhub-base=5.2.1 +- jupyterhub-singleuser=5.2.1 +- jupyterlab=4.3.2 +- jupyterlab_pygments=0.3.0 +- jupyterlab_server=2.27.3 +- keyutils=1.6.1 +- krb5=1.21.3 +- ld_impl_linux-aarch64=2.43 +- libarchive=3.7.7 +- libcurl=8.10.1 +- libedit=3.1.20191231 +- libev=4.33 +- libexpat=2.6.4 +- libffi=3.4.2 +- libgcc=14.2.0 +- libgcc-ng=14.2.0 +- libgomp=14.2.0 +- libiconv=1.17 +- liblzma=5.6.3 +- libmamba=2.0.4 +- libmambapy=2.0.4 +- libnghttp2=1.64.0 +- libnsl=2.0.1 +- libsodium=1.0.20 +- libsolv=0.7.30 +- libsqlite=3.47.0 +- libssh2=1.11.1 +- libstdcxx=14.2.0 +- libstdcxx-ng=14.2.0 +- libuuid=2.38.1 +- libxcrypt=4.4.36 +- libxml2=2.13.5 +- libzlib=1.3.1 +- lz4-c=1.10.0 +- lzo=2.10 +- make=4.4.1 +- mako=1.3.8 +- mamba=2.0.4 +- markupsafe=3.0.2 +- matplotlib-inline=0.1.7 +- menuinst=2.2.0 +- mistune=3.0.2 +- nbclassic=1.1.0 +- nbclient=0.10.1 +- nbconvert-core=7.16.4 +- nbformat=5.10.4 +- ncurses=6.5 +- nest-asyncio=1.6.0 +- nlohmann_json=3.11.3 +- notebook=7.3.1 +- notebook-shim=0.2.4 +- oauthlib=3.2.2 +- openssl=3.4.0 +- overrides=7.7.0 +- packaging=24.2 +- pamela=1.2.0 +- pandocfilters=1.5.0 +- parso=0.8.4 +- pexpect=4.9.0 +- pickleshare=0.7.5 +- pip=24.3.1 +- pkgutil-resolve-name=1.3.10 +- platformdirs=4.3.6 +- pluggy=1.5.0 +- prometheus_client=0.21.1 +- prompt-toolkit=3.0.48 +- psutil=6.1.0 +- ptyprocess=0.7.0 +- pure_eval=0.2.3 +- pybind11-abi=4 +- pycosat=0.6.6 +- pycparser=2.22 +- pydantic=2.10.3 +- pydantic-core=2.27.1 +- pygments=2.18.0 +- pyjwt=2.10.1 +- pysocks=1.7.1 +- python=3.12.8 +- python-dateutil=2.9.0.post0 +- python-fastjsonschema=2.21.1 +- python-json-logger=2.0.7 +- python_abi=3.12 +- pytz=2024.2 +- pyyaml=6.0.2 +- pyzmq=26.2.0 +- readline=8.2 +- referencing=0.35.1 +- reproc=14.2.4.post0 +- reproc-cpp=14.2.4.post0 +- requests=2.32.3 +- rfc3339-validator=0.1.4 +- rfc3986-validator=0.1.1 +- rpds-py=0.22.3 +- ruamel.yaml=0.18.6 +- ruamel.yaml.clib=0.2.8 +- send2trash=1.8.3 +- setuptools=75.6.0 +- simdjson=3.10.1 +- six=1.17.0 +- sniffio=1.3.1 +- soupsieve=2.5 +- spdlog=1.14.1 +- sqlalchemy=2.0.36 +- stack_data=0.6.3 +- terminado=0.18.1 +- tinycss2=1.4.0 +- tk=8.6.13 +- tomli=2.2.1 +- tornado=6.4.2 +- tqdm=4.67.1 +- traitlets=5.14.3 +- truststore=0.10.0 +- types-python-dateutil=2.9.0.20241206 +- typing-extensions=4.12.2 +- typing_extensions=4.12.2 +- typing_utils=0.1.0 +- tzdata=2024b +- uri-template=1.3.0 +- urllib3=2.2.3 +- wcwidth=0.2.13 +- webcolors=24.11.1 +- webencodings=0.5.1 +- websocket-client=1.8.0 +- wheel=0.45.1 +- yaml=0.2.5 +- yaml-cpp=0.8.0 +- zeromq=4.3.5 +- zipp=3.21.0 +- zstandard=0.23.0 +- zstd=1.5.6 +- pip: + - adbc-driver-flightsql==1.3.0 + - adbc-driver-manager==1.3.0 + - altair==5.5.0 + - autovizwidget==0.22.0 + - bokeh==3.6.2 + - branca==0.8.1 + - cftime==1.6.4.post1 + - chroma-py==0.1.0.dev1 + - click==8.1.8 + - cloudpickle==3.1.0 + - colorcet==3.1.0 + - colour==0.1.5 + - contourpy==1.3.1 + - cycler==0.12.1 + - dask==2024.12.1 + - folium==0.19.2 + - fonttools==4.55.3 + - fsspec==2024.12.0 + - geojson==3.2.0 + - geopandas==1.0.1 + - hdijupyterutils==0.22.0 + - holoviews==1.20.0 + - hvplot==0.11.2 + - ipywidgets==8.1.5 + - jupyter==1.1.1 + - jupyter-console==6.6.3 + - jupyterlab_widgets==3.0.13 + - kamu==0.6.0 + - kiwisolver==1.4.8 + - linkify-it-py==2.0.3 + - livy==0.8.0 + - locket==1.0.0 + - mapboxgl==0.10.2 + - Markdown==3.7 + - markdown-it-py==3.0.0 + - matplotlib==3.10.0 + - mdit-py-plugins==0.4.2 + - mdurl==0.1.2 + - narwhals==1.19.1 + - netCDF4==1.7.2 + - numpy==2.2.1 + - pandas==2.2.3 + - pandas-bokeh==0.5.5 + - panel==1.5.5 + - param==2.2.0 + - partd==1.4.2 + - pillow==11.0.0 + - plotly==5.24.1 + - pyarrow==18.1.0 + - pyogrio==0.10.0 + - pyparsing==3.2.0 + - pyproj==3.7.0 + - pyviz_comms==3.0.3 + - setuptools==75.6.0 + - shapely==2.0.6 + - tenacity==9.0.0 + - toolz==1.0.0 + - tzdata==2024.2 + - uc-micro-py==1.0.3 + - wheel==0.45.1 + - widgetsnbextension==4.0.13 + - xarray==2024.11.0 + - xyzservices==2024.9.0 + - zstandard==0.23.0 +name: base +prefix: /opt/conda diff --git a/images/demo/jupyter/requirements/linux/arm64/requirements.in b/images/demo/jupyter/requirements/linux/arm64/requirements.in index d7b27f8f59..68140ad718 100644 --- a/images/demo/jupyter/requirements/linux/arm64/requirements.in +++ b/images/demo/jupyter/requirements/linux/arm64/requirements.in @@ -1,21 +1,16 @@ -# TODO: Pinned due to sparkmagic installation issue -# See: https://github.com/jupyter-incubator/sparkmagic/issues/825 -# See workaround applied in: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -notebook==6.5.5 +kamu[jupyter-autoviz,jupyter-sql,spark] -sparkmagic - -pandas +dask geopandas geojson -xarray netcdf4 -dask +pandas +xarray +altair bokeh -hvplot -pandas-bokeh folium -altair +hvplot mapboxgl +pandas-bokeh shapely diff --git a/images/demo/jupyter/requirements/linux/arm64/requirements.txt b/images/demo/jupyter/requirements/linux/arm64/requirements.txt deleted file mode 100644 index 832732c7db..0000000000 --- a/images/demo/jupyter/requirements/linux/arm64/requirements.txt +++ /dev/null @@ -1,411 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-aarch64 -_openmp_mutex=4.5=2_gnu -alembic=1.13.1=pyhd8ed1ab_1 -altair=5.2.0=pyhd8ed1ab_0 -anyio=4.2.0=pyhd8ed1ab_0 -archspec=0.2.2=pyhd8ed1ab_0 -argon2-cffi=23.1.0=pyhd8ed1ab_0 -argon2-cffi-bindings=21.2.0=py311hcd402e7_4 -arrow=1.3.0=pyhd8ed1ab_0 -asttokens=2.4.1=pyhd8ed1ab_0 -async-lru=2.0.4=pyhd8ed1ab_0 -async_generator=1.10=py_0 -attrs=23.2.0=pyh71513ae_0 -autovizwidget=0.21.0=pyh1a96a4e_1 -aws-c-auth=0.7.16=h570bf23_5 -aws-c-cal=0.6.10=h967b9ec_1 -aws-c-common=0.9.13=h31becfc_0 -aws-c-compression=0.2.18=h00d1b86_1 -aws-c-event-stream=0.4.2=h10e8a16_3 -aws-c-http=0.8.1=hf0788a4_4 -aws-c-io=0.14.4=h87c19fb_2 -aws-c-mqtt=0.10.2=he8e29e5_3 -aws-c-s3=0.5.1=h71a96cc_6 -aws-c-sdkutils=0.1.15=h00d1b86_1 -aws-checksums=0.1.18=h00d1b86_1 -aws-crt-cpp=0.26.2=h8568a09_5 -aws-sdk-cpp=1.11.267=hfce6cab_1 -azure-core-cpp=1.10.3=hcd87347_1 -azure-storage-blobs-cpp=12.10.0=h2a328a1_0 -azure-storage-common-cpp=12.5.0=hee0c750_2 -babel=2.14.0=pyhd8ed1ab_0 -beautifulsoup4=4.12.3=pyha770c72_0 -bleach=6.1.0=pyhd8ed1ab_0 -blinker=1.7.0=pyhd8ed1ab_0 -blosc=1.21.5=h2f3a684_0 -bokeh=3.3.4=pyhd8ed1ab_0 -boltons=23.1.1=pyhd8ed1ab_0 -branca=0.7.1=pyhd8ed1ab_0 -brotli=1.1.0=h31becfc_1 -brotli-bin=1.1.0=h31becfc_1 -brotli-python=1.1.0=py311h8715677_1 -bzip2=1.0.8=h31becfc_5 -c-ares=1.26.0=h31becfc_0 -ca-certificates=2024.2.2=hcefe29a_0 -cached-property=1.5.2=hd8ed1ab_1 -cached_property=1.5.2=pyha770c72_1 -cairo=1.18.0=ha13f110_0 -certifi=2024.2.2=pyhd8ed1ab_0 -certipy=0.1.3=py_0 -cffi=1.16.0=py311h7963103_0 -cfitsio=4.3.1=hf28c5f1_0 -cftime=1.6.3=py311hf13da56_0 -charset-normalizer=3.3.2=pyhd8ed1ab_0 -chroma-py=0.1.0.dev1=py_0 -click=8.1.7=unix_pyh707e725_0 -click-plugins=1.1.1=py_0 -cligj=0.7.2=pyhd8ed1ab_1 -cloudpickle=3.0.0=pyhd8ed1ab_0 -colorama=0.4.6=pyhd8ed1ab_0 -colorcet=3.0.1=pyhd8ed1ab_0 -colour=0.1.5=pyhd8ed1ab_1 -comm=0.2.1=pyhd8ed1ab_0 -conda=23.11.0=py311hec3470c_1 -conda-libmamba-solver=24.1.0=pyhd8ed1ab_0 -conda-package-handling=2.2.0=pyh38be061_0 -conda-package-streaming=0.9.0=pyhd8ed1ab_0 -configurable-http-proxy=4.6.1=h4e45a9e_0 -contourpy=1.2.0=py311h098ece5_0 -cryptography=42.0.2=py311h2245af3_0 -cycler=0.12.1=pyhd8ed1ab_0 -cytoolz=0.12.3=py311hc8f2f60_0 -dask=2024.2.0=pyhd8ed1ab_0 -dask-core=2024.2.0=pyhd8ed1ab_0 -debugpy=1.8.1=py311h8715677_0 -decorator=5.1.1=pyhd8ed1ab_0 -defusedxml=0.7.1=pyhd8ed1ab_0 -distributed=2024.2.0=pyhd8ed1ab_0 -distro=1.9.0=pyhd8ed1ab_0 -entrypoints=0.4=pyhd8ed1ab_0 -exceptiongroup=1.2.0=pyhd8ed1ab_2 -executing=2.0.1=pyhd8ed1ab_0 -expat=2.5.0=hd600fc2_1 -fiona=1.9.5=py311he15760a_3 -fmt=10.2.1=h2a328a1_0 -folium=0.15.1=pyhd8ed1ab_0 -font-ttf-dejavu-sans-mono=2.37=hab24e00_0 -font-ttf-inconsolata=3.000=h77eed37_0 -font-ttf-source-code-pro=2.038=h77eed37_0 -font-ttf-ubuntu=0.83=h77eed37_1 -fontconfig=2.14.2=ha9a116f_0 -fonts-conda-ecosystem=1=0 -fonts-conda-forge=1=0 -fonttools=4.49.0=py311hcd402e7_0 -fqdn=1.5.1=pyhd8ed1ab_0 -freetype=2.12.1=hf0a5ef3_2 -freexl=2.0.0=h5428426_0 -fsspec=2024.2.0=pyhca7485f_0 -gdal=3.8.4=py311h3b5b607_0 -geojson=3.1.0=pyhd8ed1ab_0 -geopandas=0.14.3=pyhd8ed1ab_0 -geopandas-base=0.14.3=pyha770c72_0 -geos=3.12.1=h2f0025b_0 -geotiff=1.7.1=h3e58e51_15 -gettext=0.21.1=ha18d298_0 -gflags=2.2.2=h54f1f3f_1004 -giflib=5.2.1=hb4cce97_3 -glog=0.6.0=h8ab10f1_0 -greenlet=3.0.3=py311h8715677_0 -h11=0.14.0=pyhd8ed1ab_0 -h2=4.1.0=pyhd8ed1ab_0 -hdf4=4.2.15=hb6ba311_7 -hdf5=1.14.3=nompi_ha486f32_100 -hdijupyterutils=0.21.0=pyh1a96a4e_1 -holoviews=1.18.3=pyhd8ed1ab_0 -hpack=4.0.0=pyh9f0ad1d_0 -httpcore=1.0.2=pyhd8ed1ab_0 -httpx=0.26.0=pyhd8ed1ab_0 -hvplot=0.9.2=pyhd8ed1ab_0 -hyperframe=6.0.1=pyhd8ed1ab_0 -icu=73.2=h787c7f5_0 -idna=3.6=pyhd8ed1ab_0 -importlib-metadata=7.0.1=pyha770c72_0 -importlib_metadata=7.0.1=hd8ed1ab_0 -importlib_resources=6.1.1=pyhd8ed1ab_0 -ipykernel=6.29.2=pyhd33586a_0 -ipython=8.21.0=pyh707e725_0 -ipython_genutils=0.2.0=py_1 -ipywidgets=8.1.2=pyhd8ed1ab_0 -isoduration=20.11.0=pyhd8ed1ab_0 -jedi=0.19.1=pyhd8ed1ab_0 -jinja2=3.1.3=pyhd8ed1ab_0 -joblib=1.3.2=pyhd8ed1ab_0 -json-c=0.17=h9d1147b_0 -json5=0.9.14=pyhd8ed1ab_0 -jsonpatch=1.33=pyhd8ed1ab_0 -jsonpointer=2.4=py311hec3470c_3 -jsonschema=4.21.1=pyhd8ed1ab_0 -jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 -jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 -jupyter=1.0.0=pyhd8ed1ab_10 -jupyter-lsp=2.2.2=pyhd8ed1ab_0 -jupyter_client=7.4.9=pyhd8ed1ab_0 -jupyter_console=6.6.3=pyhd8ed1ab_0 -jupyter_core=5.7.1=py311hec3470c_0 -jupyter_events=0.9.0=pyhd8ed1ab_0 -jupyter_server=2.12.5=pyhd8ed1ab_0 -jupyter_server_terminals=0.5.2=pyhd8ed1ab_0 -jupyter_telemetry=0.1.0=pyhd8ed1ab_1 -jupyterhub=4.0.2=pyh31011fe_0 -jupyterhub-base=4.0.2=pyh31011fe_0 -jupyterlab=4.1.1=pyhd8ed1ab_0 -jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 -jupyterlab_server=2.25.2=pyhd8ed1ab_0 -jupyterlab_widgets=3.0.10=pyhd8ed1ab_0 -kealib=1.5.3=h4670d8b_0 -keyutils=1.6.1=h4e544f5_0 -kiwisolver=1.4.5=py311h0d5d7b0_1 -krb5=1.21.2=hc419048_0 -lcms2=2.16=h922389a_0 -ld_impl_linux-aarch64=2.40=h2d8c526_0 -lerc=4.0.0=h4de3ea5_0 -libabseil=20230802.1=cxx17_h2f0025b_0 -libaec=1.1.2=h2f0025b_1 -libarchive=3.7.2=hd2f85e0_1 -libarrow=15.0.0=h606a0d5_4_cpu -libarrow-acero=15.0.0=h2f0025b_4_cpu -libarrow-dataset=15.0.0=h2f0025b_4_cpu -libarrow-flight=15.0.0=he69d72d_4_cpu -libarrow-flight-sql=15.0.0=h1fc705f_4_cpu -libarrow-gandiva=15.0.0=h90362dd_4_cpu -libarrow-substrait=15.0.0=h0599332_4_cpu -libblas=3.9.0=21_linuxaarch64_openblas -libboost-headers=1.84.0=h8af1aa0_1 -libbrotlicommon=1.1.0=h31becfc_1 -libbrotlidec=1.1.0=h31becfc_1 -libbrotlienc=1.1.0=h31becfc_1 -libcblas=3.9.0=21_linuxaarch64_openblas -libcrc32c=1.1.2=h01db608_0 -libcurl=8.5.0=h4e8248e_0 -libdeflate=1.19=h31becfc_0 -libedit=3.1.20191231=he28a2e2_2 -libev=4.33=h31becfc_2 -libevent=2.1.12=h4ba1bb4_1 -libexpat=2.5.0=hd600fc2_1 -libffi=3.4.2=h3557bc0_5 -libgcc-ng=13.2.0=hf8544c7_5 -libgdal=3.8.4=h79c3f81_0 -libgfortran-ng=13.2.0=he9431aa_5 -libgfortran5=13.2.0=h582850c_5 -libglib=2.78.4=h311d5f7_0 -libgomp=13.2.0=hf8544c7_5 -libgoogle-cloud=2.12.0=h3b99733_5 -libgrpc=1.60.1=heeb7df3_0 -libiconv=1.17=h31becfc_2 -libjpeg-turbo=3.0.0=h31becfc_1 -libkml=1.3.0=h7d16752_1018 -liblapack=3.9.0=21_linuxaarch64_openblas -libllvm15=15.0.7=hb4f23b0_4 -libmamba=1.5.6=hea3be6c_0 -libmambapy=1.5.6=py311h765b69a_0 -libnetcdf=4.9.2=nompi_h33102a8_113 -libnghttp2=1.58.0=hb0e430d_1 -libnsl=2.0.1=h31becfc_0 -libnuma=2.0.16=hb4cce97_1 -libopenblas=0.3.26=pthreads_h5a5ec62_0 -libparquet=15.0.0=hb18b541_4_cpu -libpng=1.6.42=h194ca79_0 -libpq=16.2=h58720eb_0 -libprotobuf=4.25.1=h87e877f_2 -libre2-11=2023.06.02=hf48c5ca_0 -librttopo=1.1.0=hd8968fb_15 -libsodium=1.0.18=hb9de7d4_1 -libsolv=0.7.28=hd84c7bf_0 -libspatialindex=1.9.3=h01db608_4 -libspatialite=5.1.0=h896d346_4 -libsqlite=3.45.1=h194ca79_0 -libssh2=1.11.0=h492db2e_0 -libstdcxx-ng=13.2.0=h9a76618_5 -libthrift=0.19.0=h043aeee_1 -libtiff=4.6.0=h1708d11_2 -libutf8proc=2.8.0=h4e544f5_0 -libuuid=2.38.1=hb4cce97_0 -libuv=1.46.0=h31becfc_0 -libwebp-base=1.3.2=h31becfc_0 -libxcb=1.15=h2a766a3_0 -libxcrypt=4.4.36=h31becfc_1 -libxml2=2.12.5=h3091e33_0 -libzip=1.10.1=h4156a30_3 -libzlib=1.2.13=h31becfc_5 -linkify-it-py=2.0.3=pyhd8ed1ab_0 -locket=1.0.0=pyhd8ed1ab_0 -lz4=4.3.3=py311h6a4b261_0 -lz4-c=1.9.4=hd600fc2_0 -lzo=2.10=h516909a_1000 -make=4.3=h309ac5b_1 -mako=1.3.2=pyhd8ed1ab_0 -mamba=1.5.6=py311hb6c5aa6_0 -mapboxgl=0.10.2=py_1 -mapclassify=2.6.1=pyhd8ed1ab_0 -markdown=3.5.2=pyhd8ed1ab_0 -markdown-it-py=3.0.0=pyhd8ed1ab_0 -markupsafe=2.1.5=py311hc8f2f60_0 -matplotlib-base=3.8.3=py311h1f11223_0 -matplotlib-inline=0.1.6=pyhd8ed1ab_0 -mdit-py-plugins=0.4.0=pyhd8ed1ab_0 -mdurl=0.1.2=pyhd8ed1ab_0 -menuinst=2.0.2=py311hec3470c_0 -minizip=4.0.4=hb75dd74_0 -mistune=3.0.2=pyhd8ed1ab_0 -msgpack-python=1.0.7=py311h0d5d7b0_0 -munkres=1.1.4=pyh9f0ad1d_0 -nbclassic=1.0.0=pyhb4ecaf3_1 -nbclient=0.8.0=pyhd8ed1ab_0 -nbconvert=7.16.0=pyhd8ed1ab_0 -nbconvert-core=7.16.0=pyhd8ed1ab_0 -nbconvert-pandoc=7.16.0=pyhd8ed1ab_0 -nbformat=5.9.2=pyhd8ed1ab_0 -ncurses=6.4=h0425590_2 -nest-asyncio=1.6.0=pyhd8ed1ab_0 -netcdf4=1.6.5=nompi_py311hcd50196_100 -networkx=3.2.1=pyhd8ed1ab_0 -nodejs=20.9.0=hc1f8a26_0 -notebook=6.5.5=pyha770c72_0 -notebook-shim=0.2.3=pyhd8ed1ab_0 -nspr=4.35=h4de3ea5_0 -nss=3.98=hc5a5cc2_0 -numpy=1.26.4=py311h69ead2a_0 -oauthlib=3.2.2=pyhd8ed1ab_0 -openjpeg=2.5.0=h0d9d63b_3 -openssl=3.2.1=h31becfc_0 -orc=1.9.2=h5960ff3_1 -overrides=7.7.0=pyhd8ed1ab_0 -packaging=23.2=pyhd8ed1ab_0 -pamela=1.1.0=pyh1a96a4e_0 -pandas=1.5.3=py311hff2c139_1 -pandas-bokeh=0.5.5=pyhd8ed1ab_0 -pandoc=3.1.11.1=h8af1aa0_0 -pandocfilters=1.5.0=pyhd8ed1ab_0 -panel=1.3.8=pyhd8ed1ab_0 -param=2.0.2=pyhca7485f_0 -parso=0.8.3=pyhd8ed1ab_0 -partd=1.4.1=pyhd8ed1ab_0 -pcre2=10.42=hd0f9c67_0 -pexpect=4.9.0=pyhd8ed1ab_0 -pickleshare=0.7.5=py_1003 -pillow=10.2.0=py311hbcc2232_0 -pip=24.0=pyhd8ed1ab_0 -pixman=0.43.2=h2f0025b_0 -pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 -platformdirs=4.2.0=pyhd8ed1ab_0 -plotly=5.19.0=pyhd8ed1ab_0 -pluggy=1.4.0=pyhd8ed1ab_0 -poppler=24.02.0=h3cd87ed_0 -poppler-data=0.4.12=hd8ed1ab_0 -postgresql=16.2=he703394_0 -proj=9.3.1=h7b42f86_0 -prometheus_client=0.19.0=pyhd8ed1ab_0 -prompt-toolkit=3.0.42=pyha770c72_0 -prompt_toolkit=3.0.42=hd8ed1ab_0 -psutil=5.9.8=py311hcd402e7_0 -pthread-stubs=0.4=hb9de7d4_1001 -ptyprocess=0.7.0=pyhd3deb0d_0 -pure_eval=0.2.2=pyhd8ed1ab_0 -pyarrow=15.0.0=py311h1eb6f34_4_cpu -pyarrow-hotfix=0.6=pyhd8ed1ab_0 -pybind11-abi=4=hd8ed1ab_3 -pycosat=0.6.6=py311hcd402e7_0 -pycparser=2.21=pyhd8ed1ab_0 -pyct=0.5.0=pyhd8ed1ab_0 -pycurl=7.45.1=py311h4769251_3 -pygments=2.17.2=pyhd8ed1ab_0 -pyjwt=2.8.0=pyhd8ed1ab_1 -pyopenssl=24.0.0=pyhd8ed1ab_0 -pyparsing=3.1.1=pyhd8ed1ab_0 -pyproj=3.6.1=py311ha6273e5_5 -pysocks=1.7.1=pyha2e5f31_6 -pyspnego=0.9.1=py311hcd402e7_2 -python=3.11.7=h43d1f9e_1_cpython -python-dateutil=2.8.2=pyhd8ed1ab_0 -python-fastjsonschema=2.19.1=pyhd8ed1ab_0 -python-json-logger=2.0.7=pyhd8ed1ab_0 -python_abi=3.11=4_cp311 -pytz=2024.1=pyhd8ed1ab_0 -pyviz_comms=3.0.0=pyhd8ed1ab_0 -pyyaml=6.0.1=py311hcd402e7_1 -pyzmq=24.0.1=py311h22a2215_1 -qtconsole-base=5.5.1=pyha770c72_0 -qtpy=2.4.1=pyhd8ed1ab_0 -re2=2023.06.02=h887e66c_0 -readline=8.2=h8fc344f_1 -referencing=0.33.0=pyhd8ed1ab_0 -reproc=14.2.4.post0=h31becfc_1 -reproc-cpp=14.2.4.post0=h2f0025b_1 -requests=2.31.0=pyhd8ed1ab_0 -requests-kerberos=0.14.0=pyhd8ed1ab_1 -rfc3339-validator=0.1.4=pyhd8ed1ab_0 -rfc3986-validator=0.1.1=pyh9f0ad1d_0 -rpds-py=0.17.1=py311h32437ce_0 -rtree=1.2.0=py311h04fbf56_0 -ruamel.yaml=0.18.6=py311hcd402e7_0 -ruamel.yaml.clib=0.2.8=py311hcd402e7_0 -s2n=1.4.4=h5a25046_0 -scikit-learn=1.4.1.post1=py311hb93614b_0 -scipy=1.12.0=py311h69ead2a_2 -send2trash=1.8.2=pyh41d4057_0 -setuptools=69.0.3=pyhd8ed1ab_0 -shapely=2.0.3=py311hbbe59c9_0 -six=1.16.0=pyh6c4a22f_0 -snappy=1.1.10=he8610fa_0 -sniffio=1.3.0=pyhd8ed1ab_0 -sortedcontainers=2.4.0=pyhd8ed1ab_0 -soupsieve=2.5=pyhd8ed1ab_1 -sparkmagic=0.21.0=pyhd8ed1ab_1 -sqlalchemy=2.0.26=py311hc8f2f60_0 -sqlite=3.45.1=h3b3482f_0 -stack_data=0.6.2=pyhd8ed1ab_0 -tblib=3.0.0=pyhd8ed1ab_0 -tenacity=8.2.3=pyhd8ed1ab_0 -terminado=0.18.0=pyh0d859eb_0 -threadpoolctl=3.3.0=pyhc1e730c_0 -tiledb=2.20.0=hf61e980_0 -tinycss2=1.2.1=pyhd8ed1ab_0 -tk=8.6.13=h194ca79_0 -tomli=2.0.1=pyhd8ed1ab_0 -toolz=0.12.1=pyhd8ed1ab_0 -tornado=6.3.3=py311hc8f2f60_1 -tqdm=4.66.2=pyhd8ed1ab_0 -traitlets=5.9.0=pyhd8ed1ab_0 -truststore=0.8.0=pyhd8ed1ab_0 -types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 -typing-extensions=4.9.0=hd8ed1ab_0 -typing_extensions=4.9.0=pyha770c72_0 -typing_utils=0.1.0=pyhd8ed1ab_0 -tzcode=2024a=h31becfc_0 -tzdata=2024a=h0c530f3_0 -uc-micro-py=1.0.3=pyhd8ed1ab_0 -ucx=1.15.0=hedb98eb_3 -uri-template=1.3.0=pyhd8ed1ab_0 -uriparser=0.9.7=hd600fc2_1 -urllib3=2.2.0=pyhd8ed1ab_0 -wcwidth=0.2.13=pyhd8ed1ab_0 -webcolors=1.13=pyhd8ed1ab_0 -webencodings=0.5.1=pyhd8ed1ab_2 -websocket-client=1.7.0=pyhd8ed1ab_0 -wheel=0.42.0=pyhd8ed1ab_0 -widgetsnbextension=4.0.10=pyhd8ed1ab_0 -xarray=2024.2.0=pyhd8ed1ab_0 -xerces-c=3.2.5=hf13c1fb_0 -xorg-kbproto=1.0.7=h3557bc0_1002 -xorg-libice=1.1.1=h7935292_0 -xorg-libsm=1.2.4=h5a01bc2_0 -xorg-libx11=1.8.7=h055a233_0 -xorg-libxau=1.0.11=h31becfc_0 -xorg-libxdmcp=1.1.3=h3557bc0_0 -xorg-libxext=1.3.4=h2a766a3_2 -xorg-libxrender=0.9.11=h7935292_0 -xorg-renderproto=0.11.1=h3557bc0_1002 -xorg-xextproto=7.3.0=h2a766a3_1003 -xorg-xproto=7.0.31=h3557bc0_1007 -xyzservices=2023.10.1=pyhd8ed1ab_0 -xz=5.2.6=h9cdd2b7_0 -yaml=0.2.5=hf897c2e_2 -yaml-cpp=0.8.0=h2f0025b_0 -zeromq=4.3.5=h2f0025b_0 -zict=3.0.0=pyhd8ed1ab_0 -zipp=3.17.0=pyhd8ed1ab_0 -zlib=1.2.13=h31becfc_5 -zstandard=0.22.0=py311hb827a26_0 -zstd=1.5.5=h4c53e97_0 diff --git a/images/demo/jupyter/sparkmagic.json b/images/demo/jupyter/sparkmagic.json deleted file mode 100644 index c1849ef986..0000000000 --- a/images/demo/jupyter/sparkmagic.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "kernel_python_credentials": { - "username": "", - "password": "", - "url": "http://localhost:8998", - "auth": "None" - }, - "kernel_scala_credentials": { - "username": "", - "password": "", - "url": "http://localhost:8998", - "auth": "None" - }, - "kernel_r_credentials": { - "username": "", - "password": "", - "url": "http://localhost:8998" - }, - "logging_config": { - "version": 1, - "formatters": { - "magicsFormatter": { - "format": "%(asctime)s\t%(levelname)s\t%(message)s", - "datefmt": "" - } - }, - "handlers": { - "magicsHandler": { - "class": "hdijupyterutils.filehandler.MagicsFileHandler", - "formatter": "magicsFormatter", - "home_path": "~/.sparkmagic" - } - }, - "loggers": { - "magicsLogger": { - "handlers": [ - "magicsHandler" - ], - "level": "DEBUG", - "propagate": 0 - } - } - }, - "wait_for_idle_timeout_seconds": 15, - "livy_session_startup_timeout_seconds": 60, - "fatal_error_suggestion": "The code failed because of a fatal error:\n\t{}.\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.", - "ignore_ssl_errors": false, - "session_configs": { - "driverMemory": "1000M", - "executorCores": 2 - }, - "use_auto_viz": true, - "coerce_dataframe": true, - "default_maxrows": 1000000, - "pyspark_dataframe_encoding": "utf-8", - "heartbeat_refresh_seconds": 30, - "livy_server_heartbeat_timeout_seconds": 0, - "heartbeat_retry_seconds": 10, - "server_extension_default_kernel_name": "pysparkkernel", - "custom_headers": {}, - "retry_policy": "configurable", - "retry_seconds_to_sleep_list": [ - 0.2, - 0.5, - 1, - 3, - 5 - ], - "configurable_retry_policy_max_retries": 8 -} \ No newline at end of file diff --git a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/01 - Introduction.ipynb b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/01 - Introduction.ipynb index 66516c71e4..79a943335d 100644 --- a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/01 - Introduction.ipynb +++ b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/01 - Introduction.ipynb @@ -64,8 +64,9 @@ "
\n", "New to Jupyter?\n", "\n", - "* Go back to the Jupyter's main tab that shows the list of files \n", - "* In the top right corner click New -> Terminal\n", + "* Open the File menu at the top of the window\n", + "* Select New -> Terminal\n", + "* This will open a terminal in a new browser tab\n", "* Now you can switch between the terminal tab and this lesson as you continue\n", "\n", "
\n", @@ -308,7 +309,7 @@ "source": [ "## Analyzing Data\n", "\n", - "Getting raw data in is just a small first step on our journey towards collaboration on data, but before we continue, let's take a quick break and see how you can analyze the data that we already have.\n", + "Getting raw data in is just a small first step on our journey towards collaboration on data, but before we continue, let's take a quick break and see what we can do with data we already have.\n", "\n", "### SQL Shell\n", "\n", @@ -347,13 +348,24 @@ "\n", "### Notebooks\n", "\n", - "When you install `kamu` on your computer you can use `kamu notebook` command to start an integrated Jupyter \n", - "Notebook environment, identical to the one you are currently using.\n", + "When you install Kamu CLI on your computer you can use `kamu notebook` command to start an integrated Jupyter \n", + "Notebook environment identical to the one you are currently using.\n", "\n", "Since we're already in the notebook environment - let's give this integration a try!\n", "\n", "
\n", - "Start by loading kamu Jupyter extension:\n", + "Start by creating a connection to kamu SQL server:\n", + "
\n", + "\n", + "
\n", + "
\n", + "New to Jupyter?\n", + "\n", + "Jupyter notebooks contain cells that are **executable**, so static text can me mixed with computations and data visualization.\n", + "\n", + "**You** are in control of what runs when, so you'll need to **select the code cell below** and then click the **\"Run\"** button on the top panel, or press `Shift + Enter`.\n", + "\n", + "
\n", "
" ] }, @@ -363,28 +375,49 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext kamu" + "import kamu\n", + "\n", + "con = kamu.connect(\"file://\")\n", + "print(\"Connected to kamu via\", con)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "
\n", - "New to Jupyter?\n", + "Using `kamu` Python library we can connect to any remote kamu node by providing a URL.\n", "\n", - "Jupyter notebooks contain cells that are **executable**, so static text can me mixed with computations and data visualization.\n", - "\n", - "**You** are in control of what runs when, so you'll need to **select the code cell above** and then click the **\"Run\"** button on the top panel, or press `Shift + Enter`.\n", + "When URL is a local path - `kamu` library will automatically start an SQL server for that local workspace and connect to it. Super convenient!\n", "\n", - "
\n", - "
\n", - "\n", - "We can now import the dataset we have in our workspace into this notebook environment. We can also give it a less verbose alias.\n", + "We can now send SQL requests using `query(sql)` method. The result will be returned as Pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "con.query(\"select 1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "con.query(\"select * from 'covid19.british-columbia.case-details' limit 3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Writing `con.query(...)` many times can get old fast, but `kamu` Jupyter extension can help with that.\n", "\n", "
\n", - "Run the below to import the dataset (may take 15 or so seconds first time):\n", + "Load kamu Jupyter extension:\n", "
" ] }, @@ -394,7 +427,14 @@ "metadata": {}, "outputs": [], "source": [ - "%import_dataset covid19.british-columbia.case-details --alias cases_bc" + "%load_ext kamu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The extension provides a very convenient `%%sql` cell magic:" ] }, { @@ -412,30 +452,25 @@ "metadata": {}, "outputs": [], "source": [ - "cases_bc.printSchema()\n", - "cases_bc.count()" + "%%sql\n", + "select count(*) from 'covid19.british-columbia.case-details'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "describe 'covid19.british-columbia.case-details'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "
\n", - "What did we just run?\n", - "\n", - "The code you type into a regular cell is executed by [PySpark](https://spark.apache.org/docs/latest/api/python/) server that `kamu` runs when you are working with notebooks.\n", - "\n", - "So it's a Python code, but it is **executed remotely**, not in the notebook kernel. We will discuss benefits of this later.\n", - "\n", - "
\n", - "
\n", - "\n", - "You can use the `%%sql` cell command to run SQL queries on the imported datasets.\n", - "\n", - "
\n", - "To see a sample of data run:\n", - "
" + "To see a sample of data run:" ] }, { @@ -445,26 +480,19 @@ "outputs": [], "source": [ "%%sql\n", - "select * from cases_bc \n", + "select\n", + " *\n", + "from 'covid19.british-columbia.case-details'\n", "order by reported_date desc\n", - "limit 5" + "limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "
\n", - "What did we just run?\n", - "\n", - "Similarly to the PySpark code, the queries in `%%sql` cells are sent to and executed by the Spark SQL engine. The results are then returned back to the notebook kernel.\n", - "\n", - "
\n", - "
\n", - "\n", "
\n", - "Let's run this simple SQL query to build a histogram of cases by the age group:\n", + "Run this simple SQL query to count number of cases per age group:\n", "
" ] }, @@ -478,7 +506,7 @@ "select\n", " age_group,\n", " count(*) as case_count \n", - "from cases_bc\n", + "from 'covid19.british-columbia.case-details'\n", "group by age_group" ] }, @@ -486,17 +514,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "The `kamu` extension also provides a convenient auto-viz widget that you can use to quickly plot data in a data frame.\n", + "\n", "
\n", " \n", - "Once you get the results, try using the built-in data visualizer to plot the data as a **bar chart**\n", + "Once you get the results, try switching results view from \"Table\" to \"Bar\" tab and build a histogram.\n", "\n", "
\n", "\n", - "SQL is great for shaping and aggregating data, but for more advanced processing or visualizations you might need more tools. Using `-o ` parameter of the `%%sql` command we can ask for the result of a query to be returned into the notebook as **Pandas dataframe**.\n", + "Using `kamu` with Jupyter lets you offload complex computations to a selection of powerful SQL engines. It avoids having to download all data (which often may not fit into memory) into the notebook - instead you can shape and aggregate data on the SQL engine side and only download often much smaller results for the final visualization.\n", + "\n", + "Using `-o ` parameter of the `%%sql` cell magic we can save the result into a variable.\n", + "\n", + "When you expect a lot of data and don't want to display a table you can also use `-q` or `--quiet` flag.\n", "\n", "
\n", "\n", - "Let's count the number of cases per day and pull the result from Spark into our notebook:\n", + "Let's count the number of cases per day and pull the result from SQL engine into our notebook:\n", " \n", "
" ] @@ -507,11 +541,11 @@ "metadata": {}, "outputs": [], "source": [ - "%%sql -o df\n", + "%%sql -o df -q\n", "select\n", " reported_date as date,\n", " count(*) as case_count\n", - "from cases_bc\n", + "from 'covid19.british-columbia.case-details'\n", "group by date\n", "order by date" ] @@ -522,18 +556,12 @@ "source": [ "We now have a variable `df` containing the data as Pandas dataframe, and you are free to do with it anything you'd normally do in Jupyter.\n", "\n", - "
\n", - "\n", - "Note that if you just type `df` in a cell - you will get an error. That's because by default this kernel executes operations in the remote PySpark environment. To access `df` you need to use `%%local` cell command which will execute code in this local Python kernel.\n", - " \n", - "
\n", - "\n", "This environment already comes with some popular plotting libraries pre-installed (like `plotly`, `bokeh`, `mapbox`, etc.), but if your favorite library is missing - you can always `pip install` it from the terminal.\n", "\n", "
\n", - " \n", + "\n", "Let's do some basic plotting:\n", - " \n", + "\n", "
" ] }, @@ -543,7 +571,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "import plotly.express as px\n", "\n", "fig = px.scatter(\n", @@ -569,19 +596,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/02 - Collaboration.ipynb b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/02 - Collaboration.ipynb index 413414ea7e..29e60f8f59 100644 --- a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/02 - Collaboration.ipynb +++ b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/02 - Collaboration.ipynb @@ -517,19 +517,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/03 - Trust.ipynb b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/03 - Trust.ipynb index 362c194317..2d4aa14b2c 100644 --- a/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/03 - Trust.ipynb +++ b/images/demo/user-home/01 - Kamu Basics (COVID-19 example)/03 - Trust.ipynb @@ -257,19 +257,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/01 - Working with Web3 data.ipynb b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/01 - Working with Web3 data.ipynb index 6406cb3a24..4875c21afc 100644 --- a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/01 - Working with Web3 data.ipynb +++ b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/01 - Working with Web3 data.ipynb @@ -326,35 +326,17 @@ "Jupyter notebook you're using now runs either on our demo server (https://demo.kamu.dev) or can be launched with `kamu notebook` command in your own workspace when you have the tool installed.\n", " \n", "To start working with data:\n", - "- First run `%load_ext kamu` to load our extension\n", - "- Then use `%import_dataset dataset_name` to import datasets from your workspace\n", + "- Import `kamu` Python library\n", + "- Create a connection to the node\n", + "- Using `file://` as a URL will start and connec to a local SQL server\n", + "- (Optionally) Load Jupyter extension to enable `%%sql` cell magic\n", "\n", - "Above commands will start the Apache Spark SQL server in the background and connect to it.\n", - " \n", - "By default all code cells execute in PySpark environment, which is most of the time not what we want.\n", - " \n", - "Instead we use `%%sql` cells to run SQL queries in Spark. It's a great way to explore and shape your data.\n", - " \n", - "You can download the result of any SQL query into the notebook's Python process using `%%sql -o pandas_dataframe_variable -n records_limit`.\n", - " \n", - "You can then use `%%local` cells to execute Python code inside the notebook to further process or visualize the data.\n", + "The `%%sql` cells will execute queries in `kamu`'s powerful SQL engines and return the results as Pandas dataframe.\n", " \n", "\n", "" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import pandas as pd\n", - "import hvplot.pandas\n", - "pd.set_option('max_colwidth', None)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -362,7 +344,10 @@ "outputs": [], "source": [ "%load_ext kamu\n", - "%import_dataset net.rocketpool.reth.mint-burn" + "import kamu\n", + "\n", + "con = kamu.connect(\"file://\")\n", + "print(\"Connected to kamu via\", con)" ] }, { @@ -372,7 +357,7 @@ "outputs": [], "source": [ "%%sql\n", - "select * from `net.rocketpool.reth.mint-burn` limit 5" + "select * from 'net.rocketpool.reth.mint-burn' limit 3" ] }, { @@ -382,17 +367,17 @@ "outputs": [], "source": [ "%%sql -o reth_pool -q\n", + "--## The -o option above downloads the SQL query result into the notebook as Pandas dataframe\n", + "--## The -q flag skips displaying the data\n", "\n", - "--## The -o option above downloads the SQL query result\n", - "--## into the local notebook as Pandas dataframe\n", "select \n", " event_time, \n", " case \n", - " when event_name = \"TokensMinted\" then \"Mint\"\n", - " when event_name = \"TokensBurned\" then \"Burn\"\n", + " when event_name = 'TokensMinted' then 'Mint'\n", + " when event_name = 'TokensBurned' then 'Burn'\n", " end as event_name, \n", " avg(eth_amount / amount) as rate\n", - "from `net.rocketpool.reth.mint-burn` \n", + "from \"net.rocketpool.reth.mint-burn\"\n", "group by event_time, event_name\n", "order by 1" ] @@ -403,7 +388,10 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", + "import pandas as pd\n", + "import hvplot.pandas\n", + "pd.set_option('max_colwidth', None)\n", + "\n", "reth_pool.hvplot.step(\n", " x=\"event_time\", \n", " by=\"event_name\", \n", @@ -448,15 +436,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset com.cryptocompare.ohlcv.eth-usd" - ] - }, { "cell_type": "code", "execution_count": null, @@ -464,9 +443,9 @@ "outputs": [], "source": [ "%%sql\n", - "select * from `com.cryptocompare.ohlcv.eth-usd` \n", + "select * from \"com.cryptocompare.ohlcv.eth-usd\"\n", "order by event_time desc \n", - "limit 5" + "limit 3" ] }, { @@ -476,7 +455,12 @@ "outputs": [], "source": [ "%%sql -o eth2usd -q\n", - "select event_time, open, close from `com.cryptocompare.ohlcv.eth-usd` order by event_time" + "select\n", + " event_time,\n", + " open,\n", + " close\n", + "from \"com.cryptocompare.ohlcv.eth-usd\"\n", + "order by event_time" ] }, { @@ -485,7 +469,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "eth2usd.hvplot.line(\n", " x=\"event_time\",\n", " y=\"close\",\n", @@ -553,15 +536,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.tokens.transfers" - ] - }, { "cell_type": "code", "execution_count": null, @@ -569,9 +543,9 @@ "outputs": [], "source": [ "%%sql\n", - "select * from `account.tokens.transfers` \n", + "select * from \"account.tokens.transfers\"\n", "order by block_number desc\n", - "limit 5" + "limit 3" ] }, { @@ -582,9 +556,9 @@ "source": [ "%%sql\n", "select\n", - " token_name as `Token`, \n", - " sum(abs(value) / pow(10, token_decimal)) as `Volume Traded` \n", - "from `account.tokens.transfers`\n", + " token_name as 'Token', \n", + " sum(abs(cast(value as double)) / pow(10, cast(token_decimal as int))) as 'Volume Traded'\n", + "from \"account.tokens.transfers\"\n", "group by 1" ] }, @@ -605,15 +579,6 @@ "This is why we need the `account.transactions` dataset that contains all account transactions along with their `ETH` value." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.transactions" - ] - }, { "cell_type": "code", "execution_count": null, @@ -622,9 +587,9 @@ "source": [ "%%sql\n", "select *\n", - "from `account.transactions` \n", + "from \"account.transactions\"\n", "order by block_number desc\n", - "limit 5" + "limit 3" ] }, { @@ -636,8 +601,8 @@ "%%sql -o transactions -q\n", "select\n", " *, \n", - " value / pow(10, 18) as value_eth \n", - "from `account.transactions` \n", + " cast(value as double) / pow(10, 18) as value_eth \n", + "from \"account.transactions\"\n", "order by block_number desc" ] }, @@ -647,8 +612,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "transactions\n", "transactions.hvplot.scatter(\n", " x=\"block_time\",\n", " y=\"value_eth\",\n", @@ -737,15 +700,6 @@ "In the next chapter we will explore why stream processing model is such a big deal." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.tokens.portfolio" - ] - }, { "cell_type": "code", "execution_count": null, @@ -753,7 +707,7 @@ "outputs": [], "source": [ "%%sql -o portfolio -q\n", - "select * from `account.tokens.portfolio` " + "select * from \"account.tokens.portfolio\"" ] }, { @@ -762,7 +716,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "portfolio[\n", " portfolio.token_symbol == \"rETH\"\n", "].hvplot.scatter(\n", @@ -779,7 +732,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "r = portfolio[\n", " portfolio.token_symbol == \"rETH\"\n", "]\n", @@ -957,15 +909,6 @@ "```" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.tokens.portfolio.market-value" - ] - }, { "cell_type": "code", "execution_count": null, @@ -973,7 +916,7 @@ "outputs": [], "source": [ "%%sql -o market_value -q\n", - "select * from `account.tokens.portfolio.market-value` " + "select * from \"account.tokens.portfolio.market-value\"" ] }, { @@ -982,7 +925,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "market_value.hvplot.line(\n", " x=\"event_time\", \n", " y=[\"token_book_value_eth\", \"token_market_value_eth\"],\n", @@ -1001,7 +943,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "market_value.hvplot.line(\n", " x=\"event_time\",\n", " y=[\"token_book_value_eth_as_usd\", \"token_market_value_usd\"],\n", @@ -1054,19 +995,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/02 - Watermarks.ipynb b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/02 - Watermarks.ipynb index 74a1167578..25c868d13c 100644 --- a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/02 - Watermarks.ipynb +++ b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/02 - Watermarks.ipynb @@ -66,18 +66,6 @@ "And just to confirm:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%local\n", - "import pandas as pd\n", - "import hvplot.pandas\n", - "pd.set_option('max_colwidth', None)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -85,9 +73,8 @@ "outputs": [], "source": [ "%load_ext kamu\n", - "%import_dataset net.rocketpool.reth.mint-burn\n", - "%import_dataset account.tokens.portfolio.market-value\n", - "%import_dataset account.tokens.portfolio" + "import kamu\n", + "con = kamu.connect(\"file://\")" ] }, { @@ -99,7 +86,7 @@ "%%sql\n", "select \n", " event_time, eth_amount, amount\n", - "from `net.rocketpool.reth.mint-burn`\n", + "from \"net.rocketpool.reth.mint-burn\"\n", "order by 1 desc\n", "limit 1" ] @@ -124,7 +111,7 @@ "%%sql\n", "select\n", " event_time, token_symbol, token_balance, token_market_value_eth, token_market_value_usd\n", - "from `account.tokens.portfolio.market-value` \n", + "from \"account.tokens.portfolio.market-value\"\n", "order by event_time desc\n", "limit 1" ] @@ -179,8 +166,8 @@ "--## so let's filter out all other types\n", "reth_portfolio as (\n", " select * \n", - " from `account.tokens.portfolio`\n", - " where token_symbol = \"rETH\"\n", + " from \"account.tokens.portfolio\"\n", + " where token_symbol = 'rETH'\n", "),\n", "\n", "--## Join every exchange rate data point\n", @@ -194,7 +181,7 @@ " pf.token_symbol,\n", " pf.token_balance,\n", " reth.eth_amount / reth.amount * pf.token_balance as token_market_value_eth\n", - " from `net.rocketpool.reth.mint-burn` as reth\n", + " from \"net.rocketpool.reth.mint-burn\" as reth\n", " join reth_portfolio as pf\n", " on reth.event_time >= pf.block_time\n", "),\n", @@ -221,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are multiple ways to perform a JOIN based on closest preceding timestamp, but all of them will give you the same result.\n", + "There are many ways to perform a JOIN based on closest preceding timestamp, but all of them will give you the same result.\n", "\n", "Let's compare this \"batch\" result to the \"streaming\" result we get from `kamu`:" ] @@ -233,7 +220,7 @@ "outputs": [], "source": [ "%%sql -o mv_streaming -q\n", - "select * from `account.tokens.portfolio.market-value`" + "select * from \"account.tokens.portfolio.market-value\"" ] }, { @@ -242,7 +229,10 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", + "import pandas as pd\n", + "import hvplot.pandas\n", + "pd.set_option('max_colwidth', None)\n", + "\n", "mv_batch.hvplot.line(\n", " x=\"event_time\", \n", " y=\"token_market_value_eth\",\n", @@ -371,15 +361,6 @@ "Let's have a look now:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.tokens.portfolio.market-value" - ] - }, { "cell_type": "code", "execution_count": null, @@ -387,7 +368,7 @@ "outputs": [], "source": [ "%%sql -o mv_streaming -q\n", - "select * from `account.tokens.portfolio.market-value`\n", + "select * from \"account.tokens.portfolio.market-value\"\n", "order by event_time desc" ] }, @@ -397,7 +378,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", "mv_batch.hvplot.line(\n", " x=\"event_time\", \n", " y=\"token_market_value_eth\",\n", @@ -452,19 +432,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/03 - Alternative reality pipeline.ipynb b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/03 - Alternative reality pipeline.ipynb index 47ebe82399..dd4d1a064d 100644 --- a/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/03 - Alternative reality pipeline.ipynb +++ b/images/demo/user-home/02 - Web3 Data (Ethereum trading example)/03 - Alternative reality pipeline.ipynb @@ -84,21 +84,15 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", + "%load_ext kamu\n", + "import kamu\n", "import pandas as pd\n", "import hvplot.pandas\n", "import holoviews as hv\n", - "pd.set_option('max_colwidth', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext kamu\n", - "%import_dataset co.alphavantage.tickers.daily.spy" + "\n", + "pd.set_option('max_colwidth', None)\n", + "\n", + "con = kamu.connect(\"file://\")" ] }, { @@ -110,7 +104,7 @@ "%%sql\n", "select \n", " event_time, close\n", - "from `co.alphavantage.tickers.daily.spy`\n", + "from \"co.alphavantage.tickers.daily.spy\"\n", "where event_time > '2010-01-01'\n", "\n", "--## Switch to \"Area\" viz type to view results" @@ -164,17 +158,6 @@ "Drum roll... Here comes the exciting part - let's compare the two investments!" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%import_dataset account.tokens.portfolio.usd\n", - "%import_dataset account.tokens.portfolio.market-value\n", - "%import_dataset account.whatif.reth-vs-snp500.market-value" - ] - }, { "cell_type": "code", "execution_count": null, @@ -182,8 +165,8 @@ "outputs": [], "source": [ "%%sql -o portfolio -q\n", - "select * from `account.tokens.portfolio.usd`\n", - "where token_symbol = \"rETH\"" + "select * from \"account.tokens.portfolio.usd\"\n", + "where token_symbol = 'rETH'" ] }, { @@ -193,7 +176,7 @@ "outputs": [], "source": [ "%%sql -o market_value -q\n", - "select * from `account.tokens.portfolio.market-value`" + "select * from \"account.tokens.portfolio.market-value\"" ] }, { @@ -203,7 +186,7 @@ "outputs": [], "source": [ "%%sql -o alternative_market_value -q\n", - "select * from `account.whatif.reth-vs-snp500.market-value`" + "select * from \"account.whatif.reth-vs-snp500.market-value\"" ] }, { @@ -212,8 +195,6 @@ "metadata": {}, "outputs": [], "source": [ - "%%local\n", - "\n", "max_height = max(\n", " alternative_market_value[\"alt_spy_market_value_usd\"].max(),\n", " market_value[\"token_market_value_usd\"].max(),\n", @@ -294,19 +275,21 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" } }, "nbformat": 4, diff --git a/images/jupyter/Dockerfile b/images/jupyter/Dockerfile index 1db823667d..661127342f 100644 --- a/images/jupyter/Dockerfile +++ b/images/jupyter/Dockerfile @@ -1,7 +1,7 @@ # Base image info: https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html # Base image tags: https://quay.io/repository/jupyter/minimal-notebook # Customization is based on: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -FROM quay.io/jupyter/minimal-notebook:2024-02-13 +FROM quay.io/jupyter/minimal-notebook:2024-12-09 ARG TARGETPLATFORM @@ -9,33 +9,24 @@ ARG TARGETPLATFORM USER root RUN apt update && \ - apt -y install curl wget gnupg unzip jq && \ + apt -y install netcat-traditional curl wget gnupg unzip jq && \ apt-get clean && rm -rf /var/lib/apt/lists /var/cache/apt/archives -COPY requirements/$TARGETPLATFORM/requirements.txt requirements.txt +COPY requirements/$TARGETPLATFORM/env.yaml env.yaml -# TODO: Semi-permanent hack for `mapboxgl` package being broken in conda-forge -# See: https://github.com/kamu-data/kamu-cli/issues/533 -RUN mamba install -y --file requirements.txt && \ - mamba uninstall mapboxgl && pip install --no-cache-dir mapboxgl && \ - mamba clean --all -f -y && \ - rm requirements.txt && \ - fix-permissions "${CONDA_DIR}" && \ +RUN mamba env update -y -f env.yaml && \ + mamba clean --all -f -y && \ + rm env.yaml && \ + fix-permissions "${CONDA_DIR}" && \ fix-permissions "/home/${NB_USER}" ######################################################################################### USER $NB_USER -COPY kamu.py /opt/conda/lib/python3.11/site-packages/kamu.py -COPY sparkmagic.json /home/$NB_USER/.sparkmagic/config.json - -RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension -RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/sparkkernel -RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/pysparkkernel -RUN jupyter-kernelspec install --user $(pip show sparkmagic | grep Location | cut -d" " -f2)/sparkmagic/kernels/sparkrkernel -RUN jupyter serverextension enable --py sparkmagic +COPY overrides.json /opt/conda/share/jupyter/lab/settings/overrides.json +RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements" ######################################################################################### USER root @@ -43,6 +34,4 @@ RUN fix-permissions "/home/${NB_USER}" USER $NB_USER -# TODO: Remove show_banner option after Sparkmagic supports novebook >= 7.0.0 -# See: https://github.com/jupyter-incubator/sparkmagic/issues/885 -CMD ["jupyter", "notebook", "--ip", "0.0.0.0", "--port", "8080", "--NotebookApp.show_banner=False"] +CMD ["jupyter", "lab", "--ip", "0.0.0.0", "--port", "8080", "--NotebookApp.iopub_data_rate_limit=1e10"] diff --git a/images/jupyter/Makefile b/images/jupyter/Makefile index 4a36ef57ad..488ddb1ceb 100644 --- a/images/jupyter/Makefile +++ b/images/jupyter/Makefile @@ -1,5 +1,5 @@ PLATFORM=linux/amd64 -BASE_IMAGE:=quay.io/jupyter/minimal-notebook:2024-02-13 +BASE_IMAGE:=quay.io/jupyter/minimal-notebook:2024-12-09 # Requires QEMU @@ -23,7 +23,22 @@ requirements-platform: # Executed from inside the base image +# +# The stupidity of Python package management ecosystems is unbelievabe. Jupyter images are +# based on conda, but some packages we have are only installable by pip. We want to make +# environment reproducible, but `conda env export` in `dependencies.pip` section includes only +# **top-level** packages, ignoring all direct and transitive dependencies. +# +# To make environment fully reproducible we have to resort to: +# - Run `conda env export` to lock conda packages (and part of pip packages) +# - Strig partial `pip` packages from conda env +# - Run `pip freeze` to lock pip packages +# - Filter out conda packages from `pip freeze` output +# - Merge the rest into `dependencies.pip` section of `conda env export` .PHONY: requirements-install-freeze requirements-install-freeze: - mamba install -y --file requirements/$(PLATFORM)/requirements.in - mamba list --export > requirements/$(PLATFORM)/requirements.txt + pip install -r requirements/$(PLATFORM)/requirements.in + pip freeze > requirements/$(PLATFORM)/requirements.txt + mamba env export --no-builds > requirements/$(PLATFORM)/env.yaml + python ./merge_requirements.py requirements/$(PLATFORM)/env.yaml requirements/$(PLATFORM)/requirements.txt + rm requirements/$(PLATFORM)/requirements.txt diff --git a/images/jupyter/merge_requirements.py b/images/jupyter/merge_requirements.py new file mode 100644 index 0000000000..6ff0b294e2 --- /dev/null +++ b/images/jupyter/merge_requirements.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import sys +import yaml + +env_path = sys.argv[1] +req_path = sys.argv[2] + +# Read files +with open(env_path) as f: + env = yaml.safe_load(f) + +with open(req_path) as f: + reqs = [r.strip() for r in f.readlines()] + +# Filter out pip packages from `conda env export` +env['dependencies'] = [ + dep for dep in env['dependencies'] + if not isinstance(dep, dict) or 'pip' not in dep +] + +# Filter conda packages from `pip freeze` output +reqs = [r for r in reqs if not '@ file://' in r] + +# Merge into environment +env['dependencies'].append({'pip': reqs}) + +# Replace env file +with open(env_path, 'w') as f: + yaml.safe_dump(env, f) diff --git a/images/jupyter/overrides.json b/images/jupyter/overrides.json new file mode 100644 index 0000000000..36a8a9e7b5 --- /dev/null +++ b/images/jupyter/overrides.json @@ -0,0 +1,5 @@ +{ + "@jupyterlab/apputils-extension:themes": { + "adaptive-theme": true + } +} \ No newline at end of file diff --git a/images/jupyter/requirements/linux/amd64/env.yaml b/images/jupyter/requirements/linux/amd64/env.yaml new file mode 100644 index 0000000000..42c02bd385 --- /dev/null +++ b/images/jupyter/requirements/linux/amd64/env.yaml @@ -0,0 +1,274 @@ +channels: +- conda-forge +dependencies: +- _libgcc_mutex=0.1 +- _openmp_mutex=4.5 +- alembic=1.14.0 +- annotated-types=0.7.0 +- anyio=4.7.0 +- archspec=0.2.3 +- argon2-cffi=23.1.0 +- argon2-cffi-bindings=21.2.0 +- arrow=1.3.0 +- asttokens=3.0.0 +- async-lru=2.0.4 +- async_generator=1.10 +- attrs=24.2.0 +- babel=2.16.0 +- beautifulsoup4=4.12.3 +- bleach=6.2.0 +- blinker=1.9.0 +- boltons=24.0.0 +- brotli-python=1.1.0 +- bzip2=1.0.8 +- c-ares=1.34.3 +- ca-certificates=2024.8.30 +- cached-property=1.5.2 +- cached_property=1.5.2 +- certifi=2024.8.30 +- certipy=0.2.1 +- cffi=1.17.1 +- charset-normalizer=3.4.0 +- colorama=0.4.6 +- comm=0.2.2 +- conda=24.11.0 +- conda-libmamba-solver=24.11.1 +- conda-package-handling=2.4.0 +- conda-package-streaming=0.11.0 +- cpp-expected=1.1.0 +- cryptography=44.0.0 +- debugpy=1.8.9 +- decorator=5.1.1 +- defusedxml=0.7.1 +- distro=1.9.0 +- entrypoints=0.4 +- exceptiongroup=1.2.2 +- executing=2.1.0 +- fmt=11.0.2 +- fqdn=1.5.1 +- frozendict=2.4.6 +- greenlet=3.1.1 +- h11=0.14.0 +- h2=4.1.0 +- hpack=4.0.0 +- httpcore=1.0.7 +- httpx=0.28.1 +- hyperframe=6.0.1 +- idna=3.10 +- importlib-metadata=8.5.0 +- importlib_resources=6.4.5 +- ipykernel=6.29.5 +- ipython=8.30.0 +- ipython_genutils=0.2.0 +- isoduration=20.11.0 +- jedi=0.19.2 +- jinja2=3.1.4 +- json5=0.10.0 +- jsonpatch=1.33 +- jsonpointer=3.0.0 +- jsonschema=4.23.0 +- jsonschema-specifications=2024.10.1 +- jsonschema-with-format-nongpl=4.23.0 +- jupyter-lsp=2.2.5 +- jupyter_client=8.6.3 +- jupyter_core=5.7.2 +- jupyter_events=0.10.0 +- jupyter_server=2.14.2 +- jupyter_server_terminals=0.5.3 +- jupyterhub-base=5.2.1 +- jupyterhub-singleuser=5.2.1 +- jupyterlab=4.3.2 +- jupyterlab_pygments=0.3.0 +- jupyterlab_server=2.27.3 +- keyutils=1.6.1 +- krb5=1.21.3 +- ld_impl_linux-64=2.43 +- libarchive=3.7.7 +- libcurl=8.10.1 +- libedit=3.1.20191231 +- libev=4.33 +- libexpat=2.6.4 +- libffi=3.4.2 +- libgcc=14.2.0 +- libgcc-ng=14.2.0 +- libgomp=14.2.0 +- libiconv=1.17 +- liblzma=5.6.3 +- libmamba=2.0.4 +- libmambapy=2.0.4 +- libnghttp2=1.64.0 +- libnsl=2.0.1 +- libsodium=1.0.20 +- libsolv=0.7.30 +- libsqlite=3.47.0 +- libssh2=1.11.1 +- libstdcxx=14.2.0 +- libstdcxx-ng=14.2.0 +- libuuid=2.38.1 +- libxcrypt=4.4.36 +- libxml2=2.13.5 +- libzlib=1.3.1 +- lz4-c=1.10.0 +- lzo=2.10 +- make=4.4.1 +- mako=1.3.8 +- mamba=2.0.4 +- markupsafe=3.0.2 +- matplotlib-inline=0.1.7 +- menuinst=2.2.0 +- mistune=3.0.2 +- nbclassic=1.1.0 +- nbclient=0.10.1 +- nbconvert-core=7.16.4 +- nbformat=5.10.4 +- ncurses=6.5 +- nest-asyncio=1.6.0 +- nlohmann_json=3.11.3 +- notebook=7.3.1 +- notebook-shim=0.2.4 +- oauthlib=3.2.2 +- openssl=3.4.0 +- overrides=7.7.0 +- packaging=24.2 +- pamela=1.2.0 +- pandocfilters=1.5.0 +- parso=0.8.4 +- pexpect=4.9.0 +- pickleshare=0.7.5 +- pip=24.3.1 +- pkgutil-resolve-name=1.3.10 +- platformdirs=4.3.6 +- pluggy=1.5.0 +- prometheus_client=0.21.1 +- prompt-toolkit=3.0.48 +- psutil=6.1.0 +- ptyprocess=0.7.0 +- pure_eval=0.2.3 +- pybind11-abi=4 +- pycosat=0.6.6 +- pycparser=2.22 +- pydantic=2.10.3 +- pydantic-core=2.27.1 +- pygments=2.18.0 +- pyjwt=2.10.1 +- pysocks=1.7.1 +- python=3.12.8 +- python-dateutil=2.9.0.post0 +- python-fastjsonschema=2.21.1 +- python-json-logger=2.0.7 +- python_abi=3.12 +- pytz=2024.2 +- pyyaml=6.0.2 +- pyzmq=26.2.0 +- readline=8.2 +- referencing=0.35.1 +- reproc=14.2.5.post0 +- reproc-cpp=14.2.5.post0 +- requests=2.32.3 +- rfc3339-validator=0.1.4 +- rfc3986-validator=0.1.1 +- rpds-py=0.22.3 +- ruamel.yaml=0.18.6 +- ruamel.yaml.clib=0.2.8 +- send2trash=1.8.3 +- setuptools=75.6.0 +- simdjson=3.10.1 +- six=1.17.0 +- sniffio=1.3.1 +- soupsieve=2.5 +- spdlog=1.14.1 +- sqlalchemy=2.0.36 +- stack_data=0.6.3 +- terminado=0.18.1 +- tinycss2=1.4.0 +- tk=8.6.13 +- tomli=2.2.1 +- tornado=6.4.2 +- tqdm=4.67.1 +- traitlets=5.14.3 +- truststore=0.10.0 +- types-python-dateutil=2.9.0.20241206 +- typing-extensions=4.12.2 +- typing_extensions=4.12.2 +- typing_utils=0.1.0 +- tzdata=2024b +- uri-template=1.3.0 +- urllib3=2.2.3 +- wcwidth=0.2.13 +- webcolors=24.11.1 +- webencodings=0.5.1 +- websocket-client=1.8.0 +- wheel=0.45.1 +- yaml=0.2.5 +- yaml-cpp=0.8.0 +- zeromq=4.3.5 +- zipp=3.21.0 +- zstandard=0.23.0 +- zstd=1.5.6 +- pip: + - adbc-driver-flightsql==1.3.0 + - adbc-driver-manager==1.3.0 + - altair==5.5.0 + - autovizwidget==0.22.0 + - bokeh==3.6.2 + - branca==0.8.1 + - cftime==1.6.4.post1 + - chroma-py==0.1.0.dev1 + - click==8.1.8 + - cloudpickle==3.1.0 + - colorcet==3.1.0 + - colour==0.1.5 + - contourpy==1.3.1 + - cycler==0.12.1 + - dask==2024.12.1 + - folium==0.19.2 + - fonttools==4.55.3 + - fsspec==2024.12.0 + - geojson==3.2.0 + - geopandas==1.0.1 + - hdijupyterutils==0.22.0 + - holoviews==1.20.0 + - hvplot==0.11.2 + - ipywidgets==8.1.5 + - jupyter==1.1.1 + - jupyter-console==6.6.3 + - jupyterlab_widgets==3.0.13 + - kamu==0.6.0 + - kiwisolver==1.4.8 + - linkify-it-py==2.0.3 + - livy==0.8.0 + - locket==1.0.0 + - mapboxgl==0.10.2 + - Markdown==3.7 + - markdown-it-py==3.0.0 + - matplotlib==3.10.0 + - mdit-py-plugins==0.4.2 + - mdurl==0.1.2 + - narwhals==1.19.1 + - netCDF4==1.7.2 + - numpy==2.2.1 + - pandas==2.2.3 + - pandas-bokeh==0.5.5 + - panel==1.5.5 + - param==2.2.0 + - partd==1.4.2 + - pillow==11.0.0 + - plotly==5.24.1 + - pyarrow==18.1.0 + - pyogrio==0.10.0 + - pyparsing==3.2.0 + - pyproj==3.7.0 + - pyviz_comms==3.0.3 + - setuptools==75.6.0 + - shapely==2.0.6 + - tenacity==9.0.0 + - toolz==1.0.0 + - tzdata==2024.2 + - uc-micro-py==1.0.3 + - wheel==0.45.1 + - widgetsnbextension==4.0.13 + - xarray==2024.11.0 + - xyzservices==2024.9.0 + - zstandard==0.23.0 +name: base +prefix: /opt/conda diff --git a/images/jupyter/requirements/linux/amd64/requirements.in b/images/jupyter/requirements/linux/amd64/requirements.in index d7b27f8f59..68140ad718 100644 --- a/images/jupyter/requirements/linux/amd64/requirements.in +++ b/images/jupyter/requirements/linux/amd64/requirements.in @@ -1,21 +1,16 @@ -# TODO: Pinned due to sparkmagic installation issue -# See: https://github.com/jupyter-incubator/sparkmagic/issues/825 -# See workaround applied in: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -notebook==6.5.5 +kamu[jupyter-autoviz,jupyter-sql,spark] -sparkmagic - -pandas +dask geopandas geojson -xarray netcdf4 -dask +pandas +xarray +altair bokeh -hvplot -pandas-bokeh folium -altair +hvplot mapboxgl +pandas-bokeh shapely diff --git a/images/jupyter/requirements/linux/amd64/requirements.txt b/images/jupyter/requirements/linux/amd64/requirements.txt deleted file mode 100644 index c24a4e3dcf..0000000000 --- a/images/jupyter/requirements/linux/amd64/requirements.txt +++ /dev/null @@ -1,414 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -_libgcc_mutex=0.1=conda_forge -_openmp_mutex=4.5=2_gnu -alembic=1.13.1=pyhd8ed1ab_1 -altair=5.2.0=pyhd8ed1ab_0 -anyio=4.2.0=pyhd8ed1ab_0 -archspec=0.2.2=pyhd8ed1ab_0 -argon2-cffi=23.1.0=pyhd8ed1ab_0 -argon2-cffi-bindings=21.2.0=py311h459d7ec_4 -arrow=1.3.0=pyhd8ed1ab_0 -asttokens=2.4.1=pyhd8ed1ab_0 -async-lru=2.0.4=pyhd8ed1ab_0 -async_generator=1.10=py_0 -attrs=23.2.0=pyh71513ae_0 -autovizwidget=0.21.0=pyh1a96a4e_1 -aws-c-auth=0.7.16=h70caa3e_0 -aws-c-cal=0.6.9=h14ec70c_3 -aws-c-common=0.9.12=hd590300_0 -aws-c-compression=0.2.17=h572eabf_8 -aws-c-event-stream=0.4.2=h17cd1f3_0 -aws-c-http=0.8.0=hc6da83f_5 -aws-c-io=0.14.3=h3c8c088_1 -aws-c-mqtt=0.10.2=h0ef3971_0 -aws-c-s3=0.5.1=h2910485_1 -aws-c-sdkutils=0.1.14=h572eabf_0 -aws-checksums=0.1.17=h572eabf_7 -aws-crt-cpp=0.26.2=ha623a59_3 -aws-sdk-cpp=1.11.267=h0bb408c_0 -azure-core-cpp=1.10.3=h91d86a7_1 -azure-storage-blobs-cpp=12.10.0=h00ab1b0_0 -azure-storage-common-cpp=12.5.0=hb858b4b_2 -babel=2.14.0=pyhd8ed1ab_0 -beautifulsoup4=4.12.3=pyha770c72_0 -bleach=6.1.0=pyhd8ed1ab_0 -blinker=1.7.0=pyhd8ed1ab_0 -blosc=1.21.5=h0f2a231_0 -bokeh=3.3.4=pyhd8ed1ab_0 -boltons=23.1.1=pyhd8ed1ab_0 -branca=0.7.1=pyhd8ed1ab_0 -brotli=1.1.0=hd590300_1 -brotli-bin=1.1.0=hd590300_1 -brotli-python=1.1.0=py311hb755f60_1 -bzip2=1.0.8=hd590300_5 -c-ares=1.26.0=hd590300_0 -ca-certificates=2024.2.2=hbcca054_0 -cached-property=1.5.2=hd8ed1ab_1 -cached_property=1.5.2=pyha770c72_1 -cairo=1.18.0=h3faef2a_0 -certifi=2024.2.2=pyhd8ed1ab_0 -certipy=0.1.3=py_0 -cffi=1.16.0=py311hb3a22ac_0 -cfitsio=4.3.1=hbdc6101_0 -cftime=1.6.3=py311h1f0f07a_0 -charset-normalizer=3.3.2=pyhd8ed1ab_0 -chroma-py=0.1.0.dev1=py_0 -click=8.1.7=unix_pyh707e725_0 -click-plugins=1.1.1=py_0 -cligj=0.7.2=pyhd8ed1ab_1 -cloudpickle=3.0.0=pyhd8ed1ab_0 -colorama=0.4.6=pyhd8ed1ab_0 -colorcet=3.0.1=pyhd8ed1ab_0 -colour=0.1.5=pyhd8ed1ab_1 -comm=0.2.1=pyhd8ed1ab_0 -conda=23.11.0=py311h38be061_1 -conda-libmamba-solver=24.1.0=pyhd8ed1ab_0 -conda-package-handling=2.2.0=pyh38be061_0 -conda-package-streaming=0.9.0=pyhd8ed1ab_0 -configurable-http-proxy=4.6.1=h92b4e83_0 -contourpy=1.2.0=py311h9547e67_0 -cryptography=42.0.2=py311hcb13ee4_0 -cycler=0.12.1=pyhd8ed1ab_0 -cytoolz=0.12.3=py311h459d7ec_0 -dask=2024.2.0=pyhd8ed1ab_0 -dask-core=2024.2.0=pyhd8ed1ab_0 -debugpy=1.8.1=py311hb755f60_0 -decorator=5.1.1=pyhd8ed1ab_0 -defusedxml=0.7.1=pyhd8ed1ab_0 -distributed=2024.2.0=pyhd8ed1ab_0 -distro=1.9.0=pyhd8ed1ab_0 -entrypoints=0.4=pyhd8ed1ab_0 -exceptiongroup=1.2.0=pyhd8ed1ab_2 -executing=2.0.1=pyhd8ed1ab_0 -expat=2.5.0=hcb278e6_1 -fiona=1.9.5=py311hf8e0aa6_3 -fmt=10.2.1=h00ab1b0_0 -folium=0.15.1=pyhd8ed1ab_0 -font-ttf-dejavu-sans-mono=2.37=hab24e00_0 -font-ttf-inconsolata=3.000=h77eed37_0 -font-ttf-source-code-pro=2.038=h77eed37_0 -font-ttf-ubuntu=0.83=h77eed37_1 -fontconfig=2.14.2=h14ed4e7_0 -fonts-conda-ecosystem=1=0 -fonts-conda-forge=1=0 -fonttools=4.49.0=py311h459d7ec_0 -fqdn=1.5.1=pyhd8ed1ab_0 -freetype=2.12.1=h267a509_2 -freexl=2.0.0=h743c826_0 -fsspec=2024.2.0=pyhca7485f_0 -gdal=3.8.4=py311h8be719e_0 -geojson=3.1.0=pyhd8ed1ab_0 -geopandas=0.14.3=pyhd8ed1ab_0 -geopandas-base=0.14.3=pyha770c72_0 -geos=3.12.1=h59595ed_0 -geotiff=1.7.1=h6b2125f_15 -gettext=0.21.1=h27087fc_0 -gflags=2.2.2=he1b5a44_1004 -giflib=5.2.1=h0b41bf4_3 -glog=0.6.0=h6f12383_0 -greenlet=3.0.3=py311hb755f60_0 -h11=0.14.0=pyhd8ed1ab_0 -h2=4.1.0=pyhd8ed1ab_0 -hdf4=4.2.15=h2a13503_7 -hdf5=1.14.3=nompi_h4f84152_100 -hdijupyterutils=0.21.0=pyh1a96a4e_1 -holoviews=1.18.3=pyhd8ed1ab_0 -hpack=4.0.0=pyh9f0ad1d_0 -httpcore=1.0.2=pyhd8ed1ab_0 -httpx=0.26.0=pyhd8ed1ab_0 -hvplot=0.9.2=pyhd8ed1ab_0 -hyperframe=6.0.1=pyhd8ed1ab_0 -icu=73.2=h59595ed_0 -idna=3.6=pyhd8ed1ab_0 -importlib-metadata=7.0.1=pyha770c72_0 -importlib_metadata=7.0.1=hd8ed1ab_0 -importlib_resources=6.1.1=pyhd8ed1ab_0 -ipykernel=6.29.2=pyhd33586a_0 -ipython=8.21.0=pyh707e725_0 -ipython_genutils=0.2.0=py_1 -ipywidgets=8.1.2=pyhd8ed1ab_0 -isoduration=20.11.0=pyhd8ed1ab_0 -jedi=0.19.1=pyhd8ed1ab_0 -jinja2=3.1.3=pyhd8ed1ab_0 -joblib=1.3.2=pyhd8ed1ab_0 -json-c=0.17=h7ab15ed_0 -json5=0.9.14=pyhd8ed1ab_0 -jsonpatch=1.33=pyhd8ed1ab_0 -jsonpointer=2.4=py311h38be061_3 -jsonschema=4.21.1=pyhd8ed1ab_0 -jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 -jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 -jupyter=1.0.0=pyhd8ed1ab_10 -jupyter-lsp=2.2.2=pyhd8ed1ab_0 -jupyter_client=7.4.9=pyhd8ed1ab_0 -jupyter_console=6.6.3=pyhd8ed1ab_0 -jupyter_core=5.7.1=py311h38be061_0 -jupyter_events=0.9.0=pyhd8ed1ab_0 -jupyter_server=2.12.5=pyhd8ed1ab_0 -jupyter_server_terminals=0.5.2=pyhd8ed1ab_0 -jupyter_telemetry=0.1.0=pyhd8ed1ab_1 -jupyterhub=4.0.2=pyh31011fe_0 -jupyterhub-base=4.0.2=pyh31011fe_0 -jupyterlab=4.1.1=pyhd8ed1ab_0 -jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 -jupyterlab_server=2.25.2=pyhd8ed1ab_0 -jupyterlab_widgets=3.0.10=pyhd8ed1ab_0 -kealib=1.5.3=h2f55d51_0 -keyutils=1.6.1=h166bdaf_0 -kiwisolver=1.4.5=py311h9547e67_1 -krb5=1.21.2=h659d440_0 -lcms2=2.16=hb7c19ff_0 -ld_impl_linux-64=2.40=h41732ed_0 -lerc=4.0.0=h27087fc_0 -libabseil=20230802.1=cxx17_h59595ed_0 -libaec=1.1.2=h59595ed_1 -libarchive=3.7.2=h2aa1ff5_1 -libarrow=15.0.0=h49c8883_4_cpu -libarrow-acero=15.0.0=h59595ed_4_cpu -libarrow-dataset=15.0.0=h59595ed_4_cpu -libarrow-flight=15.0.0=hdc44a87_4_cpu -libarrow-flight-sql=15.0.0=hfbc7f12_4_cpu -libarrow-gandiva=15.0.0=h308e607_4_cpu -libarrow-substrait=15.0.0=hfbc7f12_4_cpu -libblas=3.9.0=21_linux64_openblas -libboost-headers=1.84.0=ha770c72_1 -libbrotlicommon=1.1.0=hd590300_1 -libbrotlidec=1.1.0=hd590300_1 -libbrotlienc=1.1.0=hd590300_1 -libcblas=3.9.0=21_linux64_openblas -libcrc32c=1.1.2=h9c3ff4c_0 -libcurl=8.5.0=hca28451_0 -libdeflate=1.19=hd590300_0 -libedit=3.1.20191231=he28a2e2_2 -libev=4.33=hd590300_2 -libevent=2.1.12=hf998b51_1 -libexpat=2.5.0=hcb278e6_1 -libffi=3.4.2=h7f98852_5 -libgcc-ng=13.2.0=h807b86a_5 -libgdal=3.8.4=h9323651_0 -libgfortran-ng=13.2.0=h69a702a_5 -libgfortran5=13.2.0=ha4646dd_5 -libglib=2.78.4=h783c2da_0 -libgomp=13.2.0=h807b86a_5 -libgoogle-cloud=2.12.0=hef10d8f_5 -libgrpc=1.60.1=h74775cd_0 -libiconv=1.17=hd590300_2 -libjpeg-turbo=3.0.0=hd590300_1 -libkml=1.3.0=h01aab08_1018 -liblapack=3.9.0=21_linux64_openblas -libllvm15=15.0.7=hb3ce162_4 -libmamba=1.5.6=had39da4_0 -libmambapy=1.5.6=py311hf2555c7_0 -libnetcdf=4.9.2=nompi_h9612171_113 -libnghttp2=1.58.0=h47da74e_1 -libnl=3.9.0=hd590300_0 -libnsl=2.0.1=hd590300_0 -libnuma=2.0.16=h0b41bf4_1 -libopenblas=0.3.26=pthreads_h413a1c8_0 -libparquet=15.0.0=h352af49_4_cpu -libpng=1.6.42=h2797004_0 -libpq=16.2=h33b98f1_0 -libprotobuf=4.25.1=hf27288f_2 -libre2-11=2023.06.02=h7a70373_0 -librttopo=1.1.0=h8917695_15 -libsodium=1.0.18=h36c2ea0_1 -libsolv=0.7.28=hfc55251_0 -libspatialindex=1.9.3=h9c3ff4c_4 -libspatialite=5.1.0=h7bd4643_4 -libsqlite=3.45.1=h2797004_0 -libssh2=1.11.0=h0841786_0 -libstdcxx-ng=13.2.0=h7e041cc_5 -libthrift=0.19.0=hb90f79a_1 -libtiff=4.6.0=ha9c0a0a_2 -libutf8proc=2.8.0=h166bdaf_0 -libuuid=2.38.1=h0b41bf4_0 -libuv=1.46.0=hd590300_0 -libwebp-base=1.3.2=hd590300_0 -libxcb=1.15=h0b41bf4_0 -libxcrypt=4.4.36=hd590300_1 -libxml2=2.12.5=h232c23b_0 -libzip=1.10.1=h2629f0a_3 -libzlib=1.2.13=hd590300_5 -linkify-it-py=2.0.3=pyhd8ed1ab_0 -locket=1.0.0=pyhd8ed1ab_0 -lz4=4.3.3=py311h38e4bf4_0 -lz4-c=1.9.4=hcb278e6_0 -lzo=2.10=h516909a_1000 -make=4.3=hd18ef5c_1 -mako=1.3.2=pyhd8ed1ab_0 -mamba=1.5.6=py311h3072747_0 -mapboxgl=0.10.2=py_1 -mapclassify=2.6.1=pyhd8ed1ab_0 -markdown=3.5.2=pyhd8ed1ab_0 -markdown-it-py=3.0.0=pyhd8ed1ab_0 -markupsafe=2.1.5=py311h459d7ec_0 -matplotlib-base=3.8.3=py311h54ef318_0 -matplotlib-inline=0.1.6=pyhd8ed1ab_0 -mdit-py-plugins=0.4.0=pyhd8ed1ab_0 -mdurl=0.1.2=pyhd8ed1ab_0 -menuinst=2.0.2=py311h38be061_0 -minizip=4.0.4=h0ab5242_0 -mistune=3.0.2=pyhd8ed1ab_0 -msgpack-python=1.0.7=py311h9547e67_0 -munkres=1.1.4=pyh9f0ad1d_0 -nbclassic=1.0.0=pyhb4ecaf3_1 -nbclient=0.8.0=pyhd8ed1ab_0 -nbconvert=7.16.0=pyhd8ed1ab_0 -nbconvert-core=7.16.0=pyhd8ed1ab_0 -nbconvert-pandoc=7.16.0=pyhd8ed1ab_0 -nbformat=5.9.2=pyhd8ed1ab_0 -ncurses=6.4=h59595ed_2 -nest-asyncio=1.6.0=pyhd8ed1ab_0 -netcdf4=1.6.5=nompi_py311he8ad708_100 -networkx=3.2.1=pyhd8ed1ab_0 -nodejs=20.9.0=hb753e55_0 -notebook=6.5.5=pyha770c72_0 -notebook-shim=0.2.3=pyhd8ed1ab_0 -nspr=4.35=h27087fc_0 -nss=3.98=h1d7d5a4_0 -numpy=1.26.4=py311h64a7726_0 -oauthlib=3.2.2=pyhd8ed1ab_0 -openjpeg=2.5.0=h488ebb8_3 -openssl=3.2.1=hd590300_0 -orc=1.9.2=h7829240_1 -overrides=7.7.0=pyhd8ed1ab_0 -packaging=23.2=pyhd8ed1ab_0 -pamela=1.1.0=pyh1a96a4e_0 -pandas=1.5.3=py311h2872171_1 -pandas-bokeh=0.5.5=pyhd8ed1ab_0 -pandoc=3.1.11.1=ha770c72_0 -pandocfilters=1.5.0=pyhd8ed1ab_0 -panel=1.3.8=pyhd8ed1ab_0 -param=2.0.2=pyhca7485f_0 -parso=0.8.3=pyhd8ed1ab_0 -partd=1.4.1=pyhd8ed1ab_0 -pcre2=10.42=hcad00b1_0 -pexpect=4.9.0=pyhd8ed1ab_0 -pickleshare=0.7.5=py_1003 -pillow=10.2.0=py311ha6c5da5_0 -pip=24.0=pyhd8ed1ab_0 -pixman=0.43.2=h59595ed_0 -pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 -platformdirs=4.2.0=pyhd8ed1ab_0 -plotly=5.19.0=pyhd8ed1ab_0 -pluggy=1.4.0=pyhd8ed1ab_0 -poppler=24.02.0=h590f24d_0 -poppler-data=0.4.12=hd8ed1ab_0 -postgresql=16.2=h7387d8b_0 -proj=9.3.1=h1d62c97_0 -prometheus_client=0.19.0=pyhd8ed1ab_0 -prompt-toolkit=3.0.42=pyha770c72_0 -prompt_toolkit=3.0.42=hd8ed1ab_0 -psutil=5.9.8=py311h459d7ec_0 -pthread-stubs=0.4=h36c2ea0_1001 -ptyprocess=0.7.0=pyhd3deb0d_0 -pure_eval=0.2.2=pyhd8ed1ab_0 -pyarrow=15.0.0=py311h39c9aba_4_cpu -pyarrow-hotfix=0.6=pyhd8ed1ab_0 -pybind11-abi=4=hd8ed1ab_3 -pycosat=0.6.6=py311h459d7ec_0 -pycparser=2.21=pyhd8ed1ab_0 -pyct=0.5.0=pyhd8ed1ab_0 -pycurl=7.45.1=py311hae980a4_3 -pygments=2.17.2=pyhd8ed1ab_0 -pyjwt=2.8.0=pyhd8ed1ab_1 -pyopenssl=24.0.0=pyhd8ed1ab_0 -pyparsing=3.1.1=pyhd8ed1ab_0 -pyproj=3.6.1=py311hca0b8b9_5 -pysocks=1.7.1=pyha2e5f31_6 -pyspnego=0.9.1=py311h459d7ec_2 -python=3.11.7=hab00c5b_1_cpython -python-dateutil=2.8.2=pyhd8ed1ab_0 -python-fastjsonschema=2.19.1=pyhd8ed1ab_0 -python-json-logger=2.0.7=pyhd8ed1ab_0 -python_abi=3.11=4_cp311 -pytz=2024.1=pyhd8ed1ab_0 -pyviz_comms=3.0.0=pyhd8ed1ab_0 -pyyaml=6.0.1=py311h459d7ec_1 -pyzmq=24.0.1=py311ha4b6469_1 -qtconsole-base=5.5.1=pyha770c72_0 -qtpy=2.4.1=pyhd8ed1ab_0 -rdma-core=50.0=hd3aeb46_1 -re2=2023.06.02=h2873b5e_0 -readline=8.2=h8228510_1 -referencing=0.33.0=pyhd8ed1ab_0 -reproc=14.2.4.post0=hd590300_1 -reproc-cpp=14.2.4.post0=h59595ed_1 -requests=2.31.0=pyhd8ed1ab_0 -requests-kerberos=0.14.0=pyhd8ed1ab_1 -rfc3339-validator=0.1.4=pyhd8ed1ab_0 -rfc3986-validator=0.1.1=pyh9f0ad1d_0 -rpds-py=0.17.1=py311h46250e7_0 -rtree=1.2.0=py311h3bb2b0f_0 -ruamel.yaml=0.18.6=py311h459d7ec_0 -ruamel.yaml.clib=0.2.8=py311h459d7ec_0 -s2n=1.4.3=h06160fa_0 -scikit-learn=1.4.1.post1=py311hc009520_0 -scipy=1.12.0=py311h64a7726_2 -send2trash=1.8.2=pyh41d4057_0 -setuptools=69.0.3=pyhd8ed1ab_0 -shapely=2.0.3=py311h2032efe_0 -six=1.16.0=pyh6c4a22f_0 -snappy=1.1.10=h9fff704_0 -sniffio=1.3.0=pyhd8ed1ab_0 -sortedcontainers=2.4.0=pyhd8ed1ab_0 -soupsieve=2.5=pyhd8ed1ab_1 -sparkmagic=0.21.0=pyhd8ed1ab_1 -sqlalchemy=2.0.26=py311h459d7ec_0 -sqlite=3.45.1=h2c6b66d_0 -stack_data=0.6.2=pyhd8ed1ab_0 -tblib=3.0.0=pyhd8ed1ab_0 -tenacity=8.2.3=pyhd8ed1ab_0 -terminado=0.18.0=pyh0d859eb_0 -threadpoolctl=3.3.0=pyhc1e730c_0 -tiledb=2.20.0=h4386cac_0 -tinycss2=1.2.1=pyhd8ed1ab_0 -tk=8.6.13=noxft_h4845f30_101 -tomli=2.0.1=pyhd8ed1ab_0 -toolz=0.12.1=pyhd8ed1ab_0 -tornado=6.3.3=py311h459d7ec_1 -tqdm=4.66.2=pyhd8ed1ab_0 -traitlets=5.9.0=pyhd8ed1ab_0 -truststore=0.8.0=pyhd8ed1ab_0 -types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 -typing-extensions=4.9.0=hd8ed1ab_0 -typing_extensions=4.9.0=pyha770c72_0 -typing_utils=0.1.0=pyhd8ed1ab_0 -tzcode=2024a=h3f72095_0 -tzdata=2024a=h0c530f3_0 -uc-micro-py=1.0.3=pyhd8ed1ab_0 -ucx=1.15.0=h75e419f_3 -uri-template=1.3.0=pyhd8ed1ab_0 -uriparser=0.9.7=hcb278e6_1 -urllib3=2.2.0=pyhd8ed1ab_0 -wcwidth=0.2.13=pyhd8ed1ab_0 -webcolors=1.13=pyhd8ed1ab_0 -webencodings=0.5.1=pyhd8ed1ab_2 -websocket-client=1.7.0=pyhd8ed1ab_0 -wheel=0.42.0=pyhd8ed1ab_0 -widgetsnbextension=4.0.10=pyhd8ed1ab_0 -xarray=2024.2.0=pyhd8ed1ab_0 -xerces-c=3.2.5=hac6953d_0 -xorg-kbproto=1.0.7=h7f98852_1002 -xorg-libice=1.1.1=hd590300_0 -xorg-libsm=1.2.4=h7391055_0 -xorg-libx11=1.8.7=h8ee46fc_0 -xorg-libxau=1.0.11=hd590300_0 -xorg-libxdmcp=1.1.3=h7f98852_0 -xorg-libxext=1.3.4=h0b41bf4_2 -xorg-libxrender=0.9.11=hd590300_0 -xorg-renderproto=0.11.1=h7f98852_1002 -xorg-xextproto=7.3.0=h0b41bf4_1003 -xorg-xproto=7.0.31=h7f98852_1007 -xyzservices=2023.10.1=pyhd8ed1ab_0 -xz=5.2.6=h166bdaf_0 -yaml=0.2.5=h7f98852_2 -yaml-cpp=0.8.0=h59595ed_0 -zeromq=4.3.5=h59595ed_0 -zict=3.0.0=pyhd8ed1ab_0 -zipp=3.17.0=pyhd8ed1ab_0 -zlib=1.2.13=hd590300_5 -zstandard=0.22.0=py311haa97af0_0 -zstd=1.5.5=hfc55251_0 diff --git a/images/jupyter/requirements/linux/arm64/env.yaml b/images/jupyter/requirements/linux/arm64/env.yaml new file mode 100644 index 0000000000..1b0f42d63d --- /dev/null +++ b/images/jupyter/requirements/linux/arm64/env.yaml @@ -0,0 +1,274 @@ +channels: +- conda-forge +dependencies: +- _openmp_mutex=4.5 +- alembic=1.14.0 +- annotated-types=0.7.0 +- anyio=4.7.0 +- archspec=0.2.3 +- argon2-cffi=23.1.0 +- argon2-cffi-bindings=21.2.0 +- arrow=1.3.0 +- asttokens=3.0.0 +- async-lru=2.0.4 +- async_generator=1.10 +- attrs=24.2.0 +- babel=2.16.0 +- beautifulsoup4=4.12.3 +- bleach=6.2.0 +- blinker=1.9.0 +- boltons=24.0.0 +- brotli-python=1.1.0 +- bzip2=1.0.8 +- c-ares=1.34.3 +- ca-certificates=2024.8.30 +- cached-property=1.5.2 +- cached_property=1.5.2 +- certifi=2024.8.30 +- certipy=0.2.1 +- cffi=1.17.1 +- charset-normalizer=3.4.0 +- colorama=0.4.6 +- comm=0.2.2 +- conda=24.11.0 +- conda-libmamba-solver=24.11.1 +- conda-package-handling=2.4.0 +- conda-package-streaming=0.11.0 +- cpp-expected=1.1.0 +- cryptography=44.0.0 +- debugpy=1.8.9 +- decorator=5.1.1 +- defusedxml=0.7.1 +- distro=1.9.0 +- entrypoints=0.4 +- exceptiongroup=1.2.2 +- executing=2.1.0 +- fmt=11.0.2 +- fqdn=1.5.1 +- frozendict=2.4.6 +- greenlet=3.1.1 +- h11=0.14.0 +- h2=4.1.0 +- hpack=4.0.0 +- httpcore=1.0.7 +- httpx=0.28.1 +- hyperframe=6.0.1 +- icu=75.1 +- idna=3.10 +- importlib-metadata=8.5.0 +- importlib_resources=6.4.5 +- ipykernel=6.29.5 +- ipython=8.30.0 +- ipython_genutils=0.2.0 +- isoduration=20.11.0 +- jedi=0.19.2 +- jinja2=3.1.4 +- json5=0.10.0 +- jsonpatch=1.33 +- jsonpointer=3.0.0 +- jsonschema=4.23.0 +- jsonschema-specifications=2024.10.1 +- jsonschema-with-format-nongpl=4.23.0 +- jupyter-lsp=2.2.5 +- jupyter_client=8.6.3 +- jupyter_core=5.7.2 +- jupyter_events=0.10.0 +- jupyter_server=2.14.2 +- jupyter_server_terminals=0.5.3 +- jupyterhub-base=5.2.1 +- jupyterhub-singleuser=5.2.1 +- jupyterlab=4.3.2 +- jupyterlab_pygments=0.3.0 +- jupyterlab_server=2.27.3 +- keyutils=1.6.1 +- krb5=1.21.3 +- ld_impl_linux-aarch64=2.43 +- libarchive=3.7.7 +- libcurl=8.10.1 +- libedit=3.1.20191231 +- libev=4.33 +- libexpat=2.6.4 +- libffi=3.4.2 +- libgcc=14.2.0 +- libgcc-ng=14.2.0 +- libgomp=14.2.0 +- libiconv=1.17 +- liblzma=5.6.3 +- libmamba=2.0.4 +- libmambapy=2.0.4 +- libnghttp2=1.64.0 +- libnsl=2.0.1 +- libsodium=1.0.20 +- libsolv=0.7.30 +- libsqlite=3.47.0 +- libssh2=1.11.1 +- libstdcxx=14.2.0 +- libstdcxx-ng=14.2.0 +- libuuid=2.38.1 +- libxcrypt=4.4.36 +- libxml2=2.13.5 +- libzlib=1.3.1 +- lz4-c=1.10.0 +- lzo=2.10 +- make=4.4.1 +- mako=1.3.8 +- mamba=2.0.4 +- markupsafe=3.0.2 +- matplotlib-inline=0.1.7 +- menuinst=2.2.0 +- mistune=3.0.2 +- nbclassic=1.1.0 +- nbclient=0.10.1 +- nbconvert-core=7.16.4 +- nbformat=5.10.4 +- ncurses=6.5 +- nest-asyncio=1.6.0 +- nlohmann_json=3.11.3 +- notebook=7.3.1 +- notebook-shim=0.2.4 +- oauthlib=3.2.2 +- openssl=3.4.0 +- overrides=7.7.0 +- packaging=24.2 +- pamela=1.2.0 +- pandocfilters=1.5.0 +- parso=0.8.4 +- pexpect=4.9.0 +- pickleshare=0.7.5 +- pip=24.3.1 +- pkgutil-resolve-name=1.3.10 +- platformdirs=4.3.6 +- pluggy=1.5.0 +- prometheus_client=0.21.1 +- prompt-toolkit=3.0.48 +- psutil=6.1.0 +- ptyprocess=0.7.0 +- pure_eval=0.2.3 +- pybind11-abi=4 +- pycosat=0.6.6 +- pycparser=2.22 +- pydantic=2.10.3 +- pydantic-core=2.27.1 +- pygments=2.18.0 +- pyjwt=2.10.1 +- pysocks=1.7.1 +- python=3.12.8 +- python-dateutil=2.9.0.post0 +- python-fastjsonschema=2.21.1 +- python-json-logger=2.0.7 +- python_abi=3.12 +- pytz=2024.2 +- pyyaml=6.0.2 +- pyzmq=26.2.0 +- readline=8.2 +- referencing=0.35.1 +- reproc=14.2.4.post0 +- reproc-cpp=14.2.4.post0 +- requests=2.32.3 +- rfc3339-validator=0.1.4 +- rfc3986-validator=0.1.1 +- rpds-py=0.22.3 +- ruamel.yaml=0.18.6 +- ruamel.yaml.clib=0.2.8 +- send2trash=1.8.3 +- setuptools=75.6.0 +- simdjson=3.10.1 +- six=1.17.0 +- sniffio=1.3.1 +- soupsieve=2.5 +- spdlog=1.14.1 +- sqlalchemy=2.0.36 +- stack_data=0.6.3 +- terminado=0.18.1 +- tinycss2=1.4.0 +- tk=8.6.13 +- tomli=2.2.1 +- tornado=6.4.2 +- tqdm=4.67.1 +- traitlets=5.14.3 +- truststore=0.10.0 +- types-python-dateutil=2.9.0.20241206 +- typing-extensions=4.12.2 +- typing_extensions=4.12.2 +- typing_utils=0.1.0 +- tzdata=2024b +- uri-template=1.3.0 +- urllib3=2.2.3 +- wcwidth=0.2.13 +- webcolors=24.11.1 +- webencodings=0.5.1 +- websocket-client=1.8.0 +- wheel=0.45.1 +- yaml=0.2.5 +- yaml-cpp=0.8.0 +- zeromq=4.3.5 +- zipp=3.21.0 +- zstandard=0.23.0 +- zstd=1.5.6 +- pip: + - adbc-driver-flightsql==1.3.0 + - adbc-driver-manager==1.3.0 + - altair==5.5.0 + - autovizwidget==0.22.0 + - bokeh==3.6.2 + - branca==0.8.1 + - cftime==1.6.4.post1 + - chroma-py==0.1.0.dev1 + - click==8.1.8 + - cloudpickle==3.1.0 + - colorcet==3.1.0 + - colour==0.1.5 + - contourpy==1.3.1 + - cycler==0.12.1 + - dask==2024.12.1 + - folium==0.19.2 + - fonttools==4.55.3 + - fsspec==2024.12.0 + - geojson==3.2.0 + - geopandas==1.0.1 + - hdijupyterutils==0.22.0 + - holoviews==1.20.0 + - hvplot==0.11.2 + - ipywidgets==8.1.5 + - jupyter==1.1.1 + - jupyter-console==6.6.3 + - jupyterlab_widgets==3.0.13 + - kamu==0.6.0 + - kiwisolver==1.4.8 + - linkify-it-py==2.0.3 + - livy==0.8.0 + - locket==1.0.0 + - mapboxgl==0.10.2 + - Markdown==3.7 + - markdown-it-py==3.0.0 + - matplotlib==3.10.0 + - mdit-py-plugins==0.4.2 + - mdurl==0.1.2 + - narwhals==1.19.1 + - netCDF4==1.7.2 + - numpy==2.2.1 + - pandas==2.2.3 + - pandas-bokeh==0.5.5 + - panel==1.5.5 + - param==2.2.0 + - partd==1.4.2 + - pillow==11.0.0 + - plotly==5.24.1 + - pyarrow==18.1.0 + - pyogrio==0.10.0 + - pyparsing==3.2.0 + - pyproj==3.7.0 + - pyviz_comms==3.0.3 + - setuptools==75.6.0 + - shapely==2.0.6 + - tenacity==9.0.0 + - toolz==1.0.0 + - tzdata==2024.2 + - uc-micro-py==1.0.3 + - wheel==0.45.1 + - widgetsnbextension==4.0.13 + - xarray==2024.11.0 + - xyzservices==2024.9.0 + - zstandard==0.23.0 +name: base +prefix: /opt/conda diff --git a/images/jupyter/requirements/linux/arm64/requirements.in b/images/jupyter/requirements/linux/arm64/requirements.in index d7b27f8f59..68140ad718 100644 --- a/images/jupyter/requirements/linux/arm64/requirements.in +++ b/images/jupyter/requirements/linux/arm64/requirements.in @@ -1,21 +1,16 @@ -# TODO: Pinned due to sparkmagic installation issue -# See: https://github.com/jupyter-incubator/sparkmagic/issues/825 -# See workaround applied in: https://github.com/jupyter-incubator/sparkmagic/blob/master/Dockerfile.jupyter -notebook==6.5.5 +kamu[jupyter-autoviz,jupyter-sql,spark] -sparkmagic - -pandas +dask geopandas geojson -xarray netcdf4 -dask +pandas +xarray +altair bokeh -hvplot -pandas-bokeh folium -altair +hvplot mapboxgl +pandas-bokeh shapely diff --git a/images/jupyter/requirements/linux/arm64/requirements.txt b/images/jupyter/requirements/linux/arm64/requirements.txt deleted file mode 100644 index 832732c7db..0000000000 --- a/images/jupyter/requirements/linux/arm64/requirements.txt +++ /dev/null @@ -1,411 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-aarch64 -_openmp_mutex=4.5=2_gnu -alembic=1.13.1=pyhd8ed1ab_1 -altair=5.2.0=pyhd8ed1ab_0 -anyio=4.2.0=pyhd8ed1ab_0 -archspec=0.2.2=pyhd8ed1ab_0 -argon2-cffi=23.1.0=pyhd8ed1ab_0 -argon2-cffi-bindings=21.2.0=py311hcd402e7_4 -arrow=1.3.0=pyhd8ed1ab_0 -asttokens=2.4.1=pyhd8ed1ab_0 -async-lru=2.0.4=pyhd8ed1ab_0 -async_generator=1.10=py_0 -attrs=23.2.0=pyh71513ae_0 -autovizwidget=0.21.0=pyh1a96a4e_1 -aws-c-auth=0.7.16=h570bf23_5 -aws-c-cal=0.6.10=h967b9ec_1 -aws-c-common=0.9.13=h31becfc_0 -aws-c-compression=0.2.18=h00d1b86_1 -aws-c-event-stream=0.4.2=h10e8a16_3 -aws-c-http=0.8.1=hf0788a4_4 -aws-c-io=0.14.4=h87c19fb_2 -aws-c-mqtt=0.10.2=he8e29e5_3 -aws-c-s3=0.5.1=h71a96cc_6 -aws-c-sdkutils=0.1.15=h00d1b86_1 -aws-checksums=0.1.18=h00d1b86_1 -aws-crt-cpp=0.26.2=h8568a09_5 -aws-sdk-cpp=1.11.267=hfce6cab_1 -azure-core-cpp=1.10.3=hcd87347_1 -azure-storage-blobs-cpp=12.10.0=h2a328a1_0 -azure-storage-common-cpp=12.5.0=hee0c750_2 -babel=2.14.0=pyhd8ed1ab_0 -beautifulsoup4=4.12.3=pyha770c72_0 -bleach=6.1.0=pyhd8ed1ab_0 -blinker=1.7.0=pyhd8ed1ab_0 -blosc=1.21.5=h2f3a684_0 -bokeh=3.3.4=pyhd8ed1ab_0 -boltons=23.1.1=pyhd8ed1ab_0 -branca=0.7.1=pyhd8ed1ab_0 -brotli=1.1.0=h31becfc_1 -brotli-bin=1.1.0=h31becfc_1 -brotli-python=1.1.0=py311h8715677_1 -bzip2=1.0.8=h31becfc_5 -c-ares=1.26.0=h31becfc_0 -ca-certificates=2024.2.2=hcefe29a_0 -cached-property=1.5.2=hd8ed1ab_1 -cached_property=1.5.2=pyha770c72_1 -cairo=1.18.0=ha13f110_0 -certifi=2024.2.2=pyhd8ed1ab_0 -certipy=0.1.3=py_0 -cffi=1.16.0=py311h7963103_0 -cfitsio=4.3.1=hf28c5f1_0 -cftime=1.6.3=py311hf13da56_0 -charset-normalizer=3.3.2=pyhd8ed1ab_0 -chroma-py=0.1.0.dev1=py_0 -click=8.1.7=unix_pyh707e725_0 -click-plugins=1.1.1=py_0 -cligj=0.7.2=pyhd8ed1ab_1 -cloudpickle=3.0.0=pyhd8ed1ab_0 -colorama=0.4.6=pyhd8ed1ab_0 -colorcet=3.0.1=pyhd8ed1ab_0 -colour=0.1.5=pyhd8ed1ab_1 -comm=0.2.1=pyhd8ed1ab_0 -conda=23.11.0=py311hec3470c_1 -conda-libmamba-solver=24.1.0=pyhd8ed1ab_0 -conda-package-handling=2.2.0=pyh38be061_0 -conda-package-streaming=0.9.0=pyhd8ed1ab_0 -configurable-http-proxy=4.6.1=h4e45a9e_0 -contourpy=1.2.0=py311h098ece5_0 -cryptography=42.0.2=py311h2245af3_0 -cycler=0.12.1=pyhd8ed1ab_0 -cytoolz=0.12.3=py311hc8f2f60_0 -dask=2024.2.0=pyhd8ed1ab_0 -dask-core=2024.2.0=pyhd8ed1ab_0 -debugpy=1.8.1=py311h8715677_0 -decorator=5.1.1=pyhd8ed1ab_0 -defusedxml=0.7.1=pyhd8ed1ab_0 -distributed=2024.2.0=pyhd8ed1ab_0 -distro=1.9.0=pyhd8ed1ab_0 -entrypoints=0.4=pyhd8ed1ab_0 -exceptiongroup=1.2.0=pyhd8ed1ab_2 -executing=2.0.1=pyhd8ed1ab_0 -expat=2.5.0=hd600fc2_1 -fiona=1.9.5=py311he15760a_3 -fmt=10.2.1=h2a328a1_0 -folium=0.15.1=pyhd8ed1ab_0 -font-ttf-dejavu-sans-mono=2.37=hab24e00_0 -font-ttf-inconsolata=3.000=h77eed37_0 -font-ttf-source-code-pro=2.038=h77eed37_0 -font-ttf-ubuntu=0.83=h77eed37_1 -fontconfig=2.14.2=ha9a116f_0 -fonts-conda-ecosystem=1=0 -fonts-conda-forge=1=0 -fonttools=4.49.0=py311hcd402e7_0 -fqdn=1.5.1=pyhd8ed1ab_0 -freetype=2.12.1=hf0a5ef3_2 -freexl=2.0.0=h5428426_0 -fsspec=2024.2.0=pyhca7485f_0 -gdal=3.8.4=py311h3b5b607_0 -geojson=3.1.0=pyhd8ed1ab_0 -geopandas=0.14.3=pyhd8ed1ab_0 -geopandas-base=0.14.3=pyha770c72_0 -geos=3.12.1=h2f0025b_0 -geotiff=1.7.1=h3e58e51_15 -gettext=0.21.1=ha18d298_0 -gflags=2.2.2=h54f1f3f_1004 -giflib=5.2.1=hb4cce97_3 -glog=0.6.0=h8ab10f1_0 -greenlet=3.0.3=py311h8715677_0 -h11=0.14.0=pyhd8ed1ab_0 -h2=4.1.0=pyhd8ed1ab_0 -hdf4=4.2.15=hb6ba311_7 -hdf5=1.14.3=nompi_ha486f32_100 -hdijupyterutils=0.21.0=pyh1a96a4e_1 -holoviews=1.18.3=pyhd8ed1ab_0 -hpack=4.0.0=pyh9f0ad1d_0 -httpcore=1.0.2=pyhd8ed1ab_0 -httpx=0.26.0=pyhd8ed1ab_0 -hvplot=0.9.2=pyhd8ed1ab_0 -hyperframe=6.0.1=pyhd8ed1ab_0 -icu=73.2=h787c7f5_0 -idna=3.6=pyhd8ed1ab_0 -importlib-metadata=7.0.1=pyha770c72_0 -importlib_metadata=7.0.1=hd8ed1ab_0 -importlib_resources=6.1.1=pyhd8ed1ab_0 -ipykernel=6.29.2=pyhd33586a_0 -ipython=8.21.0=pyh707e725_0 -ipython_genutils=0.2.0=py_1 -ipywidgets=8.1.2=pyhd8ed1ab_0 -isoduration=20.11.0=pyhd8ed1ab_0 -jedi=0.19.1=pyhd8ed1ab_0 -jinja2=3.1.3=pyhd8ed1ab_0 -joblib=1.3.2=pyhd8ed1ab_0 -json-c=0.17=h9d1147b_0 -json5=0.9.14=pyhd8ed1ab_0 -jsonpatch=1.33=pyhd8ed1ab_0 -jsonpointer=2.4=py311hec3470c_3 -jsonschema=4.21.1=pyhd8ed1ab_0 -jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 -jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 -jupyter=1.0.0=pyhd8ed1ab_10 -jupyter-lsp=2.2.2=pyhd8ed1ab_0 -jupyter_client=7.4.9=pyhd8ed1ab_0 -jupyter_console=6.6.3=pyhd8ed1ab_0 -jupyter_core=5.7.1=py311hec3470c_0 -jupyter_events=0.9.0=pyhd8ed1ab_0 -jupyter_server=2.12.5=pyhd8ed1ab_0 -jupyter_server_terminals=0.5.2=pyhd8ed1ab_0 -jupyter_telemetry=0.1.0=pyhd8ed1ab_1 -jupyterhub=4.0.2=pyh31011fe_0 -jupyterhub-base=4.0.2=pyh31011fe_0 -jupyterlab=4.1.1=pyhd8ed1ab_0 -jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 -jupyterlab_server=2.25.2=pyhd8ed1ab_0 -jupyterlab_widgets=3.0.10=pyhd8ed1ab_0 -kealib=1.5.3=h4670d8b_0 -keyutils=1.6.1=h4e544f5_0 -kiwisolver=1.4.5=py311h0d5d7b0_1 -krb5=1.21.2=hc419048_0 -lcms2=2.16=h922389a_0 -ld_impl_linux-aarch64=2.40=h2d8c526_0 -lerc=4.0.0=h4de3ea5_0 -libabseil=20230802.1=cxx17_h2f0025b_0 -libaec=1.1.2=h2f0025b_1 -libarchive=3.7.2=hd2f85e0_1 -libarrow=15.0.0=h606a0d5_4_cpu -libarrow-acero=15.0.0=h2f0025b_4_cpu -libarrow-dataset=15.0.0=h2f0025b_4_cpu -libarrow-flight=15.0.0=he69d72d_4_cpu -libarrow-flight-sql=15.0.0=h1fc705f_4_cpu -libarrow-gandiva=15.0.0=h90362dd_4_cpu -libarrow-substrait=15.0.0=h0599332_4_cpu -libblas=3.9.0=21_linuxaarch64_openblas -libboost-headers=1.84.0=h8af1aa0_1 -libbrotlicommon=1.1.0=h31becfc_1 -libbrotlidec=1.1.0=h31becfc_1 -libbrotlienc=1.1.0=h31becfc_1 -libcblas=3.9.0=21_linuxaarch64_openblas -libcrc32c=1.1.2=h01db608_0 -libcurl=8.5.0=h4e8248e_0 -libdeflate=1.19=h31becfc_0 -libedit=3.1.20191231=he28a2e2_2 -libev=4.33=h31becfc_2 -libevent=2.1.12=h4ba1bb4_1 -libexpat=2.5.0=hd600fc2_1 -libffi=3.4.2=h3557bc0_5 -libgcc-ng=13.2.0=hf8544c7_5 -libgdal=3.8.4=h79c3f81_0 -libgfortran-ng=13.2.0=he9431aa_5 -libgfortran5=13.2.0=h582850c_5 -libglib=2.78.4=h311d5f7_0 -libgomp=13.2.0=hf8544c7_5 -libgoogle-cloud=2.12.0=h3b99733_5 -libgrpc=1.60.1=heeb7df3_0 -libiconv=1.17=h31becfc_2 -libjpeg-turbo=3.0.0=h31becfc_1 -libkml=1.3.0=h7d16752_1018 -liblapack=3.9.0=21_linuxaarch64_openblas -libllvm15=15.0.7=hb4f23b0_4 -libmamba=1.5.6=hea3be6c_0 -libmambapy=1.5.6=py311h765b69a_0 -libnetcdf=4.9.2=nompi_h33102a8_113 -libnghttp2=1.58.0=hb0e430d_1 -libnsl=2.0.1=h31becfc_0 -libnuma=2.0.16=hb4cce97_1 -libopenblas=0.3.26=pthreads_h5a5ec62_0 -libparquet=15.0.0=hb18b541_4_cpu -libpng=1.6.42=h194ca79_0 -libpq=16.2=h58720eb_0 -libprotobuf=4.25.1=h87e877f_2 -libre2-11=2023.06.02=hf48c5ca_0 -librttopo=1.1.0=hd8968fb_15 -libsodium=1.0.18=hb9de7d4_1 -libsolv=0.7.28=hd84c7bf_0 -libspatialindex=1.9.3=h01db608_4 -libspatialite=5.1.0=h896d346_4 -libsqlite=3.45.1=h194ca79_0 -libssh2=1.11.0=h492db2e_0 -libstdcxx-ng=13.2.0=h9a76618_5 -libthrift=0.19.0=h043aeee_1 -libtiff=4.6.0=h1708d11_2 -libutf8proc=2.8.0=h4e544f5_0 -libuuid=2.38.1=hb4cce97_0 -libuv=1.46.0=h31becfc_0 -libwebp-base=1.3.2=h31becfc_0 -libxcb=1.15=h2a766a3_0 -libxcrypt=4.4.36=h31becfc_1 -libxml2=2.12.5=h3091e33_0 -libzip=1.10.1=h4156a30_3 -libzlib=1.2.13=h31becfc_5 -linkify-it-py=2.0.3=pyhd8ed1ab_0 -locket=1.0.0=pyhd8ed1ab_0 -lz4=4.3.3=py311h6a4b261_0 -lz4-c=1.9.4=hd600fc2_0 -lzo=2.10=h516909a_1000 -make=4.3=h309ac5b_1 -mako=1.3.2=pyhd8ed1ab_0 -mamba=1.5.6=py311hb6c5aa6_0 -mapboxgl=0.10.2=py_1 -mapclassify=2.6.1=pyhd8ed1ab_0 -markdown=3.5.2=pyhd8ed1ab_0 -markdown-it-py=3.0.0=pyhd8ed1ab_0 -markupsafe=2.1.5=py311hc8f2f60_0 -matplotlib-base=3.8.3=py311h1f11223_0 -matplotlib-inline=0.1.6=pyhd8ed1ab_0 -mdit-py-plugins=0.4.0=pyhd8ed1ab_0 -mdurl=0.1.2=pyhd8ed1ab_0 -menuinst=2.0.2=py311hec3470c_0 -minizip=4.0.4=hb75dd74_0 -mistune=3.0.2=pyhd8ed1ab_0 -msgpack-python=1.0.7=py311h0d5d7b0_0 -munkres=1.1.4=pyh9f0ad1d_0 -nbclassic=1.0.0=pyhb4ecaf3_1 -nbclient=0.8.0=pyhd8ed1ab_0 -nbconvert=7.16.0=pyhd8ed1ab_0 -nbconvert-core=7.16.0=pyhd8ed1ab_0 -nbconvert-pandoc=7.16.0=pyhd8ed1ab_0 -nbformat=5.9.2=pyhd8ed1ab_0 -ncurses=6.4=h0425590_2 -nest-asyncio=1.6.0=pyhd8ed1ab_0 -netcdf4=1.6.5=nompi_py311hcd50196_100 -networkx=3.2.1=pyhd8ed1ab_0 -nodejs=20.9.0=hc1f8a26_0 -notebook=6.5.5=pyha770c72_0 -notebook-shim=0.2.3=pyhd8ed1ab_0 -nspr=4.35=h4de3ea5_0 -nss=3.98=hc5a5cc2_0 -numpy=1.26.4=py311h69ead2a_0 -oauthlib=3.2.2=pyhd8ed1ab_0 -openjpeg=2.5.0=h0d9d63b_3 -openssl=3.2.1=h31becfc_0 -orc=1.9.2=h5960ff3_1 -overrides=7.7.0=pyhd8ed1ab_0 -packaging=23.2=pyhd8ed1ab_0 -pamela=1.1.0=pyh1a96a4e_0 -pandas=1.5.3=py311hff2c139_1 -pandas-bokeh=0.5.5=pyhd8ed1ab_0 -pandoc=3.1.11.1=h8af1aa0_0 -pandocfilters=1.5.0=pyhd8ed1ab_0 -panel=1.3.8=pyhd8ed1ab_0 -param=2.0.2=pyhca7485f_0 -parso=0.8.3=pyhd8ed1ab_0 -partd=1.4.1=pyhd8ed1ab_0 -pcre2=10.42=hd0f9c67_0 -pexpect=4.9.0=pyhd8ed1ab_0 -pickleshare=0.7.5=py_1003 -pillow=10.2.0=py311hbcc2232_0 -pip=24.0=pyhd8ed1ab_0 -pixman=0.43.2=h2f0025b_0 -pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 -platformdirs=4.2.0=pyhd8ed1ab_0 -plotly=5.19.0=pyhd8ed1ab_0 -pluggy=1.4.0=pyhd8ed1ab_0 -poppler=24.02.0=h3cd87ed_0 -poppler-data=0.4.12=hd8ed1ab_0 -postgresql=16.2=he703394_0 -proj=9.3.1=h7b42f86_0 -prometheus_client=0.19.0=pyhd8ed1ab_0 -prompt-toolkit=3.0.42=pyha770c72_0 -prompt_toolkit=3.0.42=hd8ed1ab_0 -psutil=5.9.8=py311hcd402e7_0 -pthread-stubs=0.4=hb9de7d4_1001 -ptyprocess=0.7.0=pyhd3deb0d_0 -pure_eval=0.2.2=pyhd8ed1ab_0 -pyarrow=15.0.0=py311h1eb6f34_4_cpu -pyarrow-hotfix=0.6=pyhd8ed1ab_0 -pybind11-abi=4=hd8ed1ab_3 -pycosat=0.6.6=py311hcd402e7_0 -pycparser=2.21=pyhd8ed1ab_0 -pyct=0.5.0=pyhd8ed1ab_0 -pycurl=7.45.1=py311h4769251_3 -pygments=2.17.2=pyhd8ed1ab_0 -pyjwt=2.8.0=pyhd8ed1ab_1 -pyopenssl=24.0.0=pyhd8ed1ab_0 -pyparsing=3.1.1=pyhd8ed1ab_0 -pyproj=3.6.1=py311ha6273e5_5 -pysocks=1.7.1=pyha2e5f31_6 -pyspnego=0.9.1=py311hcd402e7_2 -python=3.11.7=h43d1f9e_1_cpython -python-dateutil=2.8.2=pyhd8ed1ab_0 -python-fastjsonschema=2.19.1=pyhd8ed1ab_0 -python-json-logger=2.0.7=pyhd8ed1ab_0 -python_abi=3.11=4_cp311 -pytz=2024.1=pyhd8ed1ab_0 -pyviz_comms=3.0.0=pyhd8ed1ab_0 -pyyaml=6.0.1=py311hcd402e7_1 -pyzmq=24.0.1=py311h22a2215_1 -qtconsole-base=5.5.1=pyha770c72_0 -qtpy=2.4.1=pyhd8ed1ab_0 -re2=2023.06.02=h887e66c_0 -readline=8.2=h8fc344f_1 -referencing=0.33.0=pyhd8ed1ab_0 -reproc=14.2.4.post0=h31becfc_1 -reproc-cpp=14.2.4.post0=h2f0025b_1 -requests=2.31.0=pyhd8ed1ab_0 -requests-kerberos=0.14.0=pyhd8ed1ab_1 -rfc3339-validator=0.1.4=pyhd8ed1ab_0 -rfc3986-validator=0.1.1=pyh9f0ad1d_0 -rpds-py=0.17.1=py311h32437ce_0 -rtree=1.2.0=py311h04fbf56_0 -ruamel.yaml=0.18.6=py311hcd402e7_0 -ruamel.yaml.clib=0.2.8=py311hcd402e7_0 -s2n=1.4.4=h5a25046_0 -scikit-learn=1.4.1.post1=py311hb93614b_0 -scipy=1.12.0=py311h69ead2a_2 -send2trash=1.8.2=pyh41d4057_0 -setuptools=69.0.3=pyhd8ed1ab_0 -shapely=2.0.3=py311hbbe59c9_0 -six=1.16.0=pyh6c4a22f_0 -snappy=1.1.10=he8610fa_0 -sniffio=1.3.0=pyhd8ed1ab_0 -sortedcontainers=2.4.0=pyhd8ed1ab_0 -soupsieve=2.5=pyhd8ed1ab_1 -sparkmagic=0.21.0=pyhd8ed1ab_1 -sqlalchemy=2.0.26=py311hc8f2f60_0 -sqlite=3.45.1=h3b3482f_0 -stack_data=0.6.2=pyhd8ed1ab_0 -tblib=3.0.0=pyhd8ed1ab_0 -tenacity=8.2.3=pyhd8ed1ab_0 -terminado=0.18.0=pyh0d859eb_0 -threadpoolctl=3.3.0=pyhc1e730c_0 -tiledb=2.20.0=hf61e980_0 -tinycss2=1.2.1=pyhd8ed1ab_0 -tk=8.6.13=h194ca79_0 -tomli=2.0.1=pyhd8ed1ab_0 -toolz=0.12.1=pyhd8ed1ab_0 -tornado=6.3.3=py311hc8f2f60_1 -tqdm=4.66.2=pyhd8ed1ab_0 -traitlets=5.9.0=pyhd8ed1ab_0 -truststore=0.8.0=pyhd8ed1ab_0 -types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 -typing-extensions=4.9.0=hd8ed1ab_0 -typing_extensions=4.9.0=pyha770c72_0 -typing_utils=0.1.0=pyhd8ed1ab_0 -tzcode=2024a=h31becfc_0 -tzdata=2024a=h0c530f3_0 -uc-micro-py=1.0.3=pyhd8ed1ab_0 -ucx=1.15.0=hedb98eb_3 -uri-template=1.3.0=pyhd8ed1ab_0 -uriparser=0.9.7=hd600fc2_1 -urllib3=2.2.0=pyhd8ed1ab_0 -wcwidth=0.2.13=pyhd8ed1ab_0 -webcolors=1.13=pyhd8ed1ab_0 -webencodings=0.5.1=pyhd8ed1ab_2 -websocket-client=1.7.0=pyhd8ed1ab_0 -wheel=0.42.0=pyhd8ed1ab_0 -widgetsnbextension=4.0.10=pyhd8ed1ab_0 -xarray=2024.2.0=pyhd8ed1ab_0 -xerces-c=3.2.5=hf13c1fb_0 -xorg-kbproto=1.0.7=h3557bc0_1002 -xorg-libice=1.1.1=h7935292_0 -xorg-libsm=1.2.4=h5a01bc2_0 -xorg-libx11=1.8.7=h055a233_0 -xorg-libxau=1.0.11=h31becfc_0 -xorg-libxdmcp=1.1.3=h3557bc0_0 -xorg-libxext=1.3.4=h2a766a3_2 -xorg-libxrender=0.9.11=h7935292_0 -xorg-renderproto=0.11.1=h3557bc0_1002 -xorg-xextproto=7.3.0=h2a766a3_1003 -xorg-xproto=7.0.31=h3557bc0_1007 -xyzservices=2023.10.1=pyhd8ed1ab_0 -xz=5.2.6=h9cdd2b7_0 -yaml=0.2.5=hf897c2e_2 -yaml-cpp=0.8.0=h2f0025b_0 -zeromq=4.3.5=h2f0025b_0 -zict=3.0.0=pyhd8ed1ab_0 -zipp=3.17.0=pyhd8ed1ab_0 -zlib=1.2.13=h31becfc_5 -zstandard=0.22.0=py311hb827a26_0 -zstd=1.5.5=h4c53e97_0 diff --git a/images/jupyter/sparkmagic.json b/images/jupyter/sparkmagic.json deleted file mode 100644 index 6f3078dbfd..0000000000 --- a/images/jupyter/sparkmagic.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "kernel_python_credentials": { - "username": "", - "password": "", - "url": "http://kamu-livy:8998", - "auth": "None" - }, - "kernel_scala_credentials": { - "username": "", - "password": "", - "url": "http://kamu-livy:8998", - "auth": "None" - }, - "kernel_r_credentials": { - "username": "", - "password": "", - "url": "http://kamu-livy:8998" - }, - "logging_config": { - "version": 1, - "formatters": { - "magicsFormatter": { - "format": "%(asctime)s\t%(levelname)s\t%(message)s", - "datefmt": "" - } - }, - "handlers": { - "magicsHandler": { - "class": "hdijupyterutils.filehandler.MagicsFileHandler", - "formatter": "magicsFormatter", - "home_path": "~/.sparkmagic" - } - }, - "loggers": { - "magicsLogger": { - "handlers": [ - "magicsHandler" - ], - "level": "DEBUG", - "propagate": 0 - } - } - }, - "wait_for_idle_timeout_seconds": 15, - "livy_session_startup_timeout_seconds": 60, - "fatal_error_suggestion": "The code failed because of a fatal error:\n\t{}.\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.", - "ignore_ssl_errors": false, - "session_configs": { - "driverMemory": "1000M", - "executorCores": 2 - }, - "use_auto_viz": true, - "coerce_dataframe": true, - "default_maxrows": 1000000, - "pyspark_dataframe_encoding": "utf-8", - "heartbeat_refresh_seconds": 30, - "livy_server_heartbeat_timeout_seconds": 0, - "heartbeat_retry_seconds": 10, - "server_extension_default_kernel_name": "pysparkkernel", - "custom_headers": {}, - "retry_policy": "configurable", - "retry_seconds_to_sleep_list": [ - 0.2, - 0.5, - 1, - 3, - 5 - ], - "configurable_retry_policy_max_retries": 8 -} \ No newline at end of file diff --git a/resources/cli-reference.md b/resources/cli-reference.md index 0ce96f5a46..270b5520f1 100644 --- a/resources/cli-reference.md +++ b/resources/cli-reference.md @@ -652,6 +652,10 @@ Starts the notebook server for exploring the data in the workspace * `--address
` — Expose HTTP server on specific network interface * `--http-port ` — Expose HTTP server on specific port +* `--engine ` — Engine type to use for the notebook + + Possible values: `datafusion`, `spark` + * `-e`, `--env ` — Propagate or set an environment variable in the notebook (e.g. `-e VAR` or `-e VAR=foo`) This command will run the Jupyter server and the Spark engine connected together, letting you query data with SQL before pulling it into the notebook for final processing and visualization. @@ -1076,7 +1080,7 @@ Executes an SQL query or drops you into an SQL shell **Subcommands:** -* `server` — Run JDBC server only +* `server` — Runs an SQL engine in a server mode **Options:** @@ -1145,16 +1149,38 @@ Note: Currently when connecting to a remote SQL kamu server you will need to man ## `kamu sql server` -Run JDBC server only +Runs an SQL engine in a server mode **Usage:** `kamu sql server [OPTIONS]` **Options:** -* `--address
` — Expose JDBC server on specific network interface -* `--port ` — Expose JDBC server on specific port -* `--livy` — Run Livy server instead of Spark JDBC -* `--flight-sql` — Run Flight SQL server instead of Spark JDBC +* `--address
` — Expose server on specific network interface +* `--port ` — Expose server on specific port +* `--engine ` — Engine type to use for this server + + Possible values: `datafusion`, `spark` + +* `--livy` — Run Livy server instead of JDBC + +**Examples:** + +By default runs the DataFusion engine exposing the FlightSQL protocol: + + kamu sql server + +To customize interface and port: + + kamu sql server --address 0.0.0.0 --port 50050 + +To run with Spark engine: + + kamu sql server --engine spark + +By default Spark runs with JDBC protocol, to instead run with Livy HTTP gateway: + + kamu sql server --engine spark --livy + diff --git a/src/adapter/flight-sql/Cargo.toml b/src/adapter/flight-sql/Cargo.toml index aee8df5739..9fb9a13c6c 100644 --- a/src/adapter/flight-sql/Cargo.toml +++ b/src/adapter/flight-sql/Cargo.toml @@ -22,29 +22,39 @@ doctest = false [dependencies] +kamu-accounts = { workspace = true } kamu-core = { workspace = true } +database-common = { workspace = true } time-source = { workspace = true } arrow-flight = { version = "53", features = ["flight-sql-experimental"] } async-trait = { version = "0.1", default-features = false } +base32 = { version = "0.5", default-features = false } base64 = { version = "0.22", default-features = false } +bytes = { version = "1", default-features = false } chrono = { version = "0.4", default-features = false } datafusion = { version = "43", default-features = false } dill = { version = "0.10", default-features = false } futures = "0.3" +http = { version = "1", default-features = false } +http-body = { version = "1", default-features = false } like = { version = "0.3", default-features = false } prost = { version = "0.13", default-features = false } +rand = { version = "0.8", default-features = false } tokio = { version = "1", default-features = false, features = [] } tonic = { version = "0.12", default-features = false } +tower = { version = "0.5", default-features = false } tracing = { version = "0.1", default-features = false } uuid = { version = "1", default-features = false } [dev-dependencies] +kamu-accounts = { workspace = true, features = ["testing"] } kamu-core = { workspace = true, features = ["testing"] } kamu-data-utils = { workspace = true, features = ["testing"] } indoc = "2" +mockall = { version = "0.13", default-features = false } test-log = { version = "0.2", features = ["trace"] } tokio = { version = "1", default-features = false, features = [] } tokio-stream = { version = "0.1", default-features = false, features = ["net"] } diff --git a/src/adapter/flight-sql/src/auth_layer.rs b/src/adapter/flight-sql/src/auth_layer.rs new file mode 100644 index 0000000000..43cf956ef9 --- /dev/null +++ b/src/adapter/flight-sql/src/auth_layer.rs @@ -0,0 +1,199 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use database_common::DatabaseTransactionRunner; +use futures::Future; +use kamu_accounts::{ + Account, + AnonymousAccountReason, + AuthenticationService, + CurrentAccountSubject, + GetAccountInfoError, +}; +use tonic::body::BoxBody; +use tonic::Status; +use tower::{Layer, Service}; + +use crate::SessionId; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct SessionAuthConfig { + pub allow_anonymous: bool, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct AuthenticationLayer {} + +impl AuthenticationLayer { + pub fn new() -> Self { + Self {} + } +} + +impl Layer for AuthenticationLayer { + type Service = AuthenticationMiddleware; + + fn layer(&self, inner: Svc) -> Self::Service { + AuthenticationMiddleware { inner } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct AuthenticationMiddleware { + inner: Svc, +} + +impl AuthenticationMiddleware { + fn extract_service_method(request: &http::Request) -> (String, String) { + let path = request.uri().path(); + let mut parts = path.split('/').filter(|x| !x.is_empty()); + let service = parts.next().unwrap_or_default(); + let method = parts.next().unwrap_or_default(); + (service.to_string(), method.to_string()) + } + + fn extract_bearer_token(request: &http::Request) -> Option { + let auth = request.headers().get(http::header::AUTHORIZATION)?; + let auth = auth.to_str().ok()?; + + if auth.starts_with("Bearer ") || auth.starts_with("bearer ") { + return Some(auth["Bearer ".len()..].to_string()); + } + + None + } + + async fn get_account_by_token( + base_catalog: &dill::Catalog, + access_token: String, + ) -> Result { + use tracing::Instrument; + + DatabaseTransactionRunner::new(base_catalog.clone()) + .transactional_with( + |authentication_service: Arc| async move { + authentication_service.account_by_token(access_token).await + }, + ) + .instrument(tracing::debug_span!( + "AuthenticationMiddleware::current_account_subject" + )) + .await + } +} + +impl Service> for AuthenticationMiddleware +where + ReqBody: Send + 'static, + Svc: Service, Response = http::Response>, + Svc: Clone + Send + 'static, + Svc::Future: Send + 'static, +{ + type Response = http::Response; + type Error = Svc::Error; + type Future = + Pin> + Send + 'static>>; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, mut request: http::Request) -> Self::Future { + // Inspired by https://github.com/maxcountryman/axum-login/blob/5239b38b2698a3db3f92075b6ad430aea79c215a/axum-login/src/auth.rs + // TODO: PERF: Is cloning a performance concern? + let mut inner = self.inner.clone(); + + Box::pin(async move { + let base_catalog = request + .extensions() + .get::() + .expect("Catalog not found in request extensions"); + + let conf: Arc = base_catalog.get_one().unwrap(); + + let token = Self::extract_bearer_token(&request); + let (service, method) = Self::extract_service_method(&request); + + let subject = match &token { + None if conf.allow_anonymous + && service == "arrow.flight.protocol.FlightService" + && method == "Handshake" => + { + CurrentAccountSubject::anonymous( + AnonymousAccountReason::NoAuthenticationProvided, + ) + } + Some(token) if conf.allow_anonymous && token.starts_with("anon_") => { + // TODO: SEC: Anonymous session tokens have to be validated + CurrentAccountSubject::anonymous( + AnonymousAccountReason::NoAuthenticationProvided, + ) + } + Some(token) => { + match Self::get_account_by_token(base_catalog, token.clone()).await { + Ok(account) => CurrentAccountSubject::logged( + account.id, + account.account_name, + account.is_admin, + ), + Err(e @ GetAccountInfoError::AccessToken(_)) => { + tracing::warn!("{e}"); + return Ok(Status::unauthenticated(e.to_string()).into_http()); + } + Err(e @ GetAccountInfoError::AccountUnresolved) => { + tracing::warn!("{e}"); + return Ok(Status::unauthenticated(e.to_string()).into_http()); + } + Err(e @ GetAccountInfoError::Internal(_)) => { + tracing::error!( + error = ?e, + error_msg = %e, + "Internal error during authentication", + ); + return Ok(Status::internal("Internal error").into_http()); + } + } + } + _ => { + // Disallow fully unauthorized access - anonymous users have to go through + // handshare procedure + return Ok(Status::unauthenticated( + "Unauthenticated access is not allowed. Provide a bearer token or use \ + basic auth and handshake endpoint to login as anonymous.", + ) + .into_http()); + } + }; + + let session_id = token.map(SessionId); + + tracing::debug!(?subject, ?session_id, "Authenticated request"); + + let mut derived_catalog_builder = dill::CatalogBuilder::new_chained(base_catalog); + if let Some(session_id) = session_id { + derived_catalog_builder.add_value(session_id); + } + derived_catalog_builder.add_value(subject); + + let derived_catalog = derived_catalog_builder.build(); + request.extensions_mut().insert(derived_catalog); + + inner.call(request).await + }) + } +} diff --git a/src/adapter/flight-sql/src/lib.rs b/src/adapter/flight-sql/src/lib.rs index 60846817ce..29fe7db572 100644 --- a/src/adapter/flight-sql/src/lib.rs +++ b/src/adapter/flight-sql/src/lib.rs @@ -9,19 +9,25 @@ #![feature(lint_reasons)] +mod auth_layer; mod service; -mod service_builder; +mod service_wrapper; mod session_auth; +mod session_auth_anon; +mod session_auth_bearer_only; mod session_manager; mod session_manager_caching; mod session_manager_singleton; +pub mod sql_info; +mod types; +pub use auth_layer::*; pub use service::*; -pub use service_builder::*; +pub use service_wrapper::*; pub use session_auth::*; +pub use session_auth_anon::*; +pub use session_auth_bearer_only::*; pub use session_manager::*; pub use session_manager_caching::*; pub use session_manager_singleton::*; - -pub type SessionToken = String; -pub type PlanToken = String; +pub use types::*; diff --git a/src/adapter/flight-sql/src/service.rs b/src/adapter/flight-sql/src/service.rs index 4569dca3bb..56d2f0dd7b 100644 --- a/src/adapter/flight-sql/src/service.rs +++ b/src/adapter/flight-sql/src/service.rs @@ -73,67 +73,41 @@ use tonic::codegen::tokio_stream::Stream; use tonic::metadata::MetadataValue; use tonic::{Request, Response, Status, Streaming}; -use crate::{KamuFlightSqlServiceBuilder, PlanToken, SessionManager, SessionToken}; +use crate::{PlanId, SessionAuth, SessionManager}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// const TABLE_TYPES: [&str; 2] = ["TABLE", "VIEW"]; +const CLOSE_SESSION: &str = "CloseSession"; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // KamuFlightSqlService //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct KamuFlightSqlService { - sql_info: SqlInfoData, + sql_info: Arc, + // LazyOnce ensures that these objects are instantiated once but only when they are needed - + // this is important because during some operations like `handshake` the `SessionId` is not + // available so an attempt to instantiate a `SessionManager` may fail + session_auth: LazyOnce>, + session_manager: LazyOnce>, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[dill::component(pub)] impl KamuFlightSqlService { - pub fn builder() -> KamuFlightSqlServiceBuilder { - KamuFlightSqlServiceBuilder::new() - } - - pub(crate) fn new(sql_info: SqlInfoData) -> Self { - Self { sql_info } - } - - // This type is a singleton. For it to play nicely with DB transactions we - // follow the same pattern as in Axum where middleware layers are responsible - // for attaching the Catalog to incoming requests. Here we extract catalog from - // the extensions to instantiate session manager. - fn get_session_manager(&self, req: &Request) -> Result, Status> { - let Some(catalog) = req.extensions().get::() else { - return Err(Status::internal("Catalog extension is not configured")); - }; - - catalog - .get_one() - .map_err(|_| Status::internal("Injection error")) - } - - fn get_token(&self, req: &Request) -> Result { - let auth = req - .metadata() - .get("authorization") - .ok_or_else(|| Status::internal("No authorization header!"))? - .to_str() - .map_err(|e| Status::internal(format!("Error parsing header: {e}")))? - .to_string(); - - let Some(session_token) = auth.strip_prefix("Bearer ") else { - return Err(Status::internal("Invalid auth header!")); - }; - - Ok(SessionToken::from(session_token)) - } - - async fn get_ctx(&self, req: &Request) -> Result, Status> { - let session_token = self.get_token(req)?; - - self.get_session_manager(req)? - .get_context(&session_token.to_string()) - .await + pub fn new( + sql_info: Arc, + session_auth: dill::Lazy>, + session_manager: dill::Lazy>, + ) -> Self { + Self { + sql_info, + session_auth: LazyOnce::new(session_auth), + session_manager: LazyOnce::new(session_manager), + } } fn get_sql_info( @@ -638,6 +612,11 @@ impl KamuFlightSqlService { let stream = futures::stream::iter(flights.into_iter().map(Ok)); Ok(Response::new(Box::pin(stream))) } + + #[tracing::instrument(level = "debug", skip_all)] + async fn do_action_close_session(&self, _request: Request) -> Result<(), Status> { + self.session_manager.close_session().await + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -657,6 +636,7 @@ impl FlightSqlService for KamuFlightSqlService { Response> + Send>>>, Status, > { + use base64::engine::{GeneralPurpose, GeneralPurposeConfig}; use base64::Engine; let basic = "Basic "; @@ -672,7 +652,12 @@ impl FlightSqlService for KamuFlightSqlService { )))?; } let base64 = &authorization[basic.len()..]; - let bytes = base64::engine::general_purpose::STANDARD + let b64engine = GeneralPurpose::new( + &base64::alphabet::STANDARD, + GeneralPurposeConfig::new() + .with_decode_padding_mode(base64::engine::DecodePaddingMode::Indifferent), + ); + let bytes = b64engine .decode(base64) .map_err(|_| Status::invalid_argument("authorization not parsable"))?; let str = String::from_utf8(bytes) @@ -684,10 +669,7 @@ impl FlightSqlService for KamuFlightSqlService { let username = parts[0]; let password = parts[1]; - let session_token = self - .get_session_manager(&request)? - .auth_basic(username, password) - .await?; + let session_token = self.session_auth.auth_basic(username, password).await?; let result = HandshakeResponse { protocol_version: 0, @@ -733,7 +715,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandStatementQuery, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let plan = Self::prepare_statement(&query.query, &ctx).await?; let df = ctx .execute_logical_plan(plan) @@ -754,18 +736,14 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandPreparedStatementQuery, request: Request, ) -> Result, Status> { - let session_token = self.get_token(&request)?; - - let plan_token = PlanToken::from_utf8(query.prepared_statement_handle.to_vec()) - .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?; - - let session_manager = self.get_session_manager(&request)?; + let plan_id = PlanId( + String::from_utf8(query.prepared_statement_handle.to_vec()) + .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?, + ); - let plan = session_manager - .get_plan(&session_token, &plan_token) - .await?; + let plan = self.session_manager.get_plan(&plan_id).await?; - let ctx = session_manager.get_context(&session_token).await?; + let ctx = self.session_manager.get_context().await?; let df = ctx .execute_logical_plan(plan) @@ -782,7 +760,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetCatalogs, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_catalogs(&ctx, &query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -793,7 +771,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetDbSchemas, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_schemas(&ctx, &query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -804,7 +782,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetTables, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_tables(ctx, &query, true).await?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -815,7 +793,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetTableTypes, request: Request, ) -> Result, Status> { - let _ctx = self.get_ctx(&request).await?; + let _ctx = self.session_manager.get_context().await?; let data = self.get_table_types(true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -826,7 +804,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetSqlInfo, request: Request, ) -> Result, Status> { - let _ctx = self.get_ctx(&request).await?; + let _ctx = self.session_manager.get_context().await?; let data = self.get_sql_info(&query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -837,7 +815,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetPrimaryKeys, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_primary_keys(&ctx, &query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -848,7 +826,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetExportedKeys, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_exported_keys(&ctx, &query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -859,7 +837,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetImportedKeys, request: Request, ) -> Result, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_imported_keys(&ctx, &query, true)?; self.record_batch_to_flight_info(&data, &query.as_any(), true) } @@ -893,7 +871,7 @@ impl FlightSqlService for KamuFlightSqlService { ticket: TicketStatementQuery, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let query = CommandStatementQuery::decode(ticket.statement_handle) .map_err(|e| Status::internal(format!("Invalid ticket: {e}")))?; @@ -915,18 +893,14 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandPreparedStatementQuery, request: Request, ) -> Result::DoGetStream>, Status> { - let session_token = self.get_token(&request)?; - - let plan_token = PlanToken::from_utf8(query.prepared_statement_handle.into()) - .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?; - - let session_manager = self.get_session_manager(&request)?; + let plan_id = PlanId( + String::from_utf8(query.prepared_statement_handle.to_vec()) + .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?, + ); - let plan = session_manager - .get_plan(&session_token, &plan_token) - .await?; + let plan = self.session_manager.get_plan(&plan_id).await?; - let ctx = session_manager.get_context(&session_token).await?; + let ctx = self.session_manager.get_context().await?; let df = ctx .execute_logical_plan(plan) @@ -942,7 +916,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetCatalogs, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_catalogs(&ctx, &query, false)?; self.record_batch_to_stream(data) } @@ -953,7 +927,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetDbSchemas, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_schemas(&ctx, &query, false)?; self.record_batch_to_stream(data) } @@ -964,7 +938,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetTables, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_tables(ctx, &query, false).await?; self.record_batch_to_stream(data) } @@ -975,7 +949,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetTableTypes, request: Request, ) -> Result::DoGetStream>, Status> { - let _ctx = self.get_ctx(&request).await?; + let _ctx = self.session_manager.get_context().await?; let data = self.get_table_types(false)?; self.record_batch_to_stream(data) } @@ -986,7 +960,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetSqlInfo, request: Request, ) -> Result::DoGetStream>, Status> { - let _ctx = self.get_ctx(&request).await?; + let _ctx = self.session_manager.get_context().await?; let data = self.get_sql_info(&query, false)?; self.record_batch_to_stream(data) } @@ -997,7 +971,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetPrimaryKeys, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_primary_keys(&ctx, &query, false)?; self.record_batch_to_stream(data) } @@ -1008,7 +982,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetExportedKeys, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_exported_keys(&ctx, &query, false)?; self.record_batch_to_stream(data) } @@ -1019,7 +993,7 @@ impl FlightSqlService for KamuFlightSqlService { query: CommandGetImportedKeys, request: Request, ) -> Result::DoGetStream>, Status> { - let ctx = self.get_ctx(&request).await?; + let ctx = self.session_manager.get_context().await?; let data = self.get_imported_keys(&ctx, &query, false)?; self.record_batch_to_stream(data) } @@ -1070,13 +1044,11 @@ impl FlightSqlService for KamuFlightSqlService { query: ActionCreatePreparedStatementRequest, request: Request, ) -> Result { - let session_token = self.get_token(&request)?; - let session_manager = self.get_session_manager(&request)?; - let ctx = session_manager.get_context(&session_token).await?; + let ctx = self.session_manager.get_context().await?; let plan = Self::prepare_statement(&query.query, &ctx).await?; let schema_bytes = self.df_schema_to_arrow(plan.schema())?; - let plan_token = session_manager.cache_plan(&session_token, plan).await?; + let plan_token = self.session_manager.cache_plan(plan).await?; tracing::debug!(%plan_token, "Prepared statement"); @@ -1094,14 +1066,12 @@ impl FlightSqlService for KamuFlightSqlService { query: ActionClosePreparedStatementRequest, request: Request, ) -> Result<(), Status> { - let session_token = self.get_token(&request)?; - - let plan_token = PlanToken::from_utf8(query.prepared_statement_handle.into()) - .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?; + let plan_id = PlanId( + String::from_utf8(query.prepared_statement_handle.to_vec()) + .map_err(|e| Status::internal(format!("Error decoding handle: {e}")))?, + ); - self.get_session_manager(&request)? - .remove_plan(&session_token, &plan_token) - .await?; + self.session_manager.remove_plan(&plan_id).await?; Ok(()) } @@ -1196,6 +1166,50 @@ impl FlightSqlService for KamuFlightSqlService { /// GetSqlInfo. #[tracing::instrument(level = "debug", skip_all, fields(%id, ?result))] async fn register_sql_info(&self, id: i32, result: &SqlInfo) {} + + async fn do_action_fallback( + &self, + request: Request, + ) -> Result::DoActionStream>, Status> { + // TODO: Base interface should handle CloseSession action + // See: https://github.com/apache/arrow-rs/issues/6516 + if request.get_ref().r#type == CLOSE_SESSION { + self.do_action_close_session(request).await?; + Ok(Response::new(Box::pin(futures::stream::empty()))) + } else { + Err(Status::invalid_argument(format!( + "do_action: The defined request is invalid: {:?}", + request.get_ref().r#type + ))) + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO: Consider upstreaming into `dill` +// One downside to this type is that it panics on ingestion errors rather than +// returning them +struct LazyOnce { + f: dill::Lazy, + v: std::sync::OnceLock, +} + +impl LazyOnce { + pub fn new(f: dill::Lazy) -> Self { + Self { + f, + v: std::sync::OnceLock::new(), + } + } +} + +impl std::ops::Deref for LazyOnce { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.v.get_or_init(|| self.f.get().unwrap()) + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/service_builder.rs b/src/adapter/flight-sql/src/service_builder.rs deleted file mode 100644 index 0d16da8a78..0000000000 --- a/src/adapter/flight-sql/src/service_builder.rs +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use arrow_flight::sql::metadata::SqlInfoDataBuilder; -use arrow_flight::sql::{ - SqlInfo, - SqlNullOrdering, - SqlSupportedCaseSensitivity, - SqlSupportedTransactions, - SupportedSqlGrammar, -}; - -use crate::KamuFlightSqlService; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub struct KamuFlightSqlServiceBuilder { - sql_info: SqlInfoDataBuilder, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -impl KamuFlightSqlServiceBuilder { - pub fn new() -> Self { - let sql_info = Self::default_sql_info(); - - Self { sql_info } - } - - pub fn build(self) -> KamuFlightSqlService { - KamuFlightSqlService::new(self.sql_info.build().unwrap()) - } - - pub fn with_server_name(mut self, name: &str, version: &str) -> Self { - self.sql_info.append(SqlInfo::FlightSqlServerName, name); - self.sql_info - .append(SqlInfo::FlightSqlServerVersion, version); - self - } - - // TODO: Revisit - fn default_sql_info() -> SqlInfoDataBuilder { - let mut builder = SqlInfoDataBuilder::new(); - // Server information - builder.append(SqlInfo::FlightSqlServerName, "Unknown"); - builder.append(SqlInfo::FlightSqlServerVersion, "0.0.0"); - // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24 - builder.append(SqlInfo::FlightSqlServerArrowVersion, "1.3"); - builder.append(SqlInfo::FlightSqlServerReadOnly, true); - builder.append(SqlInfo::FlightSqlServerSql, true); - builder.append(SqlInfo::FlightSqlServerSubstrait, false); - builder.append( - SqlInfo::FlightSqlServerTransaction, - SqlSupportedTransactions::SqlTransactionUnspecified as i32, - ); - // don't yet support `CancelQuery` action - builder.append(SqlInfo::FlightSqlServerCancel, false); - builder.append(SqlInfo::FlightSqlServerStatementTimeout, 0i32); - builder.append(SqlInfo::FlightSqlServerTransactionTimeout, 0i32); - // SQL syntax information - builder.append(SqlInfo::SqlDdlCatalog, false); - builder.append(SqlInfo::SqlDdlSchema, false); - builder.append(SqlInfo::SqlDdlTable, false); - builder.append( - SqlInfo::SqlIdentifierCase, - SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase as i32, - ); - builder.append(SqlInfo::SqlIdentifierQuoteChar, r#"""#); - builder.append( - SqlInfo::SqlQuotedIdentifierCase, - SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive as i32, - ); - builder.append(SqlInfo::SqlAllTablesAreSelectable, true); - builder.append( - SqlInfo::SqlNullOrdering, - SqlNullOrdering::SqlNullsSortedHigh as i32, - ); - // builder.append(SqlInfo::SqlKeywords, SQL_INFO_SQL_KEYWORDS); - // builder.append(SqlInfo::SqlNumericFunctions, SQL_INFO_NUMERIC_FUNCTIONS); - // builder.append(SqlInfo::SqlStringFunctions, SQL_INFO_STRING_FUNCTIONS); - // builder.append(SqlInfo::SqlSystemFunctions, SQL_INFO_SYSTEM_FUNCTIONS); - // builder.append(SqlInfo::SqlDatetimeFunctions, SQL_INFO_DATE_TIME_FUNCTIONS); - builder.append(SqlInfo::SqlSearchStringEscape, "\\"); - builder.append(SqlInfo::SqlExtraNameCharacters, ""); - builder.append(SqlInfo::SqlSupportsColumnAliasing, true); - builder.append(SqlInfo::SqlNullPlusNullIsNull, true); - // Skip SqlSupportsConvert (which is the map of the conversions that are - // supported); .with_sql_info(SqlInfo::SqlSupportsConvert, TBD); - builder.append(SqlInfo::SqlSupportsTableCorrelationNames, false); - builder.append(SqlInfo::SqlSupportsDifferentTableCorrelationNames, false); - builder.append(SqlInfo::SqlSupportsExpressionsInOrderBy, true); - builder.append(SqlInfo::SqlSupportsOrderByUnrelated, true); - builder.append(SqlInfo::SqlSupportedGroupBy, 3i32); - builder.append(SqlInfo::SqlSupportsLikeEscapeClause, true); - builder.append(SqlInfo::SqlSupportsNonNullableColumns, true); - builder.append( - SqlInfo::SqlSupportedGrammar, - SupportedSqlGrammar::SqlCoreGrammar as i32, - ); - // report we support all ansi 92 - builder.append(SqlInfo::SqlAnsi92SupportedLevel, 0b111_i32); - builder.append(SqlInfo::SqlSupportsIntegrityEnhancementFacility, false); - builder.append(SqlInfo::SqlOuterJoinsSupportLevel, 2i32); - builder.append(SqlInfo::SqlSchemaTerm, "schema"); - builder.append(SqlInfo::SqlProcedureTerm, "procedure"); - builder.append(SqlInfo::SqlCatalogAtStart, false); - builder.append(SqlInfo::SqlSchemasSupportedActions, 0i32); - builder.append(SqlInfo::SqlCatalogsSupportedActions, 0i32); - builder.append(SqlInfo::SqlSupportedPositionedCommands, 0i32); - builder.append(SqlInfo::SqlSelectForUpdateSupported, false); - builder.append(SqlInfo::SqlStoredProceduresSupported, false); - builder.append(SqlInfo::SqlSupportedSubqueries, 15i32); - builder.append(SqlInfo::SqlCorrelatedSubqueriesSupported, true); - builder.append(SqlInfo::SqlSupportedUnions, 3i32); - // For max lengths, report max arrow string length - builder.append(SqlInfo::SqlMaxBinaryLiteralLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxCharLiteralLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnsInGroupBy, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnsInIndex, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnsInOrderBy, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnsInSelect, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxColumnsInTable, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxConnections, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxCursorNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxIndexLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlDbSchemaNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxProcedureNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxCatalogNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxRowSize, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxRowSizeIncludesBlobs, true); - builder.append(SqlInfo::SqlMaxStatementLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxStatements, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxTableNameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxTablesInSelect, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlMaxUsernameLength, i64::from(i32::MAX)); - builder.append(SqlInfo::SqlDefaultTransactionIsolation, 0i64); - builder.append(SqlInfo::SqlTransactionsSupported, false); - builder.append(SqlInfo::SqlSupportedTransactionsIsolationLevels, 0i32); - builder.append(SqlInfo::SqlDataDefinitionCausesTransactionCommit, false); - builder.append(SqlInfo::SqlDataDefinitionsInTransactionsIgnored, true); - builder.append(SqlInfo::SqlSupportedResultSetTypes, 0i32); - builder.append( - SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified, - 0i32, - ); - builder.append( - SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly, - 0i32, - ); - builder.append( - SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive, - 0i32, - ); - builder.append( - SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive, - 0i32, - ); - builder.append(SqlInfo::SqlBatchUpdatesSupported, false); - builder.append(SqlInfo::SqlSavepointsSupported, false); - builder.append(SqlInfo::SqlNamedParametersSupported, false); - builder.append(SqlInfo::SqlLocatorsUpdateCopy, false); - builder.append(SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported, false); - builder - } -} diff --git a/src/adapter/flight-sql/src/service_wrapper.rs b/src/adapter/flight-sql/src/service_wrapper.rs new file mode 100644 index 0000000000..00014f93cd --- /dev/null +++ b/src/adapter/flight-sql/src/service_wrapper.rs @@ -0,0 +1,598 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::pin::Pin; +use std::sync::Arc; + +use arrow_flight::flight_service_server::FlightService; +use arrow_flight::sql::server::{FlightSqlService, PeekableFlightDataStream}; +use arrow_flight::sql::{ + ActionBeginSavepointRequest, + ActionBeginSavepointResult, + ActionBeginTransactionRequest, + ActionBeginTransactionResult, + ActionCancelQueryRequest, + ActionCancelQueryResult, + ActionClosePreparedStatementRequest, + ActionCreatePreparedStatementRequest, + ActionCreatePreparedStatementResult, + ActionCreatePreparedSubstraitPlanRequest, + ActionEndSavepointRequest, + ActionEndTransactionRequest, + CommandGetCatalogs, + CommandGetCrossReference, + CommandGetDbSchemas, + CommandGetExportedKeys, + CommandGetImportedKeys, + CommandGetPrimaryKeys, + CommandGetSqlInfo, + CommandGetTableTypes, + CommandGetTables, + CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, + CommandPreparedStatementUpdate, + CommandStatementQuery, + CommandStatementSubstraitPlan, + CommandStatementUpdate, + DoPutPreparedStatementResult, + SqlInfo, + TicketStatementQuery, +}; +use arrow_flight::{ + Action, + FlightDescriptor, + FlightInfo, + HandshakeRequest, + HandshakeResponse, + Ticket, +}; +use tonic::codegen::tokio_stream::Stream; +use tonic::{Request, Response, Status, Streaming}; + +use crate::KamuFlightSqlService; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// This type is a singleton that is called by GRPC server. For it to play nicely +// with DB transactions we follow the same pattern as in Axum where middleware +// layers are responsible for attaching the Catalog to incoming requests. This +// wrapper will extract the catalog from the reuqest extensions and instantiate +// the inner service in the request context. +pub struct KamuFlightSqlServiceWrapper; + +impl KamuFlightSqlServiceWrapper { + async fn exec(&self, mut request: Request, f: F) -> Result + where + F: FnOnce(Request, Arc) -> Fut, + Fut: std::future::Future>, + { + let Some(catalog) = request.extensions_mut().remove::() else { + return Err(Status::internal("Catalog extension is not configured")); + }; + + // TODO: Eventually method should look like this: + // + // ``` + // let transaction_runner = database_common::DatabaseTransactionRunner::new(catalog); + // transaction_runner + // .transactional(|tx_catalog: dill::Catalog| async move { + // let inner: Arc = tx_catalog.get_one().int_err()?; + // Ok(f(request, inner).await) + // }) + // .await + // .map_err(internal_error::)? + // ``` + // + // We want it to open and close DB transaction for the duration of the handler. + // + // Currently, however, because the contruction of `datafusion::SessionContext` + // is expensive we cache it in memory for a short period of time. Because the + // context holds on to core objects it also holds on to the DB transaction, thus + // transactions outlive the duration of the handler which would violate the + // transaction manager contract. So instead... + + // We extract transaction manager + let db_transaction_manager = catalog + .get_one::() + .unwrap(); + + // Create a transaction + let transaction_ref = db_transaction_manager.make_transaction_ref().await.map_err(|e| { + tracing::error!(error = %e, error_dbg = ?e, "Failed to open database transaction for FlightSQL session"); + Status::internal("could not start database transaction") + })?; + + // Attach transaction to the new chained catalog. + // + // Transaction will therefore live for as long as `SessionContext` holds on to + // it. The DB connection will be returned to the pool when session expires. + // + // In this approach transaction manager never gets a chance to COMMIT, and the + // transaction will be automatically rolled back when it's dropped, but that's + // OK because all these interactions are read-only. + let session_catalog = catalog.builder_chained().add_value(transaction_ref).build(); + + let inner: Arc = session_catalog.get_one().map_err(internal_error)?; + + f(request, inner).await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO: Replace with a macro +#[expect(unused_variables)] +#[tonic::async_trait] +impl FlightSqlService for KamuFlightSqlServiceWrapper { + type FlightService = KamuFlightSqlServiceWrapper; + + async fn do_handshake( + &self, + request: Request>, + ) -> Result< + Response> + Send>>>, + Status, + > { + self.exec(request, |request, inner| async move { + inner.do_handshake(request).await + }) + .await + } + + async fn do_get_fallback( + &self, + request: Request, + message: arrow_flight::sql::Any, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_fallback(request, message).await + }) + .await + } + + async fn do_get_xdbc_type_info( + &self, + query: CommandGetXdbcTypeInfo, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_xdbc_type_info(query, request).await + }) + .await + } + + async fn get_flight_info_statement( + &self, + query: CommandStatementQuery, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_statement(query, request).await + }) + .await + } + + async fn get_flight_info_prepared_statement( + &self, + query: CommandPreparedStatementQuery, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner + .get_flight_info_prepared_statement(query, request) + .await + }) + .await + } + + async fn get_flight_info_catalogs( + &self, + query: CommandGetCatalogs, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_catalogs(query, request).await + }) + .await + } + + async fn get_flight_info_schemas( + &self, + query: CommandGetDbSchemas, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_schemas(query, request).await + }) + .await + } + + async fn get_flight_info_tables( + &self, + query: CommandGetTables, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_tables(query, request).await + }) + .await + } + + async fn get_flight_info_table_types( + &self, + query: CommandGetTableTypes, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_table_types(query, request).await + }) + .await + } + + async fn get_flight_info_sql_info( + &self, + query: CommandGetSqlInfo, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_sql_info(query, request).await + }) + .await + } + + async fn get_flight_info_primary_keys( + &self, + query: CommandGetPrimaryKeys, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_primary_keys(query, request).await + }) + .await + } + + async fn get_flight_info_exported_keys( + &self, + query: CommandGetExportedKeys, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_exported_keys(query, request).await + }) + .await + } + + async fn get_flight_info_imported_keys( + &self, + query: CommandGetImportedKeys, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_imported_keys(query, request).await + }) + .await + } + + async fn get_flight_info_cross_reference( + &self, + query: CommandGetCrossReference, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_cross_reference(query, request).await + }) + .await + } + + async fn get_flight_info_xdbc_type_info( + &self, + query: CommandGetXdbcTypeInfo, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_xdbc_type_info(query, request).await + }) + .await + } + + async fn do_get_statement( + &self, + query: TicketStatementQuery, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_statement(query, request).await + }) + .await + } + + async fn do_get_prepared_statement( + &self, + query: CommandPreparedStatementQuery, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_prepared_statement(query, request).await + }) + .await + } + + async fn do_get_catalogs( + &self, + query: CommandGetCatalogs, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_catalogs(query, request).await + }) + .await + } + + async fn do_get_schemas( + &self, + query: CommandGetDbSchemas, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_schemas(query, request).await + }) + .await + } + + async fn do_get_tables( + &self, + query: CommandGetTables, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_tables(query, request).await + }) + .await + } + + async fn do_get_table_types( + &self, + query: CommandGetTableTypes, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_table_types(query, request).await + }) + .await + } + + async fn do_get_sql_info( + &self, + query: CommandGetSqlInfo, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_sql_info(query, request).await + }) + .await + } + + async fn do_get_primary_keys( + &self, + query: CommandGetPrimaryKeys, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_primary_keys(query, request).await + }) + .await + } + + async fn do_get_exported_keys( + &self, + query: CommandGetExportedKeys, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_exported_keys(query, request).await + }) + .await + } + + async fn do_get_imported_keys( + &self, + query: CommandGetImportedKeys, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_imported_keys(query, request).await + }) + .await + } + + async fn do_get_cross_reference( + &self, + query: CommandGetCrossReference, + request: Request, + ) -> Result::DoGetStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_get_cross_reference(query, request).await + }) + .await + } + + async fn do_put_statement_update( + &self, + query: CommandStatementUpdate, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_put_statement_update(query, request).await + }) + .await + } + + async fn do_put_prepared_statement_query( + &self, + query: CommandPreparedStatementQuery, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_put_prepared_statement_query(query, request).await + }) + .await + } + + async fn do_put_prepared_statement_update( + &self, + query: CommandPreparedStatementUpdate, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_put_prepared_statement_update(query, request).await + }) + .await + } + + async fn do_action_create_prepared_statement( + &self, + query: ActionCreatePreparedStatementRequest, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner + .do_action_create_prepared_statement(query, request) + .await + }) + .await + } + + async fn do_action_close_prepared_statement( + &self, + query: ActionClosePreparedStatementRequest, + request: Request, + ) -> Result<(), Status> { + self.exec(request, |request, inner| async move { + inner + .do_action_close_prepared_statement(query, request) + .await + }) + .await + } + + async fn get_flight_info_substrait_plan( + &self, + query: CommandStatementSubstraitPlan, + request: Request, + ) -> Result, Status> { + self.exec(request, |request, inner| async move { + inner.get_flight_info_substrait_plan(query, request).await + }) + .await + } + + async fn do_put_substrait_plan( + &self, + query: CommandStatementSubstraitPlan, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_put_substrait_plan(query, request).await + }) + .await + } + + async fn do_action_create_prepared_substrait_plan( + &self, + query: ActionCreatePreparedSubstraitPlanRequest, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner + .do_action_create_prepared_substrait_plan(query, request) + .await + }) + .await + } + + async fn do_action_begin_transaction( + &self, + query: ActionBeginTransactionRequest, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_action_begin_transaction(query, request).await + }) + .await + } + + async fn do_action_end_transaction( + &self, + query: ActionEndTransactionRequest, + request: Request, + ) -> Result<(), Status> { + self.exec(request, |request, inner| async move { + inner.do_action_end_transaction(query, request).await + }) + .await + } + + async fn do_action_begin_savepoint( + &self, + query: ActionBeginSavepointRequest, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_action_begin_savepoint(query, request).await + }) + .await + } + + async fn do_action_end_savepoint( + &self, + query: ActionEndSavepointRequest, + request: Request, + ) -> Result<(), Status> { + self.exec(request, |request, inner| async move { + inner.do_action_end_savepoint(query, request).await + }) + .await + } + + async fn do_action_cancel_query( + &self, + query: ActionCancelQueryRequest, + request: Request, + ) -> Result { + self.exec(request, |request, inner| async move { + inner.do_action_cancel_query(query, request).await + }) + .await + } + + async fn register_sql_info(&self, id: i32, result: &SqlInfo) {} + + async fn do_action_fallback( + &self, + request: Request, + ) -> Result::DoActionStream>, Status> { + self.exec(request, |request, inner| async move { + inner.do_action_fallback(request).await + }) + .await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn internal_error(error: E) -> Status { + tracing::error!( + error = ?error, + error_msg = %error, + "Internal error", + ); + Status::internal("Internal error") +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/session_auth.rs b/src/adapter/flight-sql/src/session_auth.rs index 00a4d073f6..9c957b451e 100644 --- a/src/adapter/flight-sql/src/session_auth.rs +++ b/src/adapter/flight-sql/src/session_auth.rs @@ -7,38 +7,16 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::collections::HashMap; - use tonic::Status; +use crate::SessionToken; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Simplified trait that some session managers can delegate authentication to #[async_trait::async_trait] pub trait SessionAuth: Send + Sync { - async fn auth_basic(&self, _username: &str, _password: &str) -> Result<(), Status> { - Err(Status::unauthenticated("Invalid credentials")) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[dill::component] -#[dill::interface(dyn SessionAuth)] -pub struct SessionAuthBasicPredefined { - accounts_passwords: HashMap, -} - -#[async_trait::async_trait] -impl SessionAuth for SessionAuthBasicPredefined { - async fn auth_basic(&self, username: &str, password: &str) -> Result<(), Status> { - if let Some(expected_password) = self.accounts_passwords.get(username) { - if expected_password == password { - return Ok(()); - } - } - Err(Status::unauthenticated("Invalid credentials")) - } + async fn auth_basic(&self, username: &str, password: &str) -> Result; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/session_auth_anon.rs b/src/adapter/flight-sql/src/session_auth_anon.rs new file mode 100644 index 0000000000..9bf7b6f03c --- /dev/null +++ b/src/adapter/flight-sql/src/session_auth_anon.rs @@ -0,0 +1,55 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use rand::Rng; +use tonic::Status; + +use crate::{SessionAuth, SessionToken}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +const ANON_SESSION_TOKEN_BYTES_LENGTH: usize = 16; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Enables basic auth for `anonymous` account with no password, generating them +/// a special bearer token that is used to tell multiple anonymous clients +/// apart for session identification and individual rate-limiting. +#[dill::component] +#[dill::interface(dyn SessionAuth)] +pub struct SessionAuthAnonymous {} + +#[async_trait::async_trait] +impl SessionAuth for SessionAuthAnonymous { + async fn auth_basic(&self, username: &str, password: &str) -> Result { + match (username, password) { + // Some libraries have bugs that prevent using empty password + // kamu/kamu is deprecated - preserving compatibility with old credentials + ("anonymous", "" | "anonymous") | ("kamu", "kamu") => {} + _ => { + return Err(Status::unauthenticated( + "Basic auth is only supported for 'anonymous' accounts with no password. \ + Authenticated users should use Bearer token auth mechanism.", + )) + } + } + + let mut random_token_bytes = [0_u8; ANON_SESSION_TOKEN_BYTES_LENGTH]; + rand::thread_rng().fill(&mut random_token_bytes); + let base32_token = base32::encode(base32::Alphabet::Crockford, &random_token_bytes); + + // TODO: SEC: Anonymous tokens should be validated on subsequent requests, + // otherwise malicious clients can just generate them. This will require + // storing tokens in a cache (e.g. alongside rate limiting data). + let session_token = SessionToken(format!("anon_{}", &base32_token)); + Ok(session_token) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/session_auth_bearer_only.rs b/src/adapter/flight-sql/src/session_auth_bearer_only.rs new file mode 100644 index 0000000000..890e44e707 --- /dev/null +++ b/src/adapter/flight-sql/src/session_auth_bearer_only.rs @@ -0,0 +1,31 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use tonic::Status; + +use crate::{SessionAuth, SessionToken}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Rejects all basic auth attempts, requiring that users auth via bearer tokens +#[dill::component] +#[dill::interface(dyn SessionAuth)] +pub struct SessionAuthBearerOnly {} + +#[async_trait::async_trait] +impl SessionAuth for SessionAuthBearerOnly { + async fn auth_basic(&self, _username: &str, _password: &str) -> Result { + Err(Status::unauthenticated( + "Basic auth and anonymous access are not supported by this server. Users must \ + authenticate via Bearer token auth mechanism.", + )) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/session_manager.rs b/src/adapter/flight-sql/src/session_manager.rs index 26d052622c..418a2cdb84 100644 --- a/src/adapter/flight-sql/src/session_manager.rs +++ b/src/adapter/flight-sql/src/session_manager.rs @@ -13,22 +13,16 @@ use datafusion::logical_expr::LogicalPlan; use datafusion::prelude::SessionContext; use tonic::Status; -use crate::{PlanToken, SessionToken}; +use crate::PlanId; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Responsible for managing the state associated with the client session. #[async_trait::async_trait] pub trait SessionManager: Send + Sync { - /// Called during the handshake stage to authenticate the client. - /// - /// Returns a bearer token by which the client is associated with its - /// session in all subsequent calls. - async fn auth_basic(&self, username: &str, password: &str) -> Result; - /// Can be used to free the client session resources and state when /// connection is gracefully closed. - async fn end_session(&self, token: &SessionToken) -> Result<(), Status>; + async fn close_session(&self) -> Result<(), Status>; /// Called on every operation to get the session context for the client. /// Token argument represents the token returned at the authentication @@ -37,26 +31,17 @@ pub trait SessionManager: Send + Sync { /// Note that the session token should be treated as untrusted - it's the /// job of session manager implementation to verify it before returning /// the context. - async fn get_context(&self, token: &SessionToken) -> Result, Status>; + async fn get_context(&self) -> Result, Status>; /// Called to cache the logical plan of a prepared statement - async fn cache_plan( - &self, - token: &SessionToken, - plan: LogicalPlan, - ) -> Result; + async fn cache_plan(&self, plan: LogicalPlan) -> Result; /// Called to retrieve the previously cached logical plan of a prepared /// statement - async fn get_plan( - &self, - token: &SessionToken, - plan_token: &PlanToken, - ) -> Result; + async fn get_plan(&self, plan_id: &PlanId) -> Result; /// Called to clean up the previously cached logical plan - async fn remove_plan(&self, token: &SessionToken, plan_token: &PlanToken) - -> Result<(), Status>; + async fn remove_plan(&self, plan_id: &PlanId) -> Result<(), Status>; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/flight-sql/src/session_manager_caching.rs b/src/adapter/flight-sql/src/session_manager_caching.rs index c1d45a3cd4..9d8f09f23b 100644 --- a/src/adapter/flight-sql/src/session_manager_caching.rs +++ b/src/adapter/flight-sql/src/session_manager_caching.rs @@ -13,24 +13,27 @@ use std::sync::{Arc, Mutex}; use chrono::{DateTime, Utc}; use datafusion::logical_expr::LogicalPlan; use datafusion::prelude::SessionContext; +use kamu_accounts::CurrentAccountSubject; use kamu_core::QueryService; use time_source::SystemTimeSource; use tonic::Status; -use crate::{PlanToken, SessionAuth, SessionManager, SessionToken}; +use crate::{internal_error, PlanId, SessionId, SessionManager}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Clone)] pub struct SessionCachingConfig { - pub session_expiration_timeout: std::time::Duration, - pub session_inactivity_timeout: std::time::Duration, + pub authed_session_expiration_timeout: std::time::Duration, + pub authed_session_inactivity_timeout: std::time::Duration, + pub anon_session_expiration_timeout: std::time::Duration, + pub anon_session_inactivity_timeout: std::time::Duration, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct SessionManagerCachingState { - sessions: Mutex>, + sessions: Mutex>, } #[dill::component(pub)] @@ -45,7 +48,7 @@ impl SessionManagerCachingState { struct Session { ctx: Option>, - plans: HashMap, + plans: HashMap, created: DateTime, accessed: DateTime, } @@ -60,38 +63,66 @@ struct Session { #[dill::component(pub)] #[dill::interface(dyn SessionManager)] pub struct SessionManagerCaching { + subject: Arc, + session_id: SessionId, conf: Arc, timer: Arc, - auth: dill::Lazy>, query_svc: dill::Lazy>, state: Arc, } impl SessionManagerCaching { + fn get_or_create_session(&self, fun: F) -> Result + where + F: FnOnce(&mut Session) -> Result, + { + let mut sessions = self.state.sessions.lock().unwrap(); + + let session = if let Some(session) = sessions.get_mut(&self.session_id) { + session + } else { + tracing::debug!("Creating new FlightSQL session"); + let now = self.timer.now(); + + let session = Session { + ctx: None, + plans: HashMap::new(), + created: now, + accessed: now, + }; + + sessions.insert(self.session_id.clone(), session); + + self.schedule_expiration(); + + sessions.get_mut(&self.session_id).unwrap() + }; + + fun(session) + } + async fn create_context(&self) -> Result, Status> { - let query_svc = self - .query_svc - .get() - .map_err(|_| Status::internal("Injection error"))?; + let query_svc = self.query_svc.get().map_err(internal_error)?; - let ctx = query_svc - .create_session() - .await - .map_err(|e| Status::internal(e.to_string()))?; + let ctx = query_svc.create_session().await.map_err(internal_error)?; Ok(Arc::new(ctx)) } - fn schedule_expiration(&self, token: SessionToken) { - let expires_in = self.conf.session_expiration_timeout; + fn schedule_expiration(&self) { + let session_id = self.session_id.clone(); + let expires_in = match self.subject.as_ref() { + CurrentAccountSubject::Anonymous(_) => self.conf.anon_session_expiration_timeout, + CurrentAccountSubject::Logged(_) => self.conf.authed_session_expiration_timeout, + }; let state = Arc::clone(&self.state); tokio::task::spawn(async move { tokio::time::sleep(expires_in).await; let mut sessions = state.sessions.lock().unwrap(); - if let Some(session) = sessions.remove(&token) { + if let Some(session) = sessions.remove(&session_id) { tracing::debug!( - token, + %session_id, created_at = %session.created, "Expiring FlightSQL session", ); @@ -99,9 +130,13 @@ impl SessionManagerCaching { }); } - fn schedule_inactivity(&self, token: SessionToken, for_accessed_at: DateTime) { + fn schedule_inactivity(&self, for_accessed_at: DateTime) { + let session_id = self.session_id.clone(); let timer = self.timer.clone(); - let inactivity_timeout = self.conf.session_inactivity_timeout; + let inactivity_timeout = match self.subject.as_ref() { + CurrentAccountSubject::Anonymous(_) => self.conf.anon_session_inactivity_timeout, + CurrentAccountSubject::Logged(_) => self.conf.authed_session_inactivity_timeout, + }; let state = Arc::clone(&self.state); tokio::task::spawn(async move { @@ -112,7 +147,7 @@ impl SessionManagerCaching { tokio::time::sleep(inactive_in).await; let mut sessions = state.sessions.lock().unwrap(); - let Some(session) = sessions.get_mut(&token) else { + let Some(session) = sessions.get_mut(&session_id) else { // Session expired break; }; @@ -120,7 +155,7 @@ impl SessionManagerCaching { if session.accessed == accessed_cas { // Session was not accessed since this timer was scheduled - release the context tracing::debug!( - token, + %session_id, created_at = %session.created, accessed_at = %session.accessed, ?inactivity_timeout, @@ -132,8 +167,11 @@ impl SessionManagerCaching { // Re-schedule to updated time accessed_cas = session.accessed; - inactive_in = - inactivity_timeout - (timer.now() - session.accessed).to_std().unwrap(); + inactive_in = inactivity_timeout.saturating_sub( + (std::cmp::max(timer.now(), session.accessed) - session.accessed) + .to_std() + .unwrap(), + ); } }); } @@ -141,135 +179,83 @@ impl SessionManagerCaching { #[async_trait::async_trait] impl SessionManager for SessionManagerCaching { - async fn auth_basic(&self, username: &str, password: &str) -> Result { - self.auth - .get() - .map_err(|_| Status::internal("Injection error"))? - .auth_basic(username, password) - .await?; - - let now = self.timer.now(); - let token = SessionToken::from(uuid::Uuid::new_v4()); - - tracing::debug!(token, "Creating new FlightSQL session"); - - let session = Session { - ctx: Some(self.create_context().await?), - plans: HashMap::new(), - created: now, - accessed: now, - }; - - let mut sessions = self.state.sessions.lock().unwrap(); - sessions.insert(token.clone(), session); - - self.schedule_inactivity(token.clone(), now); - self.schedule_expiration(token.clone()); - - Ok(token) - } - - async fn end_session(&self, token: &SessionToken) -> Result<(), Status> { + async fn close_session(&self) -> Result<(), Status> { let mut sessions = self.state.sessions.lock().unwrap(); - sessions.remove(token); + if sessions.remove(&self.session_id).is_some() { + tracing::debug!( + session_id = %self.session_id, + "Closed FlightSQL session context" + ); + } Ok(()) } - async fn get_context(&self, token: &SessionToken) -> Result, Status> { + async fn get_context(&self) -> Result, Status> { // Try to get existing context - { - let mut sessions = self.state.sessions.lock().unwrap(); - - let Some(session) = sessions.get_mut(token) else { - return Err(Status::unauthenticated("Invalid token")); - }; - + if let Some(ctx) = self.get_or_create_session(|session| { if let Some(ctx) = &session.ctx { - tracing::debug!(token, "Reusing FlightSQL session context"); - // The inactivity timer will reschedule itself session.accessed = self.timer.now(); - return Ok(Arc::clone(ctx)); + Ok(Some(Arc::clone(ctx))) + } else { + Ok(None) } + })? { + tracing::debug!( + session_id = %self.session_id, + "Reusing FlightSQL session context" + ); + return Ok(ctx); } // Session was inactive - re-create the context - tracing::debug!(token, "Re-creating suspended FlightSQL session context"); - - let ctx = self.create_context().await?; - - { - let mut sessions = self.state.sessions.lock().unwrap(); - - let Some(session) = sessions.get_mut(token) else { - return Err(Status::unauthenticated("Invalid token")); - }; + tracing::debug!( + session_id = %self.session_id, + "Creating FlightSQL session context" + ); - session.accessed = self.timer.now(); + let new_ctx = self.create_context().await?; + self.get_or_create_session(move |session| { if let Some(ctx) = &session.ctx { // Oops, another thread created the context already - reuse the existing one Ok(Arc::clone(ctx)) } else { // Insert new context into the session - session.ctx = Some(Arc::clone(&ctx)); + session.ctx = Some(Arc::clone(&new_ctx)); // Schedule inactivity timer - self.schedule_inactivity(token.clone(), session.accessed); + self.schedule_inactivity(session.accessed); - Ok(ctx) + Ok(new_ctx) } - } + }) } - async fn cache_plan( - &self, - token: &SessionToken, - plan: LogicalPlan, - ) -> Result { - let mut sessions = self.state.sessions.lock().unwrap(); - - let Some(session) = sessions.get_mut(token) else { - return Err(Status::unauthenticated("Invalid token")); - }; - - let plan_token = PlanToken::from(uuid::Uuid::new_v4()); - session.plans.insert(plan_token.clone(), plan); + async fn cache_plan(&self, plan: LogicalPlan) -> Result { + let plan_id = PlanId(uuid::Uuid::new_v4().to_string()); - Ok(plan_token) + self.get_or_create_session(move |session| { + session.plans.insert(plan_id.clone(), plan); + Ok(plan_id) + }) } - async fn get_plan( - &self, - token: &SessionToken, - plan_token: &PlanToken, - ) -> Result { - let sessions = self.state.sessions.lock().unwrap(); - - let Some(session) = sessions.get(token) else { - return Err(Status::unauthenticated("Invalid token")); - }; - - let Some(plan) = session.plans.get(plan_token) else { - return Err(Status::unauthenticated("Invalid plan token")); - }; - - Ok(plan.clone()) + async fn get_plan(&self, plan_id: &PlanId) -> Result { + self.get_or_create_session(move |session| { + if let Some(plan) = session.plans.get(plan_id) { + Ok(plan.clone()) + } else { + Err(Status::unauthenticated("Invalid plan token")) + } + }) } - async fn remove_plan( - &self, - token: &SessionToken, - plan_token: &PlanToken, - ) -> Result<(), Status> { - let mut sessions = self.state.sessions.lock().unwrap(); - - let Some(session) = sessions.get_mut(token) else { - return Err(Status::unauthenticated("Invalid token")); - }; - - session.plans.remove(plan_token); - Ok(()) + async fn remove_plan(&self, plan_id: &PlanId) -> Result<(), Status> { + self.get_or_create_session(move |session| { + session.plans.remove(plan_id); + Ok(()) + }) } } diff --git a/src/adapter/flight-sql/src/session_manager_singleton.rs b/src/adapter/flight-sql/src/session_manager_singleton.rs index ef76817b5c..a2a90d5310 100644 --- a/src/adapter/flight-sql/src/session_manager_singleton.rs +++ b/src/adapter/flight-sql/src/session_manager_singleton.rs @@ -15,7 +15,7 @@ use datafusion::prelude::SessionContext; use kamu_core::QueryService; use tonic::Status; -use crate::{PlanToken, SessionAuth, SessionManager, SessionToken}; +use crate::{internal_error, PlanId, SessionManager}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -26,7 +26,6 @@ use crate::{PlanToken, SessionAuth, SessionManager, SessionToken}; #[dill::component] #[dill::interface(dyn SessionManager)] pub struct SessionManagerSingleton { - auth: dill::Lazy>, query_svc: dill::Lazy>, state: Arc, } @@ -47,44 +46,18 @@ impl SessionManagerSingletonState { struct Inner { ctx: Arc, - plans: HashMap, -} - -impl SessionManagerSingleton { - const DUMMY_TOKEN: &'static str = "singleton-token"; - - fn check_token(&self, token: &SessionToken) -> Result<(), Status> { - if token == Self::DUMMY_TOKEN { - Ok(()) - } else { - Err(Status::unauthenticated("Invalid token")) - } - } + plans: HashMap, } #[async_trait::async_trait] impl SessionManager for SessionManagerSingleton { - async fn auth_basic(&self, username: &str, password: &str) -> Result { - self.auth - .get() - .map_err(|_| Status::internal("Injection error"))? - .auth_basic(username, password) - .await?; - - Ok(SessionToken::from(Self::DUMMY_TOKEN)) - } - - async fn end_session(&self, token: &SessionToken) -> Result<(), Status> { - self.check_token(token)?; - + async fn close_session(&self) -> Result<(), Status> { let mut state = self.state.inner.lock().unwrap(); state.take(); Ok(()) } - async fn get_context(&self, token: &SessionToken) -> Result, Status> { - self.check_token(token)?; - + async fn get_context(&self) -> Result, Status> { { let state = self.state.inner.lock().unwrap(); if let Some(state) = &(*state) { @@ -92,10 +65,7 @@ impl SessionManager for SessionManagerSingleton { } } - let query_svc = self - .query_svc - .get() - .map_err(|_| Status::internal("Injection error"))?; + let query_svc = self.query_svc.get().map_err(internal_error)?; let ctx = query_svc .create_session() @@ -112,56 +82,38 @@ impl SessionManager for SessionManagerSingleton { Ok(ctx) } - async fn cache_plan( - &self, - token: &SessionToken, - plan: LogicalPlan, - ) -> Result { - self.check_token(token)?; - + async fn cache_plan(&self, plan: LogicalPlan) -> Result { let mut state = self.state.inner.lock().unwrap(); let Some(s) = &mut (*state) else { return Err(Status::internal("Invalid state")); }; - let plan_token = PlanToken::from(uuid::Uuid::new_v4()); + let plan_token = PlanId(uuid::Uuid::new_v4().to_string()); s.plans.insert(plan_token.clone(), plan); Ok(plan_token) } - async fn get_plan( - &self, - token: &SessionToken, - plan_token: &PlanToken, - ) -> Result { - self.check_token(token)?; - + async fn get_plan(&self, plan_id: &PlanId) -> Result { let mut state = self.state.inner.lock().unwrap(); let Some(s) = &mut (*state) else { return Err(Status::internal("Invalid state")); }; - let Some(plan) = s.plans.get(plan_token) else { + let Some(plan) = s.plans.get(plan_id) else { return Err(Status::internal("No such plan")); }; Ok(plan.clone()) } - async fn remove_plan( - &self, - token: &SessionToken, - plan_token: &PlanToken, - ) -> Result<(), Status> { - self.check_token(token)?; - + async fn remove_plan(&self, plan_id: &PlanId) -> Result<(), Status> { let mut state = self.state.inner.lock().unwrap(); let Some(s) = &mut (*state) else { return Err(Status::internal("Invalid state")); }; - s.plans.remove(plan_token); + s.plans.remove(plan_id); Ok(()) } } diff --git a/src/adapter/flight-sql/src/sql_info.rs b/src/adapter/flight-sql/src/sql_info.rs new file mode 100644 index 0000000000..b1a45867e8 --- /dev/null +++ b/src/adapter/flight-sql/src/sql_info.rs @@ -0,0 +1,144 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use arrow_flight::sql::metadata::SqlInfoDataBuilder; +use arrow_flight::sql::{ + SqlInfo, + SqlNullOrdering, + SqlSupportedCaseSensitivity, + SqlSupportedTransactions, + SupportedSqlGrammar, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub fn default_sql_info() -> SqlInfoDataBuilder { + let mut builder = SqlInfoDataBuilder::new(); + // Server information + builder.append(SqlInfo::FlightSqlServerName, "Unknown"); + builder.append(SqlInfo::FlightSqlServerVersion, "0.0.0"); + // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24 + builder.append(SqlInfo::FlightSqlServerArrowVersion, "1.3"); + builder.append(SqlInfo::FlightSqlServerReadOnly, true); + builder.append(SqlInfo::FlightSqlServerSql, true); + builder.append(SqlInfo::FlightSqlServerSubstrait, false); + builder.append( + SqlInfo::FlightSqlServerTransaction, + SqlSupportedTransactions::SqlTransactionUnspecified as i32, + ); + // don't yet support `CancelQuery` action + builder.append(SqlInfo::FlightSqlServerCancel, false); + builder.append(SqlInfo::FlightSqlServerStatementTimeout, 0i32); + builder.append(SqlInfo::FlightSqlServerTransactionTimeout, 0i32); + // SQL syntax information + builder.append(SqlInfo::SqlDdlCatalog, false); + builder.append(SqlInfo::SqlDdlSchema, false); + builder.append(SqlInfo::SqlDdlTable, false); + builder.append( + SqlInfo::SqlIdentifierCase, + SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase as i32, + ); + builder.append(SqlInfo::SqlIdentifierQuoteChar, r#"""#); + builder.append( + SqlInfo::SqlQuotedIdentifierCase, + SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive as i32, + ); + builder.append(SqlInfo::SqlAllTablesAreSelectable, true); + builder.append( + SqlInfo::SqlNullOrdering, + SqlNullOrdering::SqlNullsSortedHigh as i32, + ); + // builder.append(SqlInfo::SqlKeywords, SQL_INFO_SQL_KEYWORDS); + // builder.append(SqlInfo::SqlNumericFunctions, SQL_INFO_NUMERIC_FUNCTIONS); + // builder.append(SqlInfo::SqlStringFunctions, SQL_INFO_STRING_FUNCTIONS); + // builder.append(SqlInfo::SqlSystemFunctions, SQL_INFO_SYSTEM_FUNCTIONS); + // builder.append(SqlInfo::SqlDatetimeFunctions, SQL_INFO_DATE_TIME_FUNCTIONS); + builder.append(SqlInfo::SqlSearchStringEscape, "\\"); + builder.append(SqlInfo::SqlExtraNameCharacters, ""); + builder.append(SqlInfo::SqlSupportsColumnAliasing, true); + builder.append(SqlInfo::SqlNullPlusNullIsNull, true); + // Skip SqlSupportsConvert (which is the map of the conversions that are + // supported); .with_sql_info(SqlInfo::SqlSupportsConvert, TBD); + builder.append(SqlInfo::SqlSupportsTableCorrelationNames, false); + builder.append(SqlInfo::SqlSupportsDifferentTableCorrelationNames, false); + builder.append(SqlInfo::SqlSupportsExpressionsInOrderBy, true); + builder.append(SqlInfo::SqlSupportsOrderByUnrelated, true); + builder.append(SqlInfo::SqlSupportedGroupBy, 3i32); + builder.append(SqlInfo::SqlSupportsLikeEscapeClause, true); + builder.append(SqlInfo::SqlSupportsNonNullableColumns, true); + builder.append( + SqlInfo::SqlSupportedGrammar, + SupportedSqlGrammar::SqlCoreGrammar as i32, + ); + // report we support all ansi 92 + builder.append(SqlInfo::SqlAnsi92SupportedLevel, 0b111_i32); + builder.append(SqlInfo::SqlSupportsIntegrityEnhancementFacility, false); + builder.append(SqlInfo::SqlOuterJoinsSupportLevel, 2i32); + builder.append(SqlInfo::SqlSchemaTerm, "schema"); + builder.append(SqlInfo::SqlProcedureTerm, "procedure"); + builder.append(SqlInfo::SqlCatalogAtStart, false); + builder.append(SqlInfo::SqlSchemasSupportedActions, 0i32); + builder.append(SqlInfo::SqlCatalogsSupportedActions, 0i32); + builder.append(SqlInfo::SqlSupportedPositionedCommands, 0i32); + builder.append(SqlInfo::SqlSelectForUpdateSupported, false); + builder.append(SqlInfo::SqlStoredProceduresSupported, false); + builder.append(SqlInfo::SqlSupportedSubqueries, 15i32); + builder.append(SqlInfo::SqlCorrelatedSubqueriesSupported, true); + builder.append(SqlInfo::SqlSupportedUnions, 3i32); + // For max lengths, report max arrow string length + builder.append(SqlInfo::SqlMaxBinaryLiteralLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxCharLiteralLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnsInGroupBy, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnsInIndex, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnsInOrderBy, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnsInSelect, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxColumnsInTable, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxConnections, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxCursorNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxIndexLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlDbSchemaNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxProcedureNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxCatalogNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxRowSize, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxRowSizeIncludesBlobs, true); + builder.append(SqlInfo::SqlMaxStatementLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxStatements, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxTableNameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxTablesInSelect, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlMaxUsernameLength, i64::from(i32::MAX)); + builder.append(SqlInfo::SqlDefaultTransactionIsolation, 0i64); + builder.append(SqlInfo::SqlTransactionsSupported, false); + builder.append(SqlInfo::SqlSupportedTransactionsIsolationLevels, 0i32); + builder.append(SqlInfo::SqlDataDefinitionCausesTransactionCommit, false); + builder.append(SqlInfo::SqlDataDefinitionsInTransactionsIgnored, true); + builder.append(SqlInfo::SqlSupportedResultSetTypes, 0i32); + builder.append( + SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified, + 0i32, + ); + builder.append( + SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly, + 0i32, + ); + builder.append( + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive, + 0i32, + ); + builder.append( + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive, + 0i32, + ); + builder.append(SqlInfo::SqlBatchUpdatesSupported, false); + builder.append(SqlInfo::SqlSavepointsSupported, false); + builder.append(SqlInfo::SqlNamedParametersSupported, false); + builder.append(SqlInfo::SqlLocatorsUpdateCopy, false); + builder.append(SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported, false); + builder +} diff --git a/src/adapter/flight-sql/src/types.rs b/src/adapter/flight-sql/src/types.rs new file mode 100644 index 0000000000..1525eb403a --- /dev/null +++ b/src/adapter/flight-sql/src/types.rs @@ -0,0 +1,59 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SessionToken(pub String); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SessionId(pub String); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PlanId(pub String); + +impl std::ops::Deref for SessionToken { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::Deref for SessionId { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::Deref for PlanId { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for SessionToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::fmt::Display for SessionId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::fmt::Display for PlanId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/src/adapter/flight-sql/tests/tests/test_service.rs b/src/adapter/flight-sql/tests/tests/test_service.rs index 9fb4ecd948..92c6dd69fc 100644 --- a/src/adapter/flight-sql/tests/tests/test_service.rs +++ b/src/adapter/flight-sql/tests/tests/test_service.rs @@ -13,9 +13,10 @@ use std::net::SocketAddr; use arrow_flight::flight_service_server::FlightServiceServer; use arrow_flight::sql::client::FlightSqlServiceClient; use datafusion::prelude::*; -use dill::Component; use futures::TryStreamExt; use indoc::indoc; +use kamu_accounts::testing::MockAuthenticationService; +use kamu_accounts::{Account, AuthenticationService, GetAccountInfoError}; use kamu_adapter_flight_sql::*; use kamu_core::{MockQueryService, QueryService}; use tokio::net::TcpListener; @@ -51,22 +52,47 @@ async fn run_server() -> FlightServer { .await .unwrap(); + let mut mock_authentication_service = MockAuthenticationService::new(); + mock_authentication_service + .expect_account_by_token() + .with(mockall::predicate::eq("valid-token".to_string())) + .returning(|_| Ok(Account::dummy())); + mock_authentication_service + .expect_account_by_token() + .with(mockall::predicate::eq("invalid-token".to_string())) + .returning(|_| { + Err(GetAccountInfoError::AccessToken( + kamu_accounts::AccessTokenError::Invalid("foo".into()), + )) + }); + let mut query_svc: kamu_core::MockQueryService = kamu_core::MockQueryService::new(); query_svc .expect_create_session() .return_once(move || Ok(ctx)); - let catalog = dill::Catalog::builder() - .add_builder( - SessionAuthBasicPredefined::builder() - .with_accounts_passwords([("admin".to_string(), "password".to_string())].into()), - ) - .bind::() + let mut b = dill::Catalog::builder(); + + b.add::() + .add_value(SessionAuthConfig { + allow_anonymous: true, + }) + .add_value(mock_authentication_service) + .bind::() .add_value(query_svc) .bind::() .add::() .add::() - .build(); + .add_value( + kamu_adapter_flight_sql::sql_info::default_sql_info() + .build() + .unwrap(), + ) + .add::(); + + database_common::NoOpDatabasePlugin::init_database_components(&mut b); + + let catalog = b.build(); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); @@ -77,9 +103,8 @@ async fn run_server() -> FlightServer { req.extensions_mut().insert(catalog.clone()); Ok(req) })) - .add_service(FlightServiceServer::new( - KamuFlightSqlService::builder().build(), - )) + .layer(AuthenticationLayer::new()) + .add_service(FlightServiceServer::new(KamuFlightSqlServiceWrapper)) .serve_with_incoming(tokio_stream::wrappers::TcpListenerStream::new(listener)); let task = tokio::task::spawn(service); @@ -100,7 +125,7 @@ async fn get_client(addr: &SocketAddr) -> FlightSqlServiceClient { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] -async fn test_auth_error() { +async fn test_basic_auth_disabled() { let server = run_server().await; let mut client = get_client(&server.addr).await; @@ -110,11 +135,79 @@ async fn test_auth_error() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] -async fn test_statement() { +async fn test_invalid_bearer_token() { + let server = run_server().await; + let mut client = get_client(&server.addr).await; + client.set_token("invalid-token".to_string()); + + assert_matches!( + client.execute("select * from test".to_string(), None).await, + Err(_) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_statement_anonymous() { + let server = run_server().await; + + let mut client = get_client(&server.addr).await; + client.handshake("anonymous", "").await.unwrap(); + + let fi = client + .execute("select * from test".to_string(), None) + .await + .unwrap(); + + let mut record_batches: Vec<_> = client + .do_get(fi.endpoint[0].ticket.clone().unwrap()) + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + assert_eq!(record_batches.len(), 1); + + let ctx = SessionContext::new(); + let df = ctx.read_batch(record_batches.pop().unwrap()).unwrap(); + + kamu_data_utils::testing::assert_schema_eq( + df.schema(), + indoc!( + " + message arrow_schema { + OPTIONAL INT32 id; + OPTIONAL BYTE_ARRAY name (STRING); + } + " + ), + ); + kamu_data_utils::testing::assert_data_eq( + df, + indoc!( + " + +----+------+ + | id | name | + +----+------+ + | 1 | a | + | 2 | b | + +----+------+ + " + ), + ) + .await; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_statement_bearer() { let server = run_server().await; let mut client = get_client(&server.addr).await; - client.handshake("admin", "password").await.unwrap(); + client.set_token("valid-token".to_string()); let fi = client .execute("select * from test".to_string(), None) diff --git a/src/adapter/http/src/middleware/authentication_layer.rs b/src/adapter/http/src/middleware/authentication_layer.rs index 6dc1c7b34e..9b9cc9d545 100644 --- a/src/adapter/http/src/middleware/authentication_layer.rs +++ b/src/adapter/http/src/middleware/authentication_layer.rs @@ -75,8 +75,8 @@ impl AuthenticationMiddleware { )) .await; - // TODO: Getting the full account info here is expensive while all we need is - // the caller identity + // TODO: PERF: Getting the full account info here is expensive while all we need + // is the caller identity match account_res { Ok(account) => Ok(CurrentAccountSubject::logged( account.id, @@ -100,7 +100,14 @@ impl AuthenticationMiddleware { AnonymousAccountReason::AuthenticationInvalid, )) } - Err(GetAccountInfoError::Internal(_)) => Err(internal_server_error_response()), + Err(GetAccountInfoError::Internal(err)) => { + tracing::error!( + error = ?err, + error_msg = %err, + "Internal error during authentication", + ); + Err(internal_server_error_response()) + } } } else { Ok(CurrentAccountSubject::anonymous( diff --git a/src/app/cli/Cargo.toml b/src/app/cli/Cargo.toml index 2251f19dcf..3b23428b67 100644 --- a/src/app/cli/Cargo.toml +++ b/src/app/cli/Cargo.toml @@ -29,9 +29,8 @@ doctest = false [features] -default = ["flight-sql", "ingest-evm", "ingest-mqtt", "query-extensions-json"] +default = ["ingest-evm", "ingest-mqtt", "query-extensions-json"] -flight-sql = ["dep:kamu-adapter-flight-sql"] ingest-evm = ["kamu/ingest-evm"] ingest-ftp = ["kamu/ingest-ftp"] ingest-mqtt = ["kamu/ingest-mqtt"] @@ -55,7 +54,7 @@ kamu = { workspace = true } kamu-data-utils = { workspace = true } kamu-adapter-auth-oso = { workspace = true } -kamu-adapter-flight-sql = { optional = true, workspace = true } +kamu-adapter-flight-sql = { workspace = true } kamu-adapter-graphql = { workspace = true } kamu-adapter-http = { workspace = true, features = [ "e2e", diff --git a/src/app/cli/src/app.rs b/src/app/cli/src/app.rs index fa26dee027..6032087185 100644 --- a/src/app/cli/src/app.rs +++ b/src/app/cli/src/app.rs @@ -479,14 +479,15 @@ pub fn configure_base_catalog( b.add::(); - // TODO: Unstub FlightSQL authentication - b.add_builder( - kamu_adapter_flight_sql::SessionAuthBasicPredefined::builder() - .with_accounts_passwords([("kamu".to_string(), "kamu".to_string())].into()), - ); - b.bind::(); + b.add::(); b.add::(); b.add::(); + b.add_value( + kamu_adapter_flight_sql::sql_info::default_sql_info() + .build() + .unwrap(), + ); + b.add::(); if tenancy_config == TenancyConfig::MultiTenant { b.add::(); @@ -505,6 +506,10 @@ pub fn configure_base_catalog( b.add::(); b.add::(); + b.add::(); + b.add::(); + b.add::(); + register_message_dispatcher::( &mut b, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, @@ -714,16 +719,15 @@ pub fn register_config_in_catalog( }); catalog_builder.add_value(kamu::utils::ipfs_wrapper::IpfsClient::default()); - catalog_builder.add_value( - config - .protocol - .as_ref() - .unwrap() - .flight_sql - .as_ref() - .unwrap() - .to_system(), - ); + let flight_sql_conf = config + .protocol + .as_ref() + .unwrap() + .flight_sql + .as_ref() + .unwrap(); + catalog_builder.add_value(flight_sql_conf.to_session_auth_config()); + catalog_builder.add_value(flight_sql_conf.to_session_caching_config()); if tenancy_config == TenancyConfig::MultiTenant { let mut implicit_user_config = PredefinedAccountsConfig::new(); diff --git a/src/app/cli/src/cli.rs b/src/app/cli/src/cli.rs index dcf1398b0c..3634b90480 100644 --- a/src/app/cli/src/cli.rs +++ b/src/app/cli/src/cli.rs @@ -814,6 +814,10 @@ pub struct Notebook { #[arg(long)] pub http_port: Option, + /// Engine type to use for the notebook + #[arg(long, value_name = "ENG", value_enum)] + pub engine: Option, + /// Propagate or set an environment variable in the notebook (e.g. `-e VAR` /// or `-e VAR=foo`) #[arg(long, short = 'e', value_name = "VAR")] @@ -1287,24 +1291,47 @@ pub enum SqlSubCommand { Server(SqlServer), } -/// Run JDBC server only +/// Runs an SQL engine in a server mode #[derive(Debug, clap::Args)] +#[command(after_help = r#" +**Examples:** + +By default runs the DataFusion engine exposing the FlightSQL protocol: + + kamu sql server + +To customize interface and port: + + kamu sql server --address 0.0.0.0 --port 50050 + +To run with Spark engine: + + kamu sql server --engine spark + +By default Spark runs with JDBC protocol, to instead run with Livy HTTP gateway: + + kamu sql server --engine spark --livy +"#)] pub struct SqlServer { - /// Expose JDBC server on specific network interface + /// Expose server on specific network interface #[arg(long)] pub address: Option, - /// Expose JDBC server on specific port + /// Expose server on specific port #[arg(long)] pub port: Option, - /// Run Livy server instead of Spark JDBC - #[arg(long)] - pub livy: bool, + /// Engine type to use for this server. + /// + /// Currently `datafusion` engine will expose Flight SQL endpoint, while + /// `spark` engine will expose either JDBC (default) or Livy endpoint (if + /// `--livy` flag is set). + #[arg(long, value_name = "ENG", value_enum)] + pub engine: Option, - /// Run Flight SQL server instead of Spark JDBC + /// Run Livy server instead of JDBC #[arg(long)] - pub flight_sql: bool, + pub livy: bool, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/app/cli/src/cli_commands.rs b/src/app/cli/src/cli_commands.rs index d089c4edea..535b30cefb 100644 --- a/src/app/cli/src/cli_commands.rs +++ b/src/app/cli/src/cli_commands.rs @@ -259,12 +259,14 @@ pub fn get_command( None::<&str>, )), cli::Command::Notebook(c) => Box::new(NotebookCommand::new( - cli_catalog.get_one()?, + base_catalog.get_one()?, + base_catalog.get_one()?, cli_catalog.get_one()?, cli_catalog.get_one()?, cli_catalog.get_one()?, c.address, c.http_port, + c.engine, c.env.unwrap_or_default(), )), cli::Command::Pull(c) => { @@ -385,39 +387,18 @@ pub fn get_command( c.output_path, c.records_per_file, )), - Some(cli::SqlSubCommand::Server(sc)) => { - if sc.livy { - Box::new(SqlServerLivyCommand::new( - cli_catalog.get_one()?, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - sc.address, - sc.port, - )) - } else if sc.flight_sql { - cfg_if::cfg_if! { - if #[cfg(feature = "flight-sql")] { - Box::new(SqlServerFlightSqlCommand::new( - cli_catalog.clone(), - sc.address, - sc.port, - )) - } else { - return Err(CLIError::usage_error("Kamu was compiled without Flight SQL support")) - } - } - } else { - Box::new(SqlServerCommand::new( - cli_catalog.get_one()?, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - sc.address, - sc.port, - )) - } - } + Some(cli::SqlSubCommand::Server(sc)) => Box::new(SqlServerCommand::new( + base_catalog.get_one()?, + base_catalog.get_one()?, + cli_catalog.get_one()?, + cli_catalog.get_one()?, + cli_catalog.get_one()?, + cli_catalog.get_one()?, + sc.address, + sc.port, + sc.engine, + sc.livy, + )), }, cli::Command::System(c) => match c.subcommand { cli::SystemSubCommand::ApiServer(sc) => match sc.subcommand { diff --git a/src/app/cli/src/commands/common.rs b/src/app/cli/src/commands/common.rs index 2523cab2d4..cf93feff66 100644 --- a/src/app/cli/src/commands/common.rs +++ b/src/app/cli/src/commands/common.rs @@ -7,21 +7,25 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::Duration; use kamu::domain::PullImageListener; +use crate::OutputConfig; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct PullImageProgress { + output_config: Arc, image_purpose: &'static str, progress_bar: Mutex>, } impl PullImageProgress { - pub fn new(image_purpose: &'static str) -> Self { + pub fn new(output_config: Arc, image_purpose: &'static str) -> Self { Self { + output_config, image_purpose, progress_bar: Mutex::new(None), } @@ -30,6 +34,13 @@ impl PullImageProgress { impl PullImageListener for PullImageProgress { fn begin(&self, image: &str) { + if !self.output_config.is_tty + || self.output_config.verbosity_level != 0 + || self.output_config.quiet + { + return; + } + let s = indicatif::ProgressBar::new_spinner(); let style = indicatif::ProgressStyle::default_spinner() .template("{spinner:.cyan} {msg}") diff --git a/src/app/cli/src/commands/mod.rs b/src/app/cli/src/commands/mod.rs index d1a8c7f9bf..29e07d9521 100644 --- a/src/app/cli/src/commands/mod.rs +++ b/src/app/cli/src/commands/mod.rs @@ -42,9 +42,6 @@ mod reset_command; mod search_command; mod set_watermark_command; mod sql_server_command; -#[cfg(feature = "flight-sql")] -mod sql_server_flightsql_command; -mod sql_server_livy_command; mod sql_shell_command; mod system_api_server_gql_query_command; mod system_api_server_gql_schema_command; @@ -95,9 +92,6 @@ pub use reset_command::*; pub use search_command::*; pub use set_watermark_command::*; pub use sql_server_command::*; -#[cfg(feature = "flight-sql")] -pub use sql_server_flightsql_command::*; -pub use sql_server_livy_command::*; pub use sql_shell_command::*; pub use system_api_server_gql_query_command::*; pub use system_api_server_gql_schema_command::*; diff --git a/src/app/cli/src/commands/notebook_command.rs b/src/app/cli/src/commands/notebook_command.rs index 28225bda1d..715b6933d1 100644 --- a/src/app/cli/src/commands/notebook_command.rs +++ b/src/app/cli/src/commands/notebook_command.rs @@ -16,30 +16,32 @@ use container_runtime::ContainerRuntime; use internal_error::*; use super::common::PullImageProgress; -use super::{CLIError, Command}; -use crate::config::JupyterConfig; -use crate::explore::NotebookServerImpl; +use super::{CLIError, Command, SqlShellEngine}; +use crate::explore::{FlightSqlServiceFactory, NotebookServerFactory, SparkLivyServerFactory}; use crate::output::OutputConfig; -use crate::WorkspaceLayout; pub struct NotebookCommand { - workspace_layout: Arc, - jupyter_config: Arc, + flight_sql_service_factory: Arc, + spark_livy_server_factory: Arc, + notebook_server_factory: Arc, output_config: Arc, container_runtime: Arc, address: Option, port: Option, + engine: Option, env_vars: Vec<(String, Option)>, } impl NotebookCommand { pub fn new( - workspace_layout: Arc, - jupyter_config: Arc, + flight_sql_service_factory: Arc, + spark_livy_server_factory: Arc, + notebook_server_factory: Arc, output_config: Arc, container_runtime: Arc, address: Option, port: Option, + engine: Option, env_vars: Iter, ) -> Self where @@ -47,12 +49,14 @@ impl NotebookCommand { Str: AsRef, { Self { - workspace_layout, - jupyter_config, + flight_sql_service_factory, + spark_livy_server_factory, + notebook_server_factory, output_config, container_runtime, address, port, + engine, env_vars: env_vars .into_iter() .map(|elem| { @@ -68,16 +72,9 @@ impl NotebookCommand { .collect(), } } -} - -#[async_trait::async_trait(?Send)] -impl Command for NotebookCommand { - async fn run(&mut self) -> Result<(), CLIError> { - let notebook_server = - NotebookServerImpl::new(self.container_runtime.clone(), self.jupyter_config.clone()); - let environment_vars = self - .env_vars + fn collect_env_vars(&self) -> Result, CLIError> { + self.env_vars .iter() .map(|(name, value)| { value @@ -88,15 +85,11 @@ impl Command for NotebookCommand { }) .map(|v| (name.to_owned(), v)) }) - .collect::, _>>()?; - - let spinner = if self.output_config.verbosity_level == 0 && !self.output_config.quiet { - let pull_progress = PullImageProgress::new("container"); - notebook_server - .ensure_images(&pull_progress) - .await - .int_err()?; + .collect::, _>>() + } + fn startup_spinner(&self) -> Option { + if self.output_config.verbosity_level == 0 && !self.output_config.quiet { let s = indicatif::ProgressBar::new_spinner(); let style = indicatif::ProgressStyle::default_spinner() .template("{spinner:.cyan} {msg}") @@ -107,14 +100,41 @@ impl Command for NotebookCommand { Some(s) } else { None - }; + } + } - notebook_server - .run( - &self.workspace_layout.datasets_dir, - &self.workspace_layout.run_info_dir, + async fn run_datafusion(&mut self) -> Result<(), CLIError> { + let environment_vars = self.collect_env_vars()?; + + let pull_progress = PullImageProgress::new(self.output_config.clone(), "Jupyter"); + + self.notebook_server_factory + .ensure_image(Some(&pull_progress)) + .await + .int_err()?; + + let spinner = self.startup_spinner(); + + // FIXME: We have to bind FlightSQL to 0.0.0.0 external interface instead of + // 127.0.0.1 as Jupyter will be connection from the outside + let flight_sql_svc = self + .flight_sql_service_factory + .start(Some(std::net::Ipv4Addr::UNSPECIFIED.into()), None) + .await?; + + let client_url = url::Url::parse(&format!( + "grpc://host.docker.internal:{}", + flight_sql_svc.local_addr().port() + )) + .unwrap(); + + let mut notebook_container = self + .notebook_server_factory + .start( + &client_url, self.address, self.port, + None, environment_vars, self.output_config.verbosity_level > 0, move |url| { @@ -129,10 +149,122 @@ impl Command for NotebookCommand { eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); let _ = webbrowser::open(url); }, - || eprintln!("{}", s("Shutting down").yellow()), ) .await .int_err()?; + + tokio::select! { + _ = container_runtime::signal::graceful_stop() => { + eprintln!("{}", s("Shutting down").yellow()); + } + _ = flight_sql_svc.wait() => { + tracing::warn!("FlightSQL server terminated"); + eprintln!("{}", s("FlightSQL server terminated").yellow()); + } + exit_status = notebook_container.wait() => { + tracing::warn!(?exit_status, "Notebook server terminated"); + eprintln!("{}", s("Notebook server terminated").yellow()); + } + } + + notebook_container.terminate().await.int_err()?; + Ok(()) } + + async fn run_spark(&mut self) -> Result<(), CLIError> { + let environment_vars = self.collect_env_vars()?; + + // Pull images + self.spark_livy_server_factory + .ensure_image(Some(&PullImageProgress::new( + self.output_config.clone(), + "Spark", + ))) + .await + .int_err()?; + + self.notebook_server_factory + .ensure_image(Some(&PullImageProgress::new( + self.output_config.clone(), + "Jupyter", + ))) + .await + .int_err()?; + + // Start containers on one network + let spinner = self.startup_spinner(); + + let network = self + .container_runtime + .create_random_network_with_prefix("kamu-") + .await + .int_err()?; + + let mut livy = self + .spark_livy_server_factory + .start( + None, + None, + self.output_config.verbosity_level > 0, + Some(network.name()), + ) + .await + .int_err()?; + + let mut notebook = self + .notebook_server_factory + .start( + &url::Url::parse("http://kamu-livy:8998").unwrap(), + self.address, + self.port, + Some(network.name()), + environment_vars, + self.output_config.verbosity_level > 0, + move |url| { + if let Some(s) = spinner { + s.finish_and_clear(); + } + eprintln!( + "{}\n {}", + s("Jupyter server is now running at:").green().bold(), + s(url).bold(), + ); + eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); + let _ = webbrowser::open(url); + }, + ) + .await + .int_err()?; + + tokio::select! { + _ = container_runtime::signal::graceful_stop() => { + eprintln!("{}", s("Shutting down").yellow()); + }, + exit_status = livy.wait() => { + tracing::warn!(?exit_status, "Livy container exited"); + }, + exit_status = notebook.wait() => { + tracing::warn!(?exit_status, "Jupyter container exited"); + }, + } + + notebook.terminate().await.int_err()?; + livy.terminate().await.int_err()?; + network.free().await.int_err()?; + + Ok(()) + } +} + +#[async_trait::async_trait(?Send)] +impl Command for NotebookCommand { + async fn run(&mut self) -> Result<(), CLIError> { + let engine = self.engine.unwrap_or(SqlShellEngine::Datafusion); + + match engine { + SqlShellEngine::Datafusion => self.run_datafusion().await, + SqlShellEngine::Spark => self.run_spark().await, + } + } } diff --git a/src/app/cli/src/commands/sql_server_command.rs b/src/app/cli/src/commands/sql_server_command.rs index 0b22383621..f61731152b 100644 --- a/src/app/cli/src/commands/sql_server_command.rs +++ b/src/app/cli/src/commands/sql_server_command.rs @@ -17,66 +17,103 @@ use internal_error::*; use kamu::*; use super::common::PullImageProgress; -use super::{CLIError, Command}; -use crate::explore::SqlShellImpl; +use super::{CLIError, Command, SqlShellEngine}; +use crate::explore::{FlightSqlServiceFactory, SparkLivyServerFactory, SqlShellImpl}; use crate::output::*; use crate::WorkspaceLayout; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub struct SqlServerCommand { + flight_sql_service_factory: Arc, + spark_livy_server_factory: Arc, workspace_layout: Arc, engine_prov_config: Arc, output_config: Arc, container_runtime: Arc, address: Option, port: Option, + engine: Option, + livy: bool, } impl SqlServerCommand { pub fn new( + flight_sql_service_factory: Arc, + spark_livy_server_factory: Arc, workspace_layout: Arc, engine_prov_config: Arc, output_config: Arc, container_runtime: Arc, address: Option, port: Option, + engine: Option, + livy: bool, ) -> Self { Self { + flight_sql_service_factory, + spark_livy_server_factory, workspace_layout, engine_prov_config, output_config, container_runtime, address, port, + engine, + livy, } } -} - -#[async_trait::async_trait(?Send)] -impl Command for SqlServerCommand { - async fn run(&mut self) -> Result<(), CLIError> { - let sql_shell = SqlShellImpl::new( - self.container_runtime.clone(), - self.engine_prov_config.spark_image.clone(), - ); - - let spinner = if self.output_config.verbosity_level == 0 && !self.output_config.quiet { - let mut pull_progress = PullImageProgress::new("engine"); - sql_shell - .ensure_images(&mut pull_progress) - .await - .int_err()?; + fn startup_spinner(&self, message: &str) -> Option { + if self.output_config.verbosity_level == 0 && !self.output_config.quiet { let s = indicatif::ProgressBar::new_spinner(); let style = indicatif::ProgressStyle::default_spinner() .template("{spinner:.cyan} {msg}") .unwrap(); s.set_style(style); - s.set_message("Starting SQL server"); + s.set_message(message.to_string()); s.enable_steady_tick(Duration::from_millis(100)); Some(s) } else { None - }; + } + } + + async fn run_datafusion_flight_sql(&mut self) -> Result<(), CLIError> { + let flight_sql_svc = self + .flight_sql_service_factory + .start(self.address, self.port) + .await?; + + eprintln!( + "{} {}", + s("Flight SQL server is now running on:").green().bold(), + s(format!("{}", flight_sql_svc.local_addr())).bold(), + ); + eprintln!( + "{}", + s("Protocol documentation: https://docs.kamu.dev/node/protocols/flight-sql/").dim() + ); + eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); + + flight_sql_svc.wait().await.int_err()?; + + Ok(()) + } + + async fn run_spark_jdbc(&mut self) -> Result<(), CLIError> { + let sql_shell = SqlShellImpl::new( + self.container_runtime.clone(), + self.engine_prov_config.spark_image.clone(), + ); + + let mut pull_progress = PullImageProgress::new(self.output_config.clone(), "engine"); + sql_shell + .ensure_images(&mut pull_progress) + .await + .int_err()?; + + let spinner = self.startup_spinner("Starting Spark JDBC server"); let address = self.address.unwrap_or("127.0.0.1".parse().unwrap()); let port = self.port.unwrap_or(10000); @@ -95,14 +132,96 @@ impl Command for SqlServerCommand { s.finish_and_clear(); } eprintln!( - "{}\n {}", - s("SQL server is now running at:").green().bold(), + "{} {}", + s("Spark JDBC server is now running on:").green().bold(), s(format!("jdbc:hive2://{address}:{port}")).bold(), ); + eprintln!( + "{}", + s("Protocol documentation: https://docs.kamu.dev/node/protocols/jdbc/").dim() + ); eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); - spark.wait().await?; + tokio::select! { + _ = container_runtime::signal::graceful_stop() => { + eprintln!("{}", s("Shutting down").yellow()); + spark.terminate().await?; + }, + exit_status = spark.wait() => { + exit_status?; + eprintln!("{}", s("Container exited").yellow()); + }, + } Ok(()) } + + async fn run_spark_livy(&mut self) -> Result<(), CLIError> { + let pull_progress = PullImageProgress::new(self.output_config.clone(), "engine"); + self.spark_livy_server_factory + .ensure_image(Some(&pull_progress)) + .await + .int_err()?; + + let spinner = self.startup_spinner("Starting Spark Livy server"); + + let mut livy = self + .spark_livy_server_factory + .start( + self.address, + self.port, + self.output_config.verbosity_level > 0, + None, + ) + .await + .int_err()?; + + livy.wait_for_socket(std::time::Duration::from_secs(30)) + .await + .int_err()?; + + if let Some(s) = spinner { + s.finish_and_clear(); + } + eprintln!( + "{} {}", + s("Spark Livy server is now running on:").green().bold(), + s(format!("http://{}", livy.local_addr())).bold(), + ); + eprintln!( + "{}", + s("This protocol is deprecated and will be replaced by FlightSQL").yellow() + ); + eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); + + tokio::select! { + _ = container_runtime::signal::graceful_stop() => { + eprintln!("{}", s("Shutting down").yellow()); + livy.terminate().await?; + }, + exit_status = livy.wait() => { + exit_status?; + eprintln!("{}", s("Container exited").yellow()); + }, + } + + Ok(()) + } +} + +#[async_trait::async_trait(?Send)] +impl Command for SqlServerCommand { + async fn run(&mut self) -> Result<(), CLIError> { + let engine = self.engine.unwrap_or(if !self.livy { + SqlShellEngine::Datafusion + } else { + SqlShellEngine::Spark + }); + + match engine { + SqlShellEngine::Datafusion => self.run_datafusion_flight_sql().await, + SqlShellEngine::Spark if self.livy => self.run_spark_livy().await, + SqlShellEngine::Spark => self.run_spark_jdbc().await, + } + } } diff --git a/src/app/cli/src/commands/sql_server_flightsql_command.rs b/src/app/cli/src/commands/sql_server_flightsql_command.rs deleted file mode 100644 index 3583d7b3b5..0000000000 --- a/src/app/cli/src/commands/sql_server_flightsql_command.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::net::IpAddr; - -use arrow_flight::flight_service_server::FlightServiceServer; -use console::style as s; -use internal_error::*; -use tokio::net::TcpListener; -use tonic::transport::Server; - -use super::{CLIError, Command}; - -pub struct SqlServerFlightSqlCommand { - catalog: dill::Catalog, - address: Option, - port: Option, -} - -impl SqlServerFlightSqlCommand { - pub fn new(catalog: dill::Catalog, address: Option, port: Option) -> Self { - Self { - catalog, - address, - port, - } - } -} - -#[async_trait::async_trait(?Send)] -impl Command for SqlServerFlightSqlCommand { - async fn run(&mut self) -> Result<(), CLIError> { - let listener = TcpListener::bind(( - self.address.unwrap_or("127.0.0.1".parse().unwrap()), - self.port.unwrap_or(0), - )) - .await - .unwrap(); - - let addr = listener.local_addr().unwrap(); - tracing::info!("Listening on {addr:?}"); - - eprintln!( - "{} {}", - s("Flight SQL server is now running on:").green().bold(), - s(addr).bold(), - ); - eprintln!( - "{}", - s(format!( - indoc::indoc!( - r#" - To connect via JDBC: - - Get latest driver from https://central.sonatype.com/artifact/org.apache.arrow/flight-sql-jdbc-driver - - Install driver in your client application - - Connect using URL: jdbc:arrow-flight-sql://{}?useEncryption=false - - Use 'kamu' as login and password"# - ), - addr - )).yellow() - ); - eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); - - // This catalog will be attached to every request by the middleware layer - let catalog = self.catalog.clone(); - - Server::builder() - .layer(tonic::service::interceptor( - move |mut req: tonic::Request<()>| { - req.extensions_mut().insert(catalog.clone()); - Ok(req) - }, - )) - .add_service(FlightServiceServer::new( - kamu_adapter_flight_sql::KamuFlightSqlService::builder() - .with_server_name(crate::BINARY_NAME, crate::VERSION) - .build(), - )) - .serve_with_incoming(tokio_stream::wrappers::TcpListenerStream::new(listener)) - .await - .int_err()?; - - Ok(()) - } -} diff --git a/src/app/cli/src/commands/sql_server_livy_command.rs b/src/app/cli/src/commands/sql_server_livy_command.rs deleted file mode 100644 index 402b7617ef..0000000000 --- a/src/app/cli/src/commands/sql_server_livy_command.rs +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::net::IpAddr; -use std::sync::Arc; -use std::time::Duration; - -use console::style as s; -use container_runtime::ContainerRuntime; -use internal_error::*; - -use super::common::PullImageProgress; -use super::{CLIError, Command}; -use crate::config::JupyterConfig; -use crate::explore::LivyServerImpl; -use crate::output::*; -use crate::WorkspaceLayout; - -pub struct SqlServerLivyCommand { - workspace_layout: Arc, - jupyter_config: Arc, - output_config: Arc, - container_runtime: Arc, - address: Option, - port: Option, -} - -impl SqlServerLivyCommand { - pub fn new( - workspace_layout: Arc, - jupyter_config: Arc, - output_config: Arc, - container_runtime: Arc, - address: Option, - port: Option, - ) -> Self { - Self { - workspace_layout, - jupyter_config, - output_config, - container_runtime, - address, - port, - } - } -} - -#[async_trait::async_trait(?Send)] -impl Command for SqlServerLivyCommand { - async fn run(&mut self) -> Result<(), CLIError> { - let livy_server = LivyServerImpl::new( - self.container_runtime.clone(), - self.jupyter_config.livy_image.clone().unwrap(), - ); - - let spinner = if self.output_config.is_tty - && self.output_config.verbosity_level == 0 - && !self.output_config.quiet - { - let mut pull_progress = PullImageProgress::new("engine"); - livy_server - .ensure_images(&mut pull_progress) - .await - .int_err()?; - - let s = indicatif::ProgressBar::new_spinner(); - let style = indicatif::ProgressStyle::default_spinner() - .template("{spinner:.cyan} {msg}") - .unwrap(); - s.set_style(style); - s.set_message("Starting Livy server"); - s.enable_steady_tick(Duration::from_millis(100)); - Some(s) - } else { - None - }; - - let address = self.address.unwrap_or("127.0.0.1".parse().unwrap()); - let port = self.port.unwrap_or(10000); - let url = format!("{address}:{port}"); - - livy_server - .run( - &address.to_string(), - port, - &self.workspace_layout.datasets_dir, - &self.workspace_layout.run_info_dir, - self.output_config.verbosity_level > 0, - move || { - if let Some(s) = spinner { - s.finish_and_clear(); - } - eprintln!( - "{} {}", - s("Livy server is now running on:").green().bold(), - s(url).bold(), - ); - eprintln!("{}", s("Use Ctrl+C to stop the server").yellow()); - }, - ) - .await - .int_err()?; - Ok(()) - } -} diff --git a/src/app/cli/src/commands/sql_shell_command.rs b/src/app/cli/src/commands/sql_shell_command.rs index 9209b68ce1..00276ce792 100644 --- a/src/app/cli/src/commands/sql_shell_command.rs +++ b/src/app/cli/src/commands/sql_shell_command.rs @@ -97,13 +97,13 @@ impl SqlShellCommand { self.engine_prov_config.spark_image.clone(), ); - let spinner = if self.output_config.verbosity_level == 0 && !self.output_config.quiet { - let mut pull_progress = PullImageProgress::new("container"); - sql_shell - .ensure_images(&mut pull_progress) - .await - .int_err()?; + let mut pull_progress = PullImageProgress::new(self.output_config.clone(), "container"); + sql_shell + .ensure_images(&mut pull_progress) + .await + .int_err()?; + let spinner = if self.output_config.verbosity_level == 0 && !self.output_config.quiet { let s = indicatif::ProgressBar::new_spinner(); let style = indicatif::ProgressStyle::default_spinner() .template("{spinner:.cyan} {msg}") diff --git a/src/app/cli/src/explore/flight_sql_service_factory.rs b/src/app/cli/src/explore/flight_sql_service_factory.rs new file mode 100644 index 0000000000..137f6979f3 --- /dev/null +++ b/src/app/cli/src/explore/flight_sql_service_factory.rs @@ -0,0 +1,92 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::future::Future; +use std::net::{IpAddr, SocketAddr}; +use std::pin::Pin; + +use arrow_flight::flight_service_server::FlightServiceServer; +use kamu_adapter_flight_sql::AuthenticationLayer; + +use crate::CLIError; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[dill::component] +pub struct FlightSqlServiceFactory { + catalog: dill::Catalog, +} + +impl FlightSqlServiceFactory { + pub async fn start( + &self, + address: Option, + port: Option, + ) -> Result { + assert_matches!( + self.catalog + .get_one::(), + Err(dill::InjectionError::Unregistered(_)), + "FlightSqlServiceFactory must be constructed from the base catalog" + ); + + let listener = tokio::net::TcpListener::bind(( + address.unwrap_or("127.0.0.1".parse().unwrap()), + port.unwrap_or(0), + )) + .await + .unwrap(); + + let address = listener.local_addr().unwrap(); + + // This catalog will be attached to every request by the middleware layer + let catalog = self.catalog.clone(); + + let server_future = tonic::transport::Server::builder() + .layer(observability::tonic::grpc_layer()) + .layer(tonic::service::interceptor( + move |mut req: tonic::Request<()>| { + req.extensions_mut().insert(catalog.clone()); + Ok(req) + }, + )) + .layer(AuthenticationLayer::new()) + .add_service(FlightServiceServer::new( + kamu_adapter_flight_sql::KamuFlightSqlServiceWrapper, + )) + .serve_with_incoming(tokio_stream::wrappers::TcpListenerStream::new(listener)); + + tracing::info!("FlightSQL is listening on {:?}", address); + + Ok(FlightSqlService { + address, + server_future: Box::pin(server_future), + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct FlightSqlService { + address: SocketAddr, + server_future: Pin>>>, +} + +impl FlightSqlService { + pub fn local_addr(&self) -> &SocketAddr { + &self.address + } + + pub async fn wait(self) -> Result<(), tonic::transport::Error> { + self.server_future.await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/app/cli/src/explore/livy_server_impl.rs b/src/app/cli/src/explore/livy_server_impl.rs deleted file mode 100644 index 4b9670a7c1..0000000000 --- a/src/app/cli/src/explore/livy_server_impl.rs +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::fs::File; -use std::path::{Path, PathBuf}; -use std::process::Stdio; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; -use std::time::Duration; - -use container_runtime::*; -use internal_error::*; - -use crate::error::{CommandRunError, SubprocessError}; - -pub struct LivyServerImpl { - container_runtime: Arc, - image: String, -} - -impl LivyServerImpl { - pub fn new(container_runtime: Arc, image: String) -> Self { - Self { - container_runtime, - image, - } - } - - pub async fn ensure_images( - &self, - listener: &mut dyn PullImageListener, - ) -> Result<(), ImagePullError> { - self.container_runtime - .ensure_image(&self.image, Some(listener)) - .await - } - - pub async fn run( - &self, - addr: &str, - host_port: u16, - datasets_dir: &PathBuf, - run_info_dir: &Path, - inherit_stdio: bool, - on_started: StartedClb, - ) -> Result<(), CommandRunError> - where - StartedClb: FnOnce() + Send + 'static, - { - const LIVY_PORT: u16 = 8998; - - let livy_stdout_path = run_info_dir.join("livy.out.txt"); - let livy_stderr_path = run_info_dir.join("livy.err.txt"); - - let mut livy = self - .container_runtime - .run_attached(&self.image) - .random_container_name_with_prefix("kamu-livy-") - .entry_point("/opt/livy/bin/livy-server") - .user("root") - .map_port_with_address(addr, host_port, LIVY_PORT) - .work_dir("/opt/bitnami/spark/work-dir") - .volume((&datasets_dir, "/opt/bitnami/spark/work-dir")) - .stdout(if inherit_stdio { - Stdio::inherit() - } else { - Stdio::from(File::create(&livy_stdout_path).int_err()?) - }) - .stderr(if inherit_stdio { - Stdio::inherit() - } else { - Stdio::from(File::create(&livy_stderr_path).int_err()?) - }) - .spawn() - .map_err(|err| { - CommandRunError::SubprocessError(SubprocessError::new( - vec![livy_stderr_path, livy_stdout_path], - err, - )) - })?; - - livy.wait_for_host_socket(LIVY_PORT, Duration::from_secs(60)) - .await - .int_err()?; - - on_started(); - - let exit = Arc::new(AtomicBool::new(false)); - signal_hook::flag::register(libc::SIGINT, exit.clone()).int_err()?; - signal_hook::flag::register(libc::SIGTERM, exit.clone()).int_err()?; - - while !exit.load(Ordering::Relaxed) { - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - - livy.terminate().await.int_err()?; - Ok(()) - } -} diff --git a/src/app/cli/src/explore/mod.rs b/src/app/cli/src/explore/mod.rs index 1b3db34885..387a1c5651 100644 --- a/src/app/cli/src/explore/mod.rs +++ b/src/app/cli/src/explore/mod.rs @@ -8,24 +8,21 @@ // by the Apache License, Version 2.0. mod api_server; -pub use api_server::*; - -mod livy_server_impl; -pub use livy_server_impl::*; - -mod notebook_server_impl; -pub use notebook_server_impl::*; - +mod flight_sql_service_factory; +mod notebook_server_factory; +mod spark_livy_server_factory; mod sql_shell_impl; -pub use sql_shell_impl::*; - +mod trace_server; mod ui_configuration; -pub(crate) use ui_configuration::*; - #[cfg(feature = "web-ui")] mod web_ui_server; -#[cfg(feature = "web-ui")] -pub use web_ui_server::*; -mod trace_server; +pub use api_server::*; +pub use flight_sql_service_factory::*; +pub use notebook_server_factory::*; +pub use spark_livy_server_factory::*; +pub use sql_shell_impl::*; pub use trace_server::*; +pub(crate) use ui_configuration::*; +#[cfg(feature = "web-ui")] +pub use web_ui_server::*; diff --git a/src/app/cli/src/explore/notebook_server_factory.rs b/src/app/cli/src/explore/notebook_server_factory.rs new file mode 100644 index 0000000000..38604f3310 --- /dev/null +++ b/src/app/cli/src/explore/notebook_server_factory.rs @@ -0,0 +1,280 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::net::IpAddr; +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::sync::Arc; +use std::time::Duration; + +use container_runtime::*; +use internal_error::*; + +use crate::config::JupyterConfig; +use crate::error::{CommandRunError, SubprocessError}; +use crate::WorkspaceLayout; + +#[dill::component] +pub struct NotebookServerFactory { + container_runtime: Arc, + jupyter_config: Arc, + workspace_layout: Arc, +} + +impl NotebookServerFactory { + pub async fn ensure_image( + &self, + listener: Option<&dyn PullImageListener>, + ) -> Result<(), ImagePullError> { + self.container_runtime + .ensure_image(self.jupyter_config.image.as_ref().unwrap(), listener) + .await?; + + Ok(()) + } + + pub async fn start( + &self, + client_url: &url::Url, + address: Option, + port: Option, + network: Option<&str>, + environment_vars: Vec<(String, String)>, + inherit_stdio: bool, + on_started: StartedClb, + ) -> Result + where + StartedClb: FnOnce(&str) + Send + 'static, + { + let container_port = 8080; + + let cwd = Path::new(".").canonicalize().unwrap(); + + let stdout_path = self.workspace_layout.run_info_dir.join("jupyter.out.txt"); + let stderr_path = self.workspace_layout.run_info_dir.join("jupyter.err.txt"); + + let mut contaniner = self + .container_runtime + .run_attached(self.jupyter_config.image.as_ref().unwrap()) + .random_container_name_with_prefix("kamu-jupyter-") + .user("root") + // Start jupyter under root which suits better for rootless podman + // See: https://github.com/jupyter/docker-stacks/pull/2039 + .environment_vars([ + ("NB_USER", "root"), + ("NB_UID", "0"), + ("NB_GID", "0"), + ("KAMU_CLIENT_URL", client_url.as_str()), + ]) + .work_dir("/opt/workdir") + .map(network, container_runtime::ContainerRunCommand::network) + .extra_host(("host.docker.internal", "host-gateway")) + .map_or( + address, + |c, address| { + c.map_port_with_address( + address.to_string(), + port.expect("Must specify port"), + container_port, + ) + }, + |c| c.map_port(port.unwrap_or(0), container_port), + ) + .volume((&cwd, "/opt/workdir")) + .environment_vars(environment_vars) + .args([ + "jupyter".to_owned(), + "lab".to_owned(), + "--allow-root".to_owned(), + "--ip".to_owned(), + "0.0.0.0".to_string(), + "--port".to_owned(), + container_port.to_string(), + ]) + .stdout(if inherit_stdio { + Stdio::inherit() + } else { + Stdio::from(std::fs::File::create(&stdout_path).int_err()?) + }) + .stderr(Stdio::piped()) + .spawn() + .int_err()?; + + let port = contaniner + .wait_for_host_port(container_port, Duration::from_secs(30)) + .await + .int_err()?; + + // FIXME: When used with FlightSQL we have notebook container communicating with + // FlightSQL server running on the host using the special `host.docker.internal` + // bridge. This however causes host traffic use external interface instead of + // localhost and may result in firewalls trapping it. Here we perform a sanity + // check that client address is reachable from the Jupyter container to display + // a nice error message. + if client_url.as_str().contains("host.docker.internal") { + let cmd = format!( + "nc -zv -w 5 {} {}", + client_url.host_str().unwrap(), + client_url.port().unwrap(), + ); + + tracing::info!(cmd, "Checking Jupyter to data server connectivity"); + + let status = contaniner + .exec_shell_cmd(ExecArgs::default(), cmd.clone()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await + .int_err()?; + + if !status.success() { + return Err(indoc::formatdoc!( + r#" + Connection test from Jupyter container failed: + {cmd} + + Because Jupyter container is trying to reach the FlightSQL port on he host the traffic is routed through external interface and may be affected by the firewall. Consider checking your firewall rules and allowing traffic from private subnet. If the error doesn't go away - please submit and bug report! + "# + ) + .trim() + .int_err() + .into()); + } + } + + let host_addr = self.container_runtime.get_runtime_host_addr(); + + let token_clb = move |token: &str| { + let url = format!("http://{host_addr}:{port}/?token={token}"); + on_started(&url); + }; + + let token_extractor = if inherit_stdio { + TokenExtractor::new( + contaniner.take_stderr().unwrap(), + tokio::io::stderr(), + Some(token_clb), + ) + } else { + TokenExtractor::new( + contaniner.take_stderr().unwrap(), + tokio::fs::File::create(&stderr_path).await.map_err(|err| { + CommandRunError::SubprocessError(SubprocessError::new( + vec![stderr_path, stdout_path], + err, + )) + })?, + Some(token_clb), + ) + }; + + Ok(NotebookContainer { + contaniner, + token_extractor, + container_runtime: self.container_runtime.clone(), + chown_image: self.jupyter_config.image.clone().unwrap(), + chown_dir: cwd, + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// NotebookContainer +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Wraps notebook container to perform post-exit permission fix-up +pub struct NotebookContainer { + contaniner: ContainerProcess, + token_extractor: TokenExtractor, + container_runtime: Arc, + chown_image: String, + chown_dir: PathBuf, +} + +impl NotebookContainer { + pub async fn wait(&mut self) -> std::io::Result { + self.contaniner.wait().await + } + + pub async fn terminate(mut self) -> Result<(), CommandRunError> { + self.contaniner.terminate().await.int_err()?; + self.token_extractor.handle.await.int_err()?; + + // Fix permissions + if self.container_runtime.config.runtime == ContainerRuntimeType::Docker { + cfg_if::cfg_if! { + if #[cfg(unix)] { + self.container_runtime + .run_attached(&self.chown_image) + .random_container_name_with_prefix("kamu-jupyter-permissions-") + .shell_cmd(format!( + "chown -Rf {}:{} {}", + unsafe { libc::geteuid() }, + unsafe { libc::getegid() }, + "/opt/workdir" + )) + .user("root") + .volume((&self.chown_dir, "/opt/workdir")) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await + .int_err()?; + } + } + } + + Ok(()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// TokenExtractor +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Proxies STDOUT of the notebook server to fetch the authorization token +struct TokenExtractor { + handle: tokio::task::JoinHandle<()>, +} + +impl TokenExtractor { + fn new(input: R, mut output: W, mut on_token: Option) -> Self + where + R: tokio::io::AsyncRead + Unpin + Send + 'static, + W: tokio::io::AsyncWrite + Unpin + Send + 'static, + Clb: FnOnce(&str) + Send + 'static, + { + use tokio::io::{AsyncBufReadExt, AsyncWriteExt}; + + let handle = tokio::spawn(async move { + let re = regex::Regex::new("token=([a-z0-9]+)").unwrap(); + let mut reader = tokio::io::BufReader::new(input); + let mut line = String::with_capacity(1024); + loop { + line.clear(); + if reader.read_line(&mut line).await.unwrap() == 0 { + output.flush().await.unwrap(); + break; + } + + output.write_all(line.as_bytes()).await.unwrap(); + + if let Some(capture) = re.captures(&line) { + if let Some(clb) = on_token.take() { + let token = capture.get(1).unwrap().as_str(); + clb(token); + } + } + } + }); + + Self { handle } + } +} diff --git a/src/app/cli/src/explore/notebook_server_impl.rs b/src/app/cli/src/explore/notebook_server_impl.rs deleted file mode 100644 index 075b3c837e..0000000000 --- a/src/app/cli/src/explore/notebook_server_impl.rs +++ /dev/null @@ -1,274 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::net::{IpAddr, Ipv4Addr}; -use std::path::{Path, PathBuf}; -use std::process::Stdio; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; -use std::time::Duration; - -use container_runtime::*; -use internal_error::*; - -use crate::config::JupyterConfig; -use crate::error::{CommandRunError, SubprocessError}; - -pub struct NotebookServerImpl { - container_runtime: Arc, - jupyter_config: Arc, -} - -impl NotebookServerImpl { - pub fn new( - container_runtime: Arc, - jupyter_config: Arc, - ) -> Self { - Self { - container_runtime, - jupyter_config, - } - } - - pub async fn ensure_images( - &self, - listener: &dyn PullImageListener, - ) -> Result<(), ImagePullError> { - self.container_runtime - .ensure_image( - self.jupyter_config.livy_image.as_ref().unwrap(), - Some(listener), - ) - .await?; - - self.container_runtime - .ensure_image(self.jupyter_config.image.as_ref().unwrap(), Some(listener)) - .await?; - - Ok(()) - } - - pub async fn run( - &self, - datasets_dir: &PathBuf, - run_info_dir: &Path, - address: Option, - port: Option, - environment_vars: Vec<(String, String)>, - inherit_stdio: bool, - on_started: StartedClb, - on_shutdown: ShutdownClb, - ) -> Result<(), CommandRunError> - where - StartedClb: FnOnce(&str) + Send + 'static, - ShutdownClb: FnOnce() + Send + 'static, - { - assert!( - address.is_none(), - "Exposing Notebook server on a host network interface is not yet supported" - ); - - let network = self - .container_runtime - .create_random_network_with_prefix("kamu-") - .await - .int_err()?; - let network_name = network.name(); - - let cwd = Path::new(".").canonicalize().unwrap(); - - let livy_stdout_path = run_info_dir.join("livy.out.txt"); - let livy_stderr_path = run_info_dir.join("livy.err.txt"); - let jupyter_stdout_path = run_info_dir.join("jupyter.out.txt"); - let jupyter_stderr_path = run_info_dir.join("jupyter.err.txt"); - - let mut livy = self - .container_runtime - .run_attached(self.jupyter_config.livy_image.as_ref().unwrap()) - .random_container_name_with_prefix("kamu-livy-") - .hostname("kamu-livy") - .network(network_name) - .user("root") - .work_dir("/opt/bitnami/spark/work-dir") - .volume((&datasets_dir, "/opt/bitnami/spark/work-dir")) - .entry_point("/opt/livy/bin/livy-server") - .stdout(if inherit_stdio { - Stdio::inherit() - } else { - Stdio::from(std::fs::File::create(&livy_stdout_path).int_err()?) - }) - .stderr(if inherit_stdio { - Stdio::inherit() - } else { - Stdio::from(std::fs::File::create(&livy_stderr_path).int_err()?) - }) - .spawn() - .map_err(|err| { - CommandRunError::SubprocessError(SubprocessError::new( - vec![livy_stderr_path, livy_stdout_path], - err, - )) - })?; - - let jupyter_port_in_container = port.unwrap_or(8080); - let mut jupyter = self - .container_runtime - .run_attached(self.jupyter_config.image.as_ref().unwrap()) - .random_container_name_with_prefix("kamu-jupyter-") - .network(network_name) - .user("root") - // Start jupyter under root which suits better for rootless podman - // See: https://github.com/jupyter/docker-stacks/pull/2039 - .environment_vars([("NB_USER", "root"), ("NB_UID", "0"), ("NB_GID", "0")]) - .work_dir("/opt/workdir") - .expose_port(jupyter_port_in_container) - .volume((&cwd, "/opt/workdir")) - .environment_vars(environment_vars) - .args([ - "jupyter".to_owned(), - "notebook".to_owned(), - "--allow-root".to_owned(), - "--ip".to_owned(), - address - .unwrap_or(IpAddr::V4(Ipv4Addr::UNSPECIFIED)) - .to_string(), - "--port".to_owned(), - jupyter_port_in_container.to_string(), - // TODO: Remove show_banner option after Sparkmagic supports notebook >= 7.0.0 - // See: https://github.com/jupyter-incubator/sparkmagic/issues/885 - "--NotebookApp.show_banner=False".to_string(), - ]) - .stdout(if inherit_stdio { - Stdio::inherit() - } else { - Stdio::from(std::fs::File::create(&jupyter_stdout_path).int_err()?) - }) - .stderr(Stdio::piped()) - .spawn() - .int_err()?; - - let docker_host = self.container_runtime.get_runtime_host_addr(); - let jupyter_port = jupyter - .wait_for_host_socket(jupyter_port_in_container, Duration::from_secs(30)) - .await - .int_err()?; - - let token_clb = move |token: &str| { - let url = format!("http://{docker_host}:{jupyter_port}/?token={token}"); - on_started(&url); - }; - - let token_extractor = if inherit_stdio { - TokenExtractor::new( - jupyter.take_stderr().unwrap(), - tokio::io::stderr(), - Some(token_clb), - ) - } else { - TokenExtractor::new( - jupyter.take_stderr().unwrap(), - tokio::fs::File::create(&jupyter_stderr_path) - .await - .map_err(|err| { - CommandRunError::SubprocessError(SubprocessError::new( - vec![jupyter_stderr_path, jupyter_stdout_path], - err, - )) - })?, - Some(token_clb), - ) - }; - - let exit = Arc::new(AtomicBool::new(false)); - signal_hook::flag::register(libc::SIGINT, exit.clone()).int_err()?; - signal_hook::flag::register(libc::SIGTERM, exit.clone()).int_err()?; - - // TODO: Detect crashed processes - // Relying on shell to send signal to child processes - while !exit.load(Ordering::Relaxed) { - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - - on_shutdown(); - - jupyter.terminate().await.int_err()?; - livy.terminate().await.int_err()?; - network.free().await.int_err()?; - token_extractor.handle.await.int_err()?; - - // Fix permissions - if self.container_runtime.config.runtime == ContainerRuntimeType::Docker { - cfg_if::cfg_if! { - if #[cfg(unix)] { - self.container_runtime - .run_attached(self.jupyter_config.image.as_ref().unwrap()) - .random_container_name_with_prefix("kamu-jupyter-permissions-") - .shell_cmd(format!( - "chown -Rf {}:{} {}", - unsafe { libc::geteuid() }, - unsafe { libc::getegid() }, - "/opt/workdir" - )) - .user("root") - .volume((cwd, "/opt/workdir")) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .status() - .await - .int_err()?; - } - } - } - - Ok(()) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// TokenExtractor -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct TokenExtractor { - handle: tokio::task::JoinHandle<()>, -} - -impl TokenExtractor { - fn new(input: R, mut output: W, mut on_token: Option) -> Self - where - R: tokio::io::AsyncRead + Unpin + Send + 'static, - W: tokio::io::AsyncWrite + Unpin + Send + 'static, - Clb: FnOnce(&str) + Send + 'static, - { - use tokio::io::{AsyncBufReadExt, AsyncWriteExt}; - - let handle = tokio::spawn(async move { - let re = regex::Regex::new("token=([a-z0-9]+)").unwrap(); - let mut reader = tokio::io::BufReader::new(input); - let mut line = String::with_capacity(1024); - loop { - line.clear(); - if reader.read_line(&mut line).await.unwrap() == 0 { - output.flush().await.unwrap(); - break; - } - - output.write_all(line.as_bytes()).await.unwrap(); - - if let Some(capture) = re.captures(&line) { - if let Some(clb) = on_token.take() { - let token = capture.get(1).unwrap().as_str(); - clb(token); - } - } - } - }); - - Self { handle } - } -} diff --git a/src/app/cli/src/explore/spark_livy_server_factory.rs b/src/app/cli/src/explore/spark_livy_server_factory.rs new file mode 100644 index 0000000000..9af456abbe --- /dev/null +++ b/src/app/cli/src/explore/spark_livy_server_factory.rs @@ -0,0 +1,153 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::net::{IpAddr, SocketAddr}; +use std::process::Stdio; +use std::sync::Arc; + +use container_runtime::*; +use internal_error::*; + +use crate::config::JupyterConfig; +use crate::{CommandRunError, SubprocessError, WorkspaceLayout}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[dill::component] +pub struct SparkLivyServerFactory { + container_runtime: Arc, + jupyter_config: Arc, + workspace_layout: Arc, +} + +impl SparkLivyServerFactory { + pub async fn ensure_image( + &self, + listener: Option<&dyn PullImageListener>, + ) -> Result<(), ImagePullError> { + self.container_runtime + .ensure_image(self.jupyter_config.livy_image.as_ref().unwrap(), listener) + .await?; + + Ok(()) + } + + pub async fn start( + &self, + address: Option, + port: Option, + inherit_stdio: bool, + network: Option<&str>, + ) -> Result { + const CONTAINER_PORT: u16 = 8998; + + let livy_stdout_path = self.workspace_layout.run_info_dir.join("livy.out.txt"); + let livy_stderr_path = self.workspace_layout.run_info_dir.join("livy.err.txt"); + + let container = self + .container_runtime + .run_attached(self.jupyter_config.livy_image.as_ref().unwrap()) + .random_container_name_with_prefix("kamu-livy-") + .hostname("kamu-livy") + .maybe(network.is_some(), |c| c.network(network.unwrap())) + .map_or( + address, + |c, address| { + c.map_port_with_address( + address.to_string(), + port.expect("Must specify port"), + CONTAINER_PORT, + ) + }, + |c| c.map_port(port.unwrap_or(0), CONTAINER_PORT), + ) + .user("root") + .work_dir("/opt/bitnami/spark/work-dir") + .volume(( + &self.workspace_layout.datasets_dir, + "/opt/bitnami/spark/work-dir", + )) + .entry_point("/opt/livy/bin/livy-server") + .stdout(if inherit_stdio { + Stdio::inherit() + } else { + Stdio::from(std::fs::File::create(&livy_stdout_path).int_err()?) + }) + .stderr(if inherit_stdio { + Stdio::inherit() + } else { + Stdio::from(std::fs::File::create(&livy_stderr_path).int_err()?) + }) + .spawn() + .map_err(|err| { + CommandRunError::SubprocessError(SubprocessError::new( + vec![livy_stderr_path, livy_stdout_path], + err, + )) + })?; + + let host_port = container + .wait_for_host_port(CONTAINER_PORT, std::time::Duration::from_secs(30)) + .await + .int_err()?; + + let addr = format!( + "{}:{}", + self.container_runtime.get_runtime_host_addr(), + host_port + ) + .parse() + .int_err()?; + + Ok(LivyContainer { + container_runtime: self.container_runtime.clone(), + container, + addr, + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct LivyContainer { + container_runtime: Arc, + container: ContainerProcess, + addr: SocketAddr, +} + +impl LivyContainer { + pub fn local_addr(&self) -> &SocketAddr { + &self.addr + } + + pub async fn wait_for_socket( + &self, + timeout: std::time::Duration, + ) -> Result<(), WaitForResourceError> { + self.container_runtime + .wait_for_socket(self.addr.port(), timeout) + .await + } +} + +impl std::ops::Deref for LivyContainer { + type Target = ContainerProcess; + + fn deref(&self) -> &Self::Target { + &self.container + } +} + +impl std::ops::DerefMut for LivyContainer { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.container + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/app/cli/src/lib.rs b/src/app/cli/src/lib.rs index 7846629ff0..c0ce3f1dfe 100644 --- a/src/app/cli/src/lib.rs +++ b/src/app/cli/src/lib.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +#![feature(assert_matches)] #![feature(box_patterns)] #![feature(exit_status_error)] #![feature(error_generic_member_access)] diff --git a/src/app/cli/src/services/config/models.rs b/src/app/cli/src/services/config/models.rs index 8f0975ec5c..ae38e60055 100644 --- a/src/app/cli/src/services/config/models.rs +++ b/src/app/cli/src/services/config/models.rs @@ -528,20 +528,34 @@ impl Default for IpfsConfig { #[derive(Debug, Clone, Merge, Serialize, Deserialize)] #[serde(deny_unknown_fields, rename_all = "camelCase")] pub struct FlightSqlConfig { + /// Whether clients can authenticate as 'anonymous' user + pub allow_anonymous: Option, + + /// Time after which FlightSQL client session will be forgotten and client + /// will have to re-authroize (for authenticated clients) + pub authed_session_expiration_timeout: Option, + + /// Time after which FlightSQL session context will be released to free the + /// resources (for authenticated clients) + pub authed_session_inactivity_timeout: Option, + /// Time after which FlightSQL client session will be forgotten and client - /// will have to re-authroize - pub session_expiration_timeout: Option, + /// will have to re-authroize (for anonymous clients) + pub anon_session_expiration_timeout: Option, /// Time after which FlightSQL session context will be released to free the - /// resources - pub session_inactivity_timeout: Option, + /// resources (for anonymous clients) + pub anon_session_inactivity_timeout: Option, } impl FlightSqlConfig { pub fn new() -> Self { Self { - session_expiration_timeout: None, - session_inactivity_timeout: None, + allow_anonymous: None, + authed_session_expiration_timeout: None, + authed_session_inactivity_timeout: None, + anon_session_expiration_timeout: None, + anon_session_inactivity_timeout: None, } } @@ -549,10 +563,24 @@ impl FlightSqlConfig { Self { ..Self::default() } } - pub fn to_system(&self) -> kamu_adapter_flight_sql::SessionCachingConfig { + pub fn to_session_auth_config(&self) -> kamu_adapter_flight_sql::SessionAuthConfig { + kamu_adapter_flight_sql::SessionAuthConfig { + allow_anonymous: self.allow_anonymous.unwrap(), + } + } + + pub fn to_session_caching_config(&self) -> kamu_adapter_flight_sql::SessionCachingConfig { kamu_adapter_flight_sql::SessionCachingConfig { - session_expiration_timeout: self.session_expiration_timeout.unwrap().into(), - session_inactivity_timeout: self.session_inactivity_timeout.unwrap().into(), + authed_session_expiration_timeout: self + .authed_session_expiration_timeout + .unwrap() + .into(), + authed_session_inactivity_timeout: self + .authed_session_inactivity_timeout + .unwrap() + .into(), + anon_session_expiration_timeout: self.anon_session_expiration_timeout.unwrap().into(), + anon_session_inactivity_timeout: self.anon_session_inactivity_timeout.unwrap().into(), } } } @@ -560,8 +588,19 @@ impl FlightSqlConfig { impl Default for FlightSqlConfig { fn default() -> Self { Self { - session_expiration_timeout: Some(DurationString::from_string("5m".to_owned()).unwrap()), - session_inactivity_timeout: Some(DurationString::from_string("5s".to_owned()).unwrap()), + allow_anonymous: Some(true), + authed_session_expiration_timeout: Some( + DurationString::from_string("30m".to_owned()).unwrap(), + ), + authed_session_inactivity_timeout: Some( + DurationString::from_string("5s".to_owned()).unwrap(), + ), + anon_session_expiration_timeout: Some( + DurationString::from_string("30m".to_owned()).unwrap(), + ), + anon_session_inactivity_timeout: Some( + DurationString::from_string("5s".to_owned()).unwrap(), + ), } } } diff --git a/src/app/cli/tests/tests/test_di_graph.rs b/src/app/cli/tests/tests/test_di_graph.rs index da654ebf80..99f2bae936 100644 --- a/src/app/cli/tests/tests/test_di_graph.rs +++ b/src/app/cli/tests/tests/test_di_graph.rs @@ -66,6 +66,7 @@ fn test_di_cli_graph_validates(tenancy_config: TenancyConfig) { cli_catalog_builder.add_value(CurrentAccountSubject::new_test()); cli_catalog_builder.add_value(JwtAuthenticationConfig::default()); cli_catalog_builder.add_value(GithubAuthenticationConfig::default()); + cli_catalog_builder.add_value(kamu_adapter_flight_sql::SessionId(String::new())); let validate_result = cli_catalog_builder.validate(); @@ -99,6 +100,7 @@ fn test_di_server_graph_validates(tenancy_config: TenancyConfig) { cli_catalog_builder.add_value(GithubAuthenticationConfig::default()); cli_catalog_builder.add_value(ServerUrlConfig::new_test(None)); cli_catalog_builder.add_value(AccessToken::new("some-test-token")); + cli_catalog_builder.add_value(kamu_adapter_flight_sql::SessionId(String::new())); // TODO: We should ensure this test covers parameters requested by commands and // types needed for GQL/HTTP adapter that are currently being constructed diff --git a/src/infra/core/src/utils/docker_images.rs b/src/infra/core/src/utils/docker_images.rs index ebf9936cea..c81b534ce3 100644 --- a/src/infra/core/src/utils/docker_images.rs +++ b/src/infra/core/src/utils/docker_images.rs @@ -13,7 +13,7 @@ pub const DATAFUSION: &str = "ghcr.io/kamu-data/engine-datafusion:0.8.1"; pub const RISINGWAVE: &str = "ghcr.io/kamu-data/engine-risingwave:0.2.0-risingwave_1.7.0-alpha"; pub const LIVY: &str = SPARK; -pub const JUPYTER: &str = "ghcr.io/kamu-data/jupyter:0.6.3"; +pub const JUPYTER: &str = "ghcr.io/kamu-data/jupyter:0.7.0"; // Test Images pub const HTTPD: &str = "docker.io/httpd:2.4"; diff --git a/src/utils/container-runtime/Cargo.toml b/src/utils/container-runtime/Cargo.toml index c9a6ee2eff..55405b7667 100644 --- a/src/utils/container-runtime/Cargo.toml +++ b/src/utils/container-runtime/Cargo.toml @@ -33,6 +33,7 @@ serde = { version = "1", features = ["derive"] } thiserror = { version = "2", default-features = false, features = ["std"] } tokio = { version = "1", default-features = false, features = [ "time", + "signal", "sync", "process", "parking_lot", diff --git a/src/utils/container-runtime/src/args.rs b/src/utils/container-runtime/src/args.rs index 55a435ff17..60088c91fc 100644 --- a/src/utils/container-runtime/src/args.rs +++ b/src/utils/container-runtime/src/args.rs @@ -32,6 +32,7 @@ pub struct RunArgs { pub tty: bool, pub user: Option, pub volumes: Vec, + pub extra_hosts: Vec, pub work_dir: Option, } @@ -61,6 +62,14 @@ pub enum VolumeAccess { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ExtraHostSpec { + pub source: String, + pub dest: String, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + impl Default for RunArgs { fn default() -> Self { Self { @@ -83,6 +92,7 @@ impl Default for RunArgs { tty: false, user: None, volumes: Vec::new(), + extra_hosts: Vec::new(), work_dir: None, } } @@ -115,3 +125,16 @@ where } } } + +impl From<(S1, S2)> for ExtraHostSpec +where + S1: Into, + S2: Into, +{ + fn from(val: (S1, S2)) -> Self { + ExtraHostSpec { + source: val.0.into(), + dest: val.1.into(), + } + } +} diff --git a/src/utils/container-runtime/src/container.rs b/src/utils/container-runtime/src/container.rs index d9c6ad73a1..8be535ea4e 100644 --- a/src/utils/container-runtime/src/container.rs +++ b/src/utils/container-runtime/src/container.rs @@ -129,7 +129,11 @@ impl ContainerRunCommand { } pub fn map_port(mut self, host: u16, container: u16) -> Self { - self.args.expose_port_map.push((host, container)); + if host != 0 { + self.args.expose_port_map.push((host, container)); + } else { + self.args.expose_ports.push(container); + } self } @@ -235,6 +239,11 @@ impl ContainerRunCommand { self } + pub fn extra_host(mut self, spec: impl Into) -> Self { + self.args.extra_hosts.push(spec.into()); + self + } + pub fn into_command(self) -> tokio::process::Command { let mut cmd = self.runtime.run_cmd(self.args); @@ -306,6 +315,48 @@ impl ContainerRunCommand { let mut container = self.spawn()?; container.wait().await } + + pub fn maybe(self, cond: bool, fun: impl FnOnce(Self) -> Self) -> Self { + if cond { + fun(self) + } else { + self + } + } + + pub fn maybe_or( + self, + cond: bool, + fun_if: impl FnOnce(Self) -> Self, + fun_else: impl FnOnce(Self) -> Self, + ) -> Self { + if cond { + fun_if(self) + } else { + fun_else(self) + } + } + + pub fn map(self, opt: Option, fun: impl FnOnce(Self, T) -> Self) -> Self { + if let Some(val) = opt { + fun(self, val) + } else { + self + } + } + + pub fn map_or( + self, + opt: Option, + fun_if: impl FnOnce(Self, T) -> Self, + fun_else: impl FnOnce(Self) -> Self, + ) -> Self { + if let Some(val) = opt { + fun_if(self, val) + } else { + fun_else(self) + } + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/container-runtime/src/lib.rs b/src/utils/container-runtime/src/lib.rs index 8c0f0342e8..44fc7cc8f3 100644 --- a/src/utils/container-runtime/src/lib.rs +++ b/src/utils/container-runtime/src/lib.rs @@ -17,6 +17,7 @@ mod errors; mod handles; mod listener; mod runtime; +pub mod signal; mod terminate; pub use args::*; diff --git a/src/utils/container-runtime/src/runtime.rs b/src/utils/container-runtime/src/runtime.rs index c1cea114a0..ba546aaea3 100644 --- a/src/utils/container-runtime/src/runtime.rs +++ b/src/utils/container-runtime/src/runtime.rs @@ -192,6 +192,10 @@ impl ContainerRuntime { cmd.arg(volume); }); + args.extra_hosts.into_iter().for_each(|h| { + cmd.arg("--add-host"); + cmd.arg(format!("{}:{}", h.source, h.dest)); + }); args.user.map(|v| cmd.arg(format!("--user={v}"))); args.work_dir .map(|v| cmd.arg(format!("--workdir={}", v.display()))); @@ -483,6 +487,11 @@ impl ContainerRuntime { } } + pub fn get_random_free_port(&self) -> Result { + let listener = std::net::TcpListener::bind("127.0.0.1:0")?; + Ok(listener.local_addr()?.port()) + } + pub fn format_host_path(path: PathBuf) -> String { if !cfg!(windows) { path.to_str().unwrap().to_owned() diff --git a/src/utils/container-runtime/src/signal.rs b/src/utils/container-runtime/src/signal.rs new file mode 100644 index 0000000000..4c7bdc3f99 --- /dev/null +++ b/src/utils/container-runtime/src/signal.rs @@ -0,0 +1,29 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +pub use tokio::signal::ctrl_c; + +#[cfg(unix)] +pub async fn terminate() -> std::io::Result<()> { + let mut sig = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?; + sig.recv().await; + Ok(()) +} + +#[cfg(not(unix))] +pub fn terminate() -> impl std::future::Future> { + std::future::pending() +} + +pub async fn graceful_stop() -> std::io::Result<()> { + tokio::select! { + res = tokio::signal::ctrl_c() => res, + res = terminate() => res, + } +} diff --git a/src/utils/observability/src/lib.rs b/src/utils/observability/src/lib.rs index 6106ac8dbd..c0a67cc056 100644 --- a/src/utils/observability/src/lib.rs +++ b/src/utils/observability/src/lib.rs @@ -11,6 +11,7 @@ pub mod axum; pub mod config; pub mod health; pub mod init; +pub mod tonic; pub mod tracing; #[cfg(feature = "prometheus")] diff --git a/src/utils/observability/src/tonic.rs b/src/utils/observability/src/tonic.rs new file mode 100644 index 0000000000..3182b7d352 --- /dev/null +++ b/src/utils/observability/src/tonic.rs @@ -0,0 +1,100 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub fn grpc_layer() -> tower_http::trace::TraceLayer< + tower_http::classify::SharedClassifier, + MakeSpan, + OnRequest, + OnResponse, +> { + tower_http::trace::TraceLayer::new_for_http() + .on_request(OnRequest) + .on_response(OnResponse) + .make_span_with(MakeSpan) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +pub struct OnRequest; + +impl tower_http::trace::OnRequest for OnRequest { + fn on_request(&mut self, request: &http::Request, _: &tracing::Span) { + tracing::info!( + uri = %request.uri(), + version = ?request.version(), + headers = ?request.headers(), + "GRPC request", + ); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +pub struct OnResponse; + +impl tower_http::trace::OnResponse for OnResponse { + fn on_response( + self, + response: &http::Response, + latency: std::time::Duration, + _span: &tracing::Span, + ) { + tracing::info!( + status = response.status().as_u16(), + headers = ?response.headers(), + latency = %Latency(latency), + "GRPC response" + ); + } +} + +struct Latency(std::time::Duration); + +impl std::fmt::Display for Latency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ms", self.0.as_millis()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct MakeSpan; + +impl tower_http::trace::MakeSpan for MakeSpan { + // TODO: Trace linking across requests + fn make_span(&mut self, request: &http::Request) -> tracing::Span { + let path = request.uri().path(); + let mut parts = path.split('/').filter(|x| !x.is_empty()); + let service = parts.next().unwrap_or_default(); + let method = parts.next().unwrap_or_default(); + + let span = crate::tracing::root_span!( + "Grpc::request", + %service, + %method, + "otel.name" = tracing::field::Empty, + ); + + #[cfg(feature = "opentelemetry")] + { + crate::tracing::include_otel_trace_id(&span); + + span.record("otel.name", format!("{service}::{method}")); + } + + span + } +} + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////