The project aims to remove the guesswork of selecting columns to be used in the ZORDER statement. It achieves this by analyzing the logged execution plan for each cluster provided and returns the top n columns that were used in filter/where clauses.
Cluster log delivery
- You must setup a default destination for the cluster log delivery. For example,
dbfs:/cluster-log-delivery/0630-191345-leap375
. See the below link for more information on how to setup a cluster log deliver on databricks.
pip install in your Databricks Notebook
%pip install auto_zorder
Note: If the cluster log delivery has not been active for very long then you may not see any results.
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
cluster_ids=['cluster_id_1', 'cluster_id_2'],
optimize_table='my_db.my_table'
)
print(optimize_cmd)
>>> 'OPTIMIZE my_db.my_table ZORDER BY (col1, col2, col3, col4, col5)'
# To run the OPTIMIZE Command
spark.sql(optimize_cmd)
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
cluster_ids=['cluster_id_1', 'cluster_id_2'],
optimize_table='my_db.my_table',
number_of_cols=2
)
print(optimize_cmd)
>>> 'OPTIMIZE my_db.my_table ZORDER BY (col1, col2)'
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
cluster_ids=['cluster_id_1'],
optimize_table='my_db.my_table',
save_analysis='my_db.my_analysis'
)
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
use_analysis='my_db.my_analysis',
optimize_table='my_db.my_table'
)
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
cluster_ids=['cluster_id_1', 'cluster_id_2'],
optimize_table='my_db.my_table',
use_add_cols=[('add_col1', 0), ('add_col2', 4)]
)
print(optimize_cmd)
>>> 'OPTIMIZE my_db.my_table ZORDER BY (add_col1, auto_col1, auto_col2, auto_col3, add_col2, auto_col4, auto_col5)'
from auto_zorder import auto_zorder
optimize_cmd = auto_zorder(
cluster_ids=['cluster_id_1', 'cluster_id_2'],
optimize_table='my_db.my_table',
exclude_cols=['col1']
)
print(optimize_cmd)
>>> 'OPTIMIZE my_db.my_table ZORDER BY (col2, col3, col4, col5, col6)'
Distributed under the MIT License. See LICENSE.txt
for more information.