Skip to content

Commit

Permalink
Merge pull request #14 from broadinstitute/search-index-improvments
Browse files Browse the repository at this point in the history
perf(breadbox): Improve speed of dimension query
  • Loading branch information
pgm authored Jul 29, 2024
2 parents 161b4f5 + 168e1df commit a2e2cc9
Show file tree
Hide file tree
Showing 7 changed files with 359 additions and 242 deletions.
23 changes: 13 additions & 10 deletions breadbox-client/breadbox_client/api/datasets/get_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def _get_kwargs(
*,
limit: int,
include_referenced_by: Union[Unset, str] = "F",
prefix: Union[None, Unset, str] = UNSET,
prefix: Union[List[str], None, Unset] = UNSET,
substring: Union[List[str], None, Unset] = UNSET,
type_name: Union[None, Unset, str] = UNSET,
) -> Dict[str, Any]:
Expand All @@ -25,9 +25,12 @@ def _get_kwargs(

params["include_referenced_by"] = include_referenced_by

json_prefix: Union[None, Unset, str]
json_prefix: Union[List[str], None, Unset]
if isinstance(prefix, Unset):
json_prefix = UNSET
elif isinstance(prefix, list):
json_prefix = prefix

else:
json_prefix = prefix
params["prefix"] = json_prefix
Expand Down Expand Up @@ -120,7 +123,7 @@ def sync_detailed(
client: Union[AuthenticatedClient, Client],
limit: int,
include_referenced_by: Union[Unset, str] = "F",
prefix: Union[None, Unset, str] = UNSET,
prefix: Union[List[str], None, Unset] = UNSET,
substring: Union[List[str], None, Unset] = UNSET,
type_name: Union[None, Unset, str] = UNSET,
) -> Response[
Expand All @@ -134,7 +137,7 @@ def sync_detailed(
Args:
limit (int):
include_referenced_by (Union[Unset, str]): Default: 'F'.
prefix (Union[None, Unset, str]):
prefix (Union[List[str], None, Unset]):
substring (Union[List[str], None, Unset]):
type_name (Union[None, Unset, str]):
Expand Down Expand Up @@ -166,7 +169,7 @@ def sync(
client: Union[AuthenticatedClient, Client],
limit: int,
include_referenced_by: Union[Unset, str] = "F",
prefix: Union[None, Unset, str] = UNSET,
prefix: Union[List[str], None, Unset] = UNSET,
substring: Union[List[str], None, Unset] = UNSET,
type_name: Union[None, Unset, str] = UNSET,
) -> Optional[
Expand All @@ -180,7 +183,7 @@ def sync(
Args:
limit (int):
include_referenced_by (Union[Unset, str]): Default: 'F'.
prefix (Union[None, Unset, str]):
prefix (Union[List[str], None, Unset]):
substring (Union[List[str], None, Unset]):
type_name (Union[None, Unset, str]):
Expand All @@ -207,7 +210,7 @@ async def asyncio_detailed(
client: Union[AuthenticatedClient, Client],
limit: int,
include_referenced_by: Union[Unset, str] = "F",
prefix: Union[None, Unset, str] = UNSET,
prefix: Union[List[str], None, Unset] = UNSET,
substring: Union[List[str], None, Unset] = UNSET,
type_name: Union[None, Unset, str] = UNSET,
) -> Response[
Expand All @@ -221,7 +224,7 @@ async def asyncio_detailed(
Args:
limit (int):
include_referenced_by (Union[Unset, str]): Default: 'F'.
prefix (Union[None, Unset, str]):
prefix (Union[List[str], None, Unset]):
substring (Union[List[str], None, Unset]):
type_name (Union[None, Unset, str]):
Expand Down Expand Up @@ -251,7 +254,7 @@ async def asyncio(
client: Union[AuthenticatedClient, Client],
limit: int,
include_referenced_by: Union[Unset, str] = "F",
prefix: Union[None, Unset, str] = UNSET,
prefix: Union[List[str], None, Unset] = UNSET,
substring: Union[List[str], None, Unset] = UNSET,
type_name: Union[None, Unset, str] = UNSET,
) -> Optional[
Expand All @@ -265,7 +268,7 @@ async def asyncio(
Args:
limit (int):
include_referenced_by (Union[Unset, str]): Default: 'F'.
prefix (Union[None, Unset, str]):
prefix (Union[List[str], None, Unset]):
substring (Union[List[str], None, Unset]):
type_name (Union[None, Unset, str]):
Expand Down
7 changes: 5 additions & 2 deletions breadbox-client/latest-breadbox-api.json
Original file line number Diff line number Diff line change
Expand Up @@ -2206,7 +2206,7 @@
},
"info": {
"title": "Breadbox",
"version": "3.3.6"
"version": "3.3.3"
},
"openapi": "3.1.0",
"paths": {
Expand Down Expand Up @@ -3228,7 +3228,10 @@
"schema": {
"anyOf": [
{
"type": "string"
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""add indices for dim query
Revision ID: 2aa2d4c5c814
Revises: b92d0adae82b
Create Date: 2024-07-24 09:15:16.964601
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '2aa2d4c5c814'
down_revision = 'b92d0adae82b'
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('dimension_search_index', schema=None) as batch_op:
batch_op.create_index('idx_dim_search_index_perf_1', ['value'], unique=False)
batch_op.create_index('idx_dim_search_index_perf_2', ['type_name', 'value'], unique=False)
batch_op.create_index('idx_dim_search_index_perf_3', ['dimension_given_id', 'type_name', 'value'], unique=False)

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('dimension_search_index', schema=None) as batch_op:
batch_op.drop_index('idx_dim_search_index_perf_3')
batch_op.drop_index('idx_dim_search_index_perf_2')
batch_op.drop_index('idx_dim_search_index_perf_1')

# ### end Alembic commands ###
7 changes: 5 additions & 2 deletions breadbox/breadbox/api/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def get_dataset_data(
def get_dimensions(
limit: int,
include_referenced_by: str = "F",
prefix: Optional[str] = None,
prefix: Annotated[Optional[List[str]], Query()] = None,
# curiously, when switching `substring` from str to list, I had to explictly
# annotate it so that fastapi would look for it as a query parameter and not
# look for it in the payload body.
Expand All @@ -434,10 +434,13 @@ def get_dimensions(
if substring is None:
substring = []

if prefix is None:
prefix = []

search_index_entries = dataset_crud.get_dataset_dimension_search_index_entries(
db=db,
user=user,
prefix=prefix,
prefixes=prefix,
dimension_type_name=type_name,
limit=limit,
substrings=substring,
Expand Down
88 changes: 44 additions & 44 deletions breadbox/breadbox/crud/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ def get_dataset_dimension_search_index_entries(
db: SessionWithUser,
user: str,
limit: int,
prefix: Optional[str],
prefixes: List[str],
substrings: List[str],
dimension_type_name: Optional[str],
include_referenced_by: bool,
Expand All @@ -857,21 +857,19 @@ def get_dataset_dimension_search_index_entries(
search_index_filter_clauses = []
outer = aliased(DimensionSearchIndex)

if prefix is not None:
search_index_filter_clauses.append(
outer.value.startswith(prefix, autoescape=True)
)

if dimension_type_name:
search_index_filter_clauses.append(outer.type_name == dimension_type_name)

if len(substrings) > 0:
predicate_per_substring = [
def filter_per_value(values, predicate_constructor):
if len(values) == 0:
return []

exists_clause_per_value = [
db.query(DimensionSearchIndex)
.join(Dimension)
.filter(
and_(
DimensionSearchIndex.value.contains(substring, autoescape=True),
predicate_constructor(DimensionSearchIndex, value),
*search_index_filter_clauses,
outer.type_name == DimensionSearchIndex.type_name,
outer.dimension_given_id == DimensionSearchIndex.dimension_given_id,
Expand All @@ -881,48 +879,46 @@ def get_dataset_dimension_search_index_entries(
DimensionSearchIndex.type_name, DimensionSearchIndex.dimension_given_id,
)
.exists()
for substring in substrings
for value in values
]

property_matches_at_least_one_substring = or_(
*[
outer.value.contains(substring, autoescape=True)
for substring in substrings
]
property_matches_at_least_one_predicate = or_(
*[predicate_constructor(outer, value) for value in values]
)

search_index_query = (
db.query(outer)
.join(Dimension)
.filter(
and_(property_matches_at_least_one_substring, *predicate_per_substring)
)
.order_by(outer.priority, outer.label)
.limit(limit)
.with_entities(
outer.type_name,
outer.dimension_given_id,
outer.label,
outer.property,
outer.value,
)
)
return [property_matches_at_least_one_predicate] + exists_clause_per_value

else:
search_index_query = (
db.query(outer)
.join(Dimension)
.filter(and_(True, *search_index_filter_clauses))
.order_by(outer.priority, outer.label)
.limit(limit)
.with_entities(
outer.type_name,
outer.dimension_given_id,
outer.label,
outer.property,
outer.value,
filters_for_prefixes = filter_per_value(
prefixes, lambda table, prefix: table.value.startswith(prefix, autoescape=True)
)

filters_for_substrings = filter_per_value(
substrings,
lambda table, substring: table.value.contains(substring, autoescape=True),
)

search_index_query = (
db.query(outer)
# .join(Dimension)
.filter(
and_(
True,
*(
search_index_filter_clauses
+ filters_for_substrings
+ filters_for_prefixes
),
)
)
.limit(limit)
.with_entities(
outer.type_name,
outer.dimension_given_id,
outer.label,
outer.property,
outer.value,
)
)

search_index_entries = pd.read_sql(
search_index_query.statement, search_index_query.session.connection()
Expand Down Expand Up @@ -958,6 +954,10 @@ def get_dataset_dimension_search_index_entries(
)
)

# sort at the end. This has been moved out of the sql query because we
# want the sort to happen after "limit" has been applied.
group_entries.sort(key=lambda x: x.label)

return group_entries


Expand Down
7 changes: 7 additions & 0 deletions breadbox/breadbox/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,13 @@ class DimensionSearchIndex(Base, UUIDMixin, GroupMixin):

__table_args__ = (
Index("idx_dimension_search_index_dimension_id", "dimension_id",),
# the following indices are for speeding up querying to search index, but
# not 100% confident this is the optimal combination of indices. However,
# testing the get dimensions endpoint suggests that these are "good enough"
# for now.
Index("idx_dim_search_index_perf_1", "value"),
Index("idx_dim_search_index_perf_2", "type_name", "value"),
Index("idx_dim_search_index_perf_3", "dimension_given_id", "type_name", "value")
)

dimension_id = Column(
Expand Down
Loading

0 comments on commit a2e2cc9

Please sign in to comment.