Skip to content

Commit

Permalink
Split filter command into 2 commands generating community reviewable TSV
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut committed Nov 5, 2024
1 parent e006326 commit 68d21d2
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 72 deletions.
158 changes: 107 additions & 51 deletions sources/bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def export_tools_to_json(tools: List[Dict], output_fp: str) -> None:


def export_tools_to_tsv(
tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False
tools: List[Dict], output_fp: str, format_list_col: bool = False, to_keep_columns: Optional[List[str]] = None
) -> None:
"""
Export tool metadata to TSV output file
Expand All @@ -563,42 +563,57 @@ def export_tools_to_tsv(
# the Galaxy tools need to be formatted for the add_instances_to_table to work
df["Galaxy tool ids"] = shared.format_list_column(df["Galaxy tool ids"])

# if add_usage_stats:
# df = add_usage_stats_for_all_server(df)
if to_keep_columns is not None:
df = df[to_keep_columns]

df.to_csv(output_fp, sep="\t", index=False)


def filter_tools(
tools: List[Dict],
ts_cat: List[str],
tool_status: Dict,
) -> tuple:
def filter_tools(tools: List[Dict], ts_cat: List[str]) -> list:
"""
Filter tools for specific ToolShed categories and add information if to keep or to exclude
Filter tools for specific ToolShed categories
:param tools: dictionary with tools and their metadata
:param ts_cat: list of ToolShed categories to keep in the extraction
:param tool_status: dictionary with tools and their 2 status: Keep and Deprecated
"""
ts_filtered_tools = []
filtered_tools = []
for tool in tools:
# filter ToolShed categories and leave function if not in expected categories
if check_categories(tool["ToolShed categories"], ts_cat):
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = name in tool_status
keep = None
deprecated = None
if name in tool_status:
keep = tool_status[name][1]
deprecated = tool_status[name][2]
tool["Deprecated"] = deprecated
if keep: # only add tools that are manually marked as to keep
filtered_tools.append(tool)
tool["To keep"] = keep
ts_filtered_tools.append(tool)
return ts_filtered_tools, filtered_tools
filtered_tools.append(tool)
tool["Deprecated"] = None
tool["To keep"] = None
return filtered_tools


def curate_tools(
tools: List[Dict],
tool_status: Dict,
) -> tuple:
"""
Filter tools for specific ToolShed categories
:param tools: dictionary with tools and their metadata
:param tool_status: dictionary with tools and their 2 status: Keep and Deprecated
"""
curated_tools = []
tools_wo_biotools = []
tools_with_biotools = []
for tool in tools:
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = name in tool_status
if name in tool_status:
keep = tool_status[name]["To keep"]
deprecated = tool_status[name]["Deprecated"]
tool["Deprecated"] = deprecated
if keep: # only add tools that are manually marked as to keep
curated_tools.append(tool)
print(tool["bio.tool id"])
if tool["bio.tool id"] is None:
tools_wo_biotools.append(tool)
else:
tools_with_biotools.append(tool)
return curated_tools, tools_wo_biotools, tools_with_biotools


def reduce_ontology_terms(terms: List, ontology: Any) -> List:
Expand Down Expand Up @@ -737,35 +752,60 @@ def get_tools(repo_list: list, edam_ontology: dict) -> List[Dict]:
help="Run a small test case using only the repository: https://github.com/TGAC/earlham-galaxytools",
)

# Filter tools
filtertools = subparser.add_parser("filter", help="Filter tools")
# Filter tools based on ToolShed categories
filtertools = subparser.add_parser("filter", help="Filter tools based on ToolShed categories")
filtertools.add_argument(
"--all",
"-a",
required=True,
help="Filepath to JSON with all extracted tools, generated by extractools command",
)
filtertools.add_argument(
"--ts-filtered",
"--categories",
"-c",
help="Path to a file with ToolShed category to keep in the extraction (one per line)",
)
filtertools.add_argument(
"--filtered",
"-f",
required=True,
help="Filepath to JSON with tools filtered based on ToolShed category",
)
filtertools.add_argument(
"--tsv-filtered",
"-t",
required=True,
help="Filepath to TSV with tools filtered based on ToolShed category",
)
filtertools.add_argument(

# Curate tools categories
curatetools = subparser.add_parser("curate", help="Curate tools based on community review")
curatetools.add_argument(
"--filtered",
"-f",
required=True,
help="Filepath to TSV with tools filtered based on ToolShed category and manual curation",
help="Filepath to JSON with tools filtered based on ToolShed category",
)
filtertools.add_argument(
"--categories",
curatetools.add_argument(
"--curated",
"-c",
help="Path to a file with ToolShed category to keep in the extraction (one per line)",
required=True,
help="Filepath to TSV with curated tools",
)
filtertools.add_argument(
curatetools.add_argument(
"--wo-biotools",
required=True,
help="Filepath to TSV with tools not linked to bio.tools",
)
curatetools.add_argument(
"--w-biotools",
required=True,
help="Filepath to TSV with tools linked to bio.tools",
)
curatetools.add_argument(
"--status",
"-s",
help="Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not",
help="Path to a TSV file with tool status - at least 3 columns: IDs of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not",
)
args = parser.parse_args()

Expand All @@ -783,32 +823,48 @@ def get_tools(repo_list: list, edam_ontology: dict) -> List[Dict]:
edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load()
tools = get_tools(repo_list, edam_ontology)
export_tools_to_json(tools, args.all)
export_tools_to_tsv(tools, args.all_tsv, format_list_col=True, add_usage_stats=True)
export_tools_to_tsv(tools, args.all_tsv, format_list_col=True)

elif args.command == "filter":
with Path(args.all).open() as f:
tools = json.load(f)
# get categories and tools to exclude
categories = shared.read_file(args.categories)
# filter tool lists
filtered_tools = filter_tools(tools, categories)
if filtered_tools:
export_tools_to_json(filtered_tools, args.filtered)
export_tools_to_tsv(
filtered_tools,
args.tsv_filtered,
format_list_col=True,
to_keep_columns=["Galaxy wrapper id", "Description", "To keep", "Deprecated"],
)
else:
# if there are no ts filtered tools
print(f"No tools found for category {args.filtered}")

elif args.command == "curate":
with Path(args.filtered).open() as f:
tools = json.load(f)
try:
status = pd.read_csv(args.status, sep="\t", index_col=0, header=None).to_dict("index")
status = pd.read_csv(args.status, sep="\t", index_col=0).to_dict("index")
except Exception as ex:
print(f"Failed to load tool_status.tsv file with:\n{ex}")
print("Not assigning tool status for this community !")
status = {}

# filter tool lists
ts_filtered_tools, filtered_tools = filter_tools(tools, categories, status)

if ts_filtered_tools:

export_tools_to_tsv(ts_filtered_tools, args.ts_filtered, format_list_col=True)
# if there are no filtered tools return the ts filtered tools
if filtered_tools:
export_tools_to_tsv(filtered_tools, args.filtered, format_list_col=True)
else:
export_tools_to_tsv(ts_filtered_tools, args.filtered, format_list_col=True)

else:
# if there are no ts filtered tools
print(f"No tools found for category {args.filtered}")
curated_tools, tools_wo_biotools, tools_with_biotools = curate_tools(tools, status)
export_tools_to_json(curated_tools, args.curated)
export_tools_to_tsv(
tools_wo_biotools,
args.wo_biotools,
format_list_col=True,
to_keep_columns=["Galaxy wrapper id", "Source", "Galaxy wrapper source"],
)
export_tools_to_tsv(
tools_with_biotools,
args.w_biotools,
format_list_col=True,
to_keep_columns=["Galaxy wrapper id", "bio.tool name", "EDAM operation", "EDAM topic"],
)
59 changes: 38 additions & 21 deletions sources/bin/get_community_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,28 @@ if [[ ! -z $1 && $1 == "test" ]]; then
python sources/bin/extract_galaxy_tools.py \
filter \
--all "communities/all/resources/tools.json" \
--ts-filtered "communities/microgalaxy/resources/tools_filtered_by_ts_categories.tsv" \
--filtered "communities/microgalaxy/resources/tools.tsv" \
--categories "communities/microgalaxy/metadata/categories" \
--status "communities/microgalaxy/metadata/tool_status.tsv"
--tsv-filtered "communities/microgalaxy/resources/tools_filtered_by_ts_categories.tsv" \
--filtered "communities/microgalaxy/resources/tools_filtered_by_ts_categories.json"

python sources/bin/extract_galaxy_tools.py \
curate \
--filtered "communities/microgalaxy/resources/tools_filtered_by_ts_categories.json" \
--status "communities/microgalaxy/metadata/tool_status.tsv" \
--curated "communities/microgalaxy/resources/curated_tools.tsv" \
--wo-biotools "communities/microgalaxy/resources/curated_tools_wo_biotools.tsv" \
--w-biotools "communities/microgalaxy/resources/curated_tools_w_biotools.tsv"

python sources/bin/create_interactive_table.py \
--input "communities/microgalaxy/resources/tools.tsv" \
--input "communities/microgalaxy/resources/curated_tools.tsv" \
--remove-col "Reviewed" \
--remove-col "To keep" \
--filter-col "To keep" \
--template "sources/data/interactive_table_template.html" \
--output "communities/microgalaxy/resources/tools.html"

python sources/bin/create_wordcloud.py \
--input "communities/microgalaxy/resources/tools.tsv" \
--input "communities/microgalaxy/resources/curated_tools.tsv" \
--name-col "Galaxy wrapper id" \
--stat-col "No. of tool users (5 years) - all main servers" \
--wordcloud_mask "sources/data/usage_stats/wordcloud_mask.png" \
Expand All @@ -41,25 +48,35 @@ else
python sources/bin/extract_galaxy_tools.py \
filter \
--all "communities/all/resources/tools.json" \
--ts-filtered "communities/$community/resources/tools_filtered_by_ts_categories.tsv" \
--filtered "communities/$community/resources/tools.tsv" \
--categories "communities/$community/metadata/categories" \
--status "communities/$community/metadata/tool_status.tsv"
--tsv-filtered "communities/$community/resources/tools_filtered_by_ts_categories.tsv" \
--filtered "communities/$community/resources/tools_filtered_by_ts_categories.json"

if [[ -f "communities/$community/resources/tool_status.tsv" ]]; then

python sources/bin/extract_galaxy_tools.py \
curate \
--filtered "communities/$community/resources/tools_filtered_by_ts_categories.json" \
--status "communities/$community/metadata/tool_status.tsv" \
--curated "communities/$community/resources/curated_tools.tsv" \
--wo-biotools "communities/$community/resources/curated_tools_wo_biotools.tsv" \
--w-biotools "communities/$community/resources/curated_tools_w_biotools.tsv"

python sources/bin/create_interactive_table.py \
--input "communities/$community/resources/tools.tsv" \
--remove-col "Reviewed" \
--remove-col "To keep" \
--filter-col "To keep" \
--template "sources/data/interactive_table_template.html" \
--output "communities/$community/resources/tools.html"
python sources/bin/create_interactive_table.py \
--input "communities/$community/resources/curated_tools.tsv" \
--remove-col "Reviewed" \
--remove-col "To keep" \
--filter-col "To keep" \
--template "sources/data/interactive_table_template.html" \
--output "communities/$community/resources/tools.html"

python sources/bin/create_wordcloud.py \
--input "communities/$community/resources/tools.tsv" \
--name-col "Galaxy wrapper id" \
--stat-col "No. of tool users (5 years) - all main servers" \
--wordcloud_mask "sources/data/usage_stats/wordcloud_mask.png" \
--output "communities/$community/resources/tools_wordcloud.png"
python sources/bin/create_wordcloud.py \
--input "communities/$community/resources/curated_tools.tsv" \
--name-col "Galaxy wrapper id" \
--stat-col "No. of tool users (5 years) - all main servers" \
--wordcloud_mask "sources/data/usage_stats/wordcloud_mask.png" \
--output "communities/$community/resources/tools_wordcloud.png"
fi;
fi;
fi;
done
Expand Down

0 comments on commit 68d21d2

Please sign in to comment.