From 0cf3343bc76ba4a8e0604f1955f7d9f4ca14dbde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Mon, 3 Jun 2024 13:57:37 +0200 Subject: [PATCH] Remove tool duplication (#104) * Add to export script an option to avoid parsing extra repositories in conf * Use avoid extra repo option for all steps except 1st * Snake case argument names * Remove FROGS from extra repo --- bin/extract_all_tools.sh | 2 +- bin/extract_all_tools_stepwise.sh | 17 ++++++++++++--- bin/extract_all_tools_test.sh | 4 ++-- bin/extract_galaxy_tools.py | 35 ++++++++++++++++++++++--------- bin/get_community_tools.sh | 2 +- data/conf.yml | 1 - 6 files changed, 43 insertions(+), 18 deletions(-) diff --git a/bin/extract_all_tools.sh b/bin/extract_all_tools.sh index f4e03534..1926fac2 100755 --- a/bin/extract_all_tools.sh +++ b/bin/extract_all_tools.sh @@ -5,7 +5,7 @@ mkdir -p 'results/' python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all_tools 'results/all_tools.tsv' + --all-tools 'results/all_tools.tsv' python bin/create_interactive_table.py \ --table "results/all_tools.tsv" \ diff --git a/bin/extract_all_tools_stepwise.sh b/bin/extract_all_tools_stepwise.sh index 7b9cac75..88552a28 100755 --- a/bin/extract_all_tools_stepwise.sh +++ b/bin/extract_all_tools_stepwise.sh @@ -4,9 +4,20 @@ mkdir -p 'results/' output="results/${1}_tools.tsv" -python bin/extract_galaxy_tools.py \ +if [[ $1 =~ "01" ]]; then + python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all_tools $output \ - --planemorepository $1 + --all-tools $output \ + --planemo-repository-list $1 +else + python bin/extract_galaxy_tools.py \ + extractools \ + --api $GITHUB_API_KEY \ + --all-tools $output \ + --planemo-repository-list $1 \ + --avoid-extra-repositories +fi + + diff --git a/bin/extract_all_tools_test.sh b/bin/extract_all_tools_test.sh index 9960b9ea..22c4ac05 100755 --- a/bin/extract_all_tools_test.sh +++ b/bin/extract_all_tools_test.sh @@ -7,7 +7,7 @@ output="results/${1}_tools.tsv" python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all_tools $output \ - --planemorepository $1 \ + --all-tools $output \ + --planemo-repository-list $1 \ --test diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 95a19eb9..1eb7fc16 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -123,14 +123,14 @@ def get_string_content(cf: ContentFile) -> str: def get_tool_github_repositories( - g: Github, RepoSelection: Optional[str], run_test: bool, add_extra_repositories: bool = True + g: Github, repository_list: Optional[str], run_test: bool, add_extra_repositories: bool = True ) -> List[str]: """ Get list of tool GitHub repositories to parse :param g: GitHub instance - :param RepoSelection: The selection to use from the repository (needed to split the process for CI jobs) - :run_test: for testing only parse the repository + :param repository_list: The selection to use from the repository (needed to split the process for CI jobs) + :param run_test: for testing only parse the repository """ if run_test: @@ -140,8 +140,8 @@ def get_tool_github_repositories( repo_list: List[str] = [] for i in range(1, 5): repo_selection = f"repositories0{i}.list" - if RepoSelection: # only get these repositories - if RepoSelection == repo_selection: + if repository_list: # only get these repositories + if repository_list == repo_selection: repo_f = repo.get_contents(repo_selection) repo_l = get_string_content(repo_f).rstrip() repo_list.extend(repo_l.split("\n")) @@ -614,11 +614,21 @@ def filter_tools( # Extract tools extractools = subparser.add_parser("extractools", help="Extract tools") extractools.add_argument("--api", "-a", required=True, help="GitHub access token") - extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to TSV with all extracted tools") + extractools.add_argument("--all-tools", "-o", required=True, help="Filepath to TSV with all extracted tools") extractools.add_argument( - "--planemorepository", "-pr", required=False, help="Repository list to use from the planemo-monitor repository" + "--planemo-repository-list", + "-pr", + required=False, + help="Repository list to use from the planemo-monitor repository", + ) + extractools.add_argument( + "--avoid-extra-repositories", + "-e", + action="store_true", + default=False, + required=False, + help="Do not parse extra repositories in conf file", ) - extractools.add_argument( "--test", "-t", @@ -637,7 +647,7 @@ def filter_tools( help="Filepath to TSV with all extracted tools, generated by extractools command", ) filtertools.add_argument( - "--filtered_tools", + "--filtered-tools", "-f", required=True, help="Filepath to TSV with filtered tools", @@ -663,7 +673,12 @@ def filter_tools( # connect to GitHub g = Github(args.api) # get list of GitHub repositories to parse - repo_list = get_tool_github_repositories(g, args.planemorepository, args.test) + repo_list = get_tool_github_repositories( + g=g, + repository_list=args.planemo_repository_list, + run_test=args.test, + add_extra_repositories=not args.avoid_extra_repositories, + ) # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file tools: List[Dict] = [] for r in repo_list: diff --git a/bin/get_community_tools.sh b/bin/get_community_tools.sh index b3a6db3a..c0a2c610 100755 --- a/bin/get_community_tools.sh +++ b/bin/get_community_tools.sh @@ -12,7 +12,7 @@ for com_data_fp in data/communities/* ; do python bin/extract_galaxy_tools.py \ filtertools \ --tools "results/all_tools.tsv" \ - --filtered_tools "results/$community/tools.tsv" \ + --filtered-tools "results/$community/tools.tsv" \ --categories "data/communities/$community/categories" \ --exclude "data/communities/$community/tools_to_exclude" \ --keep "data/communities/$community/tools_to_keep" diff --git a/data/conf.yml b/data/conf.yml index dbf7e082..1e16fb9f 100644 --- a/data/conf.yml +++ b/data/conf.yml @@ -1,3 +1,2 @@ extra-repositories: - https://github.com/qiime2/galaxy-tools - - https://github.com/geraldinepascal/FROGS-wrappers