diff --git a/README.md b/README.md index ffa4135..2810e01 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,25 @@ asreview data dedup synergy:van_de_schoot_2018 -o van_de_schoot_2018_dedup.csv Removed 104 records from dataset with 6189 records. ``` +### Add missing DOIs + +Add missing DOIs to a dataset. This tool uses the [Crossref API](https://www.crossref.org/) to find missing DOIs. The tool requires a column with titles in the dataset. The output file will contain the original dataset with the added DOIs, or `None` when the DOI was not found. + +```bash +asreview data doi MY_DATASET.csv +``` + +Export the dataset with added DOIs to a file (`output.csv`) + +```bash +asreview data doi MY_DATASET.csv -o output.csv +``` + +By default, the tool uses a delay of 750 milliseconds between requests and a similarity threshold of 0.95 for deduplication. The tool offers the option to use a different delay and similarity threshold. The tool also offers the option to use a more strict similarity for deduplication and print verbose output. + +```bash +asreview data doi MY_DATASET.csv -o output.csv --delay 1000 --threshold 0.9 --strict_similarity --verbose +``` ### Data Vstack (Experimental) @@ -186,7 +205,7 @@ Vertical stacking: combine as many datasets in the same file format as you want ❗ Vstack is an experimental feature. We would love to hear your feedback. Please keep in mind that this feature can change in the future. -Stack several datasets on top of each other: +Stack several datasets on top of each other: ``` asreview data vstack output.csv MY_DATASET_1.csv MY_DATASET_2.csv MY_DATASET_3.csv ``` @@ -206,7 +225,7 @@ Compose is where datasets containing records with different labels (or no labels) can be assembled into a single dataset. ❗ Compose is an experimental feature. We would love to hear your feedback. -Please keep in mind that this feature can change in the future. +Please keep in mind that this feature can change in the future. Overview of possible input files and corresponding properties, use at least one of the following arguments: @@ -231,7 +250,7 @@ case of conflicts, use the `--conflict_resolve`/`-c` flag. This is set to | Resolve method | Action in case of conflict | |----------------|-----------------------------------------------------------------------------------------| | `keep_one` | Keep one label, using `--hierarchy` to determine which label to keep | -| `keep_all` | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) | +| `keep_all` | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) | | `abort` | Abort | diff --git a/asreviewcontrib/datatools/doi.py b/asreviewcontrib/datatools/doi.py new file mode 100644 index 0000000..e50ab27 --- /dev/null +++ b/asreviewcontrib/datatools/doi.py @@ -0,0 +1,162 @@ +import re +from difflib import SequenceMatcher +from random import random +from time import sleep +from typing import Any +from urllib.parse import quote + +import ftfy +import pandas as pd +import requests +from asreview import ASReviewData +from requests.exceptions import ConnectTimeout +from requests.exceptions import HTTPError +from tqdm import tqdm + +_SPACES_REGEX = re.compile(r'\s+') +_SYMBOLS_REGEX = re.compile(r'[^ \w\d\-_]') +_SEQ_MATCHER = SequenceMatcher() + + +def _fetch_doi( + title: str, + authors: None | str = None, + verbose: bool = False, + ) -> None | dict[str, Any]: + # https://www.crossref.org/documentation/retrieve-metadata/xml-api/retrieving-dois-by-title/ + if authors is None: + url = f"https://api.crossref.org/works?rows=1&query.title={title}" \ + "&select=title,DOI" + else: + url = f"https://api.crossref.org/works?rows=1&query.title={title}" \ + "&select=title,DOI,author" \ + f"&query.bibliographic={quote(authors, safe='')}" + + response = requests.get(url) + try: + response.raise_for_status() + + except ConnectTimeout as e: + if verbose: + tqdm.write(f'Timeout for {title}. Wait for 30s and try again.\n{e}') + + raise e + + except HTTPError: + if authors is None: + if verbose: + tqdm.write(f'Could not fetch doi for {title}') + + return None + + url = f"https://api.crossref.org/works?rows=1&query.title={title}" \ + "&select=title,DOI" + + response = requests.get(url) + try: + response.raise_for_status() + + except ConnectTimeout as e: + if verbose: + tqdm.write(f'Timeout for {title}. Wait for 30s and try again.\n{e}') + + raise e + + except HTTPError: + if verbose: + tqdm.write(f'Could not fetch doi for {title}') + + return None + + return response.json() + + +def _confirm_doi_title( + title: str, + title_from_api: str, + data: dict[str, Any], + similarity: float, + strict_similarity: bool, + verbose: bool, + ) -> None | str: + clean_title = _SYMBOLS_REGEX.sub('', title.lower()) + clean_title = _SPACES_REGEX.sub(' ', clean_title) + + clean_title_from_api = _SYMBOLS_REGEX.sub('', title_from_api.lower()) + clean_title_from_api = _SPACES_REGEX.sub(' ', clean_title_from_api) + + _SEQ_MATCHER.set_seq1(clean_title) + _SEQ_MATCHER.set_seq2(clean_title_from_api) + + if _SEQ_MATCHER.real_quick_ratio() > similarity and \ + _SEQ_MATCHER.quick_ratio() > similarity and \ + (not strict_similarity or _SEQ_MATCHER.ratio() > similarity): + + doi = data['message']['items'][0]['DOI'] + + if verbose: + tqdm.write(f'Doi found for {title}: {doi}') + + return doi + + if verbose: + tqdm.write(f'No doi found for {title}') + + return None + + +def find_dois( + asdata: ASReviewData, + delay: int = 750, + similarity: float = 0.95, + strict_similarity: bool = False, + verbose: bool = False) -> int: + titles = asdata.df['title'].apply(ftfy.fix_text).str.strip() + + if 'authors' in asdata.df.columns: + authors = asdata.df['authors'].apply(ftfy.fix_text).str.strip() + else: + authors = None + + delay_in_seconds = delay / 1000 + dois = [] + + for i, title in enumerate(tqdm(titles, desc="Finding DOIs")): + if 'authors' in asdata.df.columns: + data = _fetch_doi(title, authors[i], verbose) + else: + data = _fetch_doi(title, None, verbose) + + if data is None: + dois.append(None) + continue + + try: + title_from_api = ftfy.fix_text(data['message']['items'][0]['title'][0]) + + except IndexError: + if verbose: + tqdm.write(f'No doi found for {title}') + + dois.append(None) + continue + + doi = _confirm_doi_title( + title, + title_from_api, + data, + similarity, + strict_similarity, + verbose, + ) + + dois.append(doi) + + # sleep for delay_in_seconds + random to avoid overloading with requests + sleep(delay_in_seconds + random()) + + # if 'doi' column already exists, merge the dois, giving preference to the old ones + if 'doi' in asdata.df.columns: + asdata.df['doi'] = asdata.df['doi'].combine_first(pd.Series(dois)) + else: + asdata.df['doi'] = dois diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 647bc6a..f3759d2 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -10,6 +10,7 @@ from asreviewcontrib.datatools.convert import convert from asreviewcontrib.datatools.describe import _parse_arguments_describe from asreviewcontrib.datatools.describe import describe +from asreviewcontrib.datatools.doi import find_dois from asreviewcontrib.datatools.sample import _parse_arguments_sample from asreviewcontrib.datatools.sample import sample from asreviewcontrib.datatools.snowball import _parse_arguments_snowball @@ -17,7 +18,7 @@ from asreviewcontrib.datatools.stack import _parse_arguments_vstack from asreviewcontrib.datatools.stack import vstack -DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"] +DATATOOLS = ["describe", "dedup", "doi", "convert", "compose", "vstack", "snowball", "sample"] class DataEntryPoint(BaseEntryPoint): @@ -89,6 +90,77 @@ def execute(self, argv): f"Found {n_dup} duplicates in dataset with" f" {initial_length} records." ) + if argv[0] == "doi": + doi_parser = argparse.ArgumentParser(prog="asreview data doi") + doi_parser.add_argument( + "input_path", type=str, help="The file path of the dataset." + ) + doi_parser.add_argument( + "--output_path", + "-o", + default=None, + type=str, + help="The file path of the dataset.", + ) + doi_parser.add_argument( + "--delay", + default=750, + type=int, + help="Delay between requests in milliseconds. Default: 750.", + ) + doi_parser.add_argument( + "--threshold", + default=0.95, + type=float, + help="Similarity threshold for deduplication. Default: 0.95.", + ) + doi_parser.add_argument( + "--strict_similarity", + action='store_true', + help="Use a more strict similarity for deduplication.", + ) + doi_parser.add_argument( + "--verbose", + action='store_true', + help="Print verbose output.", + ) + + args_doi = doi_parser.parse_args(argv[1:]) + + # read data in ASReview data object + asdata = load_data(args_doi.input_path) + + if 'doi' in asdata.df.columns: + previous_dois = len(asdata.df) - asdata.df['doi'].isna().sum() + print(f"Dataset already contains dois for {previous_dois} entries. " + "Adding missing dois.") + + else: + print("Dataset does not contain dois. Adding dois.") + previous_dois = 0 + + find_dois( + asdata, + args_doi.delay, + args_doi.threshold, + args_doi.strict_similarity, + args_doi.verbose, + ) + + added_dois = len(asdata.df) - asdata.df['doi'].isna().sum() - previous_dois + + if args_doi.output_path: + asdata.to_file(args_doi.output_path) + print( + f"Added doi for {added_dois} records in dataset with" + f" {len(asdata.df)} records." + ) + else: + print( + f"Found doi for {added_dois} records in dataset with" + f" {len(asdata.df)} records." + ) + if argv[0] == "compose": args_compose_parser = _parse_arguments_compose() args_compose = args_compose_parser.parse_args(argv[1:]) diff --git a/pyproject.toml b/pyproject.toml index 0034c41..f76b995 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ classifiers = [ "Programming Language :: Python :: 3.11" ] license = {text = "MIT License"} -dependencies = ["asreview>=1.1,<2", "pandas", "pyalex"] +dependencies = ["asreview>=1.1,<2", "ftfy", "pandas", "pyalex", "tqdm"] dynamic = ["version"] requires-python = ">=3.8"