Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add missing DOIs to dataset #53

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,25 @@ asreview data dedup synergy:van_de_schoot_2018 -o van_de_schoot_2018_dedup.csv
Removed 104 records from dataset with 6189 records.
```

### Add missing DOIs

Add missing DOIs to a dataset. This tool uses the [Crossref API](https://www.crossref.org/) to find missing DOIs. The tool requires a column with titles in the dataset. The output file will contain the original dataset with the added DOIs, or `None` when the DOI was not found.

```bash
asreview data doi MY_DATASET.csv
```

Export the dataset with added DOIs to a file (`output.csv`)

```bash
asreview data doi MY_DATASET.csv -o output.csv
```

By default, the tool uses a delay of 750 milliseconds between requests and a similarity threshold of 0.95 for deduplication. The tool offers the option to use a different delay and similarity threshold. The tool also offers the option to use a more strict similarity for deduplication and print verbose output.

```bash
asreview data doi MY_DATASET.csv -o output.csv --delay 1000 --threshold 0.9 --strict_similarity --verbose
```

### Data Vstack (Experimental)

Expand All @@ -186,7 +205,7 @@ Vertical stacking: combine as many datasets in the same file format as you want
❗ Vstack is an experimental feature. We would love to hear your feedback.
Please keep in mind that this feature can change in the future.

Stack several datasets on top of each other:
Stack several datasets on top of each other:
```
asreview data vstack output.csv MY_DATASET_1.csv MY_DATASET_2.csv MY_DATASET_3.csv
```
Expand All @@ -206,7 +225,7 @@ Compose is where datasets containing records with different labels (or no
labels) can be assembled into a single dataset.

❗ Compose is an experimental feature. We would love to hear your feedback.
Please keep in mind that this feature can change in the future.
Please keep in mind that this feature can change in the future.

Overview of possible input files and corresponding properties, use at least
one of the following arguments:
Expand All @@ -231,7 +250,7 @@ case of conflicts, use the `--conflict_resolve`/`-c` flag. This is set to
| Resolve method | Action in case of conflict |
|----------------|-----------------------------------------------------------------------------------------|
| `keep_one` | Keep one label, using `--hierarchy` to determine which label to keep |
| `keep_all` | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) |
| `keep_all` | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) |
| `abort` | Abort |


Expand Down
162 changes: 162 additions & 0 deletions asreviewcontrib/datatools/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import re
from difflib import SequenceMatcher
from random import random
from time import sleep
from typing import Any
from urllib.parse import quote

import ftfy
import pandas as pd
import requests
from asreview import ASReviewData
from requests.exceptions import ConnectTimeout
from requests.exceptions import HTTPError
from tqdm import tqdm

_SPACES_REGEX = re.compile(r'\s+')
_SYMBOLS_REGEX = re.compile(r'[^ \w\d\-_]')
_SEQ_MATCHER = SequenceMatcher()


def _fetch_doi(
title: str,
authors: None | str = None,
verbose: bool = False,
) -> None | dict[str, Any]:
# https://www.crossref.org/documentation/retrieve-metadata/xml-api/retrieving-dois-by-title/
if authors is None:
url = f"https://api.crossref.org/works?rows=1&query.title={title}" \
"&select=title,DOI"
else:
url = f"https://api.crossref.org/works?rows=1&query.title={title}" \
"&select=title,DOI,author" \
f"&query.bibliographic={quote(authors, safe='')}"

response = requests.get(url)
try:
response.raise_for_status()

except ConnectTimeout as e:
if verbose:
tqdm.write(f'Timeout for {title}. Wait for 30s and try again.\n{e}')

raise e

except HTTPError:
if authors is None:
if verbose:
tqdm.write(f'Could not fetch doi for {title}')

return None

url = f"https://api.crossref.org/works?rows=1&query.title={title}" \
"&select=title,DOI"

response = requests.get(url)
try:
response.raise_for_status()

except ConnectTimeout as e:
if verbose:
tqdm.write(f'Timeout for {title}. Wait for 30s and try again.\n{e}')

raise e

except HTTPError:
if verbose:
tqdm.write(f'Could not fetch doi for {title}')

return None

return response.json()


def _confirm_doi_title(
title: str,
title_from_api: str,
data: dict[str, Any],
similarity: float,
strict_similarity: bool,
verbose: bool,
) -> None | str:
clean_title = _SYMBOLS_REGEX.sub('', title.lower())
clean_title = _SPACES_REGEX.sub(' ', clean_title)

clean_title_from_api = _SYMBOLS_REGEX.sub('', title_from_api.lower())
clean_title_from_api = _SPACES_REGEX.sub(' ', clean_title_from_api)

_SEQ_MATCHER.set_seq1(clean_title)
_SEQ_MATCHER.set_seq2(clean_title_from_api)

if _SEQ_MATCHER.real_quick_ratio() > similarity and \
_SEQ_MATCHER.quick_ratio() > similarity and \
(not strict_similarity or _SEQ_MATCHER.ratio() > similarity):

doi = data['message']['items'][0]['DOI']

if verbose:
tqdm.write(f'Doi found for {title}: {doi}')

return doi

if verbose:
tqdm.write(f'No doi found for {title}')

return None


def find_dois(
asdata: ASReviewData,
delay: int = 750,
similarity: float = 0.95,
strict_similarity: bool = False,
verbose: bool = False) -> int:
titles = asdata.df['title'].apply(ftfy.fix_text).str.strip()

if 'authors' in asdata.df.columns:
authors = asdata.df['authors'].apply(ftfy.fix_text).str.strip()
else:
authors = None

delay_in_seconds = delay / 1000
dois = []

for i, title in enumerate(tqdm(titles, desc="Finding DOIs")):
if 'authors' in asdata.df.columns:
data = _fetch_doi(title, authors[i], verbose)
else:
data = _fetch_doi(title, None, verbose)

if data is None:
dois.append(None)
continue

try:
title_from_api = ftfy.fix_text(data['message']['items'][0]['title'][0])

except IndexError:
if verbose:
tqdm.write(f'No doi found for {title}')

dois.append(None)
continue

doi = _confirm_doi_title(
title,
title_from_api,
data,
similarity,
strict_similarity,
verbose,
)

dois.append(doi)

# sleep for delay_in_seconds + random to avoid overloading with requests
sleep(delay_in_seconds + random())

# if 'doi' column already exists, merge the dois, giving preference to the old ones
if 'doi' in asdata.df.columns:
asdata.df['doi'] = asdata.df['doi'].combine_first(pd.Series(dois))
else:
asdata.df['doi'] = dois
74 changes: 73 additions & 1 deletion asreviewcontrib/datatools/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
from asreviewcontrib.datatools.convert import convert
from asreviewcontrib.datatools.describe import _parse_arguments_describe
from asreviewcontrib.datatools.describe import describe
from asreviewcontrib.datatools.doi import find_dois
from asreviewcontrib.datatools.sample import _parse_arguments_sample
from asreviewcontrib.datatools.sample import sample
from asreviewcontrib.datatools.snowball import _parse_arguments_snowball
from asreviewcontrib.datatools.snowball import snowball
from asreviewcontrib.datatools.stack import _parse_arguments_vstack
from asreviewcontrib.datatools.stack import vstack

DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"]
DATATOOLS = ["describe", "dedup", "doi", "convert", "compose", "vstack", "snowball", "sample"]


class DataEntryPoint(BaseEntryPoint):
Expand Down Expand Up @@ -89,6 +90,77 @@ def execute(self, argv):
f"Found {n_dup} duplicates in dataset with"
f" {initial_length} records."
)
if argv[0] == "doi":
doi_parser = argparse.ArgumentParser(prog="asreview data doi")
doi_parser.add_argument(
"input_path", type=str, help="The file path of the dataset."
)
doi_parser.add_argument(
"--output_path",
"-o",
default=None,
type=str,
help="The file path of the dataset.",
)
doi_parser.add_argument(
"--delay",
default=750,
type=int,
help="Delay between requests in milliseconds. Default: 750.",
)
doi_parser.add_argument(
"--threshold",
default=0.95,
type=float,
help="Similarity threshold for deduplication. Default: 0.95.",
)
doi_parser.add_argument(
"--strict_similarity",
action='store_true',
help="Use a more strict similarity for deduplication.",
)
doi_parser.add_argument(
"--verbose",
action='store_true',
help="Print verbose output.",
)

args_doi = doi_parser.parse_args(argv[1:])

# read data in ASReview data object
asdata = load_data(args_doi.input_path)

if 'doi' in asdata.df.columns:
previous_dois = len(asdata.df) - asdata.df['doi'].isna().sum()
print(f"Dataset already contains dois for {previous_dois} entries. "
"Adding missing dois.")

else:
print("Dataset does not contain dois. Adding dois.")
previous_dois = 0

find_dois(
asdata,
args_doi.delay,
args_doi.threshold,
args_doi.strict_similarity,
args_doi.verbose,
)

added_dois = len(asdata.df) - asdata.df['doi'].isna().sum() - previous_dois

if args_doi.output_path:
asdata.to_file(args_doi.output_path)
print(
f"Added doi for {added_dois} records in dataset with"
f" {len(asdata.df)} records."
)
else:
print(
f"Found doi for {added_dois} records in dataset with"
f" {len(asdata.df)} records."
)

if argv[0] == "compose":
args_compose_parser = _parse_arguments_compose()
args_compose = args_compose_parser.parse_args(argv[1:])
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ classifiers = [
"Programming Language :: Python :: 3.11"
]
license = {text = "MIT License"}
dependencies = ["asreview>=1.1,<2", "pandas", "pyalex"]
dependencies = ["asreview>=1.1,<2", "ftfy", "pandas", "pyalex", "tqdm"]
dynamic = ["version"]
requires-python = ">=3.8"

Expand Down