From 1a6eb0c53a558404db98fc110907a49a89405c80 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 1 Apr 2022 18:39:23 +0200 Subject: [PATCH] remove "gated datasets unlock" logic (#189) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: ๐Ÿ’ก move gated datasets "unlock" code to models/ also: add two tests to ensure the gated datasets can be accessed * test: ๐Ÿ’ adapt to new version of dummy_gated dataset I changed (https://huggingface.co/datasets/severo/dummy_gated/commit/99194748bed3625a941aaf785740df02ca5762c9) severo/dummy_gated to a simpler dataset, without a python script, to avoid having non-related errors. Also in the commit: load the HF_TOKEN from a secret in https://github.com/huggingface/datasets-preview-backend/settings/secrets/actions to be able to run the unit tests * test: ๐Ÿ’ fix wrong hardcoded value * chore: ๐Ÿค– ignore safety warning on ujson package it's a dependency of lm-dataformat, and last version still depends on a vulnerable ujson version * feat: ๐ŸŽธ remove the "ask_access" logic for gated datasets the new "app" tokens on moonlanding can read the gated datasets without having to accept the conditions first, as it occurs for users. BREAKING CHANGE: ๐Ÿงจ HF_TOKEN must be an app token --- .github/workflows/quality.yml | 2 +- .github/workflows/unit-tests.yml | 1 + Makefile | 2 +- README.md | 2 +- poetry.lock | 68 ++++++++++--------- pyproject.toml | 2 +- src/datasets_preview_backend/config.py | 2 + src/datasets_preview_backend/io/cache.py | 9 --- .../models/dataset.py | 1 + .../models/hf_dataset.py | 11 --- tests/models/test_dataset.py | 11 +++ tests/models/test_split.py | 12 +++- 12 files changed, 65 insertions(+), 58 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 94c45feacd..9fb202db47 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -35,5 +35,5 @@ jobs: - name: Run bandit run: poetry run bandit -r src - name: Run safety - run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 + run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499 # ^^ safety exceptions: pillow, numpy diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index cc6182a5be..6b64dcfbe2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -32,6 +32,7 @@ jobs: ROWS_MAX_NUMBER: 5 MONGO_CACHE_DATABASE: datasets_preview_cache_test MONGO_QUEUE_DATABASE: datasets_preview_queue_test + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests - uses: codecov/codecov-action@v2 with: diff --git a/Makefile b/Makefile index 7abc378041..41ee6857e2 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ quality: poetry run flake8 tests src poetry run mypy tests src poetry run bandit -r src - poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 + poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499 # ^^ safety exceptions: pillow, numpy # Format source code automatically diff --git a/README.md b/README.md index 5ed1c691c6..aa60bf19b2 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Every `WORKER_SLEEP_SECONDS` (defaults to 5 seconds) when idle, the worker will - the memory (RAM + SWAP) on the machine is below `MAX_MEMORY_PCT` (defaults to 60%) - the number of started jobs for the same dataset is under `MAX_JOBS_PER_DATASET` -Also specify `HF_TOKEN` with a User Access Token (see https://huggingface.co/settings/token, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty. +Also specify `HF_TOKEN` with an App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty. Also specify `MAX_SIZE_FALLBACK` with the maximum size in bytes of the dataset to fallback in normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`. diff --git a/poetry.lock b/poetry.lock index dddbb9dca8..dcba2b344e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -446,13 +446,13 @@ xxhash = "*" apache-beam = ["apache-beam (>=2.26.0)"] audio = ["librosa"] benchmarks = ["numpy (==1.18.5)", "tensorflow (==2.3.0)", "torch (==1.6.0)", "transformers (==3.0.2)"] -dev = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[server,s3] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)", "importlib-resources"] +dev = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[s3,server] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)", "importlib-resources"] docs = ["s3fs"] quality = ["black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)"] s3 = ["fsspec", "boto3", "botocore", "s3fs"] tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)"] tensorflow_gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"] -tests = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[server,s3] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "importlib-resources"] +tests = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[s3,server] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "importlib-resources"] torch = ["torch"] vision = ["Pillow (>=6.2.1)"] @@ -621,7 +621,7 @@ python-versions = ">=3.7" [[package]] name = "fsspec" -version = "2022.2.0" +version = "2022.3.0" description = "File-system specification" category = "main" optional = false @@ -652,6 +652,7 @@ s3 = ["s3fs"] sftp = ["paramiko"] smb = ["smbprotocol"] ssh = ["paramiko"] +tqdm = ["tqdm"] [[package]] name = "function-parser" @@ -1022,7 +1023,7 @@ python-versions = ">=3.7,<3.11" [[package]] name = "lm-dataformat" -version = "0.0.19" +version = "0.0.20" description = "A utility for storing and reading files for LM training." category = "main" optional = false @@ -2471,7 +2472,7 @@ standard = ["websockets (>=9.1)", "httptools (>=0.2.0,<0.3.0)", "watchgod (>=0.6 [[package]] name = "watchdog" -version = "2.1.6" +version = "2.1.7" description = "Filesystem events monitoring" category = "main" optional = false @@ -2559,7 +2560,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "3.9.6" -content-hash = "7fd8e8d999cde4fc105a4a36f842cd31d59f55d1fbe175ad5bca912967b82eb6" +content-hash = "dd82aacae83a234b5bcafe7298a54b4df9b1f20494e4209b34f935b9a07383a7" [metadata.files] absl-py = [ @@ -3178,8 +3179,8 @@ frozenlist = [ {file = "frozenlist-1.3.0.tar.gz", hash = "sha256:ce6f2ba0edb7b0c1d8976565298ad2deba6f8064d2bebb6ffce2ca896eb35b0b"}, ] fsspec = [ - {file = "fsspec-2022.2.0-py3-none-any.whl", hash = "sha256:eb9c9d9aee49d23028deefffe53e87c55d3515512c63f57e893710301001449a"}, - {file = "fsspec-2022.2.0.tar.gz", hash = "sha256:20322c659538501f52f6caa73b08b2ff570b7e8ea30a86559721d090e473ad5c"}, + {file = "fsspec-2022.3.0-py3-none-any.whl", hash = "sha256:a53491b003210fce6911dd8f2d37e20c41a27ce52a655eef11b885d1578ed4cf"}, + {file = "fsspec-2022.3.0.tar.gz", hash = "sha256:fd582cc4aa0db5968bad9317cae513450eddd08b2193c4428d9349265a995523"}, ] function-parser = [ {file = "function_parser-0.0.3-py3-none-any.whl", hash = "sha256:c09e4ddb1d9c7783cf5ec7aac72d858f16565552135854844948a67861a15571"}, @@ -3375,8 +3376,8 @@ llvmlite = [ {file = "llvmlite-0.38.0.tar.gz", hash = "sha256:a99d166ccf3b116f3b9ed23b9b70ba2415640a9c978f3aaa13fad49c58f4965c"}, ] lm-dataformat = [ - {file = "lm_dataformat-0.0.19-py3-none-any.whl", hash = "sha256:d05bebb6e885bfd4861516f8eca6baa90487e9ffb81b790448d9609866ca2e1f"}, - {file = "lm_dataformat-0.0.19.tar.gz", hash = "sha256:04fed4405a0eaf9b18f59051476e6e9511759cf27818b5ed67694c5b6f2fe41a"}, + {file = "lm_dataformat-0.0.20-py3-none-any.whl", hash = "sha256:247468181c9c2fea33a663cdb2f6fea489ddf6741d216fe6b466e60f002705af"}, + {file = "lm_dataformat-0.0.20.tar.gz", hash = "sha256:0016165b34d8f004753ac265348c3525532e55088f6c9c160f3597e660207145"}, ] lxml = [ {file = "lxml-4.8.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b"}, @@ -4874,29 +4875,30 @@ uvicorn = [ {file = "uvicorn-0.14.0.tar.gz", hash = "sha256:45ad7dfaaa7d55cab4cd1e85e03f27e9d60bc067ddc59db52a2b0aeca8870292"}, ] watchdog = [ - {file = "watchdog-2.1.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9693f35162dc6208d10b10ddf0458cc09ad70c30ba689d9206e02cd836ce28a3"}, - {file = "watchdog-2.1.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aba5c812f8ee8a3ff3be51887ca2d55fb8e268439ed44110d3846e4229eb0e8b"}, - {file = "watchdog-2.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4ae38bf8ba6f39d5b83f78661273216e7db5b00f08be7592062cb1fc8b8ba542"}, - {file = "watchdog-2.1.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ad6f1796e37db2223d2a3f302f586f74c72c630b48a9872c1e7ae8e92e0ab669"}, - {file = "watchdog-2.1.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:922a69fa533cb0c793b483becaaa0845f655151e7256ec73630a1b2e9ebcb660"}, - {file = "watchdog-2.1.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b2fcf9402fde2672545b139694284dc3b665fd1be660d73eca6805197ef776a3"}, - {file = "watchdog-2.1.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3386b367e950a11b0568062b70cc026c6f645428a698d33d39e013aaeda4cc04"}, - {file = "watchdog-2.1.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8f1c00aa35f504197561060ca4c21d3cc079ba29cf6dd2fe61024c70160c990b"}, - {file = "watchdog-2.1.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b52b88021b9541a60531142b0a451baca08d28b74a723d0c99b13c8c8d48d604"}, - {file = "watchdog-2.1.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8047da932432aa32c515ec1447ea79ce578d0559362ca3605f8e9568f844e3c6"}, - {file = "watchdog-2.1.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e92c2d33858c8f560671b448205a268096e17870dcf60a9bb3ac7bfbafb7f5f9"}, - {file = "watchdog-2.1.6-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b7d336912853d7b77f9b2c24eeed6a5065d0a0cc0d3b6a5a45ad6d1d05fb8cd8"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_aarch64.whl", hash = "sha256:cca7741c0fcc765568350cb139e92b7f9f3c9a08c4f32591d18ab0a6ac9e71b6"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_armv7l.whl", hash = "sha256:25fb5240b195d17de949588628fdf93032ebf163524ef08933db0ea1f99bd685"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_i686.whl", hash = "sha256:be9be735f827820a06340dff2ddea1fb7234561fa5e6300a62fe7f54d40546a0"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_ppc64.whl", hash = "sha256:d0d19fb2441947b58fbf91336638c2b9f4cc98e05e1045404d7a4cb7cddc7a65"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:3becdb380d8916c873ad512f1701f8a92ce79ec6978ffde92919fd18d41da7fb"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_s390x.whl", hash = "sha256:ae67501c95606072aafa865b6ed47343ac6484472a2f95490ba151f6347acfc2"}, - {file = "watchdog-2.1.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e0f30db709c939cabf64a6dc5babb276e6d823fd84464ab916f9b9ba5623ca15"}, - {file = "watchdog-2.1.6-py3-none-win32.whl", hash = "sha256:e02794ac791662a5eafc6ffeaf9bcc149035a0e48eb0a9d40a8feb4622605a3d"}, - {file = "watchdog-2.1.6-py3-none-win_amd64.whl", hash = "sha256:bd9ba4f332cf57b2c1f698be0728c020399ef3040577cde2939f2e045b39c1e5"}, - {file = "watchdog-2.1.6-py3-none-win_ia64.whl", hash = "sha256:a0f1c7edf116a12f7245be06120b1852275f9506a7d90227648b250755a03923"}, - {file = "watchdog-2.1.6.tar.gz", hash = "sha256:a36e75df6c767cbf46f61a91c70b3ba71811dfa0aca4a324d9407a06a8b7a2e7"}, + {file = "watchdog-2.1.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:177bae28ca723bc00846466016d34f8c1d6a621383b6caca86745918d55c7383"}, + {file = "watchdog-2.1.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d1cf7dfd747dec519486a98ef16097e6c480934ef115b16f18adb341df747a4"}, + {file = "watchdog-2.1.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7f14ce6adea2af1bba495acdde0e510aecaeb13b33f7bd2f6324e551b26688ca"}, + {file = "watchdog-2.1.7-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4d0e98ac2e8dd803a56f4e10438b33a2d40390a72750cff4939b4b274e7906fa"}, + {file = "watchdog-2.1.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:81982c7884aac75017a6ecc72f1a4fedbae04181a8665a34afce9539fc1b3fab"}, + {file = "watchdog-2.1.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0b4a1fe6201c6e5a1926f5767b8664b45f0fcb429b62564a41f490ff1ce1dc7a"}, + {file = "watchdog-2.1.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6e6ae29b72977f2e1ee3d0b760d7ee47896cb53e831cbeede3e64485e5633cc8"}, + {file = "watchdog-2.1.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b9777664848160449e5b4260e0b7bc1ae0f6f4992a8b285db4ec1ef119ffa0e2"}, + {file = "watchdog-2.1.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:19b36d436578eb437e029c6b838e732ed08054956366f6dd11875434a62d2b99"}, + {file = "watchdog-2.1.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b61acffaf5cd5d664af555c0850f9747cc5f2baf71e54bbac164c58398d6ca7b"}, + {file = "watchdog-2.1.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1e877c70245424b06c41ac258023ea4bd0c8e4ff15d7c1368f17cd0ae6e351dd"}, + {file = "watchdog-2.1.7-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d802d65262a560278cf1a65ef7cae4e2bc7ecfe19e5451349e4c67e23c9dc420"}, + {file = "watchdog-2.1.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b3750ee5399e6e9c69eae8b125092b871ee9e2fcbd657a92747aea28f9056a5c"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ed6d9aad09a2a948572224663ab00f8975fae242aa540509737bb4507133fa2d"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_armv7l.whl", hash = "sha256:b26e13e8008dcaea6a909e91d39b629a39635d1a8a7239dd35327c74f4388601"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_i686.whl", hash = "sha256:0908bb50f6f7de54d5d31ec3da1654cb7287c6b87bce371954561e6de379d690"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_ppc64.whl", hash = "sha256:bdcbf75580bf4b960fb659bbccd00123d83119619195f42d721e002c1621602f"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:81a5861d0158a7e55fe149335fb2bbfa6f48cbcbd149b52dbe2cd9a544034bbd"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_s390x.whl", hash = "sha256:03b43d583df0f18782a0431b6e9e9965c5b3f7cf8ec36a00b930def67942c385"}, + {file = "watchdog-2.1.7-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ae934e34c11aa8296c18f70bf66ed60e9870fcdb4cc19129a04ca83ab23e7055"}, + {file = "watchdog-2.1.7-py3-none-win32.whl", hash = "sha256:49639865e3db4be032a96695c98ac09eed39bbb43fe876bb217da8f8101689a6"}, + {file = "watchdog-2.1.7-py3-none-win_amd64.whl", hash = "sha256:340b875aecf4b0e6672076a6f05cfce6686935559bb6d34cebedee04126a9566"}, + {file = "watchdog-2.1.7-py3-none-win_ia64.whl", hash = "sha256:351e09b6d9374d5bcb947e6ac47a608ec25b9d70583e9db00b2fcdb97b00b572"}, + {file = "watchdog-2.1.7.tar.gz", hash = "sha256:3fd47815353be9c44eebc94cc28fe26b2b0c5bd889dafc4a5a7cbdf924143480"}, ] werkzeug = [ {file = "Werkzeug-2.0.3-py3-none-any.whl", hash = "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8"}, diff --git a/pyproject.toml b/pyproject.toml index add117be99..3e18ea7d79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ function-parser = "^0.0.3" gdown = "^4.2.0" kenlm = { url = "https://github.com/kpu/kenlm/archive/master.zip" } kss = "^2.6.0" -lm-dataformat = "^0.0.19" +lm-dataformat = "^0.0.20" lxml = "^4.6.3" mongo-types = "0.15.1" mongoengine = "^0.23.1" diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py index 5e8964ed4d..bba7fa3c68 100644 --- a/src/datasets_preview_backend/config.py +++ b/src/datasets_preview_backend/config.py @@ -8,6 +8,7 @@ DEFAULT_ASSETS_DIRECTORY, DEFAULT_DATASETS_ENABLE_PRIVATE, DEFAULT_DATASETS_REVISION, + DEFAULT_HF_TOKEN, DEFAULT_LOG_LEVEL, DEFAULT_MAX_AGE_LONG_SECONDS, DEFAULT_MAX_AGE_SHORT_SECONDS, @@ -36,6 +37,7 @@ d=os.environ, key="DATASETS_ENABLE_PRIVATE", default=DEFAULT_DATASETS_ENABLE_PRIVATE ) DATASETS_REVISION = get_str_value(d=os.environ, key="DATASETS_REVISION", default=DEFAULT_DATASETS_REVISION) +HF_TOKEN = get_str_or_none_value(d=os.environ, key="HF_TOKEN", default=DEFAULT_HF_TOKEN) LOG_LEVEL = get_str_value(d=os.environ, key="LOG_LEVEL", default=DEFAULT_LOG_LEVEL) MAX_AGE_LONG_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_LONG_SECONDS", default=DEFAULT_MAX_AGE_LONG_SECONDS) MAX_AGE_SHORT_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_SHORT_SECONDS", default=DEFAULT_MAX_AGE_SHORT_SECONDS) diff --git a/src/datasets_preview_backend/io/cache.py b/src/datasets_preview_backend/io/cache.py index 9da76c962d..73378f9bc5 100644 --- a/src/datasets_preview_backend/io/cache.py +++ b/src/datasets_preview_backend/io/cache.py @@ -43,7 +43,6 @@ SplitFullName, get_dataset_split_full_names, ) -from datasets_preview_backend.models.hf_dataset import ask_access from datasets_preview_backend.models.split import Split, get_split from datasets_preview_backend.utils import orjson_dumps @@ -359,10 +358,6 @@ def clean_database() -> None: def refresh_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]: - if hf_token: - # remove the gate (for gated datasets) if a token is passed - ask_access(dataset_name, hf_token) - try: split_full_names = get_dataset_split_full_names(dataset_name, hf_token) upsert_dataset(dataset_name, split_full_names) @@ -420,10 +415,6 @@ def refresh_split( hf_token: Optional[str] = None, max_size_fallback: Optional[int] = None, ): - if hf_token: - # remove the gate (for gated datasets) if a token is passed - ask_access(dataset_name, hf_token) - try: split = get_split( dataset_name, config_name, split_name, hf_token=hf_token, max_size_fallback=max_size_fallback diff --git a/src/datasets_preview_backend/models/dataset.py b/src/datasets_preview_backend/models/dataset.py index 838b18ff5d..e116cb3cc1 100644 --- a/src/datasets_preview_backend/models/dataset.py +++ b/src/datasets_preview_backend/models/dataset.py @@ -18,6 +18,7 @@ class SplitFullName(TypedDict): def get_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]: logger.info(f"get dataset '{dataset_name}' split full names") + try: guard_blocked_datasets(dataset_name) return [ diff --git a/src/datasets_preview_backend/models/hf_dataset.py b/src/datasets_preview_backend/models/hf_dataset.py index b613951dfb..fd7c1b13fc 100644 --- a/src/datasets_preview_backend/models/hf_dataset.py +++ b/src/datasets_preview_backend/models/hf_dataset.py @@ -1,7 +1,6 @@ import logging from typing import List, TypedDict, Union -import requests from datasets import list_datasets logger = logging.getLogger(__name__) @@ -32,15 +31,5 @@ def get_hf_datasets() -> List[HFDataset]: ] -def ask_access(dataset_name: str, hf_token: str) -> None: - url = f"https://huggingface.co/datasets/{dataset_name}/ask-access" - headers = {"Authorization": f"Bearer {hf_token}"} - try: - requests.get(url, headers=headers) - except Exception as err: - logger.warning(f"error while asking access to dataset {dataset_name}: {err}") - # TODO: check if the access was granted: check if we were redirected to the dataset page, or to the login page - - def get_hf_dataset_names() -> List[str]: return [d["id"] for d in get_hf_datasets()] diff --git a/tests/models/test_dataset.py b/tests/models/test_dataset.py index f8b9c77648..fb19764e35 100644 --- a/tests/models/test_dataset.py +++ b/tests/models/test_dataset.py @@ -1,5 +1,6 @@ import pytest +from datasets_preview_backend.config import HF_TOKEN from datasets_preview_backend.exceptions import Status400Error from datasets_preview_backend.models.dataset import get_dataset_split_full_names @@ -51,3 +52,13 @@ def test_splits_fallback() -> None: split_full_names = get_dataset_split_full_names("hda_nli_hindi") assert len(split_full_names) == 3 assert {"dataset_name": "hda_nli_hindi", "config_name": "HDA nli hindi", "split_name": "train"} in split_full_names + + +def test_gated() -> None: + split_full_names = get_dataset_split_full_names("severo/dummy_gated", HF_TOKEN) + assert len(split_full_names) == 1 + assert { + "dataset_name": "severo/dummy_gated", + "config_name": "severo--embellishments", + "split_name": "train", + } in split_full_names diff --git a/tests/models/test_split.py b/tests/models/test_split.py index d8482266d4..7f0c5f723c 100644 --- a/tests/models/test_split.py +++ b/tests/models/test_split.py @@ -1,7 +1,7 @@ +from datasets_preview_backend.config import HF_TOKEN, ROWS_MAX_NUMBER from datasets_preview_backend.models.split import get_split # TODO: test fallback -# TODO: test token def test_get_split() -> None: @@ -12,3 +12,13 @@ def test_get_split() -> None: assert split["num_bytes"] == 7792803 assert split["num_examples"] == 14006 + + +def test_gated() -> None: + dataset_name = "severo/dummy_gated" + config_name = "severo--embellishments" + split_name = "train" + split = get_split(dataset_name, config_name, split_name, HF_TOKEN) + + assert len(split["rows"]) == ROWS_MAX_NUMBER + assert split["rows"][0]["year"] == "1855"