From 1a6eb0c53a558404db98fc110907a49a89405c80 Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Fri, 1 Apr 2022 18:39:23 +0200
Subject: [PATCH] remove "gated datasets unlock" logic (#189)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor: 💡 move gated datasets "unlock" code to models/

also: add two tests to ensure the gated datasets can be accessed

* test: 💍 adapt to new version of dummy_gated dataset

I changed
(https://huggingface.co/datasets/severo/dummy_gated/commit/99194748bed3625a941aaf785740df02ca5762c9)
severo/dummy_gated to a simpler dataset, without a python script, to
avoid having non-related errors. Also in the commit: load the HF_TOKEN
from a secret in
https://github.com/huggingface/datasets-preview-backend/settings/secrets/actions
to be able to run the unit tests

* test: 💍 fix wrong hardcoded value

* chore: 🤖 ignore safety warning on ujson package

it's a dependency of lm-dataformat, and last version still depends on a
vulnerable ujson version

* feat: 🎸 remove the "ask_access" logic for gated datasets

the new "app" tokens on moonlanding can read the gated datasets without
having to accept the conditions first, as it occurs for users.

BREAKING CHANGE: 🧨 HF_TOKEN must be an app token
---
 .github/workflows/quality.yml                 |  2 +-
 .github/workflows/unit-tests.yml              |  1 +
 Makefile                                      |  2 +-
 README.md                                     |  2 +-
 poetry.lock                                   | 68 ++++++++++---------
 pyproject.toml                                |  2 +-
 src/datasets_preview_backend/config.py        |  2 +
 src/datasets_preview_backend/io/cache.py      |  9 ---
 .../models/dataset.py                         |  1 +
 .../models/hf_dataset.py                      | 11 ---
 tests/models/test_dataset.py                  | 11 +++
 tests/models/test_split.py                    | 12 +++-
 12 files changed, 65 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 94c45feacd..9fb202db47 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -35,5 +35,5 @@ jobs:
       - name: Run bandit
         run: poetry run bandit -r src
       - name: Run safety
-        run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
+        run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
         # ^^ safety exceptions: pillow, numpy
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index cc6182a5be..6b64dcfbe2 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -32,6 +32,7 @@ jobs:
           ROWS_MAX_NUMBER: 5
           MONGO_CACHE_DATABASE: datasets_preview_cache_test
           MONGO_QUEUE_DATABASE: datasets_preview_queue_test
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests
       - uses: codecov/codecov-action@v2
         with:
diff --git a/Makefile b/Makefile
index 7abc378041..41ee6857e2 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ quality:
 	poetry run flake8 tests src
 	poetry run mypy tests src
 	poetry run bandit -r src
-	poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
+	poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
 # ^^ safety exceptions: pillow, numpy
 
 # Format source code automatically
diff --git a/README.md b/README.md
index 5ed1c691c6..aa60bf19b2 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ Every `WORKER_SLEEP_SECONDS` (defaults to 5 seconds) when idle, the worker will
 - the memory (RAM + SWAP) on the machine is below `MAX_MEMORY_PCT` (defaults to 60%)
 - the number of started jobs for the same dataset is under `MAX_JOBS_PER_DATASET`
 
-Also specify `HF_TOKEN` with a User Access Token (see https://huggingface.co/settings/token, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.
+Also specify `HF_TOKEN` with an App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.
 
 Also specify `MAX_SIZE_FALLBACK` with the maximum size in bytes of the dataset to fallback in normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`.
 
diff --git a/poetry.lock b/poetry.lock
index dddbb9dca8..dcba2b344e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -446,13 +446,13 @@ xxhash = "*"
 apache-beam = ["apache-beam (>=2.26.0)"]
 audio = ["librosa"]
 benchmarks = ["numpy (==1.18.5)", "tensorflow (==2.3.0)", "torch (==1.6.0)", "transformers (==3.0.2)"]
-dev = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[server,s3] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)", "importlib-resources"]
+dev = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[s3,server] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)", "importlib-resources"]
 docs = ["s3fs"]
 quality = ["black (>=22.0,<23.0)", "flake8 (>=3.8.3)", "isort (>=5.0.0)", "pyyaml (>=5.3.1)"]
 s3 = ["fsspec", "boto3", "botocore", "s3fs"]
 tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)"]
 tensorflow_gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
-tests = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[server,s3] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "importlib-resources"]
+tests = ["absl-py", "pytest", "pytest-datadir", "pytest-xdist", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "aiobotocore", "boto3", "botocore", "faiss-cpu (>=1.6.4)", "fsspec", "moto[s3,server] (==2.0.4)", "rarfile (>=4.0)", "s3fs (==2021.08.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "torch", "torchaudio", "soundfile", "transformers", "bs4", "conllu", "h5py", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "tldextract", "zstandard", "bert-score (>=0.3.6)", "rouge-score", "sacrebleu", "scipy", "seqeval", "scikit-learn", "jiwer", "sentencepiece", "torchmetrics (==0.6.0)", "mauve-text", "toml (>=0.10.1)", "requests-file (>=1.5.1)", "tldextract (>=3.1.0)", "texttable (>=1.6.3)", "Werkzeug (>=1.0.1)", "six (>=1.15.0,<1.16.0)", "Pillow (>=6.2.1)", "librosa", "wget (>=3.2)", "pytorch-nlp (==0.5.0)", "pytorch-lightning", "fastBPE (==0.1.0)", "fairseq", "importlib-resources"]
 torch = ["torch"]
 vision = ["Pillow (>=6.2.1)"]
 
@@ -621,7 +621,7 @@ python-versions = ">=3.7"
 
 [[package]]
 name = "fsspec"
-version = "2022.2.0"
+version = "2022.3.0"
 description = "File-system specification"
 category = "main"
 optional = false
@@ -652,6 +652,7 @@ s3 = ["s3fs"]
 sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
+tqdm = ["tqdm"]
 
 [[package]]
 name = "function-parser"
@@ -1022,7 +1023,7 @@ python-versions = ">=3.7,<3.11"
 
 [[package]]
 name = "lm-dataformat"
-version = "0.0.19"
+version = "0.0.20"
 description = "A utility for storing and reading files for LM training."
 category = "main"
 optional = false
@@ -2471,7 +2472,7 @@ standard = ["websockets (>=9.1)", "httptools (>=0.2.0,<0.3.0)", "watchgod (>=0.6
 
 [[package]]
 name = "watchdog"
-version = "2.1.6"
+version = "2.1.7"
 description = "Filesystem events monitoring"
 category = "main"
 optional = false
@@ -2559,7 +2560,7 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "3.9.6"
-content-hash = "7fd8e8d999cde4fc105a4a36f842cd31d59f55d1fbe175ad5bca912967b82eb6"
+content-hash = "dd82aacae83a234b5bcafe7298a54b4df9b1f20494e4209b34f935b9a07383a7"
 
 [metadata.files]
 absl-py = [
@@ -3178,8 +3179,8 @@ frozenlist = [
     {file = "frozenlist-1.3.0.tar.gz", hash = "sha256:ce6f2ba0edb7b0c1d8976565298ad2deba6f8064d2bebb6ffce2ca896eb35b0b"},
 ]
 fsspec = [
-    {file = "fsspec-2022.2.0-py3-none-any.whl", hash = "sha256:eb9c9d9aee49d23028deefffe53e87c55d3515512c63f57e893710301001449a"},
-    {file = "fsspec-2022.2.0.tar.gz", hash = "sha256:20322c659538501f52f6caa73b08b2ff570b7e8ea30a86559721d090e473ad5c"},
+    {file = "fsspec-2022.3.0-py3-none-any.whl", hash = "sha256:a53491b003210fce6911dd8f2d37e20c41a27ce52a655eef11b885d1578ed4cf"},
+    {file = "fsspec-2022.3.0.tar.gz", hash = "sha256:fd582cc4aa0db5968bad9317cae513450eddd08b2193c4428d9349265a995523"},
 ]
 function-parser = [
     {file = "function_parser-0.0.3-py3-none-any.whl", hash = "sha256:c09e4ddb1d9c7783cf5ec7aac72d858f16565552135854844948a67861a15571"},
@@ -3375,8 +3376,8 @@ llvmlite = [
     {file = "llvmlite-0.38.0.tar.gz", hash = "sha256:a99d166ccf3b116f3b9ed23b9b70ba2415640a9c978f3aaa13fad49c58f4965c"},
 ]
 lm-dataformat = [
-    {file = "lm_dataformat-0.0.19-py3-none-any.whl", hash = "sha256:d05bebb6e885bfd4861516f8eca6baa90487e9ffb81b790448d9609866ca2e1f"},
-    {file = "lm_dataformat-0.0.19.tar.gz", hash = "sha256:04fed4405a0eaf9b18f59051476e6e9511759cf27818b5ed67694c5b6f2fe41a"},
+    {file = "lm_dataformat-0.0.20-py3-none-any.whl", hash = "sha256:247468181c9c2fea33a663cdb2f6fea489ddf6741d216fe6b466e60f002705af"},
+    {file = "lm_dataformat-0.0.20.tar.gz", hash = "sha256:0016165b34d8f004753ac265348c3525532e55088f6c9c160f3597e660207145"},
 ]
 lxml = [
     {file = "lxml-4.8.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b"},
@@ -4874,29 +4875,30 @@ uvicorn = [
     {file = "uvicorn-0.14.0.tar.gz", hash = "sha256:45ad7dfaaa7d55cab4cd1e85e03f27e9d60bc067ddc59db52a2b0aeca8870292"},
 ]
 watchdog = [
-    {file = "watchdog-2.1.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9693f35162dc6208d10b10ddf0458cc09ad70c30ba689d9206e02cd836ce28a3"},
-    {file = "watchdog-2.1.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aba5c812f8ee8a3ff3be51887ca2d55fb8e268439ed44110d3846e4229eb0e8b"},
-    {file = "watchdog-2.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4ae38bf8ba6f39d5b83f78661273216e7db5b00f08be7592062cb1fc8b8ba542"},
-    {file = "watchdog-2.1.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ad6f1796e37db2223d2a3f302f586f74c72c630b48a9872c1e7ae8e92e0ab669"},
-    {file = "watchdog-2.1.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:922a69fa533cb0c793b483becaaa0845f655151e7256ec73630a1b2e9ebcb660"},
-    {file = "watchdog-2.1.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b2fcf9402fde2672545b139694284dc3b665fd1be660d73eca6805197ef776a3"},
-    {file = "watchdog-2.1.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3386b367e950a11b0568062b70cc026c6f645428a698d33d39e013aaeda4cc04"},
-    {file = "watchdog-2.1.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8f1c00aa35f504197561060ca4c21d3cc079ba29cf6dd2fe61024c70160c990b"},
-    {file = "watchdog-2.1.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b52b88021b9541a60531142b0a451baca08d28b74a723d0c99b13c8c8d48d604"},
-    {file = "watchdog-2.1.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8047da932432aa32c515ec1447ea79ce578d0559362ca3605f8e9568f844e3c6"},
-    {file = "watchdog-2.1.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e92c2d33858c8f560671b448205a268096e17870dcf60a9bb3ac7bfbafb7f5f9"},
-    {file = "watchdog-2.1.6-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b7d336912853d7b77f9b2c24eeed6a5065d0a0cc0d3b6a5a45ad6d1d05fb8cd8"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_aarch64.whl", hash = "sha256:cca7741c0fcc765568350cb139e92b7f9f3c9a08c4f32591d18ab0a6ac9e71b6"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_armv7l.whl", hash = "sha256:25fb5240b195d17de949588628fdf93032ebf163524ef08933db0ea1f99bd685"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_i686.whl", hash = "sha256:be9be735f827820a06340dff2ddea1fb7234561fa5e6300a62fe7f54d40546a0"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_ppc64.whl", hash = "sha256:d0d19fb2441947b58fbf91336638c2b9f4cc98e05e1045404d7a4cb7cddc7a65"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:3becdb380d8916c873ad512f1701f8a92ce79ec6978ffde92919fd18d41da7fb"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_s390x.whl", hash = "sha256:ae67501c95606072aafa865b6ed47343ac6484472a2f95490ba151f6347acfc2"},
-    {file = "watchdog-2.1.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e0f30db709c939cabf64a6dc5babb276e6d823fd84464ab916f9b9ba5623ca15"},
-    {file = "watchdog-2.1.6-py3-none-win32.whl", hash = "sha256:e02794ac791662a5eafc6ffeaf9bcc149035a0e48eb0a9d40a8feb4622605a3d"},
-    {file = "watchdog-2.1.6-py3-none-win_amd64.whl", hash = "sha256:bd9ba4f332cf57b2c1f698be0728c020399ef3040577cde2939f2e045b39c1e5"},
-    {file = "watchdog-2.1.6-py3-none-win_ia64.whl", hash = "sha256:a0f1c7edf116a12f7245be06120b1852275f9506a7d90227648b250755a03923"},
-    {file = "watchdog-2.1.6.tar.gz", hash = "sha256:a36e75df6c767cbf46f61a91c70b3ba71811dfa0aca4a324d9407a06a8b7a2e7"},
+    {file = "watchdog-2.1.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:177bae28ca723bc00846466016d34f8c1d6a621383b6caca86745918d55c7383"},
+    {file = "watchdog-2.1.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d1cf7dfd747dec519486a98ef16097e6c480934ef115b16f18adb341df747a4"},
+    {file = "watchdog-2.1.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7f14ce6adea2af1bba495acdde0e510aecaeb13b33f7bd2f6324e551b26688ca"},
+    {file = "watchdog-2.1.7-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4d0e98ac2e8dd803a56f4e10438b33a2d40390a72750cff4939b4b274e7906fa"},
+    {file = "watchdog-2.1.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:81982c7884aac75017a6ecc72f1a4fedbae04181a8665a34afce9539fc1b3fab"},
+    {file = "watchdog-2.1.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0b4a1fe6201c6e5a1926f5767b8664b45f0fcb429b62564a41f490ff1ce1dc7a"},
+    {file = "watchdog-2.1.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6e6ae29b72977f2e1ee3d0b760d7ee47896cb53e831cbeede3e64485e5633cc8"},
+    {file = "watchdog-2.1.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b9777664848160449e5b4260e0b7bc1ae0f6f4992a8b285db4ec1ef119ffa0e2"},
+    {file = "watchdog-2.1.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:19b36d436578eb437e029c6b838e732ed08054956366f6dd11875434a62d2b99"},
+    {file = "watchdog-2.1.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b61acffaf5cd5d664af555c0850f9747cc5f2baf71e54bbac164c58398d6ca7b"},
+    {file = "watchdog-2.1.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1e877c70245424b06c41ac258023ea4bd0c8e4ff15d7c1368f17cd0ae6e351dd"},
+    {file = "watchdog-2.1.7-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d802d65262a560278cf1a65ef7cae4e2bc7ecfe19e5451349e4c67e23c9dc420"},
+    {file = "watchdog-2.1.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b3750ee5399e6e9c69eae8b125092b871ee9e2fcbd657a92747aea28f9056a5c"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ed6d9aad09a2a948572224663ab00f8975fae242aa540509737bb4507133fa2d"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_armv7l.whl", hash = "sha256:b26e13e8008dcaea6a909e91d39b629a39635d1a8a7239dd35327c74f4388601"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_i686.whl", hash = "sha256:0908bb50f6f7de54d5d31ec3da1654cb7287c6b87bce371954561e6de379d690"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_ppc64.whl", hash = "sha256:bdcbf75580bf4b960fb659bbccd00123d83119619195f42d721e002c1621602f"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:81a5861d0158a7e55fe149335fb2bbfa6f48cbcbd149b52dbe2cd9a544034bbd"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_s390x.whl", hash = "sha256:03b43d583df0f18782a0431b6e9e9965c5b3f7cf8ec36a00b930def67942c385"},
+    {file = "watchdog-2.1.7-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ae934e34c11aa8296c18f70bf66ed60e9870fcdb4cc19129a04ca83ab23e7055"},
+    {file = "watchdog-2.1.7-py3-none-win32.whl", hash = "sha256:49639865e3db4be032a96695c98ac09eed39bbb43fe876bb217da8f8101689a6"},
+    {file = "watchdog-2.1.7-py3-none-win_amd64.whl", hash = "sha256:340b875aecf4b0e6672076a6f05cfce6686935559bb6d34cebedee04126a9566"},
+    {file = "watchdog-2.1.7-py3-none-win_ia64.whl", hash = "sha256:351e09b6d9374d5bcb947e6ac47a608ec25b9d70583e9db00b2fcdb97b00b572"},
+    {file = "watchdog-2.1.7.tar.gz", hash = "sha256:3fd47815353be9c44eebc94cc28fe26b2b0c5bd889dafc4a5a7cbdf924143480"},
 ]
 werkzeug = [
     {file = "Werkzeug-2.0.3-py3-none-any.whl", hash = "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8"},
diff --git a/pyproject.toml b/pyproject.toml
index add117be99..3e18ea7d79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ function-parser = "^0.0.3"
 gdown = "^4.2.0"
 kenlm = { url = "https://github.com/kpu/kenlm/archive/master.zip" }
 kss = "^2.6.0"
-lm-dataformat = "^0.0.19"
+lm-dataformat = "^0.0.20"
 lxml = "^4.6.3"
 mongo-types = "0.15.1"
 mongoengine = "^0.23.1"
diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py
index 5e8964ed4d..bba7fa3c68 100644
--- a/src/datasets_preview_backend/config.py
+++ b/src/datasets_preview_backend/config.py
@@ -8,6 +8,7 @@
     DEFAULT_ASSETS_DIRECTORY,
     DEFAULT_DATASETS_ENABLE_PRIVATE,
     DEFAULT_DATASETS_REVISION,
+    DEFAULT_HF_TOKEN,
     DEFAULT_LOG_LEVEL,
     DEFAULT_MAX_AGE_LONG_SECONDS,
     DEFAULT_MAX_AGE_SHORT_SECONDS,
@@ -36,6 +37,7 @@
     d=os.environ, key="DATASETS_ENABLE_PRIVATE", default=DEFAULT_DATASETS_ENABLE_PRIVATE
 )
 DATASETS_REVISION = get_str_value(d=os.environ, key="DATASETS_REVISION", default=DEFAULT_DATASETS_REVISION)
+HF_TOKEN = get_str_or_none_value(d=os.environ, key="HF_TOKEN", default=DEFAULT_HF_TOKEN)
 LOG_LEVEL = get_str_value(d=os.environ, key="LOG_LEVEL", default=DEFAULT_LOG_LEVEL)
 MAX_AGE_LONG_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_LONG_SECONDS", default=DEFAULT_MAX_AGE_LONG_SECONDS)
 MAX_AGE_SHORT_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_SHORT_SECONDS", default=DEFAULT_MAX_AGE_SHORT_SECONDS)
diff --git a/src/datasets_preview_backend/io/cache.py b/src/datasets_preview_backend/io/cache.py
index 9da76c962d..73378f9bc5 100644
--- a/src/datasets_preview_backend/io/cache.py
+++ b/src/datasets_preview_backend/io/cache.py
@@ -43,7 +43,6 @@
     SplitFullName,
     get_dataset_split_full_names,
 )
-from datasets_preview_backend.models.hf_dataset import ask_access
 from datasets_preview_backend.models.split import Split, get_split
 from datasets_preview_backend.utils import orjson_dumps
 
@@ -359,10 +358,6 @@ def clean_database() -> None:
 
 
 def refresh_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
-    if hf_token:
-        # remove the gate (for gated datasets) if a token is passed
-        ask_access(dataset_name, hf_token)
-
     try:
         split_full_names = get_dataset_split_full_names(dataset_name, hf_token)
         upsert_dataset(dataset_name, split_full_names)
@@ -420,10 +415,6 @@ def refresh_split(
     hf_token: Optional[str] = None,
     max_size_fallback: Optional[int] = None,
 ):
-    if hf_token:
-        # remove the gate (for gated datasets) if a token is passed
-        ask_access(dataset_name, hf_token)
-
     try:
         split = get_split(
             dataset_name, config_name, split_name, hf_token=hf_token, max_size_fallback=max_size_fallback
diff --git a/src/datasets_preview_backend/models/dataset.py b/src/datasets_preview_backend/models/dataset.py
index 838b18ff5d..e116cb3cc1 100644
--- a/src/datasets_preview_backend/models/dataset.py
+++ b/src/datasets_preview_backend/models/dataset.py
@@ -18,6 +18,7 @@ class SplitFullName(TypedDict):
 
 def get_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
     logger.info(f"get dataset '{dataset_name}' split full names")
+
     try:
         guard_blocked_datasets(dataset_name)
         return [
diff --git a/src/datasets_preview_backend/models/hf_dataset.py b/src/datasets_preview_backend/models/hf_dataset.py
index b613951dfb..fd7c1b13fc 100644
--- a/src/datasets_preview_backend/models/hf_dataset.py
+++ b/src/datasets_preview_backend/models/hf_dataset.py
@@ -1,7 +1,6 @@
 import logging
 from typing import List, TypedDict, Union
 
-import requests
 from datasets import list_datasets
 
 logger = logging.getLogger(__name__)
@@ -32,15 +31,5 @@ def get_hf_datasets() -> List[HFDataset]:
     ]
 
 
-def ask_access(dataset_name: str, hf_token: str) -> None:
-    url = f"https://huggingface.co/datasets/{dataset_name}/ask-access"
-    headers = {"Authorization": f"Bearer {hf_token}"}
-    try:
-        requests.get(url, headers=headers)
-    except Exception as err:
-        logger.warning(f"error while asking access to dataset {dataset_name}: {err}")
-    # TODO: check if the access was granted: check if we were redirected to the dataset page, or to the login page
-
-
 def get_hf_dataset_names() -> List[str]:
     return [d["id"] for d in get_hf_datasets()]
diff --git a/tests/models/test_dataset.py b/tests/models/test_dataset.py
index f8b9c77648..fb19764e35 100644
--- a/tests/models/test_dataset.py
+++ b/tests/models/test_dataset.py
@@ -1,5 +1,6 @@
 import pytest
 
+from datasets_preview_backend.config import HF_TOKEN
 from datasets_preview_backend.exceptions import Status400Error
 from datasets_preview_backend.models.dataset import get_dataset_split_full_names
 
@@ -51,3 +52,13 @@ def test_splits_fallback() -> None:
     split_full_names = get_dataset_split_full_names("hda_nli_hindi")
     assert len(split_full_names) == 3
     assert {"dataset_name": "hda_nli_hindi", "config_name": "HDA nli hindi", "split_name": "train"} in split_full_names
+
+
+def test_gated() -> None:
+    split_full_names = get_dataset_split_full_names("severo/dummy_gated", HF_TOKEN)
+    assert len(split_full_names) == 1
+    assert {
+        "dataset_name": "severo/dummy_gated",
+        "config_name": "severo--embellishments",
+        "split_name": "train",
+    } in split_full_names
diff --git a/tests/models/test_split.py b/tests/models/test_split.py
index d8482266d4..7f0c5f723c 100644
--- a/tests/models/test_split.py
+++ b/tests/models/test_split.py
@@ -1,7 +1,7 @@
+from datasets_preview_backend.config import HF_TOKEN, ROWS_MAX_NUMBER
 from datasets_preview_backend.models.split import get_split
 
 # TODO: test fallback
-# TODO: test token
 
 
 def test_get_split() -> None:
@@ -12,3 +12,13 @@ def test_get_split() -> None:
 
     assert split["num_bytes"] == 7792803
     assert split["num_examples"] == 14006
+
+
+def test_gated() -> None:
+    dataset_name = "severo/dummy_gated"
+    config_name = "severo--embellishments"
+    split_name = "train"
+    split = get_split(dataset_name, config_name, split_name, HF_TOKEN)
+
+    assert len(split["rows"]) == ROWS_MAX_NUMBER
+    assert split["rows"][0]["year"] == "1855"