-
Notifications
You must be signed in to change notification settings - Fork 8
/
gdrive.py
116 lines (88 loc) · 4.01 KB
/
gdrive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from pathlib import Path
import pickle
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import gdown
from .base import BaseExporter
from . import RegionalExporter
from typing import Dict, List, Optional
class GDriveExporter(BaseExporter):
r"""
An exporter to download data from Google Drive
"""
dataset = "gdrive" # we will only save the token here
scopes = ["https://www.googleapis.com/auth/drive.metadata.readonly"]
def __init__(self, data_folder: Path = Path("data")) -> None:
super().__init__(data_folder)
assert (self.output_folder / "credentials.json").exists(), (
f"Enable the google drive API at this link: "
f"https://developers.google.com/drive/api/v3/quickstart/python "
f"to use this class. Save the credentials.json at {self.output_folder}"
)
# https://developers.google.com/drive/api/v3/quickstart/python
creds = None
token_path = self.output_folder / "token.pickle"
if token_path.exists():
with token_path.open("rb") as f:
creds = pickle.load(f)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
self.output_folder / "credentials.json", self.scopes
)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with (self.output_folder / "token.pickle").open("wb") as token:
pickle.dump(creds, token)
self.service = build("drive", "v3", credentials=creds)
def export(self, region_name: str, max_downloads: Optional[int] = None) -> None:
r"""
Download data from Google Drive. This is useful when downloading data exported by
the regional exporter, as the filesizes can be large.
:param region_name: The name of the downloaded region. The exporter will search for
this string in the google drive files to filter which files to download
:param max_downloads: The max number of downloads. If None, all tiff files containing
region_name are downloaded
"""
query = f'(fullText contains "{region_name}") and (mimeType = "image/tiff")'
file_info: List[Dict] = []
results = (
self.service.files()
.list(pageSize=10, q=query, fields="nextPageToken, files(id, name)",)
.execute()
)
items = results.get("files", [])
file_info.extend(items)
next_page = results.get("nextPageToken", None)
while next_page is not None:
results = (
self.service.files()
.list(
pageSize=10,
pageToken=next_page,
# https://stackoverflow.com/questions/47402545/
# google-drive-js-api-nextpagetoken-invalid
q=query,
fields="nextPageToken, files(id, name)",
)
.execute()
)
items = results.get("files", [])
file_info.extend(items)
next_page = results.get("nextPageToken", None)
print(f"Downloading {len(file_info)} files")
for idx, individual_file in enumerate(file_info):
if (max_downloads is not None) and (idx >= max_downloads):
return None
print(f"Downloading {individual_file['name']}")
url = f"https://drive.google.com/uc?id={individual_file['id']}"
download_path = (
self.raw_folder / RegionalExporter.dataset / individual_file["name"]
)
if download_path.exists():
print(f"File already exists! Skipping")
continue
gdown.download(url, str(download_path), quiet=False)