Skip to content

Commit

Permalink
removed deprecated --wildcards option and replaced example in README.…
Browse files Browse the repository at this point in the history
…md to use bzgrep instead; also changed positional argument to CONTENT_LOG_FILE while making UPDATED_SINCE a required option
  • Loading branch information
bjoern-reetz committed Jan 27, 2022
1 parent c8852b5 commit 15dd3ba
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 48 deletions.
29 changes: 16 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ A reference implementation for processing the content.log files found at opendat
Example usage:

```
CONTENT_LOG_URL=https://opendata.dwd.de/weather/nwp/content.log.bz2
wget $CONTENT_LOG_URL
bunzip2 content.log.bz2
grep "/icon-d2/grib/../t_2m/.*_icosahedral_.*\.grib2\.bz2" content.log > reduced_content.log
./get_updated_files.py -f reduced_content.log -b $CONTENT_LOG_URL 2022-01-27T03:00 > updated_files.txt
CONTENT_LOG_URL="https://opendata.dwd.de/weather/nwp/content.log.bz2"
PATTERN="/icon-d2/grib/03/t_2m/.*_icosahedral_"
LAST_RUN_AT=$(date -ud 00:00 -Ihours)
wget $CONTENT_LOG_URL -O content.log.bz2
bzgrep $PATTERN content.log.bz2 | ./get_updated_files.py -b $CONTENT_LOG_URL -u $LAST_RUN_AT > updated_files.txt
wget -i updated_files.txt
```

The produced file `updated_files.txt` will hold hyperlinks to files that are updated since the given date-time according
Running the program above will download all updated files into the current working directory. The produced file
`updated_files.txt` will hold hyperlinks to files that are updated since the given date-time according
to the file's modification date found in content.log.

Also mind that there are multiple servers behind https://opendata.dwd.de which might not be exactly in sync with each
Expand All @@ -24,20 +27,20 @@ to process the data reference time that is contained in the filenames instead.

```
$ ./get_updated_files.py --help
usage: get_updated_files.py [-h] [--content-log CONTENT_LOG_FILE] [--wildcards WILDCARDS_FILE] [--min-delta MIN_DELTA] [--version] UPDATED_SINCE
usage: get_updated_files.py [-h] --updated-since UPDATED_SINCE [--url-base URL_BASE] [--min-delta MIN_DELTA] [--version] [CONTENT_LOG_FILE [CONTENT_LOG_FILE ...]]
Filters paths of a DWD Open Data content.log file for entries that have been updated.
positional arguments:
UPDATED_SINCE Last time files were checked for updates
CONTENT_LOG_FILE The decompressed content.log file (default: STDIN)
optional arguments:
-h, --help show this help message and exit
--content-log CONTENT_LOG_FILE, -f CONTENT_LOG_FILE
The decompressed content.log file (default: content.log)
--wildcards WILDCARDS_FILE, -w WILDCARDS_FILE
Filter results by a set of wildcards
--updated-since UPDATED_SINCE, -u UPDATED_SINCE
last time files were checked for updates
--url-base URL_BASE, -b URL_BASE
resolve the paths taken from content.log relative to the given base URL; put the URL of the content.log.bz2 here to end up with correct hyperlinks to DWD's Open Data
--min-delta MIN_DELTA, -d MIN_DELTA
Minimum number of seconds a file needs to be younger than UPDATED_SINCE (default: 60)
minimum number of seconds a file needs to be younger than UPDATED_SINCE (default: 60)
--version show program's version number and exit
```
64 changes: 29 additions & 35 deletions get_updated_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,50 @@

import argparse
import datetime
import fnmatch
import sys
from urllib.parse import urljoin

__VERSION__ = "1.1.0"

arg_parser = argparse.ArgumentParser(description="Filters paths of a DWD Open Data content.log file "
"for entries that have been updated.")
arg_parser.add_argument("updated_since", metavar="UPDATED_SINCE",
arg_parser.add_argument("content_log_files",
nargs="*",
default=[sys.stdin], type=argparse.FileType("r"),
help="The decompressed content.log file (default: STDIN)",
metavar="CONTENT_LOG_FILE")
arg_parser.add_argument("--updated-since", "-u",
type=datetime.datetime.fromisoformat,
help="Last time files were checked for updates")
arg_parser.add_argument("--content-log", "-f", metavar="CONTENT_LOG_FILE", default="content.log",
type=argparse.FileType("r"),
help="The decompressed content.log file (default: content.log)")
required=True,
help="last time files were checked for updates")
arg_parser.add_argument("--url-base", "-b",
help="Resolve the paths taken from content.log relative to the given base URL; "
help="resolve the paths taken from content.log relative to the given base URL; "
"put the URL of the content.log.bz2 here to end up with correct hyperlinks "
"to DWD's Open Data")
arg_parser.add_argument("--min-delta", "-d", default=60,
type=int,
help="Minimum number of seconds a file needs to be younger than UPDATED_SINCE "
"(default: 60)")
arg_parser.add_argument("--wildcards", "-w", metavar="WILDCARDS_FILE",
type=argparse.FileType("r"),
help=argparse.SUPPRESS)
arg_parser.add_argument("--min-delta", "-d",
default=60, type=int,
help="minimum number of seconds a file needs to be younger than UPDATED_SINCE (default: 60)")
arg_parser.add_argument('--version', action='version', version=f'%(prog)s {__VERSION__}')


def main():
args = arg_parser.parse_args()
wildcards = "".join(args.wildcards.readlines()).split() if args.wildcards is not None else []

for line in args.content_log:
# skip all lines that do not match any specified wildcard
# wildcard = [] is falsy will skip the conditional block
# this is a DEPRECATED feature and the option is hidden in --help
# as a replacement, use grep in beforehand to reduce content.log to the relevant lines
if wildcards and not any(fnmatch.fnmatch(line, wildcard) for wildcard in wildcards):
continue
# each line is of the scheme "path|size|changed_at"
path, size, changed_at = line.strip().split("|")
changed_at = datetime.datetime.fromisoformat(changed_at)
# print paths of files that have been updated since UPDATED_SINCE
# but require an extra MIN_DELTA seconds
# because behind the scenes there are two separate servers answering to opendata.dwd.de
# which might not be exactly in sync with each other
if (changed_at - args.updated_since).total_seconds() > args.min_delta:
if args.url_base:
print(urljoin(args.url_base, path))
else:
print(path)
updated_since = args.updated_since.astimezone(datetime.timezone.utc)

for content_log_file in args.content_log_files:
for line in content_log_file:
# each line is of the scheme "path|size|changed_at"
path, size, changed_at = line.strip().split("|")
changed_at = datetime.datetime.fromisoformat(f"{changed_at}+00:00")
# print paths of files that have been updated since UPDATED_SINCE
# but require an extra MIN_DELTA seconds
# because behind the scenes there are two separate servers answering to opendata.dwd.de
# which might not be exactly in sync with each other
if (changed_at - updated_since).total_seconds() > args.min_delta:
if args.url_base:
print(urljoin(args.url_base, path))
else:
print(path)


if __name__ == "__main__":
Expand Down

0 comments on commit 15dd3ba

Please sign in to comment.