From 4418c4a63accb45ca28743e2bd7f0b7c494cfc64 Mon Sep 17 00:00:00 2001 From: bpoullio Date: Fri, 20 Sep 2019 16:24:25 -0600 Subject: [PATCH] updated canvas data sync, added script to pull exams by course per semester --- .../python/pullexams_bycourse-README.txt | 22 ++ .../python/pullexams_bycourse.py | 272 +++++++++++++++ .../python/canvas_data_sync-README.txt | 10 +- .../python/canvas_data_sync.py | 328 ++++++------------ .../python/canvasfuncs/candata.py | 108 ++++++ .../python/canvasfuncs/hmacsig.py | 59 ++++ 6 files changed, 571 insertions(+), 228 deletions(-) create mode 100644 api/pull_course_quizzes/python/pullexams_bycourse-README.txt create mode 100755 api/pull_course_quizzes/python/pullexams_bycourse.py create mode 100644 canvas_data/sync_canvas_data/python/canvasfuncs/candata.py create mode 100644 canvas_data/sync_canvas_data/python/canvasfuncs/hmacsig.py diff --git a/api/pull_course_quizzes/python/pullexams_bycourse-README.txt b/api/pull_course_quizzes/python/pullexams_bycourse-README.txt new file mode 100644 index 0000000..444eb9f --- /dev/null +++ b/api/pull_course_quizzes/python/pullexams_bycourse-README.txt @@ -0,0 +1,22 @@ +# pullexams_bycourse.py +# +# Usage: python3 pullexams_bycourse.py +# +# Outputs: CSV of exam info with course info to cross-reference +# +# Args: Requires a target (test or prod) and any amount of terms +# Note that terms must match the SIS ID for term in Canvas +# See: https://canvas.instructure.com/doc/api/enrollment_terms.html +# +# Outline: 1. Request and document all courses matching criteria specified +# 2. Request and document all quiz info for courses from 1 +# 3. Check quiz due dates against current date to filter further +# 4. Write remaining available quizzes to file +# +# General advice: * Most replacement should happen between <> +# * When you see {} do not remove w/o removing matching .format +# * Careful changing things, infinite loops are possible +# +# Author: Brandon Poulliot +# +# Works as of 9/20/19 diff --git a/api/pull_course_quizzes/python/pullexams_bycourse.py b/api/pull_course_quizzes/python/pullexams_bycourse.py new file mode 100755 index 0000000..d81027e --- /dev/null +++ b/api/pull_course_quizzes/python/pullexams_bycourse.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# pullexams_bycourse.py +# +# Usage: python3 pullexams_bycourse.py +# +# Outputs: CSV of exam info with course info to cross-reference +# +# Args: Requires a target (test or prod) and any amount of terms +# Note that terms must match the SIS ID for term in Canvas +# See: https://canvas.instructure.com/doc/api/enrollment_terms.html +# +# Outline: 1. Request and document all courses matching criteria specified +# 2. Request and document all quiz info for courses from 1 +# 3. Check quiz due dates against current date to filter further +# 4. Write remaining available quizzes to file +# +# General advice: * Most replacement should happen between <> +# * When you see {} do not remove w/o removing matching .format +# * Careful changing things, infinite loops are possible +# +# Author: Brandon Poulliot +# +# Works as of 9/20/19 + +# standard libraries +from datetime import datetime +import json +from os.path import join +from sys import argv, exit +import csv +import re + +# non-standard libraries +import requests + +############################################################################### +############################## CHANGE THESE ############################### +############################################################################### + +# set datetime info -- change format to suit +print('Starting exam pull now {:%Y-%m-%dT%H:%M:%S}'.format( + datetime.now())) + +# set a regex expression to verify your terms passed in -- see examples +# also https://regexr.com/ can help building regex to match +# MUST UNCOMMENT THESE TO VERIFY TERMS ARGS PASSED IN + +#term_regex = '^(SP|SU|FA)2[0-9]{3}$' # match format of SP2020 or FA2030 + # case sensitive -- thru 2999 + +#term_regex = '^2[0-9]{2}(1|4|7)$' # match 4-digit term starting w/ 2, middle + # digits are 0-9, last digit is 1, 4, or 7 + +#term_regex = '^2[0-9]{3}\/(SP|SU|FA)$' # match year '/' term format thru 2999 + # e.g., 2020/FA or 2031/SU thru 2999 + +# Canvas and auth info +canvas_domain = '{}.instructure.com' +token = '' +test_token = '' + +# set the output path for quiz file +prod_out = '' +test_out = '' + +# change the name of the quiz file to suit +quiz_fname = '-{}.csv' +quiz_headers = 'course_name,course_code,quiz_name,unlock_date,due_date,lock_date\n' + +# courses request parameters -- default: active w/ enrollments & not completed +account = 1 # subaccount ID +per_page = 100 # results per page, most cases limit is 100 +do_enrollments = 'true' # exclude courses w/o enrollments +do_published = 'true' # exclude unpublished courses +do_completed = 'false' # exclude completed courses +do_term = 'sis_term_id:{}' # search by semester -- use your SIS term ID +# if using, uncomment here and in params manifest (see line XXX) +#do_etype = '' # teacher, student, ta, observer, or designer +#do_blueprint = '' # t/f only include blueprint parents +#do_associated = '' # t/f only include blueprint children +#do_teachers = # int list of teacher user IDs to filter by +#do_subaccounts = '' # int list of subaccount IDs to filter by +#do_state = '' # created, claimed, available, completed, deleted, all +#do_search = '' # partial course name, code, or full ID +#do_include = '' # list of includes, see API docs +#do_sort = '' # by course_name, sis_course_id, teacher, account_name +#do_order = '' # sort 'asc' or 'desc' order +#do_filter = '' # by course or teacher, see API docs + + +############################################################################### +########################## DO NOT CHANGE THESE ############################ +############################################################################### + +# separate the args +args = [] +total_args = len(argv) + +i = 1 +while i < total_args: + print('Argument {}: {}'.format(i, argv[i])) + args.append(argv[i]) + i += 1 + +passed_args = len(args) + +if passed_args < 2: + print('''Not enough arguments supplied. \ + Syntax is: python3 pullexams_bycourse.py [prod|test] [2xxx].''') + exit('invalid arguments') + +# set environment variables based on first arg +if args[0] == 'prod': + target = 'prod' + env = '' + out_path = prod_out +elif args[0] == 'test': + target = 'test' + env = '.test' + token = test_token + out_path = test_out +else: + # will fail if no environment provided or not provided in correct order + target = None + print('Env arg invalid, exiting (should be prod/test), arg was: {}'.format( + args[0])) + exit('invalid argument') + +# set new quiz file path based on env out path and filename supplied +quizf = join(out_path, quiz_fname) + +# set request header info +headers = {'Authorization': 'Bearer {}'.format(token)} + +# set endpoint info +base_domain = 'https://{}/api/v1/{}'.format(canvas_domain.format(env), '{}') +course_uri = base_domain.format('accounts/{}/courses') +quiz_uri = base_domain.format('courses/{}/quizzes') + +# semesters check +terms = [] +i = 1 +while i < passed_args: + terms.append(args[i]) + i += 1 +print('Terms provided: {}'.format(terms)) + +for term in terms: + + # double-check quiz count + iq = 0 + + # verify that the terms provided are valid using regex + # comment out if term_regex is not set above or utilized + try: + term_regex + except NameError: + print('Terms not being verified, proceeding...') + else: + verify_term = bool(re.match(term_regex, term)) + if not verify_term: + print('Terms must be in {} format, please try again.'.format(term_regex)) + exit('invalid term format') + + # storage arrays + courses_a = [] + quiz_a = [] + quizzes_open = [] + + # params manifest, ensure all params specified are uncommented here too! + params = { + 'with_enrollments': do_enrollments, + 'published': do_published, + 'completed': do_completed, + 'enrollment_term_id': do_term.format(term)#, + #'enrollment_type[]': do_etype, + #'blueprint': do_blueprint, + #'blueprint_associated': do_associated, + #'by_teacher[]': do_teachers, + #'by_subaccounts': do_subaccounts, + #'state[]': do_state, + #'search_term': do_search, + #'include[]': do_include, + #'sort': do_sort, + #'order': do_order, + #'search_by': do_filter + } + + + # get course IDs w/ criteria spec'd above + # default: published, not completed, has enrollments + pubcourse_r = requests.get(course_uri.format(account), headers=headers, + params=params, timeout=10) + # grab the json response + pubcourses = pubcourse_r.json() + + # for each course, add it to the courses array + ic = 0 + for course in pubcourses: + courses_a.append(course) + ic += 1 + # handle pagination, keep going until the last page + while pubcourse_r.links['current']['url'] != pubcourse_r.links['last']['url']: + pubcourse_r = requests.get(pubcourse_r.links['next']['url'], headers=headers, + params=params, timeout=10) + pubcourses = pubcourse_r.json() + + for course in pubcourses: + courses_a.append(course) + ic += 1 + print('Course count for {}: {}'.format(term, ic)) + print('Completed course manifest, pulling exams...') + + # send the biorobots to the roof, radiation limit 10s + for course in courses_a: + quiz_r = requests.get(quiz_uri.format(course['id']), headers=headers, + timeout=10) + # get the response of quizzes in spec'd course + quizzes = quiz_r.json() + + # add each quiz to the quizzes array + for quiz in quizzes: + quiz_a.append(quiz) + + while quiz_r.links['current']['url'] != quiz_r.links['last']['url']: + quiz_r = requests.get(quiz_r.links['next']['url'], headers=headers, + timeout=10) + quizzes = quiz_r.json() + for quiz in quizzes: + quiz_a.append(quiz) + + # check due date, lock date, unlock date (availability), add to quiz array + for quiz in quiz_a: + name = quiz['title'] + due = quiz['due_at'] + lock = quiz['lock_at'] + unlock = quiz['unlock_at'] + if due is not None: + due_date = datetime.strptime(due, '%Y-%m-%dT%H:%M:%SZ') + dt_check = datetime.utcnow() + available = due_date > dt_check + elif unlock is not None: + unlock_date = datetime.strptime(unlock, '%Y-%m-%dT%H:%M:%SZ') + available = unlock_date < dt_check + elif lock is not None: + lock_date = datetime.strptime(lock, '%Y-%m-%dT%H:%M:%SZ') + available = lock_date < dt_check + elif lock is None and unlock is None and due is None: + available = True + else: + available = False + if available: + row = '{},{},{}\n'.format(course['name'], course['sis_course_id'], + name, unlock, due, lock) + + quizzes_open.append(row) + iq += 1 + print('Quizzes added to manifest for {}: {}'.format(term, iq)) + print('Completed exams manifest, writing to file...') + + # open the quiz file, write each row, close it up + with open(quizf.format(term), 'w+') as qfile: + qfile.write(quiz_headers) + qrow = 0 + for row in quizzes_open: + qfile.write(row) + qrow += 1 + qfile.close() + print('Quizzes Written to File: {}'.format(qrow)) + diff --git a/canvas_data/sync_canvas_data/python/canvas_data_sync-README.txt b/canvas_data/sync_canvas_data/python/canvas_data_sync-README.txt index 915c3e0..23cf8de 100644 --- a/canvas_data/sync_canvas_data/python/canvas_data_sync-README.txt +++ b/canvas_data/sync_canvas_data/python/canvas_data_sync-README.txt @@ -1,4 +1,8 @@ -Words of WARNING: This will likely download a LOT of data to the output directory. Carefully consider if you have space (hundreds of GB most likely) to spare for the sync files as well as newly re-written files (e.g., data was added to "module_progression_fact" table, but not enough to warrant a whole new file) before they are removed. +Words of WARNING: This will likely download a LOT of data to the output +directory. Carefully consider if you have space (hundreds of GB most likely) +to spare for the sync files as well as newly re-written files (e.g., data was +added to "module_progression_fact" table, but not enough to warrant a whole new +file) before they are removed. # canvas_data_sync.py # Author: Brandon Poulliot @@ -47,7 +51,7 @@ Words of WARNING: This will likely download a LOT of data to the output director # Uses default python to write a JSON file with all info on # the first 100 data dumps after dump ID 345 # -# NOTES: + Working as of 3/4/19 +# NOTES: + Working as of 9/20/19 # + Left in "future-proofing" lines, do not uncomment until useful -# + Flat file extensions don't matter -- change at will +# + Flat file extensions don't matter -- change at will# + Schema/dump extenstions - keep JSON for syntax marks in text editors # + Schema/dump extenstions - keep JSON for syntax marks in text editors diff --git a/canvas_data/sync_canvas_data/python/canvas_data_sync.py b/canvas_data/sync_canvas_data/python/canvas_data_sync.py index 5e18e32..1df5ea3 100644 --- a/canvas_data/sync_canvas_data/python/canvas_data_sync.py +++ b/canvas_data/sync_canvas_data/python/canvas_data_sync.py @@ -8,8 +8,6 @@ # * Can provide latest Canvas Data schema in file # * Can provide more information on Canvas Data dumps in CSV file # -# Requirements: only non-standard library is REQUESTS -# # Script Map: # 1. Take in arguments and set API endpoint # 2. Create base-64-encoded HMAC-256 signature @@ -23,9 +21,8 @@ # 5. Remove the GZ archive files (options 3a and 3b) # 6. Remove files not present in most recent sync (option 3a) # -# Arguments: -# 1. endpoint - required and positional, must come directly after -# script invocation +# Arguments: 1. endpoint - required and positional, must come directly after +# script invocation # 2. -l (--limit) - optional, invoke using -l limit=# # Only used with dump option, specifies a limit to # the number of dump entries returned (default=50) @@ -33,22 +30,21 @@ # Only used with dump option, specifies the ID of # the dump to pull data after (i.e., ID > limit) # 4. -m (--method) - optional, invoke using -# -m (GET|DELETE|HEAD|OPTIONS|POST|PUT) -# Future-proofing this script when more methods -# become available (default=GET) +# -m (GET|DELETE|HEAD|OPTIONS|POST|PUT) +# Future-proofing this script when more methods +# become available (default=GET) # # Usage: Call from shell/cmd with preferred version and arguments -# Examples: -# 1. python3 canvas_data_sync.py sync -m GET +# Examples: 1. python3 canvas_data_sync.py sync -m GET # -# Uses python 3.x + GET method to sync all Canvas Data +# Uses python 3.x + GET method to sync all Canvas Data # # 2. python canvas_data_sync.py dump -l limit=100 -a after=345 # -# Uses default python to write a JSON file with all info on -# the first 100 data dumps after dump ID 345 -# -# NOTES: + Working as of 3/4/19 +# Uses default python to write a JSON file with all info on +# the first 100 data dumps after dump ID 345 +# +# NOTES: + Working as of 9/20/19 # + Left in "future-proofing" lines, do not uncomment until useful # + Flat file extensions don't matter -- change at will # + Schema/dump extenstions - keep JSON for syntax marks in text editors @@ -58,46 +54,48 @@ ############################################################################### # standard modules -import argparse -import base64 import gzip -import hashlib -import hmac import json import re import sys from datetime import datetime, timezone from os import listdir, remove from os.path import getsize, isfile, join -from urllib import parse # non-standard modules import requests +from canvasfuncs import hmacsig, candata ############################################################################### ################# User-Declared Variables -- CHANGE THESE! #################### ############################################################################### # set local timezone abbreviation to differentiate -local_timezone = '' +local_timezone = '' +params = '' # generate local timestamp (LT) for filenames dt_lt = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) -# generate UTC timestamp for HMAC-256 signature -dt_now = datetime.now(timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT') -# output dir *MUST* have trailing slash -# Example: /home/canvas/data-dumps/ -out_dir = '' +# output dir *MUST* have trailing slash followed by curly braces +# Example: /home/canvas/data-dumps/{} +out_dir = '' +schema_out = join(out_dir,'schema/') # your Canvas Data API key -- do NOT use secret here -api_key = '' +api_key = '' # now use your Canvas Data API secret! -cdata_secret = '' +cdata_secret = '' # filenames -- change as appropriate -schema_fname = '{}-canvas-data-schema.json'.format(dt_lt) -dump_fname = '{}-canvas_data_dumps.json'.format(dt_lt) -# set file extension for flat files -- recommend using default of blank -fext = '' +schema_fname = 'canvasdata-schema-{}.json' +dump_fname = '{}-canvasdata-dumps.json'.format(dt_lt) + +# schema notification settings +body = join(schema_out, 'schema_notify') +subj = 'Canvas Data Schema Changes {}'.format(dt_lt) +msg = '''New schema version {} for Canvas Data. \n\ + Please consult https://portal.inshosteddata.com/docs/api''' +whonotify = '' + # set block size for buffer as needed block_size = 8192 @@ -106,206 +104,85 @@ ################# API Call Information Gathering Section ###################### ############################################################################### -# init variables for API call parameters -raw_params = [] -params = '' - -# create argument parser to allow for command line arguments -parser = argparse.ArgumentParser(description='''Separate Canvas Data API call - components.''') - -# add arguments to parser, first is positional (must be 1st) and required -parser.add_argument('endpoint', - help='''Specify the endpoint of your API call: dump, sync, - latest, or schema.''') -parser.add_argument('-l', '--limit', - help='''Syntax is "limit=#", specifies how many records to - return. Only works with dump.''') -parser.add_argument('-a', '--after', - help='''Syntax is "after=#", specifies to pull only data - after dump number provided. Only works with dump.''') -# this one doesn't matter right now, only method available is GET -parser.add_argument('-m', '--method', default='GET', - help='''Future-proofing for possible new methods for - Canvas Data API. Currently, only method is GET.''') +# parse arguments +args = candata.parse(sys.argv) -# parse args from sys.argv into ParseResult object -args = parser.parse_args() - -# check that limit, after, and method all meet syntax requirements -if args.limit is not None: - limit_syntax = re.search('^limit\=\d+$', args.limit) - if limit_syntax is not None: - raw_params.append(args.limit) - -if args.after is not None: - after_syntax = re.search('^after\=\d+$', args.after) - if after_syntax is not None: - raw_params.append(args.after) - -# check that the HTTP method is acceptable +# check that method is correct syntax method_syntax = re.search('^GET$', args.method) -# Below for use only when more methods added to Canvas Data API + +# For use only when more methods added to Canvas Data API #method_syntax = re.search('^(GET|DELETE|HEAD|OPTIONS|POST|PUT)$', # args.method) # if the method is wrong, the call won't work, exit if method_syntax is None: -# Below for use only when more methods added to Canvas Data API -# print('''HTTP method is not valid, must be GET, DELETE, HEAD, OPTIONS, -# POST, or PUT. Exiting...''') print('HTTP method is not valid, must be GET. Exiting...') sys.exit('invalid method.') -# check the endpoint argument and set the API call URL accordingly -if args.endpoint.lower() == 'dump': - api_url = 'https://portal.inshosteddata.com/api/account/self/dump{}{}' -elif args.endpoint.lower() == 'sync': - api_url = 'https://portal.inshosteddata.com/api/account/self/file/sync' -elif args.endpoint.lower() == 'schema': - api_url = 'https://portal.inshosteddata.com/api/schema/latest' -elif args.endpoint.lower() == 'latest': - api_url = 'https://portal.inshosteddata.com/api/account/self/file/latest' -# TODO: Add byTable endpoint and args -# elif args.endpoint.lower() == 'bytable': - -else: - print('''Invalid argument, must be "dump", "latest", "sync", or "schema". - Exiting...''') - sys.exit('invalid request') - -# check if params set, sort alphabetically, join them, add to end of API call -if args.endpoint.lower() == 'dump': - if len(raw_params) > 0: - raw_params.sort() - params = '&'.join(raw_params) - call_url = api_url.format('?', params) - # remove curly braces -- although this doesn't seem to matter... - else: - call_url = api_url.strip('{}') -# if not using dump, don't add parameters -else: - call_url = api_url - -############################################################################### -###################### HMAC Signature Building Section ######################## -############################################################################### - -# break the call into components to build HMAC-256 signature -call_info = list(parse.urlparse(call_url)) +# print('''HTTP method is not valid, must be GET, DELETE, HEAD, OPTIONS, +# POST, or PUT. Exiting...''') -# set components for HMAC-256 signature -reqOpts = { - 'method' : args.method.upper(), - 'host' : call_info[1], - # intentionally blank - 'content_type' : '', - # intentionally blank - 'content_md5' : '', - 'path' : call_info[2], - 'parameters' : params, - 'req_timestamp' : dt_now, - 'api_secret' : cdata_secret - } +cdata_uri = 'https://portal.inshosteddata.com/api/{}' +ep_all = ['dump', 'sync', 'schema', 'latest'] -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# -############### DO NOT CHANGE ANYTHING IN THIS SUBSECTION ##################### -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# +# check the endpoint argument +endpoints = [args.endpoint.lower()] -# build a bytes message by joining the signature components -message = bytes('\n'.join(str(x) for x in reqOpts.values()), 'utf-8') -# change the Canvas Data API secret to bytes -api_secb = bytes(reqOpts['api_secret'], 'utf-8') +if len([e for e in endpoints if e not in ep_all]) > 0: + print('''Invalid argument, must be "dump", "latest", "sync", or "schema". \ + Exiting...''') + sys.exit('invalid request') -# create an SHA-256 hashed HMAC object, then base 64 encode it -signed_msg = base64.b64encode(hmac.new(api_secb, message, - digestmod=hashlib.sha256).digest()) -# must be 'decoded'to utf-8 to get rid of byte marks (^,.,^) -signature = signed_msg.decode('utf-8') +# add a schema file to sync +if 'sync' in endpoints: + endpoints.append('schema') -# build auth headers from Canvas Data API key, HMAC-256 sig, and timestamp -auth_headers = { 'Authorization' : 'HMACAuth {}:{}'.format(api_key, signature), - 'Date' : '{}'.format(dt_now) } +# TODO: Add byTable endpoint and args ############################################################################### ############################# API Call Generation ############################# ############################################################################### # start the API call -print('Starting Canvas Data {} request...\nTimestamp: {}\n'.format(args.endpoint.lower(), dt_lt)) -start_call = requests.get(call_url, headers=auth_headers) -call_response = start_call.json() - -# initialize loop variables -fname = '' -file_path = '' -flat_file = '' -dl_url = '' -sync_files = [] - -############################################################################### -########################## CData SYNC Section ################################# -############################################################################### +for call in endpoints: + print('Starting Canvas Data {} request...\nTimestamp: {}\n'.format( + call, dt_lt)) + +# not terribly useful unless schema changes, writes schema to a file +if 'schema' in endpoints: + call_url = cdata_uri.format('schema/latest') + reqOpts = hmacsig.HMACopts(call_url, args.method, params, cdata_secret) + auth_headers = hmacsig.HMACsig(reqOpts, api_key) + start_call = requests.get(call_url, headers=auth_headers) + call_response = start_call.json() + file_path = join(schema_out, schema_fname.format(call_response['version'].replace('.', '-'))) + if not isfile(file_path): + old_schema = listdir(schema_out) + with open(file_path, 'w+') as schema_file: + call_json = json.dump(call_response, schema_file, indent=4) + schema_file.close() + for schema in old_schema: + remove(join(schema_out, schema)) + dt_complete = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) + msg_detail = msg.format(call_response['version']) + print('Schema file written, check output directory.\n Completed: {}'.format(dt_complete)) + schema_notify = candata.notify(subj, body, msg_detail, whonotify) # main purpose of script -- syncs Canvas Data API files to output dir -if args.endpoint.lower() == 'sync': +if 'sync' in endpoints: + sync_files = [] + call_url = cdata_uri.format('account/self/file/sync') + reqOpts = hmacsig.HMACopts(call_url, args.method, params, cdata_secret) + auth_headers = hmacsig.HMACsig(reqOpts, api_key) + start_call = requests.get(call_url, headers=auth_headers) + call_response = start_call.json() # get filename and download path for each table table_manifest = call_response['files'] - for table in table_manifest: - fname = table['filename'] - file_path = join(out_dir, fname) - flat_fname = fname.split('.')[0] - print(fname) - # add extension if desired, makes no difference but you do you - flat_file = file_path.split('.')[0] + fext - dl_url = table['url'] - dl_file = requests.get(dl_url) - # is the file a full table or part of a table? - print('Partial table? {}'.format(table['partial'])) - if isfile(file_path): - remove(file_path) - print('Local file fragment removed: {}'.format(file_path)) - # delete any zero-length mishap files - if isfile(flat_file) and getsize(flat_file) == 0: - remove(flat_file) - # check if the flat file exists, if not, download it - # note that this will skip incomplete files more than 0 KB - if isfile(flat_file) and getsize(flat_file) > 0: - print('Skipping file: {} -- already exists.\n'.format(flat_fname)) - sync_files.append(flat_fname) - continue - # if the thousand other scenarios aren't true, let's write the file! - else: - with open(file_path, 'wb') as sync: - sync.write(dl_file.content) - sync.close() - print('Downloaded file: {}'.format(fname)) - # open gz file and dump contents into flat file block by block - with gzip.open(file_path, 'rb') as zipped, \ - open(flat_file, 'wb') as unzipped: - while True: - block = zipped.read(block_size) - if not block: - break - else: - unzipped.write(block) - unzipped.write(block) - # must explicitly close both files before further manipulation - unzipped.close() - zipped.close() - print('Unzipped file: {}\n'.format(flat_file)) - # after all is said and done, remove the GZ file - remove(file_path) - # add the downloaded file to a sync list - sync_files.append(flat_fname) + sync_files = candata.tablesync(table_manifest, out_dir, block_size) # catalog existing files -- to remove unnecessary files later (for sync) x_files = listdir(out_dir) # create a deletion manifest via list comprehension - del_manifest = [f for f in x_files + sync_files if f not in sync_files] - del_paths = [] - # add the output path to file names in deletion manifest - for file in del_manifest: - del_paths.append(join(out_dir, file)) + del_manifest = [f for f in x_files if f not in sync_files] # remove each file in the deletion manifest to complete the sync for file in del_manifest: fpath = join(out_dir, file) @@ -315,13 +192,16 @@ dt_complete = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) print('Canvas Data synchronized.\nCompleted: {}'.format(dt_complete)) -############################################################################### -########################## CData LATEST Section ############################### -############################################################################### - # download latest dump (i.e., tables from last 24 hrs), not needed w/ sync -elif args.endpoint.lower() == 'latest': +if 'latest' in endpoints: + call_url = cdata_uri.format('account/self/file/latest') + reqOpts = hmacsig.HMACopts(call_url, args.method, params, cdata_secret) + auth_headers = hmacsig.HMACsig(reqOpts, api_key) + start_call = requests.get(call_url, headers=auth_headers) + call_response = start_call.json() table_list = call_response['artifactsByTable'] + + # TODO: match with sync to utilize candata.tablesync function for table in table_list: fname = table_list[table]['files'][0]['filename'] flat_fname = fname.split('.')[0] @@ -338,7 +218,7 @@ print('Skipping file: {} -- already exists.\n'.format(flat_fname)) continue else: - with open(file_path, 'wb') as latest: + with open(file_path, 'wb+') as latest: latest.write(dl_file.content) latest.close() print('Downloaded file: {}'.format(fname)) @@ -358,25 +238,23 @@ dt_complete = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) print('Canvas Data latest dump downloaded.\nCompleted: {}'.format(dt_complete)) -############################################################################### -########################## CData INFO Section ################################# -############################################################################### +# if you need more information on daily dumps +if 'dump' in endpoints: + dump_uri = cdata_uri.format('account/self/dump{}{}') + + if args.limit is not None and args.after is not None: + call_url = paramcheck(args.limit, args.after, dump_uri) + else: + call_url = dump_uri.strip('{}') + reqOpts = hmacsig.HMACopts(call_url, args.method, params, cdata_secret) + auth_headers = hmacsig.HMACsig(reqOpts, api_key) + start_call = requests.get(call_url, headers=auth_headers) + call_response = start_call.json() -# if you need more information on daily dumps, writes a JSON file with info -elif args.endpoint.lower() == 'dump': fname = dump_fname file_path = join(out_dir, fname) - with open(file_path,'w') as dump_file: + with open(file_path,'w+') as dump_file: call_json = json.dump(call_response, dump_file, indent=4) dt_complete = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) print('''Dump info file written, check output directory. \nCompleted: {}'''.format(dt_complete)) - -# writes schema to a file, not terribly useful unless schema changes -elif args.endpoint.lower() == 'schema': - fname = schema_fname - file_path = join(out_dir, fname) - with open(file_path, 'w') as schema_file: - call_json = json.dump(call_response, schema_file, indent=4) - dt_complete = datetime.now().strftime('%m-%d-%Y_%H%M{}'.format(local_timezone)) - print('Schema file written, check output directory.\n Completed: {}'.format(dt_complete)) diff --git a/canvas_data/sync_canvas_data/python/canvasfuncs/candata.py b/canvas_data/sync_canvas_data/python/canvasfuncs/candata.py new file mode 100644 index 0000000..f235f31 --- /dev/null +++ b/canvas_data/sync_canvas_data/python/canvasfuncs/candata.py @@ -0,0 +1,108 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +from subprocess import call +import re +from os.path import getsize, isfile, join +import requests +from os import listdir, remove +import argparse +import gzip + +def parse(argv=None): + # create argument parser to allow for command line arguments + parser = argparse.ArgumentParser(description='''Separate Canvas Data API call + components.''') + + # add arguments to parser, first is positional (must be 1st) and required + parser.add_argument('endpoint', + help='''Specify the endpoint of your API call: dump, + sync, latest, or schema.''') + parser.add_argument('-l', '--limit', + help='''Syntax is "limit=#", specifies how many records + to return. Only works with dump.''') + parser.add_argument('-a', '--after', + help='''Syntax is "after=#", specifies to pull only data + after dump number provided. Only works with dump.''') + # this one doesn't matter right now, only method available is GET + parser.add_argument('-m', '--method', default='GET', + help='''Future-proofing for possible new methods for + Canvas Data API. Currently, only method is GET.''') + # parse args from sys.argv into ParseResult object + args = parser.parse_args() + return args + +def notify(subj, body, msg, whonotify): + with open(body, 'w') as email: + email.write(msg) + email.close() + send = 'mutt -s "{}" -- {} < {}'.format(subj, whonotify, body) + call(send, shell=True) + remove(body) + +def paramcheck(limit, after, endpoint): + limit_syntax = re.search('^limit\=\d+$', args.limit) + if limit_syntax is not None: + raw_params.append(args.limit) + + after_syntax = re.search('^after\=\d+$', args.after) + if after_syntax is not None: + raw_params.append(args.after) + + raw_params.sort() + params = '&'.join(raw_params) + call_url = dump_uri.format('?', params) + + return call_url + +def tablesync(table_manifest, out_dir, block_size): + sync_files = [] + for table in table_manifest: + fname = table['filename'] + file_path = join(out_dir, fname) + flat_fname = fname.split('.')[0] + print(fname) + # add extension if desired, makes no difference but you do you + flat_file = file_path.split('.')[0] # + '' + dl_url = table['url'] + dl_file = requests.get(dl_url) + # is the file a full table or part of a table? + print('Partial table? {}'.format(table['partial'])) + if isfile(file_path): + remove(file_path) + print('Local file fragment removed: {}'.format(file_path)) + # delete any zero-length mishap files + if isfile(flat_file) and getsize(flat_file) == 0: + remove(flat_file) + # check if the flat file exists, if not, download it + # note that this will skip incomplete files more than 0 KB + if isfile(flat_file) and getsize(flat_file) > 0: + print('Skipping file: {} -- already exists.\n'.format(flat_fname)) + sync_files.append(flat_fname) + continue + # if the thousand other scenarios aren't true, let's write the file! + else: + with open(file_path, 'wb+') as sync: + sync.write(dl_file.content) + sync.close() + print('Downloaded file: {}'.format(fname)) + # open gz file and dump contents into flat file block by block + with gzip.open(file_path, 'rb') as zipped, \ + open(flat_file, 'wb+') as unzipped: + while True: + block = zipped.read(block_size) + if not block: + break + else: + unzipped.write(block) + unzipped.write(block) + # must explicitly close both files before further manipulation + unzipped.close() + zipped.close() + print('Unzipped file: {}\n'.format(flat_file)) + # after all is said and done, remove the GZ file + remove(file_path) + # add the downloaded file to a sync list + sync_files.append(flat_fname) + return sync_files + diff --git a/canvas_data/sync_canvas_data/python/canvasfuncs/hmacsig.py b/canvas_data/sync_canvas_data/python/canvasfuncs/hmacsig.py new file mode 100644 index 0000000..581f399 --- /dev/null +++ b/canvas_data/sync_canvas_data/python/canvasfuncs/hmacsig.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import base64 +import hashlib +import hmac +from datetime import datetime, timezone +from urllib import parse + +def HMACopts(call_url, method, params, cdata_secret): + + ############################################################################### + ###################### HMAC Signature Building Section ######################## + ############################################################################### + + # generate UTC timestamp for HMAC-256 signature + dt_now = datetime.now(timezone.utc).strftime('%a, %d %b %Y %H:%M:%S GMT') + + # break the call into components to build HMAC-256 signature + call_info = list(parse.urlparse(call_url)) + + # set components for HMAC-256 signature + reqOpts = { + 'method' : method.upper(), + 'host' : call_info[1], + # intentionally blank + 'content_type' : '', + # intentionally blank + 'content_md5' : '', + 'path' : call_info[2], + 'parameters' : params, + 'req_timestamp' : dt_now, + 'api_secret' : cdata_secret + } + return reqOpts + +def HMACsig(reqOpts, api_key): + + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# + ############### DO NOT CHANGE ANYTHING IN THIS SUBSECTION ##################### + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# + + # build a bytes message by joining the signature components + message = bytes('\n'.join(str(x) for x in reqOpts.values()), 'utf-8') + # change the Canvas Data API secret to bytes + api_secb = bytes(reqOpts['api_secret'], 'utf-8') + + # create an SHA-256 hashed HMAC object, then base 64 encode it + signed_msg = base64.b64encode(hmac.new(api_secb, message, + digestmod=hashlib.sha256).digest()) + # must be 'decoded'to utf-8 to get rid of byte marks (^,.,^) + signature = signed_msg.decode('utf-8') + + # build auth headers from Canvas Data API key, HMAC-256 sig, and timestamp + auth_headers = { 'Authorization' : 'HMACAuth {}:{}'.format(api_key, signature), + 'Date' : '{}'.format(reqOpts['req_timestamp']) } + + return auth_headers +