diff --git a/PhotoScraper/RPI_SIS_PhotoScraper.py b/PhotoScraper/RPI_SIS_PhotoScraper.py index c442e16..2760137 100644 --- a/PhotoScraper/RPI_SIS_PhotoScraper.py +++ b/PhotoScraper/RPI_SIS_PhotoScraper.py @@ -1,34 +1,53 @@ +import sys import json import time import imghdr import argparse -import getpass, requests, os, re +import getpass +import requests +import os from pathlib import Path from selenium.webdriver.support.ui import Select from selenium.common.exceptions import NoSuchElementException from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys from datetime import datetime ################################################################## # a few optional command line argument variables -parser = argparse.ArgumentParser(description='RPI SIS Photo and Registration Scraper') -parser.add_argument('--credentials_file', type=str, default="", - help='a file containing the user RIN and PIN') -parser.add_argument('--term_file', type=str, default="", - help='a file containing the term') -parser.add_argument('--crn_file', type=str, default="", - help='a file containing the crns of desired courses') -parser.add_argument('--headless', default=False, action="store_true", - help='run program without visual display') -parser.add_argument('--no_photos', default=False, action="store_true", - help='dont save photos') -parser.add_argument('--run_forever', default=False, action="store_true", - help='keep clicking to avoid auto-timeout') +parser = argparse.ArgumentParser(description="RPI SIS Photo and Registration Scraper") +parser.add_argument( + "--credentials_file", + type=str, + default="", + help="a file containing the user RIN and PIN", +) +parser.add_argument( + "--term_file", type=str, default="", help="a file containing the term" +) +parser.add_argument( + "--crn_file", + type=str, + default="", + help="a file containing the crns of desired courses", +) +parser.add_argument( + "--headless", + default=False, + action="store_true", + help="run program without visual display", +) +parser.add_argument( + "--no_photos", default=False, action="store_true", help="dont save photos" +) +parser.add_argument( + "--run_forever", + default=False, + action="store_true", + help="keep clicking to avoid auto-timeout", +) args = parser.parse_args() @@ -37,6 +56,7 @@ # Workaround for if pyopenssl is installed and we want weak keys try: from urllib3.contrib import pyopenssl + pyopenssl.extract_from_urllib3() except ImportError: pass @@ -46,79 +66,51 @@ # Login to SIS def login(): chrome_options = Options() - # read credentials from (optional) file - if len(args.credentials_file)>0 and os.path.isfile(args.credentials_file): - with open(str(args.credentials_file),'r') as f: - rin_id = f.readline().strip() - pin_id = f.readline().strip() - else: - rin_id = input("RIN: ") - pin_id = getpass.getpass("PIN: ") # By default we launch the display and allow visual debugging if args.headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") - # Just setting the default ciphers (for this session) to be weak DES/SHA for SIS compatibility - # Be careful about navigating to any other sites... - requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DES-CBC3-SHA:AES128-SHA:'+requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS + # read credentials from (optional) file + if len(args.credentials_file) > 0 and os.path.isfile(args.credentials_file): + with open(str(args.credentials_file), "r") as f: + rcsid = f.readline().strip() + passwd = f.readline().strip() + else: + rcsid = input("RCS ID: ") + passwd = getpass.getpass("Password: ") + + print("Setting up selenium", file=sys.stderr) driver = webdriver.Chrome(options=chrome_options) + driver.implicitly_wait(10) # open SIS - driver.get('https://sis.rpi.edu/') + driver.get("https://sis.rpi.edu") - count = 0 - while True: - count += 1 - # Types in username & password in login page - try: - username_box = driver.find_element_by_id('username') - break - except: - if count > 10: - print ("ERROR: couldn't find username box") - exit(0) - # slight delay to allow page to load - print ("wait a little longer for the page to load") - time.sleep(1) - username_box.send_keys(rin_id) try: - password_box = driver.find_element_by_id('password') - except: - print ("ERROR: couldn't find password box") - exit(0) - - password_box.send_keys(pin_id) - - time.sleep(2) - - try: - # click login button - login_button = driver.find_element_by_name("submit") - except: - print ("ERROR: couldn't find submit button") - exit(0) - - print ("now we can click login button") - login_button.click() - - time.sleep(2) - - while True: - - time.sleep(3) - - if "Rensselaer Self-Service Information System" in driver.page_source: - print("success -- made it past duo page") - break - else: - print("please complete duo authentication") - - print ("Continuing with processing...") - success = True - - return driver, success + print("Logging in", file=sys.stderr) + driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]').click() + driver.find_element(By.ID, "username").send_keys(rcsid) + driver.find_element(By.ID, "password").send_keys(passwd) + driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]').click() + + if len(driver.find_elements(By.CSS_SELECTOR, "p.output--error")) > 0: + print("Incorrect password", file=sys.stderr) + return driver, False + + print("2FA: enter code on your phone", file=sys.stderr) + driver.implicitly_wait(25) + driver.find_element(By.ID, "dont-trust-browser-button").click() + driver.implicitly_wait(10) + driver.find_element( + By.CSS_SELECTOR, 'a[href="/rss/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"]' + ) + + return driver, True + except NoSuchElementException: + print("Failed to login", file=sys.stderr) + return driver, False ################################################################## @@ -126,18 +118,18 @@ def login(): def selectTerm(driver): # read term from (optional) file term = "" - if len(str(args.term_file))>0 and os.path.isfile(str(args.term_file)): - with open(str(args.term_file),'r') as f: + if len(str(args.term_file)) > 0 and os.path.isfile(str(args.term_file)): + with open(str(args.term_file), "r") as f: term = f.readline().strip() # click Instructors & Advisors Menu - driver.find_element_by_link_text('Instructor & Advisor Menu').click() + driver.find_element(By.LINK_TEXT, "Instructor & Advisor Menu").click() # click Select a Semester or Summer Session - driver.find_element_by_link_text('Select a Semester or Summer Session').click() + driver.find_element(By.LINK_TEXT, "Select a Semester or Summer Session").click() # grab the list of available terms - select_term = Select(driver.find_element_by_name('term')) + select_term = Select(driver.find_element(By.NAME, "term")) options_term = select_term.options # loop (querying live user) until we have a valid term @@ -147,7 +139,7 @@ def selectTerm(driver): if term == "": print("Here are the available Sessions/Terms:") for index in range(len(options_term)): - print("[{0}] {1}".format(index,options_term[index].text)) + print("[{0}] {1}".format(index, options_term[index].text)) term = input("Select a term ( or Exit to terminate ): ") # if user does not wish to continue, exit if term.lower() == "exit": @@ -165,7 +157,7 @@ def selectTerm(driver): term = "" # click submit button - driver.find_element_by_xpath("//input[@value='Submit']").click() + driver.find_element(By.XPATH, "//input[@value='Submit']").click() return term, True @@ -176,17 +168,17 @@ def saveImagesToFolder(term, class_list): if len(class_list) == 0: return - course_crn = class_list[0]['course_crn'] - course_prefix = class_list[0]['course_prefix'] - course_name = class_list[0]['course_name'] - course_section = class_list[0]['course_section'] - course_number = class_list[0]['course_number'] + course_crn = class_list[0]["course_crn"] + course_prefix = class_list[0]["course_prefix"] + course_name = class_list[0]["course_name"] + course_section = class_list[0]["course_section"] + course_number = class_list[0]["course_number"] - course_folder_name = "{}-{}-{}".format(course_prefix,course_number,course_section) + course_folder_name = "{}-{}-{}".format(course_prefix, course_number, course_section) # make term (month year) into month-year term_elements = term.split() - folder_term = term_elements[0]+"-"+term_elements[1] + folder_term = term_elements[0] + "-" + term_elements[1] # get path and create path if not already existed path = Path(folder_term, course_folder_name) @@ -197,30 +189,30 @@ def saveImagesToFolder(term, class_list): # loops through the class list of dictionaries of student info for i in range(len(class_list)): obj = {} - obj['full_name'] = class_list[i]['name'] - obj['first_name'] = class_list[i]['first_name'] - obj['middle_name'] = class_list[i]['middle_name'] - obj['last_name'] = class_list[i]['last_name'] - obj['degrees'] = class_list[i]['degrees'] + obj["full_name"] = class_list[i]["name"] + obj["first_name"] = class_list[i]["first_name"] + obj["middle_name"] = class_list[i]["middle_name"] + obj["last_name"] = class_list[i]["last_name"] + obj["degrees"] = class_list[i]["degrees"] - obj['rin'] = class_list[i]['rin'] + obj["rin"] = class_list[i]["rin"] if "rcs" not in class_list[i]: - obj['rcs'] = "" + obj["rcs"] = "" print(f"Warning: no RCS for {class_list[i]['name']}") else: - obj['rcs'] = class_list[i]['rcs'] + obj["rcs"] = class_list[i]["rcs"] if "email" not in class_list[i]: - obj['email'] = "" + obj["email"] = "" print(f"Warning: no email for {class_list[i]['name']}") else: - obj['email'] = class_list[i]['email'] + obj["email"] = class_list[i]["email"] - obj['course_crn'] = class_list[i]['course_crn'] - obj['course_prefix'] = class_list[i]['course_prefix'] - obj['course_name'] = class_list[i]['course_name'] - obj['course_section'] = class_list[i]['course_section'] - obj['course_number'] = class_list[i]['course_number'] - obj['term'] = class_list[i]['term'] + obj["course_crn"] = class_list[i]["course_crn"] + obj["course_prefix"] = class_list[i]["course_prefix"] + obj["course_name"] = class_list[i]["course_name"] + obj["course_section"] = class_list[i]["course_section"] + obj["course_number"] = class_list[i]["course_number"] + obj["term"] = class_list[i]["term"] jsonfile.append(obj) for k in class_list[i].keys(): @@ -231,10 +223,10 @@ def saveImagesToFolder(term, class_list): first_name = name_str[0] last_name = name_str[1] rcs_id = "error-{}-{}".format(first_name, last_name) - print ("NAMESTR="+name_str) - print ("first_name="+first_name) - print ("last_name="+last_name) - print ("rcs_id"+rcs_id) + print("NAMESTR=" + name_str) + print("first_name=" + first_name) + print("last_name=" + last_name) + print("rcs_id" + rcs_id) # if there is an email address, assign letters before "@rpi.edu" to rcs_id if k == "email": rcs_id = class_list[i].get(k)[:-8] @@ -249,18 +241,18 @@ def saveImagesToFolder(term, class_list): continue r = requests.get(img_url) - #Deduce the extension, build the output path - img_format = imghdr.what(None,r.content).lower() + # Deduce the extension, build the output path + img_format = imghdr.what(None, r.content).lower() img_name = rcs_id + "." + img_format filepath = path / img_name - #Actually write the file. We could skip the context manager and just use Image.save(filepath) - with open(str(filepath),'wb') as f: + # Actually write the file. We could skip the context manager and just use Image.save(filepath) + with open(str(filepath), "wb") as f: f.write(r.content) print("Saved photo for student rcs {}".format(rcs_id)) - term_string = term.replace(' ','_') - filename = "all_json/"+term_string+"_"+course_crn+".json" + term_string = term.replace(" ", "_") + filename = "all_json/" + term_string + "_" + course_crn + ".json" with open(filename, "w") as f: json.dump(jsonfile, f, indent=4, sort_keys=True) @@ -273,49 +265,54 @@ def getStudentInfoFromCourse(driver, term): class_list = [] # click Summary Class List & Electronic Warning System (EWS) - driver.find_element_by_link_text('Summary Class List & Electronic Warning System (EWS)').click() + driver.find_element( + By.LINK_TEXT, "Summary Class List & Electronic Warning System (EWS)" + ).click() try: - current_record = driver.find_element_by_partial_link_text('Current Record Set') - #print ("'Current Record Set' label found") + current_record = driver.find_element(By.PARTIAL_LINK_TEXT, "Current Record Set") + # print ("'Current Record Set' label found") try: - first = driver.find_element_by_link_text('Current Record Set: 1 - 200') - print ("1-200 found") - getStudentInfoFromCourseHelper(driver,term, class_list) - print ("1-200 finished") + first = driver.find_element( + By.PARTIAL_LINK_TEXT, "Current Record Set: 1 - 200" + ) + print("1-200 found") + getStudentInfoFromCourseHelper(driver, term, class_list) + print("1-200 finished") try: - second = driver.find_element_by_partial_link_text('201 -') - print ("201-?? found") + second = driver.find_element(By.PARTIAL_LINK_TEXT, "201 -") + print("201-?? found") second.click() - getStudentInfoFromCourseHelper(driver,term, class_list) + getStudentInfoFromCourseHelper(driver, term, class_list) driver.back() - print ("201-?? finished") + print("201-?? finished") except: - print ("ERROR IN CURRENT RECORD COUNTING -- SECOND") + print("ERROR IN CURRENT RECORD COUNTING -- SECOND") return 0 except: - print ("ERROR IN CURRENT RECORD COUNTING -- FIRST") + print("ERROR IN CURRENT RECORD COUNTING -- FIRST") return 0 except: - #print ("'Current Record Set' label not found") - getStudentInfoFromCourseHelper(driver,term, class_list) + # print ("'Current Record Set' label not found") + getStudentInfoFromCourseHelper(driver, term, class_list) driver.back() driver.back() if class_list == 0: - print ("Warning: this class size is 0") + print("Warning: this class size is 0") else: # Use the info collected and save the image with rcs id for term/course in current directory saveImagesToFolder(term, class_list) + ################################################################## -def addMajor(majors,degree,text): +def addMajor(majors, degree, text): majors.append(degree + " / " + text) return majors -def addConcentrationToLastMajor(majors,text): +def addConcentrationToLastMajor(majors, text): majors[-1] = majors[-1] + " / " + text return majors @@ -325,44 +322,58 @@ def addConcentrationToLastMajor(majors,text): def getStudentInfoFromCourseHelper(driver, term, class_list): # check if class is size 0 - if len(driver.find_elements_by_class_name('errortext')) == 1: + if len(driver.find_elements(By.CLASS_NAME, "errortext")) == 1: print("Error: Class size is 0!") return 0 - COURSENAMESTRING = driver.find_elements_by_class_name('datadisplaytable')[0].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')[0].find_elements_by_tag_name('th')[0].text + COURSENAMESTRING = ( + driver.find_elements(By.CLASS_NAME, "datadisplaytable")[0] + .find_element(By.TAG_NAME, "tbody") + .find_elements(By.TAG_NAME, "tr")[0] + .find_elements(By.TAG_NAME, "th")[0] + .text + ) - COURSENAMESTRING_split = COURSENAMESTRING.split(' - ') + COURSENAMESTRING_split = COURSENAMESTRING.split(" - ") if len(COURSENAMESTRING_split) < 2: - print ("ERROR: course name formatting bug") + print("ERROR: course name formatting bug") return 0 COURSENAME = "" - for index in range(len(COURSENAMESTRING_split)-1): + for index in range(len(COURSENAMESTRING_split) - 1): if index > 0: COURSENAME = COURSENAME + " - " COURSENAME = COURSENAME + COURSENAMESTRING_split[index] - print ("COURSE NAME IS "+COURSENAME) + print("COURSE NAME IS " + COURSENAME) if len(COURSENAMESTRING_split[-1]) != 12: - print ("ERROR: course prefix / code bug") + print("ERROR: course prefix / code bug") return 0 COURSEPREFIX = COURSENAMESTRING_split[-1][0:4] COURSENUMBER = COURSENAMESTRING_split[-1][5:9] COURSESECTION = COURSENAMESTRING_split[-1][10:] - CRNSTRING = driver.find_elements_by_class_name('datadisplaytable')[0].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr') + CRNSTRING = ( + driver.find_elements(By.CLASS_NAME, "datadisplaytable")[0] + .find_element(By.TAG_NAME, "tbody") + .find_elements(By.TAG_NAME, "tr") + ) CRNSTRING = str(CRNSTRING[1].text) if CRNSTRING[0:5] == "CRN: ": CRNSTRING = CRNSTRING[5:] else: - print ("ERROR: could not find CRN") + print("ERROR: could not find CRN") return 0 # find link for pic - student_list = driver.find_elements_by_class_name('datadisplaytable')[2].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr') + student_list = ( + driver.find_elements(By.CLASS_NAME, "datadisplaytable")[2] + .find_element(By.TAG_NAME, "tbody") + .find_elements(By.TAG_NAME, "tr") + ) # find which column is the "Student Name" column, since it isn't always the same column number - student_headers = student_list[0].find_elements_by_tag_name('th') + student_headers = student_list[0].find_elements(By.TAG_NAME, "th") stu_col = -1 id_col = -1 for i in range(len(student_headers)): @@ -372,29 +383,33 @@ def getStudentInfoFromCourseHelper(driver, term, class_list): id_col = i if stu_col < 0: - print("Error: Could not find a column labeled \"Student Name\"!") + print('Error: Could not find a column labeled "Student Name"!') return 0 if id_col < 0: - print("Error: Could not find a column labeled \"ID\"!") + print('Error: Could not find a column labeled "ID"!') return 0 # NOTE: uncomment this line to help with debugging - #print("Student column: " + str(stu_col)) + # print("Student column: " + str(stu_col)) # loop through list of students to get image, name, and email # all info collected from for loop (img url, name, email) put into dict for s in range(1, len(student_list)): student_record = {} - student = driver.find_elements_by_class_name('datadisplaytable')[2].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')[s] + student = ( + driver.find_elements(By.CLASS_NAME, "datadisplaytable")[2] + .find_element(By.TAG_NAME, "tbody") + .find_elements(By.TAG_NAME, "tr")[s] + ) # NOTE: uncomment these line to help with debugging - #print('Row Number: ' + str(s)) - #print('Row Length: ' + str(len(student.find_elements_by_tag_name('td')))) - #print('Cell Value: ' + student.find_elements_by_tag_name('td')[stu_col].text) + # print('Row Number: ' + str(s)) + # print('Row Length: ' + str(len(student.find_elements(By.TAG_NAME, 'td')))) + # print('Cell Value: ' + student.find_elements(By.TAG_NAME, 'td')[stu_col].text) - full_name_cell = student.find_elements_by_tag_name('td')[stu_col].text - full_name_cell_split = full_name_cell.split(', ') + full_name_cell = student.find_elements(By.TAG_NAME, "td")[stu_col].text + full_name_cell_split = full_name_cell.split(", ") # format of the full name appears to be one of: # Smith, John X. (with middle initial) # Smith, John (with no middle name/initial) @@ -404,14 +419,20 @@ def getStudentInfoFromCourseHelper(driver, term, class_list): first_name = full_name_cell_split[1] middle_name = "" first_name_length = len(first_name) - if first_name_length > 3 and first_name[first_name_length-3] == ' ' and first_name[first_name_length-1] == '.': - middle_name = first_name[first_name_length-2:] - first_name = first_name[0:first_name_length-3] + if ( + first_name_length > 3 + and first_name[first_name_length - 3] == " " + and first_name[first_name_length - 1] == "." + ): + middle_name = first_name[first_name_length - 2 :] + first_name = first_name[0 : first_name_length - 3] - id_rin = student.find_elements_by_tag_name('td')[id_col].text + id_rin = student.find_elements(By.TAG_NAME, "td")[id_col].text try: - student.find_elements_by_tag_name('td')[stu_col].find_element_by_class_name('fieldmediumtext').click() + student.find_elements(By.TAG_NAME, "td")[stu_col].find_element( + By.CLASS_NAME, "fieldmediumtext" + ).click() except: input() raise @@ -420,48 +441,58 @@ def getStudentInfoFromCourseHelper(driver, term, class_list): driver.get(img_url) # image, initalize to empty string - student_record['img url'] = "" - image_arr = driver.find_elements_by_tag_name('img') + student_record["img url"] = "" + image_arr = driver.find_elements(By.TAG_NAME, "img") - #do search through all tags for first non-header-layout tag - #have to skip 2 more tags because they are transparent images + # do search through all tags for first non-header-layout tag + # have to skip 2 more tags because they are transparent images for i in range(len(image_arr)): - if image_arr[i].get_attribute('NAME') != "web_tab_corner_right": - student_record['img url'] = image_arr[i+2].get_attribute('src') - #Uncomment this line to print the image URLs we are attempting, useful for debugging - #print("found non-match, +2 is " + student_record['img url']) + if image_arr[i].get_attribute("NAME") != "web_tab_corner_right": + student_record["img url"] = image_arr[i + 2].get_attribute("src") + # Uncomment this line to print the image URLs we are attempting, useful for debugging + # print("found non-match, +2 is " + student_record['img url']) break # name - info_name = driver.find_elements_by_class_name('plaintable')[4].find_element_by_tag_name('tbody').find_element_by_tag_name('tr').find_elements_by_tag_name('td')[1].text + info_name = ( + driver.find_elements(By.CLASS_NAME, "plaintable")[4] + .find_element(By.TAG_NAME, "tbody") + .find_element(By.TAG_NAME, "tr") + .find_elements(By.TAG_NAME, "td")[1] + .text + ) name = info_name[16:] - student_record['name'] = name - student_record['rin'] = id_rin + student_record["name"] = name + student_record["rin"] = id_rin - student_record['first_name'] = first_name - student_record['middle_name'] = middle_name - student_record['last_name'] = last_name + student_record["first_name"] = first_name + student_record["middle_name"] = middle_name + student_record["last_name"] = last_name - student_record['course_name'] = COURSENAME - student_record['course_number'] = COURSENUMBER - student_record['course_prefix'] = COURSEPREFIX - student_record['course_crn'] = CRNSTRING - student_record['course_section'] = COURSESECTION + student_record["course_name"] = COURSENAME + student_record["course_number"] = COURSENUMBER + student_record["course_prefix"] = COURSEPREFIX + student_record["course_crn"] = CRNSTRING + student_record["course_section"] = COURSESECTION - student_record['term'] = term + student_record["term"] = term - print("Gathering info for student: "+name) + print("Gathering info for student: " + name) # email address - driver.find_element_by_link_text('Student E-mail Address').click() - if len(driver.find_elements_by_class_name('datadisplaytable')) == 1: - emails = driver.find_element_by_class_name('datadisplaytable').find_element_by_tag_name('tbody').find_elements_by_tag_name('tr') + driver.find_element(By.LINK_TEXT, "Student E-mail Address").click() + if len(driver.find_elements(By.CLASS_NAME, "datadisplaytable")) == 1: + emails = ( + driver.find_element(By.CLASS_NAME, "datadisplaytable") + .find_element(By.TAG_NAME, "tbody") + .find_elements(By.TAG_NAME, "tr") + ) for i in range(len(emails)): if emails[i].text == "Campus Student Email Address": - email = emails[i+1].find_element_by_tag_name('td').text - student_record['email'] = email - student_record['rcs'] = email[0:len(email)-8] + email = emails[i + 1].find_element(By.TAG_NAME, "td").text + student_record["email"] = email + student_record["rcs"] = email[0 : len(email) - 8] break driver.back() @@ -469,28 +500,30 @@ def getStudentInfoFromCourseHelper(driver, term, class_list): majors = [] # undergraduate major - driver.find_element_by_link_text('Student Information').click() - if len(driver.find_elements_by_class_name('datadisplaytable')) >= 1: + driver.find_element(By.LINK_TEXT, "Student Information").click() + if len(driver.find_elements(By.CLASS_NAME, "datadisplaytable")) >= 1: - for table in driver.find_elements_by_class_name('datadisplaytable'): - stuff = table.find_element_by_tag_name('tbody').find_elements_by_tag_name('tr') + for table in driver.find_elements(By.CLASS_NAME, "datadisplaytable"): + stuff = table.find_element(By.TAG_NAME, "tbody").find_elements( + By.TAG_NAME, "tr" + ) for i in range(len(stuff)): if stuff[i].text == "Current Program": - degree = stuff[i+1].text + degree = stuff[i + 1].text if stuff[i].text[0:7] == "Major: ": - majors = addMajor(majors,degree,stuff[i].text[7:]) + majors = addMajor(majors, degree, stuff[i].text[7:]) if stuff[i].text[0:22] == "Major and Department: ": - majors = addMajor(majors,degree,stuff[i].text[22:]) + majors = addMajor(majors, degree, stuff[i].text[22:]) if stuff[i].text[0:21] == "Major Concentration: ": - majors = addConcentrationToLastMajor(majors,stuff[i].text[21:]) + majors = addConcentrationToLastMajor(majors, stuff[i].text[21:]) driver.back() - student_record['degrees'] = [] + student_record["degrees"] = [] for m in majors: - #print ("MAJOR"+m) - student_record['degrees'].append(m) + # print ("MAJOR"+m) + student_record["degrees"].append(m) class_list.append(student_record) driver.back() @@ -498,17 +531,17 @@ def getStudentInfoFromCourseHelper(driver, term, class_list): ################################################################## # Gets the info regarding each course of student images with their rcs id -def wasteTimeClicking(driver,seconds): +def wasteTimeClicking(driver, seconds): counter = 0 while counter < seconds: print("wasting time") # click Instructors & Advisors Menu - driver.find_element_by_link_text('Instructor & Advisor Menu').click() + driver.find_element(By.LINK_TEXT, "Instructor & Advisor Menu").click() time.sleep(5) # click Select a Semester or Summer Session - driver.find_element_by_link_text('Select a Semester or Summer Session').click() + driver.find_element(By.LINK_TEXT, "Select a Semester or Summer Session").click() time.sleep(55) counter += 60 @@ -516,11 +549,11 @@ def wasteTimeClicking(driver,seconds): ################################################################## # Gets the info regarding each course of student images with their rcs id -def loopOverCourses(driver,term): +def loopOverCourses(driver, term): # read crns from (optional) file crns = [] - if len(str(args.crn_file))>0 and os.path.isfile(str(args.crn_file)): - with open(str(args.crn_file),'r') as f: + if len(str(args.crn_file)) > 0 and os.path.isfile(str(args.crn_file)): + with open(str(args.crn_file), "r") as f: while True: crn = f.readline().strip() if len(crn) != 5: @@ -528,38 +561,40 @@ def loopOverCourses(driver,term): crns.append(crn) # make a directory to hold the registration directories - os.makedirs('all_json',exist_ok=True) + os.makedirs("all_json", exist_ok=True) # click Course Information- Select a CRN - driver.find_element_by_link_text('Course Information- Select a CRN').click() + driver.find_element(By.LINK_TEXT, "Course Information- Select a CRN").click() # if there was at least one crn in the file if crns: - driver.find_element_by_link_text('Enter Section Identifier (CRN) Directly').click() + driver.find_element( + By.LINK_TEXT, "Enter Section Identifier (CRN) Directly" + ).click() for crn in crns: - print ("Begin processing CRN "+crn) - crn_box = driver.find_element_by_name('CRN') + print("Begin processing CRN " + crn) + crn_box = driver.find_element(By.NAME, "CRN") crn_box.clear() crn_box.send_keys(crn) crn_box.send_keys(Keys.TAB) crn_box.send_keys(Keys.RETURN) getStudentInfoFromCourse(driver, term) - print ("Finished processing CRN "+crn) + print("Finished processing CRN " + crn) return # otherwise, query the user for which crns to scrape else: # check if there are any sections assigned for this term - if len(driver.find_elements_by_class_name('warningtext')) == 1: - print ("Error: No sections assigned for this term!") + if len(driver.find_elements(By.CLASS_NAME, "warningtext")) == 1: + print("Error: No sections assigned for this term!") return # iterate and ask if user wants images/names from this course - select_course = Select(driver.find_element_by_name('crn')) + select_course = Select(driver.find_element(By.NAME, "crn")) options_course = select_course.options for index in range(len(options_course)): - select_course = Select(driver.find_element_by_name('crn')) + select_course = Select(driver.find_element(By.NAME, "crn")) options_course = select_course.options course = options_course[index].text @@ -573,47 +608,66 @@ def loopOverCourses(driver,term): elif answer == "exit": return elif answer == "y": - print ("Getting student pictures... (this could take a few seconds per student)") + print( + "Getting student pictures... (this could take a few seconds per student)" + ) select_course.select_by_index(index) - driver.find_element_by_xpath("//input[@value='Submit']").click() + driver.find_element(By.XPATH, "//input[@value='Submit']").click() getStudentInfoFromCourse(driver, term) break else: print("Invalid answer! Try again!") -################################################################## -if __name__ == "__main__": - try: - driver, success = login() - - # if login is valid with correct User ID or PIN, continue the program by collecting data - if success: - - while True: - - # Get the term to use to save images - term, success = selectTerm(driver) +# Assumes SIS main page is open +def get_csci_crns(driver): + driver.find_element(By.LINK_TEXT, "SITE MAP").click() + driver.find_element(By.LINK_TEXT, "Class Search").click() - if success: - loopOverCourses(driver,term) + driver.find_element( + By.XPATH, "/html/body/div[3]/form/table/tbody/tr/td/select/option[2]" + ).click() + driver.find_element(By.CSS_SELECTOR, 'input[type="submit"][value="Submit"]').click() + driver.find_element(By.CSS_SELECTOR, 'option[value="CSCI"]').click() + driver.find_element( + By.CSS_SELECTOR, 'input[type="submit"][value="Section Search"]' + ).click() + return [ + crn_link.text + for crn_link in driver.find_elements( + By.CSS_SELECTOR, 'a[href^="/rss/bwckschd.p_disp_listcrse"]' + ) + ] - sttime = datetime.now().strftime('%Y%m%d %H:%M:%S') - with open("last_completed_run.txt", 'a') as logfile: - logfile.write(sttime + ' completed scrape\n') - if not args.run_forever: - print ("--------------------\nlets NOT run forever\n--------------------") - break - - # wait a number of hours before doing it all again - num_hours = 1 - wasteTimeClicking(driver,60*60*num_hours) - print ("----------------\nLETS RUN FOREVER\n----------------") +################################################################## +if __name__ == "__main__": + driver, success = login() - finally: - # ends the program - try: - driver.quit() - except: - pass #If we got an exception in login(), driver will not exist in this scope + # if login is valid with correct User ID or PIN, continue the program by collecting data + if not success: + driver.quit() + else: + crns = get_csci_crns(driver) + print(crns) + # while True: + # # Get the term to use to save images + # term, success = selectTerm(driver) + + # if success: + # loopOverCourses(driver, term) + + # sttime = datetime.now().strftime("%Y%m%d %H:%M:%S") + # with open("last_completed_run.txt", "a") as logfile: + # logfile.write(sttime + " completed scrape\n") + + # if not args.run_forever: + # print( + # "--------------------\nlets NOT run forever\n--------------------" + # ) + # break + + # # wait a number of hours before doing it all again + # num_hours = 1 + # wasteTimeClicking(driver, 60 * 60 * num_hours) + # print("----------------\nLETS RUN FOREVER\n----------------")