Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2430: Set flags based on limited access permissions #494

Merged
merged 19 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class APIUtils():
'muse': 4,
'met': 5,
'isac': 6,
'UofMichigan Backlist': 7,
'UofMichigan': 7,
'UofM': 7,
'UofSC': 8,
Expand Down
1 change: 1 addition & 0 deletions localstack/init-localstack-resources.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

awslocal s3 mb s3://drb-files-local
awslocal s3 mb s3://drb-files-limited-local
awslocal s3 mb s3://ump-pdf-repository-local
9 changes: 4 additions & 5 deletions mappings/publisher_backlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,23 @@ def createMapping(self):
'title': ('Title', '{0}'),
'authors': ('Author(s)', '{0}'),
'dates': [('Pub Date', '{0}|publication_date')],
'publisher': ('Publisher (from Projects)', '{0}||'),
'publisher': ('Publisher (from Project)', '{0}||'),
'identifiers': [
('ISBN', '{0}|isbn'),
('OCLC', '{0}|oclc')
],
'rights': ('DRB Rights Classification', '{0}||||'),
'contributors': [('Contributors', '{0}|||contributor')],
'subjects': ('Subject 1', '{0}'),
'source': ('Project Name (from Projects)', '{0}'),
'source': ('Project Name (from Project)', '{0}'),
Apophenia marked this conversation as resolved.
Show resolved Hide resolved
'source_id': ('DRB_Record ID', '{0}'),
'publisher_project_source': ('Publisher (from Projects)', '{0}')
'publisher_project_source': ('Publisher (from Project)', '{0}')
}

def applyFormatting(self):
self.record.has_part = []
if self.record.source:
source_list = self.record.source[0].split(' ')
self.record.source = source_list[0]
self.record.source = self.record.source[0]

if self.record.publisher_project_source:
publisher_source = self.record.publisher_project_source[0]
Expand Down
63 changes: 43 additions & 20 deletions services/sources/publisher_backlist_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import requests
import urllib.parse
from enum import Enum
from typing import Optional
from model import Record, Work, Edition, Item
from sqlalchemy.orm import joinedload
Expand All @@ -19,16 +20,24 @@

BASE_URL = "https://api.airtable.com/v0/appBoLf4lMofecGPU/Publisher%20Backlists%20%26%20Collections%20%F0%9F%93%96?view=All%20Lists"

SOURCE_FIELD = "Project Name (from Project)"

class LimitedAccessPermissions(Enum):
FULL_ACCESS = 'Full access'
PARTIAL_ACCESS = 'Partial access/read only/no download/no login'
LIMITED_DOWNLOADABLE = 'Limited access/login for read & download'
LIMITED_WITHOUT_DOWNLOAD = 'Limited access/login for read/no download'

class PublisherBacklistService(SourceService):
def __init__(self):
self.s3_manager = S3Manager()
self.s3_manager.createS3Client()
self.title_prefix = 'titles/publisher_backlist'
self.file_bucket = os.environ['FILE_BUCKET']

self.limited_file_bucket = f'drb-files-limited-{os.environ.get("ENVIRONMENT", "qa")}'

self.drive_service = GoogleDriveService()

self.db_manager = DBManager()
self.db_manager.generateEngine()

Expand Down Expand Up @@ -149,17 +158,21 @@ def get_records(
for record in records:
try:
record_metadata = record.get('fields')

file_id = f'{self.drive_service.id_from_url(record_metadata.get("DRB_File Location"))}'
try:
file_id = f'{self.drive_service.id_from_url(record_metadata.get("DRB_File Location"))}'
except Exception:
logger.error(f'Could not extract a Drive identifier from {record_metadata.get("DRB_Record ID")}')
continue
file_name = self.drive_service.get_file_metadata(file_id).get('name')
file = self.drive_service.get_drive_file(file_id)

if not file:
logger.error(f'Failed to retrieve file for {record_metadata.get("DRB_Record ID")} from Google Drive')
continue

bucket = self.file_bucket # TODO: if record is limited access, upload to limited access bucket
s3_path = f'{self.title_prefix}/{record_metadata["Publisher (from Projects)"][0]}/{file_name}'
record_permissions = self.parse_permissions(record_metadata.get('Access type in DRB (from Access types)')[0])
bucket = self.file_bucket if not record_permissions['requires_login'] else self.limited_file_bucket
s3_path = f'{self.title_prefix}/{record_metadata[SOURCE_FIELD][0]}/{file_name}'
s3_response = self.s3_manager.putObjectInBucket(file.getvalue(), s3_path, bucket)

if not s3_response.get('ResponseMetadata').get('HTTPStatusCode') == 200:
Expand All @@ -172,7 +185,7 @@ def get_records(
publisher_backlist_record.applyMapping()

self.add_has_part_mapping(s3_url, publisher_backlist_record.record)
self.store_pdf_manifest(publisher_backlist_record.record)
self.store_pdf_manifest(publisher_backlist_record.record, requires_login=record_permissions['requires_login'])

mapped_records.append(publisher_backlist_record)
except Exception:
Expand Down Expand Up @@ -215,33 +228,32 @@ def get_publisher_backlist_records(self,

records_response = requests.get(url, headers=headers)
records_response_json = records_response.json()

publisher_backlist_records.extend(records_response_json.get('records', []))

while 'offset' in records_response_json:
next_page_url = url + f"&offset={records_response_json['offset']}"

records_response = requests.get(next_page_url, headers=headers)
records_response_json = records_response.json()

publisher_backlist_records.extend(records_response_json.get('records', []))

publisher_backlist_records.extend(records_response_json.get('records', []))
return publisher_backlist_records
def add_has_part_mapping(self, s3_url: str, record: Record):

def add_has_part_mapping(self, s3_url: str, record: Record, is_downloadable: bool=False, requires_login: bool=True):
item_no = '1'
media_tpye = 'application/pdf'
media_type = 'application/pdf'
Apophenia marked this conversation as resolved.
Show resolved Hide resolved
flags = {
'catalog': False,
'download': True,
'download': is_downloadable,
'reader': False,
'embed': False,
**({'nypl_login': True} if 'in_copyright' in record.rights else {})
'nypl_login': requires_login,
}

record.has_part.append('|'.join([item_no, s3_url, record.source, media_tpye, json.dumps(flags)]))
record.has_part.append('|'.join([item_no, s3_url, record.source, media_type, json.dumps(flags)]))

def store_pdf_manifest(self, record: Record):
def store_pdf_manifest(self, record: Record, requires_login: bool=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just confirming but do we want have a default value here and does it make sense for it to be True? Seems like in all cases this function will be called with that param set.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll remove the default here - I wanted to consistently fall back to the least permissive option to reduce the likelihood of something being set to open-access because of malformed input data, but I agree that this is not such a case.

for link in record.has_part:
item_no, url, source, media_type, _ = link.split('|')

Expand All @@ -258,7 +270,7 @@ def store_pdf_manifest(self, record: Record):
'download': False,
'reader': True,
'embed': False,
**({'fulfill_limited_access': False} if 'in_copyright' in record.rights else {})
**({'fulfill_limited_access': False} if requires_login else {})
}

record.has_part.insert(0, '|'.join([item_no, manifest_url, source, 'application/webpub+json', json.dumps(manifest_flags)]))
Expand All @@ -283,3 +295,14 @@ def generate_manifest(record, source_url, manifest_url):
})

return manifest.toJson()

@staticmethod
def parse_permissions(permissions: str) -> dict:
if permissions == LimitedAccessPermissions.FULL_ACCESS.value:
return {'is_downloadable': True, 'requires_login': False}
if permissions == LimitedAccessPermissions.PARTIAL_ACCESS.value:
return {'is_downloadable': False, 'requires_login': False}
if permissions == LimitedAccessPermissions.LIMITED_DOWNLOADABLE.value:
return {'is_downloadable': True, 'requires_login': True}
else:
return {'is_downloadable': False, 'requires_login': True}
2 changes: 1 addition & 1 deletion tests/unit/test_pub_backlist_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_applyFormatting_standard(self, test_mapping, testRecordStandard):
test_mapping.applyFormatting()

assert test_mapping.record.has_part == []
assert test_mapping.record.source == 'UofMichigan'
assert test_mapping.record.source == 'UofMichigan Backlist'
assert test_mapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
assert test_mapping.record.source_id == 'testSourceID'
assert test_mapping.record.publisher == ['testPublisher||']
Expand Down
Loading