Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Youtube API to download english transcripts #1139

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
21 changes: 0 additions & 21 deletions cms/djangoapps/contentstore/tests/test_transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,27 +534,6 @@ def test_youtube_empty_text(self, mock_get):
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)

def test_youtube_good_result(self):
response = textwrap.dedent("""<?xml version="1.0" encoding="utf-8" ?>
<transcript>
<text start="0" dur="0.27"></text>
<text start="0.27" dur="2.45">Test text 1.</text>
<text start="2.72">Test text 2.</text>
<text start="5.43" dur="1.73">Test text 3.</text>
</transcript>
""")
expected_transcripts = {
'start': [270, 2720, 5430],
'end': [2720, 2720, 7160],
'text': ['Test text 1.', 'Test text 2.', 'Test text 3.']
}
youtube_id = 'good_youtube_id'
with patch('xmodule.video_module.transcripts_utils.requests.get') as mock_get:
mock_get.return_value = Mock(status_code=200, text=response, content=response.encode('utf-8'))
transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
self.assertEqual(transcripts, expected_transcripts)
mock_get.assert_called_with('http://video.google.com/timedtext', params={'lang': 'en', 'v': 'good_youtube_id'})


class TestTranscript(unittest.TestCase):
"""
Expand Down
65 changes: 32 additions & 33 deletions cms/djangoapps/contentstore/views/tests/test_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,9 @@
TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex

SRT_TRANSCRIPT_CONTENT = u"""0
00:00:10,500 --> 00:00:13,000
Elephant's Dream

1
00:00:15,000 --> 00:00:18,000
At the left we can see...
SRT_TRANSCRIPT_CONTENT = """0
00:00:00,260 --> 00:00:00,260
Hello, welcome to Open edX.

"""

Expand Down Expand Up @@ -166,6 +162,14 @@ def setUp(self):
self.bad_data_srt_file = self.create_transcript_file(content=self.contents['bad'], suffix='.srt')
self.bad_name_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.bad')
self.bom_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt', include_bom=True)
self.good_transcript_data = {
'transcript_srt':
'0\n00:00:00,260 --> 00:00:00,260\nHello, welcome to Open edX.'
}
self.bad_transcript_data = {
'srt':
'0\n00:00:00,260 --> 00:00:00,260\nHello, welcome to Open edX.'
}

# Setup a VEDA produced video and persist `edx_video_id` in VAL.
create_video({
Expand Down Expand Up @@ -206,7 +210,7 @@ def clean_temporary_transcripts(self):
self.bad_name_srt_file.close()
self.bom_srt_file.close()

def upload_transcript(self, locator, transcript_file, edx_video_id=None):
def upload_transcript(self, locator, transcript_data, edx_video_id=None):
"""
Uploads a transcript for a video
"""
Expand All @@ -217,8 +221,8 @@ def upload_transcript(self, locator, transcript_file, edx_video_id=None):
if edx_video_id is not None:
payload.update({'edx_video_id': edx_video_id})

if transcript_file:
payload.update({'transcript-file': transcript_file})
if transcript_data:
payload.update({'transcript-file': transcript_data})

upload_url = reverse('upload_transcripts')
response = self.client.post(upload_url, payload)
Expand Down Expand Up @@ -246,8 +250,8 @@ def test_transcript_upload_success(self, edx_video_id, include_bom):
modulestore().update_item(self.item, self.user.id)

# Upload a transcript
transcript_file = self.bom_srt_file if include_bom else self.good_srt_file
response = self.upload_transcript(self.video_usage_key, transcript_file, '')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(self.video_usage_key, transcript_data, '')

# Verify the response
self.assert_response(response, expected_status_code=200, expected_message='Success')
Expand All @@ -272,7 +276,8 @@ def test_transcript_upload_without_locator(self):
"""
Test that transcript upload validation fails if the video locator is missing
"""
response = self.upload_transcript(locator=None, transcript_file=self.good_srt_file, edx_video_id='')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(locator=None, transcript_data=transcript_data, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
Expand All @@ -283,7 +288,7 @@ def test_transcript_upload_without_file(self):
"""
Test that transcript upload validation fails if transcript file is missing
"""
response = self.upload_transcript(locator=self.video_usage_key, transcript_file=None, edx_video_id='')
response = self.upload_transcript(locator=self.video_usage_key, transcript_data=None, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
Expand All @@ -296,13 +301,13 @@ def test_transcript_upload_bad_format(self):
"""
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.bad_name_srt_file,
transcript_data=self.bad_transcript_data,
edx_video_id=''
)
self.assert_response(
response,
expected_status_code=400,
expected_message=u'This transcript file type is not supported.'
expected_message=u'There is a problem with this transcript file. Try to upload a different file.'
)

def test_transcript_upload_bad_content(self):
Expand All @@ -312,7 +317,7 @@ def test_transcript_upload_bad_content(self):
# Request to upload transcript for the video
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.bad_data_srt_file,
transcript_data=self.bad_transcript_data,
edx_video_id=''
)
self.assert_response(
Expand All @@ -328,7 +333,8 @@ def test_transcript_upload_unknown_category(self):
# non_video module setup - i.e. an item whose category is not 'video'.
usage_key = self.create_non_video_module()
# Request to upload transcript for the item
response = self.upload_transcript(locator=usage_key, transcript_file=self.good_srt_file, edx_video_id='')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(locator=usage_key, transcript_data=transcript_data, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
Expand All @@ -340,9 +346,10 @@ def test_transcript_upload_non_existent_item(self):
Test that transcript upload validation fails in case of invalid item's locator.
"""
# Request to upload transcript for the item
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(
locator='non_existent_locator',
transcript_file=self.good_srt_file,
transcript_data=transcript_data,
edx_video_id=''
)
self.assert_response(
Expand All @@ -351,32 +358,24 @@ def test_transcript_upload_non_existent_item(self):
expected_message=u'Cannot find item by locator.'
)

def test_transcript_upload_without_edx_video_id(self):
"""
Test that transcript upload validation fails if the `edx_video_id` is missing
"""
response = self.upload_transcript(locator=self.video_usage_key, transcript_file=self.good_srt_file)
self.assert_response(
response,
expected_status_code=400,
expected_message=u'Video ID is required.'
)

def test_transcript_upload_with_non_existant_edx_video_id(self):
"""
Test that transcript upload works as expected if `edx_video_id` set on
video descriptor is different from `edx_video_id` received in POST request.
"""
non_existant_edx_video_id = '1111-2222-3333-4444'

transcript_data = self.good_transcript_data["transcript_srt"]
# Upload with non-existant `edx_video_id`
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.good_srt_file,
transcript_data=transcript_data,
edx_video_id=non_existant_edx_video_id
)
# Verify the response
self.assert_response(response, expected_status_code=400, expected_message='Invalid Video ID')
self.assert_response(
response, expected_status_code=400,
expected_message="edx_video_id doesn't exist."
)

# Verify transcript does not exist for non-existant `edx_video_id`
self.assertIsNone(get_video_transcript_content(non_existant_edx_video_id, language_code=u'en'))
Expand Down
Loading