Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Youtube API to download english transcripts #1139

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
21 changes: 0 additions & 21 deletions cms/djangoapps/contentstore/tests/test_transcripts_utils.py
Original file line number Diff line number Diff line change
@@ -534,27 +534,6 @@ def test_youtube_empty_text(self, mock_get):
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)

def test_youtube_good_result(self):
response = textwrap.dedent("""<?xml version="1.0" encoding="utf-8" ?>
<transcript>
<text start="0" dur="0.27"></text>
<text start="0.27" dur="2.45">Test text 1.</text>
<text start="2.72">Test text 2.</text>
<text start="5.43" dur="1.73">Test text 3.</text>
</transcript>
""")
expected_transcripts = {
'start': [270, 2720, 5430],
'end': [2720, 2720, 7160],
'text': ['Test text 1.', 'Test text 2.', 'Test text 3.']
}
youtube_id = 'good_youtube_id'
with patch('xmodule.video_module.transcripts_utils.requests.get') as mock_get:
mock_get.return_value = Mock(status_code=200, text=response, content=response.encode('utf-8'))
transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
self.assertEqual(transcripts, expected_transcripts)
mock_get.assert_called_with('http://video.google.com/timedtext', params={'lang': 'en', 'v': 'good_youtube_id'})


class TestTranscript(unittest.TestCase):
"""
65 changes: 32 additions & 33 deletions cms/djangoapps/contentstore/views/tests/test_transcripts.py
Original file line number Diff line number Diff line change
@@ -34,13 +34,9 @@
TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex

SRT_TRANSCRIPT_CONTENT = u"""0
00:00:10,500 --> 00:00:13,000
Elephant's Dream

1
00:00:15,000 --> 00:00:18,000
At the left we can see...
SRT_TRANSCRIPT_CONTENT = """0
00:00:00,260 --> 00:00:00,260
Hello, welcome to Open edX.

"""

@@ -166,6 +162,14 @@ def setUp(self):
self.bad_data_srt_file = self.create_transcript_file(content=self.contents['bad'], suffix='.srt')
self.bad_name_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.bad')
self.bom_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt', include_bom=True)
self.good_transcript_data = {
'transcript_srt':
'0\n00:00:00,260 --> 00:00:00,260\nHello, welcome to Open edX.'
}
self.bad_transcript_data = {
'srt':
'0\n00:00:00,260 --> 00:00:00,260\nHello, welcome to Open edX.'
}

# Setup a VEDA produced video and persist `edx_video_id` in VAL.
create_video({
@@ -206,7 +210,7 @@ def clean_temporary_transcripts(self):
self.bad_name_srt_file.close()
self.bom_srt_file.close()

def upload_transcript(self, locator, transcript_file, edx_video_id=None):
def upload_transcript(self, locator, transcript_data, edx_video_id=None):
"""
Uploads a transcript for a video
"""
@@ -217,8 +221,8 @@ def upload_transcript(self, locator, transcript_file, edx_video_id=None):
if edx_video_id is not None:
payload.update({'edx_video_id': edx_video_id})

if transcript_file:
payload.update({'transcript-file': transcript_file})
if transcript_data:
payload.update({'transcript-file': transcript_data})

upload_url = reverse('upload_transcripts')
response = self.client.post(upload_url, payload)
@@ -246,8 +250,8 @@ def test_transcript_upload_success(self, edx_video_id, include_bom):
modulestore().update_item(self.item, self.user.id)

# Upload a transcript
transcript_file = self.bom_srt_file if include_bom else self.good_srt_file
response = self.upload_transcript(self.video_usage_key, transcript_file, '')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(self.video_usage_key, transcript_data, '')

# Verify the response
self.assert_response(response, expected_status_code=200, expected_message='Success')
@@ -272,7 +276,8 @@ def test_transcript_upload_without_locator(self):
"""
Test that transcript upload validation fails if the video locator is missing
"""
response = self.upload_transcript(locator=None, transcript_file=self.good_srt_file, edx_video_id='')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(locator=None, transcript_data=transcript_data, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
@@ -283,7 +288,7 @@ def test_transcript_upload_without_file(self):
"""
Test that transcript upload validation fails if transcript file is missing
"""
response = self.upload_transcript(locator=self.video_usage_key, transcript_file=None, edx_video_id='')
response = self.upload_transcript(locator=self.video_usage_key, transcript_data=None, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
@@ -296,13 +301,13 @@ def test_transcript_upload_bad_format(self):
"""
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.bad_name_srt_file,
transcript_data=self.bad_transcript_data,
edx_video_id=''
)
self.assert_response(
response,
expected_status_code=400,
expected_message=u'This transcript file type is not supported.'
expected_message=u'There is a problem with this transcript file. Try to upload a different file.'
)

def test_transcript_upload_bad_content(self):
@@ -312,7 +317,7 @@ def test_transcript_upload_bad_content(self):
# Request to upload transcript for the video
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.bad_data_srt_file,
transcript_data=self.bad_transcript_data,
edx_video_id=''
)
self.assert_response(
@@ -328,7 +333,8 @@ def test_transcript_upload_unknown_category(self):
# non_video module setup - i.e. an item whose category is not 'video'.
usage_key = self.create_non_video_module()
# Request to upload transcript for the item
response = self.upload_transcript(locator=usage_key, transcript_file=self.good_srt_file, edx_video_id='')
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(locator=usage_key, transcript_data=transcript_data, edx_video_id='')
self.assert_response(
response,
expected_status_code=400,
@@ -340,9 +346,10 @@ def test_transcript_upload_non_existent_item(self):
Test that transcript upload validation fails in case of invalid item's locator.
"""
# Request to upload transcript for the item
transcript_data = self.good_transcript_data["transcript_srt"]
response = self.upload_transcript(
locator='non_existent_locator',
transcript_file=self.good_srt_file,
transcript_data=transcript_data,
edx_video_id=''
)
self.assert_response(
@@ -351,32 +358,24 @@ def test_transcript_upload_non_existent_item(self):
expected_message=u'Cannot find item by locator.'
)

def test_transcript_upload_without_edx_video_id(self):
"""
Test that transcript upload validation fails if the `edx_video_id` is missing
"""
response = self.upload_transcript(locator=self.video_usage_key, transcript_file=self.good_srt_file)
self.assert_response(
response,
expected_status_code=400,
expected_message=u'Video ID is required.'
)

def test_transcript_upload_with_non_existant_edx_video_id(self):
"""
Test that transcript upload works as expected if `edx_video_id` set on
video descriptor is different from `edx_video_id` received in POST request.
"""
non_existant_edx_video_id = '1111-2222-3333-4444'

transcript_data = self.good_transcript_data["transcript_srt"]
# Upload with non-existant `edx_video_id`
response = self.upload_transcript(
locator=self.video_usage_key,
transcript_file=self.good_srt_file,
transcript_data=transcript_data,
edx_video_id=non_existant_edx_video_id
)
# Verify the response
self.assert_response(response, expected_status_code=400, expected_message='Invalid Video ID')
self.assert_response(
response, expected_status_code=400,
expected_message="edx_video_id doesn't exist."
)

# Verify transcript does not exist for non-existant `edx_video_id`
self.assertIsNone(get_video_transcript_content(non_existant_edx_video_id, language_code=u'en'))
244 changes: 158 additions & 86 deletions cms/djangoapps/contentstore/views/transcripts_ajax.py

Large diffs are not rendered by default.

512 changes: 284 additions & 228 deletions cms/static/js/views/video/transcripts/message_manager.js

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -7,6 +7,9 @@
<%- gettext("Error.") %>
</p>
<div class="wrapper-transcripts-buttons">
<button class="action setting-download-youtube-transcript" type="button" name="setting-download-youtube-transcript" value="<%= gettext("Download YouTube Transcript") %>" data-tooltip="<%= gettext("Download YouTube Transcript") %>">
<%= gettext("Download YouTube Transcript") %>
</button>
<button class="action setting-upload" type="button" name="setting-upload" value="<%- gettext("Upload New Transcript") %>" data-tooltip="<%- gettext("Upload New Transcript") %>">
<%- gettext("Upload New Transcript") %>
</button>
9 changes: 0 additions & 9 deletions common/lib/xmodule/xmodule/video_module/transcripts_utils.py
Original file line number Diff line number Diff line change
@@ -191,15 +191,6 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_
for element in xmltree:
if element.tag == "text":
start = float(element.get("start"))
duration = float(element.get("dur", 0)) # dur is not mandatory
text = element.text
end = start + duration

if text:
# Start and end should be ints representing the millisecond timestamp.
sub_starts.append(int(start * 1000))
sub_ends.append(int((end + 0.0001) * 1000))
sub_texts.append(text.replace('\n', ' '))

return {'start': sub_starts, 'end': sub_ends, 'text': sub_texts}