-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_transcript.py
205 lines (166 loc) · 6.11 KB
/
get_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import sys
import json
import base64
from pathlib import Path
import requests
import google.auth
import google.auth.transport.requests
import google.oauth2.credentials
import googleapiclient.discovery
import inquirer
import pdb
from googleapiclient.discovery import build
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
YT_DATA_API_KEY = os.getenv('YT_DATA_API_KEY')
yt_data_api_key = 'AIzaSyB3WFRMXvfeZH45R0neeHq3LOem0buIuxg'
yt_data_url = 'https://www.googleapis.com/youtube/v3/videos'
yt_playlists_url = 'https://www.googleapis.com/youtube/v3/playlists'
target_video_url = 'https://www.youtube.com/watch?v=I3sinNeqwZU'
YT_DATA_API_BASE_URL = 'https://www.googleapis.com/youtube/v3'
YT_PLAYLIST_ITEMS_URL = f'{YT_DATA_API_BASE_URL}/playlistItems'
# Retrieve and decode the base64-encoded credentials from the environment
# google_credentials_base64 = os.environ["GOOGLE_CREDENTIALS_BASE64"]
google_credentials_base64 = os.getenv('GOOGLE_CREDENTIALS_BASE64')
google_credentials_info = json.loads(base64.b64decode(google_credentials_base64))
# Build credentials from the service account info
credentials = google.oauth2.service_account.Credentials.from_service_account_info(google_credentials_info, scopes=['https://www.googleapis.com/auth/youtube.force-ssl'])
# Set up the YouTube Data API client
youtube = googleapiclient.discovery.build('youtube', 'v3', credentials=credentials)
def video_url_to_id(video_url):
# Handle case of further parameters
if '&' in video_url:
video_id = video_url.split('v=')[1].split('&')[0]
else:
video_id = video_url.split('v=')[1]
return video_id
def video_url_to_playlist_id(video_url):
if 'list=' not in video_url:
return None
playlist_id = video_url.split('list=')[-1]
if '&' in playlist_id:
playlist_id = playlist_id.split('&')[0]
return playlist_id
def get_transcript(video_id):
print(f'Getting transcript for video_id: {video_id}')
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
full_text = ''
for i in transcript:
full_text += i['text'] + ' '
return True, full_text
except Exception as e:
return False, f'\nUnable to get transcript for video_id: {video_id}\n{e}'
def save_transcript(video_id, playlist_id, transcript):
video_title = video_info_to_title(get_video_info(video_id))
capstone_year = video_playlist_to_presentation_year(get_video_playlist(playlist_id))
transcripts_path = Path('transcripts')
# Video title is going to start with 'Launch School Capstone Presentation: <Project Name>'
# We want to remove the 'Launch School Capstone Presentation: ' part
project_name = video_info_to_project_name(get_video_info(video_id))
subfolder_path = transcripts_path / capstone_year / project_name
subfolder_path.mkdir(parents=True, exist_ok=True)
if not transcript:
success, transcript = get_transcript(video_id)
if not success :
return False
file_path = subfolder_path / f'{video_id}_transcript.txt'
if file_path.exists():
print(f'File already exists at {file_path} for video_id: {video_id}. Skipping...')
return False
with file_path.open('w', encoding='utf-8') as f:
print(f'Writing transcript to {file_path}')
f.write(transcript)
return True
def get_video_playlist(video_playlist_id):
params = {
'key': YT_DATA_API_KEY,
'part': 'snippet',
'id': video_playlist_id
}
response = requests.get(yt_playlists_url, params=params)
return response.json()
def video_playlist_to_presentation_year(video_playlist):
title = video_playlist['items'][0]['snippet']['title']
# Find year at beginning of title
year = title.split(' ')[0]
return year
def get_video_info(video_id):
params = {
'key': YT_DATA_API_KEY,
'part': 'snippet',
'id': video_id
}
response = requests.get(yt_data_url, params=params)
return response.json()
def video_info_to_project_name(video_info):
title = video_info['items'][0]['snippet']['title']
video_title = title.split(': ')[1]
return video_title
def video_info_to_title(video_info):
title = video_info['items'][0]['snippet']['title']
# Remove invalid characters (e.g. /, \, :, etc.)
title = ''.join([c for c in title if c.isalpha() or c.isdigit() or c==' ']).rstrip()
# Replace spaces with underscores
title = title.replace(' ', '_')
return title
def select_video_or_playlist():
options = [
'Video',
'Playlist'
]
questions = [
inquirer.List('video_or_playlist',
message='Select video or playlist',
choices=options,
),
]
answers = inquirer.prompt(questions)
return answers['video_or_playlist']
def download_transcript(video_id, playlist_id):
success, transcript = get_transcript(video_id)
if success:
save_transcript(video_id, playlist_id, transcript)
else:
exit(transcript)
def playlist_id_to_video_ids(playlist_id):
params = {
'key': YT_DATA_API_KEY,
'part': 'snippet',
'playlistId': playlist_id,
'maxResults': 50
}
response = requests.get(YT_PLAYLIST_ITEMS_URL, params=params)
playlist_items = response.json()
video_ids = []
# Break with pdb:
# pdb.set_trace()
for item in playlist_items['items']:
video_ids.append(item['snippet']['resourceId']['videoId'])
return video_ids
def app():
answer = select_video_or_playlist()
if answer == 'Video':
video_url = input('Enter video url: ')
video_id = video_url_to_id(video_url)
playlist_id = video_url_to_playlist_id(video_url) # returns None if no playlist ID found, needs to be handled
video_playlist = get_video_playlist(playlist_id)
print(f'video_playlist: {video_playlist}')
print(f'Capstone year: {video_playlist_to_presentation_year(video_playlist)}')
download_transcript(video_id, playlist_id)
elif answer == 'Playlist':
# print(f'Capstone year: {video_playlist_to_presentation_year(video_playlist)}')
playlist_url = input('Enter playlist URL: ')
playlist_id = video_url_to_playlist_id(playlist_url)
video_ids = playlist_id_to_video_ids(playlist_id)
print(f'video_ids: {video_ids}')
for video_id in video_ids:
download_transcript(video_id, playlist_id)
else:
print('Invalid option selected. Exiting...')
exit()
if __name__ == '__main__':
app()