-
Notifications
You must be signed in to change notification settings - Fork 50
/
video_fetcher.py
60 lines (50 loc) · 1.92 KB
/
video_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from os.path import dirname, abspath, exists, splitext, basename, join
from os import system
import pafy
import re
ROOT = dirname(abspath(__file__))
def fetch_video(url):
'''
Returns the filename of the downloaded mp4
'''
video = pafy.new(url)
stream = get_stream(video)
if stream is None:
'Skipping ' + url
return None
filename = downloaded_filename(url, video.title, stream.extension)
# First check if we cached it
mp4_filename = converted_mp4_filename(filename)
if exists(mp4_filename):
return mp4_filename
stream.download(filename)
return convert_video_to_mp4(filename)
def get_stream(video):
'Gets the lowest resolution stream that has the smallest dimension >= 256'
# my ffmpeg build doesn't convert webm's, so I'm just using m4v for now
dimensions = [stream.dimensions for stream in video.videostreams \
if stream.dimensions[0] >= 256 and stream.dimensions[1] >= 256 \
and str(stream.extension) == 'm4v']
if len(dimensions) == 0:
print 'Video does not have a suffciently high resolution'
return None
res = min(dimensions, key=lambda x:x[0]*x[1])
for stream in video.videostreams:
if stream.resolution == str(res[0]) + 'x' + str(res[1]):
return stream
def convert_video_to_mp4(filename):
'OpenCV seems to only like mp4'
target = converted_mp4_filename(filename)
system('ffmpeg -i ' + filename + ' -vcodec copy ' + target)
return target
def converted_mp4_filename(filename):
return ROOT + '/data/videos/' + splitext(basename(filename))[0] + '.mp4'
def downloaded_filename(url, title, extension):
return dirname(abspath(__file__)) + '/data/videos/' + get_video_id(url) + \
sanitized_video_title(title) + '.' + extension
def sanitized_video_title(title):
return ''.join(x for x in title if x.isalnum())
def get_video_id(url):
return re.match('.*v=(.*$)', url).groups()[0]
def video_url(video_id):
return 'https://www.youtube.com/watch?v=' + video_id