-
Notifications
You must be signed in to change notification settings - Fork 50
/
video_id_fetcher.py
82 lines (75 loc) · 2.77 KB
/
video_id_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from collections import OrderedDict, defaultdict
from youtube_crawler import search_youtube
from queries import QUERIES_AND_NOUNS, EGG_QUERIES
from imagenet import get_noun_id
def invert_dictionary(d):
'''
Swaps the keys and values of a dictionary where each key is a
string, and each value is a list of strings.
Returns:
A dictionary where each key is a string, and each value is a
list of strings (keys of `d`)
'''
new_dict = defaultdict(list)
for k, v in d.iteritems():
for elem in v:
new_dict[elem].append(k)
return new_dict
# These videos don't have eggs in them, or they only
# have tiny eggs in people's hands, at leats when the
# ms_between_frames is 10000
BLACKLIST = ['Zp2kJ2cstmU',
'pAWduxoCgVk',
'PLDUqyS2AGA',
'R4vDqlKMbrk',
's9r-CxnCXkg',
'ShnyBIm2GOQ',
'wdasrVE5NOc',
'lbzhyvH74w8', # selective_search fails on this one
'DKkNi7enlUk',
'PN2gYHJNT3Y',
'PUP7U5vTMM0',
'zglsDdaBf4g',
'yppgDL0Mn3g',
'8ki3-ASg9c8',
'2f7i-ndrx9g',
'M74wTjym2mY',
'u_3MYVAJTHM',
'YjFttWU3xMw',
'JrG332O4u_Y',
]
def get_egg_video_ids(count):
video_ids = []
videos_per_query = count / len(EGG_QUERIES)
remainder = count - videos_per_query * len(EGG_QUERIES)
for query in EGG_QUERIES:
fetch_this_many_video_ids = videos_per_query
if remainder > 0:
fetch_this_many_video_ids += remainder
remainder = 0
video_ids.extend(search_youtube(query,
fetch_this_many_video_ids))
return [id for id in video_ids if id not in BLACKLIST]
def get_noun_ids_and_video_ids(num_videos_per_noun):
'''
Returns:
an OrderedDict that contains alphabetically ordered nouns as keys,
and each value is a list of `num_videos_per_noun` video ids of videos
that likely contain that noun, as per the search queries in
`QUERIES_AND_NOUNS`.
'''
d = defaultdict(list)
for noun, queries in invert_dictionary(QUERIES_AND_NOUNS).iteritems():
videos_per_query = num_videos_per_noun / len(queries)
remainder = num_videos_per_noun - videos_per_query * len(queries)
for query in queries:
fetch_this_many_video_ids = videos_per_query
if remainder > 0:
fetch_this_many_video_ids += remainder
remainder = 0
d[get_noun_id(noun)].extend(search_youtube(query,
fetch_this_many_video_ids))
return OrderedDict(sorted(d.items()))
# TODO make sure there are no duplicate video_ids, and maybe have a
# (noun,video_id) blacklist if the noun isn't present in the video
# The blacklist can serve both purposes.