Skip to content

Commit

Permalink
remove emoji before run googletrans
Browse files Browse the repository at this point in the history
  • Loading branch information
avalanchesiqi committed Aug 10, 2018
1 parent f24e676 commit 46d9119
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
2 changes: 1 addition & 1 deletion data/video_ids.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
nXJQZSKJh4M
Trz_RKPGL9U
X_MJGbZ8Jwg
BMkWnhCc59Q
15 changes: 15 additions & 0 deletions youtube_insight/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,18 @@ def _parse_xml(xml_string):
json_return['totalSubscriber'] = total_subscriber

return json_return

# == == == == == == == == method to remove emoji in string == == == == == == == == #
@staticmethod
def _remove_emoji(text):
""" Remove emoji in text.
"""
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
10 changes: 8 additions & 2 deletions youtube_insight/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,20 @@ def crawl_metadata(self, video_id):
# use googletrans if defaultLanguage not available
if 'defaultLanguage' not in res_json['snippet']:
try:
# remove emoji as googeltrans cannot handle emoji
res_json['snippet']['detectLanguage'] = self.translator.detect(
res_json['snippet']['title'] + res_json['snippet']['description']).lang
self._remove_emoji(
res_json['snippet']['title'] + res_json['snippet']['description'])).lang
except Exception:
# Google translator throws an exception after many detections, reset the translator
time.sleep(2 * random.random())
self.update_translator()
res_json['snippet']['detectLanguage'] = self.translator.detect(
res_json['snippet']['title'] + res_json['snippet']['description']).lang
self._remove_emoji(
res_json['snippet']['title'] + res_json['snippet']['description'])).lang
# remove duplicate relevant topic ids
if 'topicDetails' in res_json and 'relevantTopicIds' in res_json['topicDetails']:
res_json['topicDetails']['relevantTopicIds'] = list(set(res_json['topicDetails']['relevantTopicIds']))
return res_json
except Exception as e:
logging.error('--- Exception in metadata crawler:', str(e))
Expand Down

0 comments on commit 46d9119

Please sign in to comment.