NewsAutosummarize.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jun  7 21:49:22 2017

@author: Shabaka
"""

# -*- coding: utf-8 -*-
"""
Created on Wed Jun  7 12:15:24 2017

@author: Shabaka
"""


# Exercise based onhttps:''glowingpython.blogspot.in/2014/09/
# text-summarization-with-nl

# nltk - natural lang processing toolkit

# we sue 2 functions from nltk

# sent_tokenize - given a grp of text tokens, tokenize into sentences

# word_tokenize: given grp of text - tokenize into words

import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import urllib.request
from bs4 import BeautifulSoup

#####################################################################
# we use defaultdict - if we try to get item with key that does not exist
# it creates a default item and add the key-value pair to the dictionary
# the default item is taken from it's constructor - it takes a function
# that creates a default object
#####################################################################
# instatiate default dict

from collections import defaultdict
######################################################################
# check for punctuation
from string import punctuation
#######################################################################
# another requirement is a function that will return the n largst elements
# in a given list using a python built in fuctionality
######################################################################

from heapq import nlargest
########################################################
# Now we create the class FrequencySummarizer

# This class captures all behaviours we need
# eliminating stopwords
# finding frequency of words in text
# assigning score of importance for words in text
# rank sentences in the text based on freq above


nltk.download('stopwords')
nltk.download('punkt')

#########################################################################


class FrequencySummarizer:

    def _init_(self, min_cut=0.1, max_cut=0.9):
        # identation changes - we are inside the constructor
        # here we set up the behaviour
        # this is called each time an object of feq summ class is
        # created or instantiated
        self._min_cut = min_cut    # self=keyword that reports the variable
        self._max_cut = max_cut
        # we save the val of the 2 parameters passed by assigning them
        # two member variables - the 'self.' prefix identifies them as part
        # of the self argument - using underscore as first char.
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # this is alist of all common words and punc symols

    # identation changes - we are out of the constructor here
    # This is still the body of the class
    # Defining var here ( outside a member function) but within the class
    # member var becomes STATIC. This means it belongs to the class, and not
    # to any specific individual instance (object) of the class

    def _compute_frequencies(self, word_sent):
        # computes freq of words int he text
        # being a member func, it take the self arg and a list of
        # sentences in a piece of text
        # returns a dictionary where the keys are words in the sentence
        # and values are the freq of those words in the set of sentences
        freq = defaultdict(int)    # dictionary with extended functionality

        # we use a for loop to count the instances/freq of
        # words in our sentences and add them to our defaultdict
        for sentence in word_sent:
            # change in indentation - the following is in the for loop
            for word in word_sent:
                if word not in self._stopwords:
                    freq[word] += 1
                # the two loops above looks at every word in
                # every sentence, keeps track of all instances
                # of that word (freq).
                # We want to do this for all non-stop words

        # Frequency calculations Done. We go on to do 2 things
        # to our list as follows:
        # 1. We normalize the frequencies by dividing  each by the highest freq
        # 2. Filter out frequencies that are too high or too low

        # (1) helps make the frequencies comparable - all freq btw 0 - 1
        # (2) captures almost all srop words (normally very high)
        max_freq = float(max(freq.values()))
        # this gives maximum frequency
        for word in freq.keys():
            # ident change - we are in the for loop
            freq[word] = freq[word]/max_freq
            # 1. this divides the word freq by max freq
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                # inside the conditional
                del freq[word]
                # we us e del to remove key-value pairs from a dict
        return freq
        # the member function is completed - returns the frequency dictionary

    def summarize(self, text, n):
        # we are now goign to define the next member func
        # this member fxn takes in self ( same for any other member func)
        # it takes a raw text input(the article of interest)
        # n is th enumber of sentences we want to return
        sents = sent_tokenize(text)   # splits text into sentences
        assert n <= len(sents)
        # the assert is a way of making sure a condition hold true
        # else, we get an exception - for sanity checks
        # here we assert that the summary is no longer than whole article
        word_sent = [word_tokenize(s.lower()) for s in sents]
        # the line of code above first converts every sentence to lowercase
        # it then splits each sentence into words.
        # it then takes all lists of sentences words as above, and then
        # combines them into one giant list
        self._freq = self._compute_frequencies(word_sent)
        # make a call to the method ( member func) _compute_frequencies
        # that gives in the giant list of words and gets
        # back a dictionary with all frequencies
        ranking = defaultdict(int)
        # this creates an empty dictionary (the defaultdict variety)
        # this holds the ranking of the sentenses int he text.
        for i, sent in enumerate(word_sent):
            # ident - inside the for loop
            # we use a for loop and the built-in fucnt enmerate
            # If we have alist ['a'. 'b', 'c'...] - enumerate outputs a
            # list of tuples[(0,'a'), (1, 'b'), (2, 'c')]
            # enumerate eliminates the need for a counter variable
            # to keep track of hat index of the list is currently on
            # this requires that we have 2 loop variables
            for word in sent:
                # indent into second for loop
                if word is self._freq:
                    ranking[i] += self._freq[word]
                    # the above does the following
                    # for each word in each sentence,
                    # we compute a rank for that sentence as a sum
                    # of the frequencies of the words in that sentence
                    # this ofcourse excludes stop words as spec'd earlier
            # sentences
        sents_idx = nlargest(n, ranking, key=ranking.get)
        # Here we wish to find the first n sentences
        # that have the highest ranking, the 'nlargest' method helps here
        # to fulfill the requirement of knowing how to sort the sentences
        # we pass int he rankings.get method
        return [sents[j] for j in sents_idx]

####################################################################
# Now we get an article from the web URL and summarize it using urllib2
# and BeautifulSoup
#####################################################################

#############################################################
# we define a function that takes  a url of an article and
# return the raw text of the article


def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
######################################################################
someUrl = 'https://www.washingtonpost.com/politics/?utm_term=.9897b28da9af'
textOfUrl = 'https://www.washingtonpost.com/politics/?utm_term=.9897b28da9af'

fs = FrequencySummarizer()
# we instantiate the frequency summarizer class and get an object of this class

summary = fs.summarize(textOfUrl[1], 1)