process_sst1_data.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Code from  https://github.com/gojomo/gensim/blob/f5b4e30dbc3ea71520b61e9d586cc946e2388ae0/gensim/test/test_doc2vec.py#L282
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import logging
from collections import namedtuple, defaultdict
from six.moves import zip as izip
import cPickle
import sys
import os
import numpy as np
import pandas as pd

class SentimentPhrase(object):
    def __init__(self, words, tags, split, sentiment, sentence_id):
        self.words = words
        self.tags = tags
        self.split = split
        self.sentiment = sentiment
        self.sentence_id = sentence_id

    def __str__(self):
        return '%s %s %s %s %s' % (self.words, self.tags, self.split, self.sentiment, self.sentence_id)


def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True):
    """
    Read and return documents from the Stanford Sentiment Treebank
    corpus (Rotten Tomatoes reviews), from http://nlp.Stanford.edu/sentiment/
    Initialize the corpus from a given directory, where
    http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
    has been expanded. It's not too big, so compose entirely into memory.
    """
    logging.info("loading corpus from %s" % dirname)

    # many mangled chars in sentences (datasetSentences.txt)
    chars_sst_mangled = ['à', 'á', 'â', 'ã', 'æ', 'ç', 'è', 'é', 'í',
                         'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'û', 'ü']
    sentence_fixups = [(char.decode('utf-8').encode('latin1'), char) for char in chars_sst_mangled]
    # more junk, and the replace necessary for sentence-phrase consistency
    sentence_fixups.extend([
        ('Â', ''),
        ('\xa0', ' '),
        ('-LRB-', '('),
        ('-RRB-', ')'),
    ])
    # only this junk in phrases (dictionary.txt)
    phrase_fixups = [('\xa0', ' ')]

    # sentence_id and split are only positive for the full sentences

    # read sentences to temp {sentence -> (id,split) dict, to correlate with dictionary.txt
    vocab = defaultdict(float)
    info_by_sentence = {}
    with open(os.path.join(dirname, 'datasetSentences.txt'), 'r') as sentences:
        with open(os.path.join(dirname, 'datasetSplit.txt'), 'r') as splits:
            next(sentences)  # legend
            next(splits)     # legend
            for sentence_line, split_line in izip(sentences, splits):
                (id, text) = sentence_line.split('\t')
                id = int(id)
                text = text.rstrip()
                for junk, fix in sentence_fixups:
                    text = text.replace(junk, fix)
                (id2, split_i) = split_line.split(',')
                assert id == int(id2)
                if text not in info_by_sentence:    # discard duplicates
                    info_by_sentence[text] = (id, int(split_i))
                else:
                    logging.info('Duplicates: %s' % text)

    # read all phrase text
    phrases = [None] * 239232  # known size of phrases
    with open(os.path.join(dirname, 'dictionary.txt'), 'r') as phrase_lines:
        for line in phrase_lines:
            (text, id) = line.split('|')
            for junk, fix in phrase_fixups:
                text = text.replace(junk, fix)
            phrases[int(id)] = text.rstrip()  # for 1st pass just string

    test_str = ''
    train_str = ''
    with open(os.path.join(dirname, 'sentiment_labels.txt'), 'r') as sentiments:
        next(sentiments)  # legend
        for line in sentiments:
            (id, sentiment) = line.split('|')
            id = int(id)
            sentiment = float(sentiment)
            text = phrases[id]
            (sentence_id, split_i) = info_by_sentence.get(text, (None, 0))
            if split_i == 2:  # test data
                test_str += text
            elif split_i == 1:
                train_str += text

    # add sentiment labels, correlate with sentences
    with open(os.path.join(dirname, 'sentiment_labels.txt'), 'r') as sentiments:
        next(sentiments)  # legend
        for line in sentiments:
            (id, sentiment) = line.split('|')
            id = int(id)
            sentiment = float(sentiment)
            text = phrases[id]
            words = text.split()
            if lowercase:
                words = [word.lower() for word in words]
            (sentence_id, split_i) = info_by_sentence.get(text, (None, 0))
            if sentence_id is not None:
                for w in words:
                    vocab[w]+=1
            split = [None, 'train', 'test', 'dev'][split_i]
            if sentence_id is None and (text in test_str or text not in train_str):  # skip phrase in test sentences and no substr of train sentences
                phrases[id] = SentimentPhrase(words, [id], split, -1, sentence_id)  # 0.5 for remove
            else:
                phrases[id] = SentimentPhrase(words, [id], split, sentiment, sentence_id)

    logging.info("loaded corpus with %i sentences and %i phrases from %s",
                 len(info_by_sentence), len(phrases), dirname)

    # counts don't match 8544, 2210, 1101 because 13 TRAIN and 1 DEV sentences are duplicates
    # print len([phrase for phrase in phrases if phrase.split == 'train']) # == 8531  # 'train'
    # print len([phrase for phrase in phrases if phrase.split == 'test']) # == 2210  # 'test'
    # print len([phrase for phrase in phrases if phrase.split == 'dev']) # == 1100  # 'dev'
    phrase0 = [phrase for phrase in phrases if phrase.sentence_id is None and phrase.sentiment != -1 ]
    sentences = [phrase for phrase in phrases if phrase.sentence_id is not None ]
    print 'sentences %d phrase %d vocab %d' % (len(sentences),
                                               len(phrase0), len(vocab))
    print 'Data example'
    print phrase0[10]
    print sentences[10]
    return phrase0, sentences, vocab

def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if word in vocab:
               word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return word_vecs


def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)


def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

if __name__ == '__main__':
    w2v_file = sys.argv[1]
    print 'load corpus'
    phrases, sentences, vocab = read_su_sentiment_rotten_tomatoes('./data/stanfordSentimentTreebank')
    print 'load word2vec'
    max_l = np.max([len(s.words) for s in sentences])
    print "number of sentences: " + str(len(sentences))
    print "vocab size: " + str(len(vocab))
    print "max sentence length: " + str(max_l)
    w2v = load_bin_vec(w2v_file, vocab)
    print "%d words in w2v" % len(w2v)
    add_unknown_words(w2v, vocab)
    W, word_idx_map = get_W(w2v)
    rand_vecs = {}
    add_unknown_words(rand_vecs, vocab)
    W2, _ = get_W(rand_vecs)
    cPickle.dump([ phrases, sentences, W, W2, word_idx_map, vocab], open("sst1.p", "wb"))
    x = cPickle.load(open("sst1.p","rb"))
    phrases, sentences, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4], x[5]
    print "dataset created!"