first commit

BUPTLdy · Jul 18, 2016 · f02ea07 · f02ea07
1 parent 0fee73d
commit f02ea07
Show file tree

Hide file tree

Showing 28 changed files with 385 additions and 0 deletions.
diff --git a/code/Sentiment_lstm.py b/code/Sentiment_lstm.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+
+import yaml
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
+from sklearn.cross_validation import train_test_split
+import multiprocessing
+import numpy as np
+from gensim.models.word2vec import Word2Vec
+from gensim.corpora.dictionary import Dictionary
+
+from keras.preprocessing import sequence
+from keras.models import Sequential
+from keras.layers.embeddings import Embedding
+from keras.layers.recurrent import LSTM
+from keras.layers.core import Dense, Dropout,Activation
+from keras.models import model_from_yaml
+np.random.seed(1337)  # For Reproducibility
+import jieba
+import pandas as pd
+import sys
+sys.setrecursionlimit(1000000)
+# set parameters:
+vocab_dim = 100
+maxlen = 100
+n_iterations = 1  # ideally more..
+n_exposures = 10
+window_size = 7
+batch_size = 32
+n_epoch = 4
+input_length = 100
+cpu_count = multiprocessing.cpu_count()
+
+
+#加载训练文件
+def loadfile():
+    neg=pd.read_excel('data/neg.xls',header=None,index=None)
+    pos=pd.read_excel('data/pos.xls',header=None,index=None)
+
+    combined=np.concatenate((pos[0], neg[0]))
+    y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))
+
+    return combined,y
+
+#对句子经行分词，并去掉换行符
+def tokenizer(text):
+    ''' Simple Parser converting each document to lower-case, then
+        removing the breaks for new lines and finally splitting on the
+        whitespace
+    '''
+    text = [jieba.lcut(document.replace('\n', '')) for document in text]
+    return text
+
+
+
+#创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
+def create_dictionaries(model=None,
+                        combined=None):
+    ''' Function does are number of Jobs:
+        1- Creates a word to index mapping
+        2- Creates a word to vector mapping
+        3- Transforms the Training and Testing Dictionaries
+
+    '''
+    if (combined is not None) and (model is not None):
+        gensim_dict = Dictionary()
+        gensim_dict.doc2bow(model.vocab.keys(),
+                            allow_update=True)
+        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
+        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量
+
+        def parse_dataset(combined):
+            ''' Words become integers
+            '''
+            data=[]
+            for sentence in combined:
+                new_txt = []
+                for word in sentence:
+                    try:
+                        new_txt.append(w2indx[word])
+                    except:
+                        new_txt.append(0)
+                data.append(new_txt)
+            return data
+        combined=parse_dataset(combined)
+        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
+        return w2indx, w2vec,combined
+    else:
+        print 'No data provided...'
+
+
+#创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
+def word2vec_train(combined):
+
+    model = Word2Vec(size=vocab_dim,
+                     min_count=n_exposures,
+                     window=window_size,
+                     workers=cpu_count,
+                     iter=n_iterations)
+    model.build_vocab(combined)
+    model.train(combined)
+    model.save('lstm_data/Word2vec_model.pkl')
+    index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
+    return   index_dict, word_vectors,combined
+
+def get_data(index_dict,word_vectors,combined,y):
+
+    n_symbols = len(index_dict) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1
+    embedding_weights = np.zeros((n_symbols, vocab_dim))#索引为0的词语，词向量全为0
+    for word, index in index_dict.items():#从索引为1的词语开始，对每个词语对应其词向量
+        embedding_weights[index, :] = word_vectors[word]
+    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
+    print x_train.shape,y_train.shape
+    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test
+
+
+##定义网络结构
+def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
+    print 'Defining a Simple Keras Model...'
+    model = Sequential()  # or Graph or whatever
+    model.add(Embedding(output_dim=vocab_dim,
+                        input_dim=n_symbols,
+                        mask_zero=True,
+                        weights=[embedding_weights],
+                        input_length=input_length))  # Adding Input Length
+    model.add(LSTM(output_dim=50, activation='sigmoid', inner_activation='hard_sigmoid'))
+    model.add(Dropout(0.5))
+    model.add(Dense(1))
+    model.add(Activation('sigmoid'))
+
+    print 'Compiling the Model...'
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam',metrics=['accuracy'])
+
+    print "Train..."
+    model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test),show_accuracy=True)
+
+    print "Evaluate..."
+    score = model.evaluate(x_test, y_test,
+                                batch_size=batch_size)
+
+    yaml_string = model.to_yaml()
+    with open('lstm_data/lstm.yml', 'w') as outfile:
+        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
+    model.save_weights('lstm_data/lstm.h5')
+    print 'Test score:', score
+
+
+#训练模型，并保存
+def train():
+    print 'Loading Data...'
+    combined,y=loadfile()
+    print len(combined),len(y)
+    print 'Tokenising...'
+    combined = tokenizer(combined)
+    print 'Training a Word2vec model...'
+    index_dict, word_vectors,combined=word2vec_train(combined)
+    print 'Setting up Arrays for Keras Embedding Layer...'
+    n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
+    print x_train.shape,y_train.shape
+    train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)
+
+
+
+
+def input_transform(string):
+    words=jieba.lcut(string)
+    words=np.array(words).reshape(1,-1)
+    model=Word2Vec.load('lstm_data/Word2vec_model.pkl')
+    _,_,combined=create_dictionaries(model,words)
+    return combined
+
+def lstm_predict(string):
+    print 'loading model......'
+    with open('lstm_data/lstm.yml', 'r') as f:
+        yaml_string = yaml.load(f)
+    model = model_from_yaml(yaml_string)
+
+    print 'loading weights......'
+    model.load_weights('lstm_data/lstm.h5')
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam',metrics=['accuracy'])
+    data=input_transform(string)
+    data.reshape(1,-1)
+    #print data
+    result=model.predict_classes(data)
+    if result[0][0]==1:
+        print string,' positive'
+    else:
+        print string,' negative'
+if __name__=='__main__':
+    #train()
+    #string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
+    string='牛逼的手机，从3米高的地方摔下去都没坏，质量非常好'
+    string='酒店的环境非常好，价格也便宜，值得推荐'
+    string='手机质量太差了，傻逼店家，赚黑心钱，以后再也不会买了'
+    string='我是傻逼'
+    string='你是傻逼'
+    string='屏幕较差，拍照也很粗糙。'
+    string='质量不错，是正品 ，安装师傅也很好，才要了83元材料费'
+    string='东西非常不错，安装师傅很负责人，装的也很漂亮，精致，谢谢安装师傅！'
+
+    lstm_predict(string)
diff --git a/code/Sentiment_lstm.pyc b/code/Sentiment_lstm.pyc
diff --git a/code/Sentiment_svm.py b/code/Sentiment_svm.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr  5 10:05:30 2016
+
+@author: ldy
+"""
+from sklearn.cross_validation import train_test_split
+from gensim.models.word2vec import Word2Vec
+import numpy as np
+import pandas as pd
+import jieba
+from sklearn.externals import joblib
+from sklearn.svm import SVC
+import sys  
+reload(sys)  
+sys.setdefaultencoding('utf8')
+
+# 加载文件，导入数据,分词
+def loadfile():
+    neg=pd.read_excel('data/neg.xls',header=None,index=None)
+    pos=pd.read_excel('data/pos.xls',header=None,index=None)
+
+    cw = lambda x: list(jieba.cut(x))
+    pos['words'] = pos[0].apply(cw)
+    neg['words'] = neg[0].apply(cw)
+
+    #print pos['words']
+    #use 1 for positive sentiment, 0 for negative
+    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
+
+    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
+
+    np.save('svm_data/y_train.npy',y_train)
+    np.save('svm_data/y_test.npy',y_test)
+    return x_train,x_test
+
+
+
+#对每个句子的所有词向量取均值
+def buildWordVector(text, size,imdb_w2v):
+    vec = np.zeros(size).reshape((1, size))
+    count = 0.
+    for word in text:
+        try:
+            vec += imdb_w2v[word].reshape((1, size))
+            count += 1.
+        except KeyError:
+            continue
+    if count != 0:
+        vec /= count
+    return vec
+
+#计算词向量
+def get_train_vecs(x_train,x_test):
+    n_dim = 300
+    #Initialize model and build vocab
+    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
+    imdb_w2v.build_vocab(x_train)
+
+    #Train the model over train_reviews (this may take several minutes)
+    imdb_w2v.train(x_train)
+
+    train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
+    #train_vecs = scale(train_vecs)
+
+    np.save('svm_data/train_vecs.npy',train_vecs)
+    print train_vecs.shape
+    #Train word2vec on test tweets
+    imdb_w2v.train(x_test)
+    imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl')
+    #Build test tweet vectors then scale
+    test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
+    #test_vecs = scale(test_vecs)
+    np.save('svm_data/test_vecs.npy',test_vecs)
+    print test_vecs.shape
+
+
+
+def get_data():
+    train_vecs=np.load('svm_data/train_vecs.npy')
+    y_train=np.load('svm_data/y_train.npy')
+    test_vecs=np.load('svm_data/test_vecs.npy')
+    y_test=np.load('svm_data/y_test.npy') 
+    return train_vecs,y_train,test_vecs,y_test
+
+
+##训练svm模型
+def svm_train(train_vecs,y_train,test_vecs,y_test):
+    clf=SVC(kernel='rbf',verbose=True)
+    clf.fit(train_vecs,y_train)
+    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
+    print clf.score(test_vecs,y_test)
+
+
+##得到待预测单个句子的词向量    
+def get_predict_vecs(words):
+    n_dim = 300
+    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
+    #imdb_w2v.train(words)
+    train_vecs = buildWordVector(words, n_dim,imdb_w2v)
+    #print train_vecs.shape
+    return train_vecs
+
+####对单个句子进行情感判断    
+def svm_predict(string):
+    words=jieba.lcut(string)
+    words_vecs=get_predict_vecs(words)
+    clf=joblib.load('svm_data/svm_model/model.pkl')
+
+    result=clf.predict(words_vecs)
+
+    if int(result[0])==1:
+        print string,' positive'
+    else:
+        print string,' negative'
+
+if __name__=='__main__':
+
+
+    ##导入文件，处理保存为向量
+#    x_train,x_test=loadfile() #得到句子分词后的结果，并把类别标签保存为y_train。npy,y_test.npy
+#    get_train_vecs(x_train,x_test) #计算词向量并保存为train_vecs.npy,test_vecs.npy
+#    train_vecs,y_train,test_vecs,y_test=get_data()#导入训练数据和测试数据
+#    svm_train(train_vecs,y_train,test_vecs,y_test)#训练svm并保存模型
+
+
+##对输入句子情感进行判断
+    string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
+    #string='牛逼的手机，从3米高的地方摔下去都没坏，质量非常好'    
+    svm_predict(string)
+
diff --git a/code/Sentiment_svm.pyc b/code/Sentiment_svm.pyc
diff --git a/data/neg.xls b/data/neg.xls
diff --git a/data/pos.xls b/data/pos.xls
diff --git a/lstm_data/Word2vec_model.pkl b/lstm_data/Word2vec_model.pkl
diff --git a/lstm_data/lstm.h5 b/lstm_data/lstm.h5
diff --git a/lstm_data/lstm.yml b/lstm_data/lstm.yml
@@ -0,0 +1,16 @@
+"class_name: Sequential\nconfig:\n- class_name: Embedding\n  config:\n    W_constraint:\
+  \ null\n    W_regularizer: null\n    activity_regularizer: null\n    batch_input_shape:\
+  \ !!python/tuple [null, 100]\n    dropout: 0.0\n    init: uniform\n    input_dim:\
+  \ 8310\n    input_dtype: int32\n    input_length: 100\n    mask_zero: true\n   \
+  \ name: embedding_1\n    output_dim: 100\n    trainable: true\n- class_name: LSTM\n\
+  \  config: {U_regularizer: null, W_regularizer: null, activation: sigmoid, b_regularizer:\
+  \ null,\n    consume_less: cpu, dropout_U: 0.0, dropout_W: 0.0, forget_bias_init:\
+  \ one, go_backwards: false,\n    init: glorot_uniform, inner_activation: hard_sigmoid,\
+  \ inner_init: orthogonal,\n    input_dim: 100, input_length: null, name: lstm_1,\
+  \ output_dim: 50, return_sequences: false,\n    stateful: false, trainable: true,\
+  \ unroll: false}\n- class_name: Dropout\n  config: {name: dropout_1, p: 0.5, trainable:\
+  \ true}\n- class_name: Dense\n  config: {W_constraint: null, W_regularizer: null,\
+  \ activation: linear, activity_regularizer: null,\n    b_constraint: null, b_regularizer:\
+  \ null, bias: true, init: glorot_uniform, input_dim: null,\n    name: dense_1, output_dim:\
+  \ 1, trainable: true}\n- class_name: Activation\n  config: {activation: sigmoid,\
+  \ name: activation_1, trainable: true}\n"
diff --git a/predict.py b/predict.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul  8 19:57:29 2016
+
+@author: ldy
+"""
+
+import sys
+sys.path.append("code")
+from Sentiment_svm import svm_predict
+from Sentiment_lstm import lstm_predict
+argvs_lenght = len(sys.argv)
+if argvs_lenght != 3:
+    print '参数长度错误！'
+argvs = sys.argv
+
+sentence  = argvs[-1]
+
+if argvs[1] == 'svm':
+    svm_predict(sentence)
+
+elif argvs[1] == 'lstm':
+    lstm_predict(sentence)
+
+else:
+    print '选择svm或lstm！'
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+keras
+gensim
+jieba
+sklearn
+h5py
+numpy
+pandas
diff --git a/svm_data/svm_model/model.pkl b/svm_data/svm_model/model.pkl
diff --git a/svm_data/svm_model/model.pkl_01.npy b/svm_data/svm_model/model.pkl_01.npy
diff --git a/svm_data/svm_model/model.pkl_02.npy b/svm_data/svm_model/model.pkl_02.npy
diff --git a/svm_data/svm_model/model.pkl_03.npy b/svm_data/svm_model/model.pkl_03.npy
diff --git a/svm_data/svm_model/model.pkl_04.npy b/svm_data/svm_model/model.pkl_04.npy
diff --git a/svm_data/svm_model/model.pkl_05.npy b/svm_data/svm_model/model.pkl_05.npy
diff --git a/svm_data/svm_model/model.pkl_06.npy b/svm_data/svm_model/model.pkl_06.npy
diff --git a/svm_data/svm_model/model.pkl_07.npy b/svm_data/svm_model/model.pkl_07.npy
diff --git a/svm_data/svm_model/model.pkl_08.npy b/svm_data/svm_model/model.pkl_08.npy
diff --git a/svm_data/svm_model/model.pkl_09.npy b/svm_data/svm_model/model.pkl_09.npy
diff --git a/svm_data/svm_model/model.pkl_10.npy b/svm_data/svm_model/model.pkl_10.npy
diff --git a/svm_data/svm_model/model.pkl_11.npy b/svm_data/svm_model/model.pkl_11.npy
diff --git a/svm_data/test_vecs.npy b/svm_data/test_vecs.npy
diff --git a/svm_data/train_vecs.npy b/svm_data/train_vecs.npy
diff --git a/svm_data/w2v_model/w2v_model.pkl b/svm_data/w2v_model/w2v_model.pkl
diff --git a/svm_data/y_test.npy b/svm_data/y_test.npy
diff --git a/svm_data/y_train.npy b/svm_data/y_train.npy
-Original file line number
+Diff line change
@@ -0,0 +1,7 @@
+    keras
+    gensim
+    jieba
+    sklearn
+    h5py
+    numpy
+    pandas