-
Notifications
You must be signed in to change notification settings - Fork 168
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
28 changed files
with
385 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import yaml | ||
import sys | ||
reload(sys) | ||
sys.setdefaultencoding('utf8') | ||
from sklearn.cross_validation import train_test_split | ||
import multiprocessing | ||
import numpy as np | ||
from gensim.models.word2vec import Word2Vec | ||
from gensim.corpora.dictionary import Dictionary | ||
|
||
from keras.preprocessing import sequence | ||
from keras.models import Sequential | ||
from keras.layers.embeddings import Embedding | ||
from keras.layers.recurrent import LSTM | ||
from keras.layers.core import Dense, Dropout,Activation | ||
from keras.models import model_from_yaml | ||
np.random.seed(1337) # For Reproducibility | ||
import jieba | ||
import pandas as pd | ||
import sys | ||
sys.setrecursionlimit(1000000) | ||
# set parameters: | ||
vocab_dim = 100 | ||
maxlen = 100 | ||
n_iterations = 1 # ideally more.. | ||
n_exposures = 10 | ||
window_size = 7 | ||
batch_size = 32 | ||
n_epoch = 4 | ||
input_length = 100 | ||
cpu_count = multiprocessing.cpu_count() | ||
|
||
|
||
#加载训练文件 | ||
def loadfile(): | ||
neg=pd.read_excel('data/neg.xls',header=None,index=None) | ||
pos=pd.read_excel('data/pos.xls',header=None,index=None) | ||
|
||
combined=np.concatenate((pos[0], neg[0])) | ||
y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int))) | ||
|
||
return combined,y | ||
|
||
#对句子经行分词,并去掉换行符 | ||
def tokenizer(text): | ||
''' Simple Parser converting each document to lower-case, then | ||
removing the breaks for new lines and finally splitting on the | ||
whitespace | ||
''' | ||
text = [jieba.lcut(document.replace('\n', '')) for document in text] | ||
return text | ||
|
||
|
||
|
||
#创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引 | ||
def create_dictionaries(model=None, | ||
combined=None): | ||
''' Function does are number of Jobs: | ||
1- Creates a word to index mapping | ||
2- Creates a word to vector mapping | ||
3- Transforms the Training and Testing Dictionaries | ||
''' | ||
if (combined is not None) and (model is not None): | ||
gensim_dict = Dictionary() | ||
gensim_dict.doc2bow(model.vocab.keys(), | ||
allow_update=True) | ||
w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引 | ||
w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量 | ||
|
||
def parse_dataset(combined): | ||
''' Words become integers | ||
''' | ||
data=[] | ||
for sentence in combined: | ||
new_txt = [] | ||
for word in sentence: | ||
try: | ||
new_txt.append(w2indx[word]) | ||
except: | ||
new_txt.append(0) | ||
data.append(new_txt) | ||
return data | ||
combined=parse_dataset(combined) | ||
combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 | ||
return w2indx, w2vec,combined | ||
else: | ||
print 'No data provided...' | ||
|
||
|
||
#创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引 | ||
def word2vec_train(combined): | ||
|
||
model = Word2Vec(size=vocab_dim, | ||
min_count=n_exposures, | ||
window=window_size, | ||
workers=cpu_count, | ||
iter=n_iterations) | ||
model.build_vocab(combined) | ||
model.train(combined) | ||
model.save('lstm_data/Word2vec_model.pkl') | ||
index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) | ||
return index_dict, word_vectors,combined | ||
|
||
def get_data(index_dict,word_vectors,combined,y): | ||
|
||
n_symbols = len(index_dict) + 1 # 所有单词的索引数,频数小于10的词语索引为0,所以加1 | ||
embedding_weights = np.zeros((n_symbols, vocab_dim))#索引为0的词语,词向量全为0 | ||
for word, index in index_dict.items():#从索引为1的词语开始,对每个词语对应其词向量 | ||
embedding_weights[index, :] = word_vectors[word] | ||
x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2) | ||
print x_train.shape,y_train.shape | ||
return n_symbols,embedding_weights,x_train,y_train,x_test,y_test | ||
|
||
|
||
##定义网络结构 | ||
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test): | ||
print 'Defining a Simple Keras Model...' | ||
model = Sequential() # or Graph or whatever | ||
model.add(Embedding(output_dim=vocab_dim, | ||
input_dim=n_symbols, | ||
mask_zero=True, | ||
weights=[embedding_weights], | ||
input_length=input_length)) # Adding Input Length | ||
model.add(LSTM(output_dim=50, activation='sigmoid', inner_activation='hard_sigmoid')) | ||
model.add(Dropout(0.5)) | ||
model.add(Dense(1)) | ||
model.add(Activation('sigmoid')) | ||
|
||
print 'Compiling the Model...' | ||
model.compile(loss='binary_crossentropy', | ||
optimizer='adam',metrics=['accuracy']) | ||
|
||
print "Train..." | ||
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test),show_accuracy=True) | ||
|
||
print "Evaluate..." | ||
score = model.evaluate(x_test, y_test, | ||
batch_size=batch_size) | ||
|
||
yaml_string = model.to_yaml() | ||
with open('lstm_data/lstm.yml', 'w') as outfile: | ||
outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) | ||
model.save_weights('lstm_data/lstm.h5') | ||
print 'Test score:', score | ||
|
||
|
||
#训练模型,并保存 | ||
def train(): | ||
print 'Loading Data...' | ||
combined,y=loadfile() | ||
print len(combined),len(y) | ||
print 'Tokenising...' | ||
combined = tokenizer(combined) | ||
print 'Training a Word2vec model...' | ||
index_dict, word_vectors,combined=word2vec_train(combined) | ||
print 'Setting up Arrays for Keras Embedding Layer...' | ||
n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) | ||
print x_train.shape,y_train.shape | ||
train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) | ||
|
||
|
||
|
||
|
||
def input_transform(string): | ||
words=jieba.lcut(string) | ||
words=np.array(words).reshape(1,-1) | ||
model=Word2Vec.load('lstm_data/Word2vec_model.pkl') | ||
_,_,combined=create_dictionaries(model,words) | ||
return combined | ||
|
||
def lstm_predict(string): | ||
print 'loading model......' | ||
with open('lstm_data/lstm.yml', 'r') as f: | ||
yaml_string = yaml.load(f) | ||
model = model_from_yaml(yaml_string) | ||
|
||
print 'loading weights......' | ||
model.load_weights('lstm_data/lstm.h5') | ||
model.compile(loss='binary_crossentropy', | ||
optimizer='adam',metrics=['accuracy']) | ||
data=input_transform(string) | ||
data.reshape(1,-1) | ||
#print data | ||
result=model.predict_classes(data) | ||
if result[0][0]==1: | ||
print string,' positive' | ||
else: | ||
print string,' negative' | ||
if __name__=='__main__': | ||
#train() | ||
#string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如' | ||
string='牛逼的手机,从3米高的地方摔下去都没坏,质量非常好' | ||
string='酒店的环境非常好,价格也便宜,值得推荐' | ||
string='手机质量太差了,傻逼店家,赚黑心钱,以后再也不会买了' | ||
string='我是傻逼' | ||
string='你是傻逼' | ||
string='屏幕较差,拍照也很粗糙。' | ||
string='质量不错,是正品 ,安装师傅也很好,才要了83元材料费' | ||
string='东西非常不错,安装师傅很负责人,装的也很漂亮,精致,谢谢安装师傅!' | ||
|
||
lstm_predict(string) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Tue Apr 5 10:05:30 2016 | ||
@author: ldy | ||
""" | ||
from sklearn.cross_validation import train_test_split | ||
from gensim.models.word2vec import Word2Vec | ||
import numpy as np | ||
import pandas as pd | ||
import jieba | ||
from sklearn.externals import joblib | ||
from sklearn.svm import SVC | ||
import sys | ||
reload(sys) | ||
sys.setdefaultencoding('utf8') | ||
|
||
# 加载文件,导入数据,分词 | ||
def loadfile(): | ||
neg=pd.read_excel('data/neg.xls',header=None,index=None) | ||
pos=pd.read_excel('data/pos.xls',header=None,index=None) | ||
|
||
cw = lambda x: list(jieba.cut(x)) | ||
pos['words'] = pos[0].apply(cw) | ||
neg['words'] = neg[0].apply(cw) | ||
|
||
#print pos['words'] | ||
#use 1 for positive sentiment, 0 for negative | ||
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg)))) | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2) | ||
|
||
np.save('svm_data/y_train.npy',y_train) | ||
np.save('svm_data/y_test.npy',y_test) | ||
return x_train,x_test | ||
|
||
|
||
|
||
#对每个句子的所有词向量取均值 | ||
def buildWordVector(text, size,imdb_w2v): | ||
vec = np.zeros(size).reshape((1, size)) | ||
count = 0. | ||
for word in text: | ||
try: | ||
vec += imdb_w2v[word].reshape((1, size)) | ||
count += 1. | ||
except KeyError: | ||
continue | ||
if count != 0: | ||
vec /= count | ||
return vec | ||
|
||
#计算词向量 | ||
def get_train_vecs(x_train,x_test): | ||
n_dim = 300 | ||
#Initialize model and build vocab | ||
imdb_w2v = Word2Vec(size=n_dim, min_count=10) | ||
imdb_w2v.build_vocab(x_train) | ||
|
||
#Train the model over train_reviews (this may take several minutes) | ||
imdb_w2v.train(x_train) | ||
|
||
train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train]) | ||
#train_vecs = scale(train_vecs) | ||
|
||
np.save('svm_data/train_vecs.npy',train_vecs) | ||
print train_vecs.shape | ||
#Train word2vec on test tweets | ||
imdb_w2v.train(x_test) | ||
imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl') | ||
#Build test tweet vectors then scale | ||
test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test]) | ||
#test_vecs = scale(test_vecs) | ||
np.save('svm_data/test_vecs.npy',test_vecs) | ||
print test_vecs.shape | ||
|
||
|
||
|
||
def get_data(): | ||
train_vecs=np.load('svm_data/train_vecs.npy') | ||
y_train=np.load('svm_data/y_train.npy') | ||
test_vecs=np.load('svm_data/test_vecs.npy') | ||
y_test=np.load('svm_data/y_test.npy') | ||
return train_vecs,y_train,test_vecs,y_test | ||
|
||
|
||
##训练svm模型 | ||
def svm_train(train_vecs,y_train,test_vecs,y_test): | ||
clf=SVC(kernel='rbf',verbose=True) | ||
clf.fit(train_vecs,y_train) | ||
joblib.dump(clf, 'svm_data/svm_model/model.pkl') | ||
print clf.score(test_vecs,y_test) | ||
|
||
|
||
##得到待预测单个句子的词向量 | ||
def get_predict_vecs(words): | ||
n_dim = 300 | ||
imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl') | ||
#imdb_w2v.train(words) | ||
train_vecs = buildWordVector(words, n_dim,imdb_w2v) | ||
#print train_vecs.shape | ||
return train_vecs | ||
|
||
####对单个句子进行情感判断 | ||
def svm_predict(string): | ||
words=jieba.lcut(string) | ||
words_vecs=get_predict_vecs(words) | ||
clf=joblib.load('svm_data/svm_model/model.pkl') | ||
|
||
result=clf.predict(words_vecs) | ||
|
||
if int(result[0])==1: | ||
print string,' positive' | ||
else: | ||
print string,' negative' | ||
|
||
if __name__=='__main__': | ||
|
||
|
||
##导入文件,处理保存为向量 | ||
# x_train,x_test=loadfile() #得到句子分词后的结果,并把类别标签保存为y_train。npy,y_test.npy | ||
# get_train_vecs(x_train,x_test) #计算词向量并保存为train_vecs.npy,test_vecs.npy | ||
# train_vecs,y_train,test_vecs,y_test=get_data()#导入训练数据和测试数据 | ||
# svm_train(train_vecs,y_train,test_vecs,y_test)#训练svm并保存模型 | ||
|
||
|
||
##对输入句子情感进行判断 | ||
string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如' | ||
#string='牛逼的手机,从3米高的地方摔下去都没坏,质量非常好' | ||
svm_predict(string) | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
"class_name: Sequential\nconfig:\n- class_name: Embedding\n config:\n W_constraint:\ | ||
\ null\n W_regularizer: null\n activity_regularizer: null\n batch_input_shape:\ | ||
\ !!python/tuple [null, 100]\n dropout: 0.0\n init: uniform\n input_dim:\ | ||
\ 8310\n input_dtype: int32\n input_length: 100\n mask_zero: true\n \ | ||
\ name: embedding_1\n output_dim: 100\n trainable: true\n- class_name: LSTM\n\ | ||
\ config: {U_regularizer: null, W_regularizer: null, activation: sigmoid, b_regularizer:\ | ||
\ null,\n consume_less: cpu, dropout_U: 0.0, dropout_W: 0.0, forget_bias_init:\ | ||
\ one, go_backwards: false,\n init: glorot_uniform, inner_activation: hard_sigmoid,\ | ||
\ inner_init: orthogonal,\n input_dim: 100, input_length: null, name: lstm_1,\ | ||
\ output_dim: 50, return_sequences: false,\n stateful: false, trainable: true,\ | ||
\ unroll: false}\n- class_name: Dropout\n config: {name: dropout_1, p: 0.5, trainable:\ | ||
\ true}\n- class_name: Dense\n config: {W_constraint: null, W_regularizer: null,\ | ||
\ activation: linear, activity_regularizer: null,\n b_constraint: null, b_regularizer:\ | ||
\ null, bias: true, init: glorot_uniform, input_dim: null,\n name: dense_1, output_dim:\ | ||
\ 1, trainable: true}\n- class_name: Activation\n config: {activation: sigmoid,\ | ||
\ name: activation_1, trainable: true}\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Fri Jul 8 19:57:29 2016 | ||
@author: ldy | ||
""" | ||
|
||
import sys | ||
sys.path.append("code") | ||
from Sentiment_svm import svm_predict | ||
from Sentiment_lstm import lstm_predict | ||
argvs_lenght = len(sys.argv) | ||
if argvs_lenght != 3: | ||
print '参数长度错误!' | ||
argvs = sys.argv | ||
|
||
sentence = argvs[-1] | ||
|
||
if argvs[1] == 'svm': | ||
svm_predict(sentence) | ||
|
||
elif argvs[1] == 'lstm': | ||
lstm_predict(sentence) | ||
|
||
else: | ||
print '选择svm或lstm!' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
keras | ||
gensim | ||
jieba | ||
sklearn | ||
h5py | ||
numpy | ||
pandas |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.