Skip to content

Commit

Permalink
rbert
Browse files Browse the repository at this point in the history
  • Loading branch information
coderJoon committed Nov 29, 2022
1 parent 4546689 commit 21819b3
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 32 deletions.
5 changes: 5 additions & 0 deletions inf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
while read line
do
python inference.py $line

done < command_file.txt
44 changes: 35 additions & 9 deletions inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import argparse
from tqdm import tqdm
from omegaconf import OmegaConf
from models import *
from models import auto_models,R_BERT
import datetime
from utils.metric import label_to_num
from pytz import timezone
Expand All @@ -27,15 +27,24 @@ def inference(model, tokenized_sent, device):
output_prob = []
for i, data in enumerate(dataloader): # tqdm
with torch.no_grad():
outputs = model(
if cfg.model.type == 'rbert':
outputs = model (data['sub_ids'].to(device),data['obj_ids'].to(device),
input_ids=data['input_ids'].to(device),
attention_mask=data['attention_mask'].to(device),
token_type_ids=data['token_type_ids'].to(device)
)
else:
outputs = model(
input_ids=data['input_ids'].to(device),
attention_mask=data['attention_mask'].to(device),
token_type_ids=data['token_type_ids'].to(device)
)
if cfg.model.type == 'CNN':
logits = outputs.get('logits')
elif cfg.model.type == 'base':
logits = outputs[0]
elif cfg.model.type == 'rbert':
logits = outputs.get('logits')
prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
logits = logits.detach().cpu().numpy()
result = np.argmax(logits, axis=-1)
Expand Down Expand Up @@ -66,8 +75,12 @@ def load_test_dataset(dataset_dir, tokenizer):
test_dataset = dataset.load_data(dataset_dir)
test_label = list(map(int,test_dataset['label'].values))
# tokenizing dataset
tokenized_test = dataset.tokenized_dataset(test_dataset, tokenizer)
return test_dataset['id'], tokenized_test, test_label
if cfg.model.type == 'rbert':
tokenized_test,sub_list,obj_list = dataset.tokenized_dataset(test_dataset, tokenizer, cfg.model.type,cfg.data.mode)
return test_dataset['id'], tokenized_test,sub_list,obj_list, test_label
else:
tokenized_test = dataset.tokenized_dataset(test_dataset, tokenizer, cfg.model.type,cfg.data.mode)
return test_dataset['id'], tokenized_test, test_label

def main(cfg):
"""
Expand All @@ -86,19 +99,27 @@ def main(cfg):
model = auto_models.CNN_Model(MODEL_NAME)
elif cfg.model.type == 'enitity':
model = auto_models.EntityModel(MODEL_NAME)
elif cfg.model.type =='rbert':
model = R_BERT.RBERT(MODEL_NAME)
best_state_dict= torch.load(cfg.test.model_dir)
model.load_state_dict(best_state_dict)
model.parameters
model.to(device)

## load test datset
test_dataset_dir = cfg.path.predict_path
test_id, test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
Re_test_dataset = RE_Dataset(test_dataset ,test_label)
if cfg.model.type == 'rbert':
test_id, test_dataset,sub_list,obj_list,test_label = load_test_dataset(test_dataset_dir, tokenizer)
Re_test_dataset = RBERT_Dataset(test_dataset,test_label,sub_list,obj_list)

else:
test_id, test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
Re_test_dataset = RE_Dataset(test_dataset ,test_label)

## predict answer
pred_answer, output_prob = inference(model, Re_test_dataset, device) # model에서 class 추론
pred_answer = num_to_label(pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환.


## make csv file with predicted answer
#########################################################
Expand All @@ -108,17 +129,22 @@ def main(cfg):
output.to_csv(cfg.test.prediction, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.
#### 필수!! ##############################################
print('---- Finish! ----')

val_process = Preprocess(cfg.path.dev_path)
dev_dataset = val_process.data
dev_label = label_to_num(dev_dataset['label'].values)
tokenized_dev = val_process.tokenized_dataset(dev_dataset, tokenizer)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
if cfg.model.type == 'rbert':
tokenized_dev, sub_mask, obj_mask = val_process.tokenized_dataset(dev_dataset, tokenizer, cfg.model.type , cfg.data.mode)
RE_dev_dataset = RBERT_Dataset(tokenized_dev, dev_label, sub_mask, obj_mask)
else:
tokenized_dev = val_process.tokenized_dataset(dev_dataset, tokenizer)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

_, output_prob = inference(model, RE_dev_dataset, device) # model에서 class 추론
result = [' '.join(map(lambda x: f'{x:.3f}', out)) for out in output_prob]
dev_dataset['output_prob'] = result
time = get_time()
dev_dataset.to_csv(f"EDA/output/{cfg.exp.exp_name}_{time}.csv", index=False)
dev_dataset.to_csv(f"./EDA/output/{cfg.exp.exp_name}_{time}.csv", index=False)
print('----csv generate Finish! ----')

def get_time():
Expand Down
89 changes: 78 additions & 11 deletions load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import torch
import tqdm
from utils import make_entity_ids


class RE_Dataset(torch.utils.data.Dataset):
Expand All @@ -19,6 +20,24 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.labels)

class RBERT_Dataset(torch.utils.data.Dataset):
""" Dataset 구성을 위한 class."""
def __init__(self, pair_dataset, labels,sub_ids,obj_ids):
self.pair_dataset = pair_dataset
self.labels = labels
self.sub_ids = sub_ids
self.obj_ids = obj_ids

def __getitem__(self, idx):
item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
item['labels'] = torch.tensor(self.labels[idx])
item['sub_ids'] = torch.tensor(self.sub_ids[idx])
item['obj_ids'] = torch.tensor(self.obj_ids[idx])
return item

def __len__(self):
return len(self.labels)

class Preprocess:
def __init__(self, path):
self.data = self.load_data(path)
Expand All @@ -38,15 +57,63 @@ def label_to_num(self, label):

return num_label

def tokenized_dataset(self, dataset, tokenizer):

tokenized_sentences = tokenizer(
list(dataset['sentence']),
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=256,
add_special_tokens=True,
)
def tokenized_dataset(self, dataset, tokenizer,type,test=False):
if type == 'rbert':
sub_list = []
obj_list = []

for sent in dataset['sentence']:
sub_id,obj_id = make_entity_ids.make_ent_ids(tokenizer,sent)
sub_list.append(sub_id)
obj_list.append(obj_id)
if test:
tmp = []
for e01,e02 in zip(dataset['subject_entity'],dataset['object_entity']):
ex = f"{e01} 와(과) {e02} 의 관계는? : "
tmp.append(ex)
tokenized_sentences = tokenizer(
tmp,
list(dataset['sentence']),
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=256,
add_special_tokens=True,
)
else:
tokenized_sentences = tokenizer(
list(dataset['sentence']),
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=256,
add_special_tokens=True,
)
return tokenized_sentences,sub_list,obj_list

else:
if test:
tmp = []
for e01,e02 in zip(dataset['subject_entity'],dataset['object_entity']):
ex = f"{e01} 와(과) {e02} 의 관계는? : "
tmp.append(ex)
tokenized_sentences = tokenizer(
list(dataset['sentence']),
tmp,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=256,
add_special_tokens=True,
)
else:
tokenized_sentences = tokenizer(
list(dataset['sentence']),
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=256,
add_special_tokens=True,
)

return tokenized_sentences
return tokenized_sentences
74 changes: 74 additions & 0 deletions models/R_BERT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import torch
import torch.nn as nn
from transformers import AutoModel,AutoConfig


class FCLayer(nn.Module):
def __init__(self, input_dim, output_dim,use_activation = True):
super(FCLayer, self).__init__()
self.use_activation = use_activation
self.dropout = nn.Dropout(p=0.2)
self.linear = nn.Linear(input_dim, output_dim)
self.tanh = nn.Tanh()

def forward(self, x):
x = self.dropout(x)
if self.use_activation:
x = self.tanh(x)
return self.linear(x)


class RBERT(nn.Module):
def __init__(self,MODEL_NAME):
super(RBERT, self).__init__()
self.MODEL_NAME = MODEL_NAME
self.Backbone = AutoModel.from_pretrained(self.MODEL_NAME)
self.model_config = AutoConfig.from_pretrained(self.MODEL_NAME)
self.hidden_size = self.model_config.hidden_size
self.num_labels = 30

self.cls_fc_layer = FCLayer(self.hidden_size, self.hidden_size) # 768 , 768
self.entity_fc_layer = FCLayer(self.hidden_size, self.hidden_size) # 768 , 768
self.label_classifier = FCLayer(
self.hidden_size * 3,
30,
use_activation=False,
)

@staticmethod
def entity_average(hidden_output, e_mask):
"""
Average the entity hidden state vectors (H_i ~ H_j)
:param hidden_output: [batch_size, j-i+1, dim]
:param e_mask: [batch_size, max_seq_len]
e.g. e_mask[0] == [0, 0, 0, 1, 1, 1, 0, 0, ... 0]
:return: [batch_size, dim]
"""
e_mask_unsqueeze = e_mask.unsqueeze(1) # [b, 1, j-i+1]
length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1) # [batch_size, 1]

# [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
avg_vector = sum_vector.float() / length_tensor.float() # broadcasting
return avg_vector

def forward(self, e1_mask, e2_mask,**batch):
inputs = {'input_ids':batch.get('input_ids'),'token_type_ids':batch.get('token_type_ids'),'attention_mask':batch.get('attention_mask')}
outputs = self.Backbone(**inputs) # sequence_output, pooled_output, (hidden_states), (attentions)
sequence_output = outputs[0]
pooled_output = outputs[1] # [CLS]

# Average
e1_h = self.entity_average(sequence_output, e1_mask)
e2_h = self.entity_average(sequence_output, e2_mask)

# Dropout -> tanh -> fc_layer (Share FC layer for e1 and e2)
pooled_output = self.cls_fc_layer(pooled_output)
e1_h = self.entity_fc_layer(e1_h)
e2_h = self.entity_fc_layer(e2_h)

# Concat -> fc_layer
concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
logits = self.label_classifier(concat_h)

return {'logits':logits}
4 changes: 3 additions & 1 deletion models/auto_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self,MODEL_NAME):
self.pooling_layers = nn.ModuleList([nn.MaxPool1d(256-i+1) for i in range(2,12)])
self.linear1 = nn.Linear(1000,500)
self.linear2 = nn.Linear(500,30)
self.dropout = nn.Dropout(p=0.5)

def forward(self,**batch):
inputs = {'input_ids':batch.get('input_ids'),'token_type_ids':batch.get('token_type_ids'),'attention_mask':batch.get('attention_mask')}
Expand All @@ -51,7 +52,8 @@ def forward(self,**batch):
y = torch.cat(tmp,axis=1).squeeze() # (Batch , 600)

y = self.linear1(y)
y = torch.relu(y)
y = torch.sigmoid(y)
y = self.dropout(y)
logits = self.linear2(y) # (Batch, 300)

return {'logits':logits}
Expand Down
5 changes: 5 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
while read line
do
python train.py $line

done < command_file.txt
30 changes: 19 additions & 11 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from utils.augmentation import *
import random
from utils.metric import *
from models import *
from models import auto_models,R_BERT
from trainer import *
import yaml
from omegaconf import OmegaConf
Expand All @@ -45,11 +45,17 @@ def train():
dev_label = label_to_num(dev_dataset['label'].values)

print('Data Tokenizing...')
tokenized_train = train_preprocess.tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = dev_preprocess.tokenized_dataset(dev_dataset, tokenizer)

RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
print(f'Selected Tokenize Type: {cfg.model.type}')
if cfg.model.type == "rbert":
tokenized_train,train_sub_ids,train_obj_ids = train_preprocess.tokenized_dataset(train_dataset, tokenizer,type=cfg.model.type,test=cfg.data.mode)
tokenized_dev,dev_sub_ids,dev_obj_ids = dev_preprocess.tokenized_dataset(dev_dataset, tokenizer,type = cfg.model.type,test=cfg.data.mode)
RE_train_dataset = RBERT_Dataset(tokenized_train, train_label,train_sub_ids,train_obj_ids)
RE_dev_dataset = RBERT_Dataset(tokenized_dev, dev_label,dev_sub_ids,dev_obj_ids)
else:
tokenized_train = train_preprocess.tokenized_dataset(train_dataset, tokenizer,type=cfg.model.type,test=cfg.data.mode)
tokenized_dev = dev_preprocess.tokenized_dataset(dev_dataset, tokenizer,type = cfg.model.type,test=cfg.data.mode)
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Expand All @@ -60,6 +66,8 @@ def train():
model = auto_models.RE_Model(MODEL_NAME)
elif cfg.model.type == "entity":
model = auto_models.EntityModel(MODEL_NAME)
elif cfg.model.type == "rbert":
model = R_BERT.RBERT(MODEL_NAME)

model.parameters
model.to(device)
Expand Down Expand Up @@ -99,17 +107,17 @@ def train():
scheduler = cfg.train.scheduler,
compute_metrics=compute_metrics, # define metrics function
num_training_steps = 3 * len(train_dataset),
#callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.train.patience, early_stopping_threshold=0.0)],
callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.train.patience, early_stopping_threshold=0.0)],
model_type = cfg.model.type
)

# train model
wandb.watch(model)
trainer.train()
try:
model.save_pretrained(cfg.test.model_dir)
except:
torch.save(model.state_dict(),cfg.test.model_dir)
#try:
# model.save_pretrained(cfg.test.model_dir)
#except:
# torch.save(model.state_dict(),cfg.test.model_dir)

def main():
wandb_cfg = dict()
Expand Down
Loading

0 comments on commit 21819b3

Please sign in to comment.