https://drive.google.com/open?id=1XvsLropEW1TX9BlitafjH7bBYsBF6O3spm7-Bz0G9XQ
jupyter notebook text_norm_challenge.ipynb
python analysis/dataAnalysis.py
From words to sentences
print ("Hello from data preprocess")
import csv
db_file = 'data/kaggle_norm_competition/en_train.csv' # Here you should put the path to the file you want to change
outputFile = open ("outputTrainDST", 'a')
with open (db_file, 'r') as f:
reader = csv.reader(f)
counter = 0
pastPhrase = 0
reader.__next__()
phraseString = ''
for row in reader:
if pastPhrase == int (row[0]):
#We are still on the current phrase
phraseString += row[4] + ' '
else:
print (phraseString, file = outputFile)
phraseString = ""
phraseString += row[4] + ' '
counter += 1
if counter == 20000:
break
pastPhrase = int (row[0])
print (counter)
Phrases tokenization
for l in en de; do for f in data/kaggle_norm_competition/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; done; done
for l in en de; do for f in data/kaggle_norm_competition/*.$l; do perl tokenizer.perl -a -no-escape -l $l -q < $f > $f.atok; done; done
python preprocess.py -train_src data/kaggle_norm_competition/linesTrainSRC -train_tgt data/kaggle_norm_competition/linesTrainDST -valid_src data/kaggle_norm_competition/linesValSRC -valid_tgt data/kaggle_norm_competition/linesValDST -save_data data/kaggle_norm_competition/train_kaggle2transformer.atok.low.pt
python train.py -data data/kaggle_norm_competition/train_kaggle2transformer.atok.low.pt -save_model trained -save_mode best -proj_share_weight
python translate.py -model trained.chkpt -vocab data/kaggle_norm_competition/train_kaggle2transformer.atok.low.pt -src data/kaggle_norm_competition/linesTest
After working with the classic Transformer we decided to try some changes in order to classify the words (an info that we already have in the database) and try if this info could help us.
In order to do that, our idea is to add a new Transformer decoder that is trained with the word class.
The new transformer model will be:
class MyTransformer(nn.Module):
''' A sequence to sequence model with attention mechanism. '''
def __init__(
self, n_src_vocab, n_tgt_vocab, n_cls_vocab, n_max_seq, n_layers=6, n_head=8,
d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64,
dropout=0.1, proj_share_weight=True, embs_share_weight=True):
super(MyTransformer, self).__init__()
self.encoder = Encoder(
n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
d_word_vec=d_word_vec, d_model=d_model,
d_inner_hid=d_inner_hid, dropout=dropout)
self.decoder = Decoder(
n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
d_word_vec=d_word_vec, d_model=d_model,
d_inner_hid=d_inner_hid, dropout=dropout)
self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
self.dropout = nn.Dropout(dropout)
self.decoder_cls = Decoder(
n_cls_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
d_word_vec=d_word_vec, d_model=d_model,
d_inner_hid=d_inner_hid, dropout=dropout)
self.cls_word_proj = Linear(d_model, n_cls_vocab, bias=False)
assert d_model == d_word_vec, \
'To facilitate the residual connections, \
the dimensions of all module output shall be the same.'
if proj_share_weight:
# Share the weight matrix between tgt word embedding/projection
assert d_model == d_word_vec
self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight
if embs_share_weight:
# Share the weight matrix between src/tgt word embeddings
# assume the src/tgt word vec size are the same
assert n_src_vocab == n_tgt_vocab, \
"To share word embedding table, the vocabulary size of src/tgt shall be the same."
self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
def get_trainable_parameters(self):
''' Avoid updating the position encoding '''
enc_freezed_param_ids = set(map(id, self.encoder.position_enc.parameters()))
dec_freezed_param_ids = set(map(id, self.decoder.position_enc.parameters()))
dec_freezed_param_ids_cls = set(map(id, self.decoder_cls.position_enc.parameters()))
freezed_param_ids = enc_freezed_param_ids | dec_freezed_param_ids | dec_freezed_param_ids_cls
return (p for p in self.parameters() if id(p) not in freezed_param_ids)
def forward(self, src, tgt, cls):
src_seq, src_pos = src
tgt_seq, tgt_pos = tgt
cls_seq, cls_pos = cls
#TODO: I don't understand what this is doing (only will modify the lenght of one sentence, the largest one)
tgt_seq = tgt_seq[:, :-1]
tgt_pos = tgt_pos[:, :-1]
cls_seq = cls_seq[:, :-1]
cls_pos = cls_pos[:, :-1]
enc_output, *_ = self.encoder(src_seq, src_pos)
dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
seq_logit = self.tgt_word_proj(dec_output)
dec_cls_output, *_ = self.decoder_cls(cls_seq, cls_pos, src_seq, enc_output)
seq_cls_logit = self.tgt_word_proj(dec_cls_output)
return seq_logit.view(-1, seq_logit.size(2)), seq_cls_logit.view(-1, seq_cls_logit.size(2))
And the loss should be now
def train_epoch(model, training_data, crit, optimizer):
''' Epoch operation in training phase'''
model.train()
total_loss = 0
n_total_words = 0
n_total_correct = 0
for batch in tqdm(
training_data, mininterval=2,
desc=' - (Training) ', leave=False):
# prepare data
src, tgt, cls = batch
gold = tgt[0][:, 1:]
class_gold = cls[0][:,1:]
# forward
optimizer.zero_grad()
pred = model(src, tgt)
# backward
loss, n_correct = get_performance(crit, pred, gold)
loss.backward()
# update parameters
optimizer.step()
optimizer.update_learning_rate()
# note keeping
n_words = gold.data.ne(Constants.PAD).sum()
n_total_words += n_words
n_total_correct += n_correct
total_loss += loss.data[0]
return total_loss/n_total_words, n_total_correct/n_total_words