After working with the classic Transformer we decided to try some changes in order to classify the words (an info that we already have in the database) and try if this info could help us.

In order to do that, our idea is to add a new Transformer decoder that is trained with the word class.

Word classification trial


class MyTransformer(nn.Module):                                                                                                                                                                                                               
''' A sequence to sequence model with attention mechanism. '''                                                                                                                                                                            

def __init__(                                                                                                                                                                                                                             
        self, n_src_vocab, n_tgt_vocab, n_cls_vocab, n_max_seq, n_layers=6, n_head=8,                                                                                                                                                     
        d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64,                                                                                                                                                                    
        dropout=0.1, proj_share_weight=True, embs_share_weight=True):                                                                                                                                                                     

    super(MyTransformer, self).__init__()                                                                                                                                                                                                 
    self.encoder = Encoder(                                                                                                                                                                                                               
        n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,                                                                                                                                                                         
        d_word_vec=d_word_vec, d_model=d_model,                                                                                                                                                                                           
        d_inner_hid=d_inner_hid, dropout=dropout)                                                                                                                                                                                         
    self.decoder = Decoder(                                                                                                                                                                                                               
        n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,                                                                                                                                                                         
        d_word_vec=d_word_vec, d_model=d_model,                                                                                                                                                                                           
        d_inner_hid=d_inner_hid, dropout=dropout)                                                                                                                                                                                         
    self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)                                                                                                                                                                         
    self.dropout = nn.Dropout(dropout)                                                                                                                                                                                                    

    self.decoder_cls = Decoder(                                                                                                                                                                                                           
        n_cls_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,                                                                                                                                                                         
        d_word_vec=d_word_vec, d_model=d_model,                                                                                                                                                                                           
        d_inner_hid=d_inner_hid, dropout=dropout)                                                                                                                                                                                         
    self.cls_word_proj = Linear(d_model, n_cls_vocab, bias=False)                                                                                                                                                                         

    assert d_model == d_word_vec, \                                                                                                                                                                                                       
    'To facilitate the residual connections, \                                                                                                                                                                                            
     the dimensions of all module output shall be the same.'                                                                                                                                                                              

    if proj_share_weight:                                                                                                                                                                                                                 
        # Share the weight matrix between tgt word embedding/projection                                                                                                                                                                   
        assert d_model == d_word_vec                                                                                                                                                                                                      
        self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight                                                                                                                                                                      

    if embs_share_weight:                                                                                                                                                                                                                 
        # Share the weight matrix between src/tgt word embeddings                                                                                                                                                                         
        # assume the src/tgt word vec size are the same                                                                                                                                                                                   
        assert n_src_vocab == n_tgt_vocab, \                                                                                                                                                                                              
        "To share word embedding table, the vocabulary size of src/tgt shall be the same."                                                                                                                                                
        self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight                                                                                                                                                               

def get_trainable_parameters(self):                                                                                                                                                                                                       
    ''' Avoid updating the position encoding '''                                                                                                                                                                                          
    enc_freezed_param_ids = set(map(id, self.encoder.position_enc.parameters()))                                                                                                                                                          
    dec_freezed_param_ids = set(map(id, self.decoder.position_enc.parameters()))                                                                                                                                                          
    dec_freezed_param_ids_cls = set(map(id, self.decoder_cls.position_enc.parameters()))                                                                                                                                                  

    freezed_param_ids = enc_freezed_param_ids | dec_freezed_param_ids | dec_freezed_param_ids_cls                                                                                                                                         
    return (p for p in self.parameters() if id(p) not in freezed_param_ids)                                                                                                                                                               

def forward(self, src, tgt, cls):                                                                                                                                                                                                         
    src_seq, src_pos = src                                                                                                                                                                                                                
    tgt_seq, tgt_pos = tgt                                                                                                                                                                                                                
    cls_seq, cls_pos = cls

    #TODO: I don't understand what this is doing (only will modify the lenght of one sentence, the largest one)                                                                                                                           
    tgt_seq = tgt_seq[:, :-1]                                                                                                                                                                                                             
    tgt_pos = tgt_pos[:, :-1]                                                                                                                                                                                                             

    cls_seq = cls_seq[:, :-1]                                                                                                                                                                                                             
    cls_pos = cls_pos[:, :-1]                                                                                                                                                                                                             

    enc_output, *_ = self.encoder(src_seq, src_pos)                                                                                                                                                                                       

    dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)                                                                                                                                                                  
    seq_logit = self.tgt_word_proj(dec_output)                                                                                                                                                                                            

    dec_cls_output, *_ = self.decoder_cls(cls_seq, cls_pos, src_seq, enc_output)                                                                                                                                                          
    seq_cls_logit = self.tgt_word_proj(dec_cls_output)                                                                                                                                                                                    

    return seq_logit.view(-1, seq_logit.size(2)), seq_cls_logit.view(-1, seq_cls_logit.size(2))


def train_epoch(model, training_data, crit, optimizer):                                                                                                                                                                                       
''' Epoch operation in training phase'''                                                                                                                                                                                                  

model.train()                                                                                                                                                                                                                             

total_loss = 0                                                                                                                                                                                                                            
n_total_words = 0                                                                                                                                                                                                                         
n_total_correct = 0                                                                                                                                                                                                                       

for batch in tqdm(                                                                                                                                                                                                                        
        training_data, mininterval=2,                                                                                                                                                                                                     
        desc='  - (Training)   ', leave=False):                                                                                                                                                                                           

    # prepare data                                                                                                                                                                                                                        
    src, tgt, cls = batch                                                                                                                                                                                                                 

    gold = tgt[0][:, 1:]                                                                                                                                                                                                                  
    class_gold = cls[0][:,1:]                                                                                                                                                                                                             

    # forward                                                                                                                                                                                                                             
    optimizer.zero_grad()                                                                                                                                                                                                                 
    pred = model(src, tgt)                                                                                                                                                                                                                

    # backward                                                                                                                                                                                                                            
    loss, n_correct = get_performance(crit, pred, gold)                                                                                                                                                                                   
    loss.backward()                                                                                                                                                                                                                       

    # update parameters                                                                                                                                                                                                                   
    optimizer.step()                                                                                                                                                                                                                      
    optimizer.update_learning_rate()                                                                                                                                                                                                      

    # note keeping                                                                                                                                                                                                                        
    n_words = gold.data.ne(Constants.PAD).sum()                                                                                                                                                                                           
    n_total_words += n_words                                                                                                                                                                                                              
    n_total_correct += n_correct                                                                                                                                                                                                          
    total_loss += loss.data[0]                                                                                                                                                                                                            

return total_loss/n_total_words, n_total_correct/n_total_words

Text Normalization Challenge - English Language [Kaggle]

Project presentation

Jupyter Notebook

Data analysis with pandas

Running

Data Preprocessing and Tokenization

Training

Testing

The code

Word classification trial