Spaces:

czl
/

Seq2Seq

Build error

App Files Files Community

czl commited on Jul 28, 2023

Commit

0a56b6f

•

1 Parent(s): 8b2e68e

add demo

Browse files

Files changed (5) hide show

.gitattributes +2 -0
app.py +352 -0
requirements.txt +10 -0
vocab/idx2word.json +0 -0
vocab/word2idx.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/NormSeq2Seq-188M_epoch35.pt filter=lfs diff=lfs merge=lfs -text
+models/AttnSeq2Seq-188M_epoch35.pt filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import json
+import re
+import unicodedata
+from typing import Tuple
+import gradio as gr
+import torch
+import torch.nn as nn
+def greet(name):
+    return "Hello " + name + "!!"
+# read word2idx and idx2word from json file
+with open('vocab/word2idx.json', 'r') as f:
+    word2idx = json.load(f)
+with open('vocab/idx2word.json', 'r') as f:
+    idx2word = json.load(f)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def unicodetoascii(text):
+    """
+    Turn a Unicode string to plain ASCII
+    :param text: text to be converted
+    :return: text in ascii format
+    """
+    normalized_text = unicodedata.normalize('NFKD', str(text))
+    ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
+    return ascii_text
+def preprocess_text(text, fn=unicodetoascii):
+    text = fn(text)
+    text = text.lower()
+    text = re.sub(r'http\S+', '', text)
+    text = re.sub(r'[^\x00-\x7F]+', "", text) # Remove non-ASCII characters
+    text = re.sub(r"(\w)[!?]+(\w)", r'\1\2', text) # Remove !? between words
+    text = re.sub(r"\s\s+", r" ", text).strip() # Remove extra spaces
+    return text
+def tokenize(text):
+    """
+    Tokenize text
+    :param text: text to be tokenized
+    :return: list of tokens
+    """
+    return text.split()
+def lookup_words(idx2word, indices):
+    """
+    Lookup words from indices
+    :param idx2word: index to word mapping
+    :param indices: indices to be converted
+    :return: list of words
+    """
+    return [idx2word[str(idx)] for idx in indices]
+params = {'input_dim': len(word2idx),
+            'emb_dim': 128,
+            'enc_hid_dim': 256,
+            'dec_hid_dim': 256,
+            'dropout': 0.5,
+            'attn_dim': 32,
+            'teacher_forcing_ratio': 0.5,
+            'epochs': 35}
+class Encoder(nn.Module):
+    """
+    GRU RNN Encoder
+    """
+    def __init__(self,
+                 input_dim: int,
+                 emb_dim: int,
+                 enc_hid_dim: int,
+                 dec_hid_dim: int,
+                 dropout: float = 0):
+        super(Encoder, self).__init__()
+        # dimension of imput
+        self.input_dim = input_dim
+        # dimension of embedding layer
+        self.emb_dim = emb_dim
+        # dimension of encoding hidden layer
+        self.enc_hid_dim = enc_hid_dim
+        # dimension of decoding hidden layer
+        self.dec_hid_dim = dec_hid_dim
+        # create embedding layer use to train embedding representations of the corpus
+        self.embedding = nn.Embedding(input_dim, emb_dim)
+        # use GRU for RNN
+        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=False, num_layers=1)
+        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
+        # create dropout layer which will help produce a more generalisable model
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # apply dropout to the embedding layer
+        embedded = self.dropout(self.embedding(src))
+        # generate an output and hidden layer from the rnn
+        outputs, hidden = self.rnn(embedded)
+        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
+        return outputs, hidden
+class Attention(nn.Module):
+    """
+    Luong attention
+    """
+    def __init__(self,
+                 enc_hid_dim: int,
+                 dec_hid_dim: int,
+                 attn_dim: int):
+        super(Attention, self).__init__()
+        # dimension of encoding hidden layer
+        self.enc_hid_dim = enc_hid_dim
+        # dimension of decoding hidden layer
+        self.dec_hid_dim = dec_hid_dim
+        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
+        self.attn = nn.Linear(self.attn_in, attn_dim)
+    def forward(self,
+                decoder_hidden: torch.Tensor,
+                encoder_outputs: torch.Tensor) -> torch.Tensor:
+        src_len = encoder_outputs.shape[0]
+        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
+        encoder_outputs = encoder_outputs.permute(1, 0, 2)
+        # Luong attention
+        energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)))
+        attention = torch.sum(energy, dim=2)
+        return F.softmax(attention, dim=1)
+class AttnDecoder(nn.Module):
+    """
+    GRU RNN Decoder with attention
+    """
+    def __init__(self,
+                 output_dim: int,
+                 emb_dim: int,
+                 enc_hid_dim: int,
+                 dec_hid_dim: int,
+                 attention: nn.Module,
+                 dropout: float = 0):
+        super(AttnDecoder, self).__init__()
+        # dimention of output layer
+        self.output_dim = output_dim
+        # dimention of embedding layer
+        self.emb_dim = emb_dim
+        # dimention of encoding hidden layer
+        self.enc_hid_dim = enc_hid_dim
+        # dimention of decoding hidden layer
+        self.dec_hid_dim = dec_hid_dim
+        # drouput rate
+        self.dropout = dropout
+        # attention layer
+        self.attention = attention
+        # create embedding layer use to train embedding representations of the corpus
+        self.embedding = nn.Embedding(output_dim, emb_dim)
+        # use GRU for RNN
+        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
+        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def encode_attention(self,
+                              decoder_hidden: torch.Tensor,
+                              encoder_outputs: torch.Tensor) -> torch.Tensor:
+        a = self.attention(decoder_hidden, encoder_outputs)
+        a = a.unsqueeze(1)
+        encoder_outputs = encoder_outputs.permute(1, 0, 2)
+        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
+        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
+        return weighted_encoder_rep
+    def forward(self,
+                input: torch.Tensor,
+                decoder_hidden: torch.Tensor,
+                encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        input = input.unsqueeze(0)
+        # apply dropout to embedding layer
+        embedded = self.dropout(self.embedding(input))
+        weighted_encoder = self.encode_attention(decoder_hidden, encoder_outputs)
+        # generate an output and hidden layer from the rnn
+        rnn_input = torch.cat((embedded, weighted_encoder), dim=2)
+        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
+        embedded = embedded.squeeze(0)
+        output = output.squeeze(0)
+        weighted_encoder = weighted_encoder.squeeze(0)
+        output = self.out(torch.cat((output, weighted_encoder, embedded), dim=1))
+        return output, decoder_hidden.squeeze(0)
+class Decoder(nn.Module):
+    """
+    GRU RNN Decoder without attention
+    """
+    def __init__(self,
+                 output_dim: int,
+                 emb_dim: int,
+                 enc_hid_dim: int,
+                 dec_hid_dim: int,
+                 dropout: float = 0):
+        super(Decoder, self).__init__()
+        # dimention of output layer
+        self.output_dim = output_dim
+        # dimention of embedding layer
+        self.emb_dim = emb_dim
+        # dimention of encoding hidden layer
+        self.enc_hid_dim = enc_hid_dim
+        # dimention of decoding hidden layer
+        self.dec_hid_dim = dec_hid_dim
+        # drouput rate
+        self.dropout = dropout
+        # create embedding layer use to train embedding representations of the corpus
+        self.embedding = nn.Embedding(output_dim, emb_dim)
+        # GRU RNN
+        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
+        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self,
+                input: torch.Tensor,
+                decoder_hidden: torch.Tensor,
+                encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor
+                                                        , torch.Tensor]:
+        input = input.unsqueeze(0)
+        # apply dropout to embedding layer
+        embedded = self.dropout(self.embedding(input))
+        context = encoder_outputs[-1,:,:]
+        context = context.repeat(embedded.shape[0], 1, 1)
+        embs_and_context = torch.cat((embedded, context), -1)
+        # generate an output and hidden layer from the rnn
+        output, decoder_hidden = self.rnn(embs_and_context, decoder_hidden.unsqueeze(0))
+        embedded = embedded.squeeze(0)
+        output = output.squeeze(0)
+        context = context.squeeze(0)
+        output = self.out(torch.cat((output, embedded, context), -1))
+        return output, decoder_hidden.squeeze(0)
+class Seq2Seq(nn.Module):
+    """
+    Seq-2-Seq model combining RNN encoder and RNN decoder
+    """
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 device: torch.device):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+    def forward(self,
+                src: torch.Tensor,
+                trg: torch.Tensor,
+                teacher_forcing_ratio: float = 0.5) -> torch.Tensor:
+        src = src.transpose(0, 1) # (max_len, batch_size)
+        trg = trg.transpose(0, 1) # (max_len, batch_size)
+        batch_size = src.shape[1]
+        max_len = trg.shape[0]
+        trg_vocab_size = self.decoder.output_dim
+        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
+        encoder_outputs, hidden = self.encoder(src)
+        # first input to the decoder is the <sos> token
+        output = trg[0,:]
+        for t in range(1, max_len):
+            output, hidden = self.decoder(output, hidden, encoder_outputs)
+            outputs[t] = output
+            teacher_force = random.random() < teacher_forcing_ratio
+            top1 = output.max(1)[1]
+            output = trg[t] if teacher_force else top1
+        return outputs
+enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
+attn = Attention(enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attn_dim=params['attn_dim'])
+dec = AttnDecoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attention=attn, dropout=params['dropout'])
+attn_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
+attn_model.load_state_dict(torch.load('models/AttnSeq2Seq-188M_epoch35.pt'))
+attn_model.to(device)
+enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
+dec = Decoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
+norm_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
+norm_model.load_state_dict(torch.load('models/NormSeq2Seq-188M_epoch35.pt'))
+norm_model.to(device)
+models_dict = {'AttentionSeq2Seq-188M': attn_model, 'NormalSeq2Seq-188M': norm_model}
+def generate(models_str, sentence, max_len=12, word2idx=word2idx, idx2word=idx2word,
+             device=device, tokenize=tokenize, preprocess_text=preprocess_text,
+             lookup_words=lookup_words, models_dict=models_dict):
+    """
+    Generate response
+    :param model: model
+    :param sentence: sentence
+    :param max_len: maximum length of sequence
+    :param word2idx: word to index mapping
+    :param idx2word: index to word mapping
+    :return: response
+    """
+    model = models_dict[models_str]
+    model.eval()
+    sentence = preprocess_text(sentence)
+    tokens = tokenize(sentence)
+    tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
+    tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
+    tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
+    outputs = [word2idx['<bos>']]
+    with torch.no_grad():
+        encoder_outputs, hidden = model.encoder(tokens)
+    for t in range(max_len):
+        output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
+        top1 = output.max(1)[1]
+        outputs.append(top1.item())
+        if top1.item() == word2idx['<eos>']:
+            break
+    response = lookup_words(idx2word, outputs)
+    return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
+demo = gr.Interface(fn=generate,
+                    inputs=[gr.Radio(list(models_dict.keys()), label="Model"),
+                     gr.Textbox(lines=2, label="Input Text")],
+                     outputs=gr.Textbox(label="Output Text"))
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio
+numpy
+pandas
+requests
+spacy
+torch
+torchtext
+nltk
+sentence-transformers
+scipy

vocab/idx2word.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab/word2idx.json ADDED Viewed

The diff for this file is too large to render. See raw diff