czl commited on
Commit
0a56b6f
1 Parent(s): 8b2e68e
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. app.py +352 -0
  3. requirements.txt +10 -0
  4. vocab/idx2word.json +0 -0
  5. vocab/word2idx.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/NormSeq2Seq-188M_epoch35.pt filter=lfs diff=lfs merge=lfs -text
37
+ models/AttnSeq2Seq-188M_epoch35.pt filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import unicodedata
4
+ from typing import Tuple
5
+
6
+ import gradio as gr
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+
11
+ def greet(name):
12
+ return "Hello " + name + "!!"
13
+
14
+ # read word2idx and idx2word from json file
15
+
16
+ with open('vocab/word2idx.json', 'r') as f:
17
+ word2idx = json.load(f)
18
+ with open('vocab/idx2word.json', 'r') as f:
19
+ idx2word = json.load(f)
20
+
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+
23
+ def unicodetoascii(text):
24
+ """
25
+ Turn a Unicode string to plain ASCII
26
+
27
+ :param text: text to be converted
28
+ :return: text in ascii format
29
+ """
30
+ normalized_text = unicodedata.normalize('NFKD', str(text))
31
+ ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
32
+ return ascii_text
33
+
34
+ def preprocess_text(text, fn=unicodetoascii):
35
+
36
+ text = fn(text)
37
+ text = text.lower()
38
+ text = re.sub(r'http\S+', '', text)
39
+ text = re.sub(r'[^\x00-\x7F]+', "", text) # Remove non-ASCII characters
40
+ text = re.sub(r"(\w)[!?]+(\w)", r'\1\2', text) # Remove !? between words
41
+ text = re.sub(r"\s\s+", r" ", text).strip() # Remove extra spaces
42
+ return text
43
+
44
+ def tokenize(text):
45
+ """
46
+ Tokenize text
47
+ :param text: text to be tokenized
48
+ :return: list of tokens
49
+ """
50
+ return text.split()
51
+
52
+ def lookup_words(idx2word, indices):
53
+ """
54
+ Lookup words from indices
55
+ :param idx2word: index to word mapping
56
+ :param indices: indices to be converted
57
+ :return: list of words
58
+ """
59
+ return [idx2word[str(idx)] for idx in indices]
60
+
61
+
62
+ params = {'input_dim': len(word2idx),
63
+ 'emb_dim': 128,
64
+ 'enc_hid_dim': 256,
65
+ 'dec_hid_dim': 256,
66
+ 'dropout': 0.5,
67
+ 'attn_dim': 32,
68
+ 'teacher_forcing_ratio': 0.5,
69
+ 'epochs': 35}
70
+
71
+ class Encoder(nn.Module):
72
+ """
73
+ GRU RNN Encoder
74
+ """
75
+ def __init__(self,
76
+ input_dim: int,
77
+ emb_dim: int,
78
+ enc_hid_dim: int,
79
+ dec_hid_dim: int,
80
+ dropout: float = 0):
81
+ super(Encoder, self).__init__()
82
+
83
+ # dimension of imput
84
+ self.input_dim = input_dim
85
+ # dimension of embedding layer
86
+ self.emb_dim = emb_dim
87
+ # dimension of encoding hidden layer
88
+ self.enc_hid_dim = enc_hid_dim
89
+ # dimension of decoding hidden layer
90
+ self.dec_hid_dim = dec_hid_dim
91
+
92
+ # create embedding layer use to train embedding representations of the corpus
93
+ self.embedding = nn.Embedding(input_dim, emb_dim)
94
+
95
+ # use GRU for RNN
96
+ self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=False, num_layers=1)
97
+ self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
98
+ # create dropout layer which will help produce a more generalisable model
99
+ self.dropout = nn.Dropout(dropout)
100
+
101
+ def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
102
+ # apply dropout to the embedding layer
103
+ embedded = self.dropout(self.embedding(src))
104
+ # generate an output and hidden layer from the rnn
105
+ outputs, hidden = self.rnn(embedded)
106
+ hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
107
+ return outputs, hidden
108
+
109
+
110
+ class Attention(nn.Module):
111
+ """
112
+ Luong attention
113
+ """
114
+ def __init__(self,
115
+ enc_hid_dim: int,
116
+ dec_hid_dim: int,
117
+ attn_dim: int):
118
+ super(Attention, self).__init__()
119
+
120
+ # dimension of encoding hidden layer
121
+ self.enc_hid_dim = enc_hid_dim
122
+ # dimension of decoding hidden layer
123
+ self.dec_hid_dim = dec_hid_dim
124
+ self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
125
+
126
+ self.attn = nn.Linear(self.attn_in, attn_dim)
127
+
128
+ def forward(self,
129
+ decoder_hidden: torch.Tensor,
130
+ encoder_outputs: torch.Tensor) -> torch.Tensor:
131
+
132
+ src_len = encoder_outputs.shape[0]
133
+ repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
134
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
135
+ # Luong attention
136
+ energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)))
137
+ attention = torch.sum(energy, dim=2)
138
+
139
+ return F.softmax(attention, dim=1)
140
+
141
+
142
+ class AttnDecoder(nn.Module):
143
+ """
144
+ GRU RNN Decoder with attention
145
+ """
146
+ def __init__(self,
147
+ output_dim: int,
148
+ emb_dim: int,
149
+ enc_hid_dim: int,
150
+ dec_hid_dim: int,
151
+ attention: nn.Module,
152
+ dropout: float = 0):
153
+ super(AttnDecoder, self).__init__()
154
+
155
+ # dimention of output layer
156
+ self.output_dim = output_dim
157
+ # dimention of embedding layer
158
+ self.emb_dim = emb_dim
159
+ # dimention of encoding hidden layer
160
+ self.enc_hid_dim = enc_hid_dim
161
+ # dimention of decoding hidden layer
162
+ self.dec_hid_dim = dec_hid_dim
163
+ # drouput rate
164
+ self.dropout = dropout
165
+ # attention layer
166
+ self.attention = attention
167
+
168
+ # create embedding layer use to train embedding representations of the corpus
169
+ self.embedding = nn.Embedding(output_dim, emb_dim)
170
+ # use GRU for RNN
171
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
172
+ self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
173
+ self.dropout = nn.Dropout(dropout)
174
+
175
+ def encode_attention(self,
176
+ decoder_hidden: torch.Tensor,
177
+ encoder_outputs: torch.Tensor) -> torch.Tensor:
178
+
179
+ a = self.attention(decoder_hidden, encoder_outputs)
180
+ a = a.unsqueeze(1)
181
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
182
+ weighted_encoder_rep = torch.bmm(a, encoder_outputs)
183
+ weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
184
+ return weighted_encoder_rep
185
+
186
+ def forward(self,
187
+ input: torch.Tensor,
188
+ decoder_hidden: torch.Tensor,
189
+ encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
190
+
191
+ input = input.unsqueeze(0)
192
+ # apply dropout to embedding layer
193
+ embedded = self.dropout(self.embedding(input))
194
+ weighted_encoder = self.encode_attention(decoder_hidden, encoder_outputs)
195
+
196
+ # generate an output and hidden layer from the rnn
197
+ rnn_input = torch.cat((embedded, weighted_encoder), dim=2)
198
+ output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
199
+
200
+ embedded = embedded.squeeze(0)
201
+ output = output.squeeze(0)
202
+ weighted_encoder = weighted_encoder.squeeze(0)
203
+ output = self.out(torch.cat((output, weighted_encoder, embedded), dim=1))
204
+ return output, decoder_hidden.squeeze(0)
205
+
206
+ class Decoder(nn.Module):
207
+ """
208
+ GRU RNN Decoder without attention
209
+ """
210
+ def __init__(self,
211
+ output_dim: int,
212
+ emb_dim: int,
213
+ enc_hid_dim: int,
214
+ dec_hid_dim: int,
215
+ dropout: float = 0):
216
+ super(Decoder, self).__init__()
217
+
218
+ # dimention of output layer
219
+ self.output_dim = output_dim
220
+ # dimention of embedding layer
221
+ self.emb_dim = emb_dim
222
+ # dimention of encoding hidden layer
223
+ self.enc_hid_dim = enc_hid_dim
224
+ # dimention of decoding hidden layer
225
+ self.dec_hid_dim = dec_hid_dim
226
+ # drouput rate
227
+ self.dropout = dropout
228
+
229
+ # create embedding layer use to train embedding representations of the corpus
230
+ self.embedding = nn.Embedding(output_dim, emb_dim)
231
+ # GRU RNN
232
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
233
+ self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
234
+ self.dropout = nn.Dropout(dropout)
235
+
236
+ def forward(self,
237
+ input: torch.Tensor,
238
+ decoder_hidden: torch.Tensor,
239
+ encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor
240
+ , torch.Tensor]:
241
+
242
+ input = input.unsqueeze(0)
243
+ # apply dropout to embedding layer
244
+ embedded = self.dropout(self.embedding(input))
245
+ context = encoder_outputs[-1,:,:]
246
+ context = context.repeat(embedded.shape[0], 1, 1)
247
+ embs_and_context = torch.cat((embedded, context), -1)
248
+ # generate an output and hidden layer from the rnn
249
+ output, decoder_hidden = self.rnn(embs_and_context, decoder_hidden.unsqueeze(0))
250
+ embedded = embedded.squeeze(0)
251
+ output = output.squeeze(0)
252
+ context = context.squeeze(0)
253
+ output = self.out(torch.cat((output, embedded, context), -1))
254
+ return output, decoder_hidden.squeeze(0)
255
+
256
+ class Seq2Seq(nn.Module):
257
+ """
258
+ Seq-2-Seq model combining RNN encoder and RNN decoder
259
+ """
260
+ def __init__(self,
261
+ encoder: nn.Module,
262
+ decoder: nn.Module,
263
+ device: torch.device):
264
+ super(Seq2Seq, self).__init__()
265
+
266
+ self.encoder = encoder
267
+ self.decoder = decoder
268
+ self.device = device
269
+
270
+ def forward(self,
271
+ src: torch.Tensor,
272
+ trg: torch.Tensor,
273
+ teacher_forcing_ratio: float = 0.5) -> torch.Tensor:
274
+ src = src.transpose(0, 1) # (max_len, batch_size)
275
+ trg = trg.transpose(0, 1) # (max_len, batch_size)
276
+ batch_size = src.shape[1]
277
+ max_len = trg.shape[0]
278
+ trg_vocab_size = self.decoder.output_dim
279
+
280
+ outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
281
+ encoder_outputs, hidden = self.encoder(src)
282
+
283
+ # first input to the decoder is the <sos> token
284
+ output = trg[0,:]
285
+
286
+ for t in range(1, max_len):
287
+ output, hidden = self.decoder(output, hidden, encoder_outputs)
288
+ outputs[t] = output
289
+ teacher_force = random.random() < teacher_forcing_ratio
290
+ top1 = output.max(1)[1]
291
+ output = trg[t] if teacher_force else top1
292
+
293
+ return outputs
294
+
295
+
296
+
297
+ enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
298
+ attn = Attention(enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attn_dim=params['attn_dim'])
299
+ dec = AttnDecoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attention=attn, dropout=params['dropout'])
300
+ attn_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
301
+ attn_model.load_state_dict(torch.load('models/AttnSeq2Seq-188M_epoch35.pt'))
302
+ attn_model.to(device)
303
+
304
+ enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
305
+ dec = Decoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
306
+ norm_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
307
+ norm_model.load_state_dict(torch.load('models/NormSeq2Seq-188M_epoch35.pt'))
308
+ norm_model.to(device)
309
+
310
+ models_dict = {'AttentionSeq2Seq-188M': attn_model, 'NormalSeq2Seq-188M': norm_model}
311
+
312
+ def generate(models_str, sentence, max_len=12, word2idx=word2idx, idx2word=idx2word,
313
+ device=device, tokenize=tokenize, preprocess_text=preprocess_text,
314
+ lookup_words=lookup_words, models_dict=models_dict):
315
+ """
316
+ Generate response
317
+ :param model: model
318
+ :param sentence: sentence
319
+ :param max_len: maximum length of sequence
320
+ :param word2idx: word to index mapping
321
+ :param idx2word: index to word mapping
322
+ :return: response
323
+ """
324
+ model = models_dict[models_str]
325
+ model.eval()
326
+ sentence = preprocess_text(sentence)
327
+ tokens = tokenize(sentence)
328
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
329
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
330
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
331
+ outputs = [word2idx['<bos>']]
332
+ with torch.no_grad():
333
+ encoder_outputs, hidden = model.encoder(tokens)
334
+ for t in range(max_len):
335
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
336
+ top1 = output.max(1)[1]
337
+ outputs.append(top1.item())
338
+ if top1.item() == word2idx['<eos>']:
339
+ break
340
+ response = lookup_words(idx2word, outputs)
341
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
342
+
343
+
344
+
345
+ demo = gr.Interface(fn=generate,
346
+ inputs=[gr.Radio(list(models_dict.keys()), label="Model"),
347
+ gr.Textbox(lines=2, label="Input Text")],
348
+ outputs=gr.Textbox(label="Output Text"))
349
+
350
+
351
+ if __name__ == "__main__":
352
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy
3
+ pandas
4
+ requests
5
+ spacy
6
+ torch
7
+ torchtext
8
+ nltk
9
+ sentence-transformers
10
+ scipy
vocab/idx2word.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/word2idx.json ADDED
The diff for this file is too large to render. See raw diff