Spaces:

NimaKL
/

spamd

Build error

App Files Files Community

NimaKL commited on Oct 3, 2022

Commit

00addfe

•

1 Parent(s): 2ea92ac

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -54

app.py CHANGED Viewed

@@ -9,66 +9,55 @@ with col1:
     st.title("Spamd: Turkish Spam Detector")
     st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers         technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
-    if st.button('Load Model', disabled=False):
-        with st.spinner('Wait for it...'):
-            import torch
-            import numpy as np
-            from transformers import AutoTokenizer
-            tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
-            from transformers import AutoModel
-            model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
-            token_id = []
-            attention_masks = []
-            def preprocessing(input_text, tokenizer):
-              '''
               Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
                 - input_ids: list of token ids
                 - token_type_ids: list of token type ids
                 - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
-              '''
-              return tokenizer.encode_plus(
-                                    input_text,
-                                    add_special_tokens = True,
-                                    max_length = 32,
-                                    pad_to_max_length = True,
-                                    return_attention_mask = True,
-                                    return_tensors = 'pt'
-                               )
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-            #Used for printing the name if the variables. Removing it will not intrupt the project.
-            st.success("Model Loaded!")
-            def predict(new_sentence):
-                # We need Token IDs and Attention Mask for inference on the new sentence
-                test_ids = []
-                test_attention_mask = []
-                # Apply the tokenizer
-                encoding = preprocessing(new_sentence, tokenizer)
-                # Extract IDs and Attention Mask
-                test_ids.append(encoding['input_ids'])
-                test_attention_mask.append(encoding['attention_mask'])
-                test_ids = torch.cat(test_ids, dim = 0)
-                test_attention_mask = torch.cat(test_attention_mask, dim = 0)
-                # Forward pass, calculate logit predictions
                 with torch.no_grad():
                     output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
-                prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
-                pred = 'Predicted Class: '+ prediction
-                with col2:
-                    st.header(pred)
-                    text = st.text_input("Enter the text you'd like to analyze for spam.")
-                    if text or st.button('Analyze'):
-                        predict(text)

     st.title("Spamd: Turkish Spam Detector")
     st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers         technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
+if st.button('Load Model', disabled=False):
+    with st.spinner('Wait for it...'):
+    import torch
+    import numpy as np
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+    from transformers import AutoModel
+    model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
+    token_id = []
+    attention_masks = []
+    def preprocessing(input_text, tokenizer):
+    '''
               Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
                 - input_ids: list of token ids
                 - token_type_ids: list of token type ids
                 - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
+    '''
+        return tokenizer.encode_plus(
+        input_text,
+        add_special_tokens = True,
+        max_length = 32,
+        pad_to_max_length = True,
+        return_attention_mask = True,
+        return_tensors = 'pt'
+        )
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    st.success("Model Loaded!")
+        def predict(new_sentence):
+            # We need Token IDs and Attention Mask for inference on the new sentence
+            test_ids = []
+            test_attention_mask = []
+            # Apply the tokenizer
+            encoding = preprocessing(new_sentence, tokenizer)
+            # Extract IDs and Attention Mask
+            test_ids.append(encoding['input_ids'])
+            test_attention_mask.append(encoding['attention_mask'])
+            test_ids = torch.cat(test_ids, dim = 0)
+            test_attention_mask = torch.cat(test_attention_mask, dim = 0)
+            # Forward pass, calculate logit predictions
                 with torch.no_grad():
                     output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
+                    prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
+                    pred = 'Predicted Class: '+ prediction
+                    with col2:
+                        st.header(pred)
+                        text = st.text_input("Enter the text you'd like to analyze for spam.")
+                        if text or st.button('Analyze'):
+                            predict(text)