4
File size: 2,798 Bytes
e922fe3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2981a00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from langdetect import detect
from newspaper import Article
from PIL import Image
import streamlit as st
import requests
import torch


st.markdown("## Prediction of Fakeness by Given URL")
background = Image.open('logo.jpg')
st.image(background)

st.markdown(f"### Article URL")
text = st.text_area("Insert some url here", 
        value="https://en.globes.co.il/en/article-yandex-looks-to-expand-activities-in-israel-1001406519")

@st.cache(allow_output_mutation=True)
def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.load_state_dict(torch.load('./model.pth', map_location='cpu'))

    model_name_translator = "facebook/wmt19-ru-en"
    tokenizer_translator = FSMTTokenizer.from_pretrained(model_name_translator)
    model_translator = FSMTForConditionalGeneration.from_pretrained(model_name_translator)
    model_translator.eval()
    return model, tokenizer, model_translator, tokenizer_translator

model, tokenizer, model_translator, tokenizer_translator = get_models_and_tokenizers()

article = Article(text)
article.download()
article.parse()
concated_text = article.title + '. ' + article.text
lang = detect(concated_text)

st.markdown(f"### Language detection")

if lang == 'ru':
    st.markdown(f"The language of this article is {lang.upper()} so we translated it!")
    with st.spinner('Waiting for translation'):
        input_ids = tokenizer_translator.encode(concated_text, 
            return_tensors="pt", max_length=512, truncation=True)
        outputs = model_translator.generate(input_ids)
        decoded = tokenizer_translator.decode(outputs[0], skip_special_tokens=True)
        st.markdown("### Translated Text")
        st.markdown(f"{decoded[:777]}")
        concated_text = decoded
else:
    st.markdown(f"The language of this article for sure:  {lang.upper()}!")

    st.markdown("### Extracted Text")
    st.markdown(f"{concated_text[:777]}")

tokens_info = tokenizer(concated_text, truncation=True, return_tensors="pt")
with torch.no_grad():
    raw_predictions = model(**tokens_info)
softmaxed = int(torch.nn.functional.softmax(raw_predictions.logits[0], dim=0)[1] * 100)
st.markdown("### Fakeness Prediction")
st.progress(softmaxed)
st.markdown(f"This is fake by **{softmaxed}%**!")
if (softmaxed > 70):
    st.error('We would not trust this text!')
elif (softmaxed > 40):
    st.warning('We are not sure about this text!')
else:
    st.success('We would trust this text!')