File size: 1,520 Bytes
2c7e9b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import streamlit as st
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
import numpy as np

def convert_label_to_title(label):
  convert_dict = {
    0: "SỨC KHỎE",
    1: "GIÁO DỤC",
    2: "THỂ THAO",
    3: "PHÁP LUẬT",
    4: "KHOA HỌC",
    5: "DU LỊCH",
    6: "GIẢI TRÍ",
    7: "KINH DOANH"
  }
  return convert_dict[label]

def predict_sentence(model, tokenizer, sentence):
    input_data = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)
    logits = model(input_data['input_ids'], attention_mask=input_data['attention_mask']).logits
    probabilities = tf.nn.softmax(logits, axis=1)
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    highest_probability = probabilities.numpy()[0, predicted_class]
    title = convert_label_to_title(predicted_class)
    return title, probabilities.numpy(), highest_probability

def load_model(checkpoint, num_class):
  model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_class)
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  return model, tokenizer

checkpoint = 'distilbert-base-multilingual-cased'
model, tokenizer = load_model(checkpoint, 8)
model.load_weights('best_model_weights.h5')

text = st.text_area('Nhập tiêu đề vào đây')

if text:
    title, probabilities, highest = predict_sentence(model, tokenizer, text)
    out = {
        'title': title,
        'prob': probabilities
    }
    st.json(out)