File size: 2,726 Bytes
393182f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
from sentence_transformers import SentenceTransformer, util
import joblib
import numpy as np
import requests
from sklearn.metrics.pairwise import cosine_similarity

model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch"  
model = SentenceTransformer(model_name)

url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl"
response = requests.get(url_xgb_model)
with open("xgboost_best_model.pkl", "wb") as f:
    f.write(response.content)
xgb_model = joblib.load("xgboost_best_model.pkl")

# Streamlit UI
st.title("Paraphrase Detection with SentenceTransformer and XGBoost")
st.write(
    """
    This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text,
    followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs
    and aims to identify if two sentences convey the same meaning.

    ### How to Use the Application
    - Enter two sentences in the input fields provided.
    - Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other.
    - The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model.

    ### F1-Macro Scores
    - **Validation F1-Macro Score**: 79.1%
    - **Test F1-Macro Score**: 72.5%
    """
)

st.header("Try It Out!")
sentence1 = st.text_input("Enter the first sentence:")
sentence2 = st.text_input("Enter the second sentence:")

if st.button("Check Paraphrase"):
    if sentence1 and sentence2:
        with st.spinner("Processing..."):
            embedding1 = model.encode(sentence1)
            embedding2 = model.encode(sentence2)
    
            # Hitung cosine similarity
            similarity = cosine_similarity([embedding1], [embedding2])[0][0]
            st.write(f"Cosine Similarity: {similarity:.4f}")
    
            # Gunakan model XGBoost untuk memprediksi apakah ini parafrasa atau tidak
            prediction = xgb_model.predict(np.array([[similarity]]))
            if prediction == 1:
                st.success("The sentences are likely paraphrases of each other.")
            else:
                st.warning("The sentences are not likely to be paraphrases.")
    else:
        st.error("Please enter both sentences to proceed.")

st.sidebar.header("About the Model")
st.sidebar.write(
    "This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. "
    "The training process focused on maximizing F1-macro scores for both validation and test sets."
)