|
import streamlit as st |
|
from sentence_transformers import SentenceTransformer, util |
|
import joblib |
|
import numpy as np |
|
import requests |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch" |
|
model = SentenceTransformer(model_name) |
|
|
|
url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl" |
|
response = requests.get(url_xgb_model) |
|
with open("xgboost_best_model.pkl", "wb") as f: |
|
f.write(response.content) |
|
xgb_model = joblib.load("xgboost_best_model.pkl") |
|
|
|
|
|
st.title("Paraphrase Detection with SentenceTransformer and XGBoost for Indonesian Sentences") |
|
st.write( |
|
""" |
|
This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text, |
|
followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs |
|
and aims to identify if two sentences convey the same meaning. |
|
|
|
### How to Use the Application |
|
- Enter two sentences in the input fields provided in Bahasa Indonesia. |
|
- Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other. |
|
- The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model. |
|
|
|
### F1-Macro Scores |
|
- **Validation F1-Macro Score**: 79.1% |
|
- **Test F1-Macro Score**: 72.5% |
|
""" |
|
) |
|
|
|
st.header("Try It Out!") |
|
sentence1 = st.text_input("Enter the first sentence:") |
|
sentence2 = st.text_input("Enter the second sentence:") |
|
|
|
if st.button("Check Paraphrase"): |
|
if sentence1 and sentence2: |
|
with st.spinner("Processing..."): |
|
embedding1 = model.encode(sentence1) |
|
embedding2 = model.encode(sentence2) |
|
|
|
|
|
similarity = cosine_similarity([embedding1], [embedding2])[0][0] |
|
st.write(f"Cosine Similarity: {similarity:.4f}") |
|
|
|
|
|
prediction = xgb_model.predict(np.array([[similarity]])) |
|
if prediction == 1: |
|
st.success("The sentences are likely paraphrases of each other.") |
|
else: |
|
st.warning("The sentences are not likely to be paraphrases.") |
|
else: |
|
st.error("Please enter both sentences to proceed.") |
|
|
|
st.sidebar.header("About the Model") |
|
st.sidebar.write( |
|
"This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. " |
|
"The training process focused on maximizing F1-macro scores for both validation and test sets." |
|
) |