import streamlit as st from sentence_transformers import SentenceTransformer, util import joblib import numpy as np import requests from sklearn.metrics.pairwise import cosine_similarity model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch" model = SentenceTransformer(model_name) url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl" response = requests.get(url_xgb_model) with open("xgboost_best_model.pkl", "wb") as f: f.write(response.content) xgb_model = joblib.load("xgboost_best_model.pkl") # Streamlit UI st.title("Paraphrase Detection with SentenceTransformer and XGBoost") st.write( """ This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text, followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs and aims to identify if two sentences convey the same meaning. ### How to Use the Application - Enter two sentences in the input fields provided. - Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other. - The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model. ### F1-Macro Scores - **Validation F1-Macro Score**: 79.1% - **Test F1-Macro Score**: 72.5% """ ) st.header("Try It Out!") sentence1 = st.text_input("Enter the first sentence:") sentence2 = st.text_input("Enter the second sentence:") if st.button("Check Paraphrase"): if sentence1 and sentence2: with st.spinner("Processing..."): embedding1 = model.encode(sentence1) embedding2 = model.encode(sentence2) # Hitung cosine similarity similarity = cosine_similarity([embedding1], [embedding2])[0][0] st.write(f"Cosine Similarity: {similarity:.4f}") # Gunakan model XGBoost untuk memprediksi apakah ini parafrasa atau tidak prediction = xgb_model.predict(np.array([[similarity]])) if prediction == 1: st.success("The sentences are likely paraphrases of each other.") else: st.warning("The sentences are not likely to be paraphrases.") else: st.error("Please enter both sentences to proceed.") st.sidebar.header("About the Model") st.sidebar.write( "This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. " "The training process focused on maximizing F1-macro scores for both validation and test sets." )