Spaces:

chukbert
/

indo-paraphrase-detection

Running

App Files Files Community

chukbert commited on 19 days ago

Commit

393182f

•

1 Parent(s): 2174f13

Create app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import streamlit as st
+from sentence_transformers import SentenceTransformer, util
+import joblib
+import numpy as np
+import requests
+from sklearn.metrics.pairwise import cosine_similarity
+model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch"
+model = SentenceTransformer(model_name)
+url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl"
+response = requests.get(url_xgb_model)
+with open("xgboost_best_model.pkl", "wb") as f:
+    f.write(response.content)
+xgb_model = joblib.load("xgboost_best_model.pkl")
+# Streamlit UI
+st.title("Paraphrase Detection with SentenceTransformer and XGBoost")
+st.write(
+    """
+    This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text,
+    followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs
+    and aims to identify if two sentences convey the same meaning.
+    ### How to Use the Application
+    - Enter two sentences in the input fields provided.
+    - Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other.
+    - The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model.
+    ### F1-Macro Scores
+    - **Validation F1-Macro Score**: 79.1%
+    - **Test F1-Macro Score**: 72.5%
+    """
+)
+st.header("Try It Out!")
+sentence1 = st.text_input("Enter the first sentence:")
+sentence2 = st.text_input("Enter the second sentence:")
+if st.button("Check Paraphrase"):
+    if sentence1 and sentence2:
+        with st.spinner("Processing..."):
+            embedding1 = model.encode(sentence1)
+            embedding2 = model.encode(sentence2)
+            # Hitung cosine similarity
+            similarity = cosine_similarity([embedding1], [embedding2])[0][0]
+            st.write(f"Cosine Similarity: {similarity:.4f}")
+            # Gunakan model XGBoost untuk memprediksi apakah ini parafrasa atau tidak
+            prediction = xgb_model.predict(np.array([[similarity]]))
+            if prediction == 1:
+                st.success("The sentences are likely paraphrases of each other.")
+            else:
+                st.warning("The sentences are not likely to be paraphrases.")
+    else:
+        st.error("Please enter both sentences to proceed.")
+st.sidebar.header("About the Model")
+st.sidebar.write(
+    "This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. "
+    "The training process focused on maximizing F1-macro scores for both validation and test sets."
+)