chukbert commited on
Commit
393182f
1 Parent(s): 2174f13

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import joblib
4
+ import numpy as np
5
+ import requests
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch"
9
+ model = SentenceTransformer(model_name)
10
+
11
+ url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl"
12
+ response = requests.get(url_xgb_model)
13
+ with open("xgboost_best_model.pkl", "wb") as f:
14
+ f.write(response.content)
15
+ xgb_model = joblib.load("xgboost_best_model.pkl")
16
+
17
+ # Streamlit UI
18
+ st.title("Paraphrase Detection with SentenceTransformer and XGBoost")
19
+ st.write(
20
+ """
21
+ This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text,
22
+ followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs
23
+ and aims to identify if two sentences convey the same meaning.
24
+
25
+ ### How to Use the Application
26
+ - Enter two sentences in the input fields provided.
27
+ - Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other.
28
+ - The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model.
29
+
30
+ ### F1-Macro Scores
31
+ - **Validation F1-Macro Score**: 79.1%
32
+ - **Test F1-Macro Score**: 72.5%
33
+ """
34
+ )
35
+
36
+ st.header("Try It Out!")
37
+ sentence1 = st.text_input("Enter the first sentence:")
38
+ sentence2 = st.text_input("Enter the second sentence:")
39
+
40
+ if st.button("Check Paraphrase"):
41
+ if sentence1 and sentence2:
42
+ with st.spinner("Processing..."):
43
+ embedding1 = model.encode(sentence1)
44
+ embedding2 = model.encode(sentence2)
45
+
46
+ # Hitung cosine similarity
47
+ similarity = cosine_similarity([embedding1], [embedding2])[0][0]
48
+ st.write(f"Cosine Similarity: {similarity:.4f}")
49
+
50
+ # Gunakan model XGBoost untuk memprediksi apakah ini parafrasa atau tidak
51
+ prediction = xgb_model.predict(np.array([[similarity]]))
52
+ if prediction == 1:
53
+ st.success("The sentences are likely paraphrases of each other.")
54
+ else:
55
+ st.warning("The sentences are not likely to be paraphrases.")
56
+ else:
57
+ st.error("Please enter both sentences to proceed.")
58
+
59
+ st.sidebar.header("About the Model")
60
+ st.sidebar.write(
61
+ "This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. "
62
+ "The training process focused on maximizing F1-macro scores for both validation and test sets."
63
+ )