Spaces:

chukbert
/

indo-paraphrase-detection

Running

App Files Files Community

indo-paraphrase-detection / app.py

chukbert

Create app.py

393182f verified 19 days ago

raw

history blame

2.73 kB

	import streamlit as st
	from sentence_transformers import SentenceTransformer, util
	import joblib
	import numpy as np
	import requests
	from sklearn.metrics.pairwise import cosine_similarity

	model_name = "chukbert/paraphrase-multilingual-MiniLM-L12-v2-MSRP-Indo-finetuned-2-epoch"
	model = SentenceTransformer(model_name)

	url_xgb_model = "https://huggingface.co/chukbert/xgb-msrp-indo/resolve/main/xgboost_best_model.pkl"
	response = requests.get(url_xgb_model)
	with open("xgboost_best_model.pkl", "wb") as f:
	f.write(response.content)
	xgb_model = joblib.load("xgboost_best_model.pkl")

	# Streamlit UI
	st.title("Paraphrase Detection with SentenceTransformer and XGBoost")
	st.write(
	"""
	This application uses a fine-tuned SentenceTransformer model for detecting paraphrases in Indonesian text,
	followed by an XGBoost classifier for final prediction. The model was trained on a dataset of sentence pairs
	and aims to identify if two sentences convey the same meaning.

	### How to Use the Application
	- Enter two sentences in the input fields provided.
	- Click the 'Check Paraphrase' button to check if the sentences are paraphrases of each other.
	- The application will provide the cosine similarity between the sentences and the final prediction by the XGBoost model.

	### F1-Macro Scores
	- Validation F1-Macro Score: 79.1%
	- Test F1-Macro Score: 72.5%
	"""
	)

	st.header("Try It Out!")
	sentence1 = st.text_input("Enter the first sentence:")
	sentence2 = st.text_input("Enter the second sentence:")

	if st.button("Check Paraphrase"):
	if sentence1 and sentence2:
	with st.spinner("Processing..."):
	embedding1 = model.encode(sentence1)
	embedding2 = model.encode(sentence2)

	# Hitung cosine similarity
	similarity = cosine_similarity([embedding1], [embedding2])[0][0]
	st.write(f"Cosine Similarity: {similarity:.4f}")

	# Gunakan model XGBoost untuk memprediksi apakah ini parafrasa atau tidak
	prediction = xgb_model.predict(np.array([[similarity]]))
	if prediction == 1:
	st.success("The sentences are likely paraphrases of each other.")
	else:
	st.warning("The sentences are not likely to be paraphrases.")
	else:
	st.error("Please enter both sentences to proceed.")

	st.sidebar.header("About the Model")
	st.sidebar.write(
	"This model is a fine-tuned version of 'paraphrase-multilingual-MiniLM-L12-v2' using Indonesian paraphrase datasets Microsoft Paraphrase Corpus, combined with an XGBoost classifier. "
	"The training process focused on maximizing F1-macro scores for both validation and test sets."
	)