import streamlit as st st.set_page_config(layout="wide") from annotated_text import annotated_text, annotation import fitz import os import chromadb import uuid from pathlib import Path import os st.title("Contracts Classification ") import pandas as pd from langchain.retrievers import BM25Retriever, EnsembleRetriever from langchain.schema import Document from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from setfit import SetFitModel # Download from the 🤗 Hub clause_model = SetFitModel.from_pretrained("scholarly360/setfit-contracts-clauses") import spacy # Load the English model from SpaCy nlp = spacy.load("en_core_web_md") def split_into_sentences_with_offsets(text): """ Splits a paragraph into sentences and returns them along with their start and end offsets. :param text: The input text to be split into sentences. :return: A list of tuples, each containing a sentence and its start and end offsets. """ doc = nlp(text) return [(sent.text, sent.start_char, sent.end_char) for sent in doc.sents] def util_upload_file_and_return_list_docs(uploaded_files): #util_del_cwd() list_docs = [] list_save_path = [] for uploaded_file in uploaded_files: save_path = Path(os.getcwd(), uploaded_file.name) with open(save_path, mode='wb') as w: w.write(uploaded_file.getvalue()) #print('save_path:', save_path) docs = fitz.open(save_path) list_docs.append(docs) list_save_path.append(save_path) return(list_docs, list_save_path) #### Helper Functions to Split using Rolling Window (recomm : use smaller rolling window ) def split_txt_file_synthetic_sentence_rolling(ctxt, sentence_size_in_chars, sliding_size_in_chars,debug=False): sliding_size_in_chars = sentence_size_in_chars - sliding_size_in_chars pos_start = 0 pos_end = len(ctxt) final_return = [] if(debug): print('pos_start : ',pos_start) print('pos_end : ',pos_end) if(pos_endpos_end): if(start