import streamlit as st import torch import torch.nn.functional as F from torch import Tensor import textract import os def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) if left_padding: return last_hidden_states[:, -1] else: sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_states.shape[0] return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery: {query}' st.title("Text Similarity Model") task = 'Given a web search query, retrieve relevant passages that answer the query' UPLOAD_DIR = "uploads" if not os.path.exists(UPLOAD_DIR): os.mkdir(UPLOAD_DIR) def save_upload(uploaded_file): filepath = os.path.join(UPLOAD_DIR, uploaded_file.name) with open(filepath,"wb") as f: f.write(uploaded_file.getbuffer()) return filepath docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx']) query = st.text_input("Enter search query") click = st.button("Search") def extract_text(doc): return textract.process(doc).decode('utf-8') if click and query: doc_contents = [] for doc in docs: # Extract text from each document doc_path = save_upload(doc) doc_text = extract_text(doc_path) doc_contents.append(doc_text) doc_embeddings = get_embeddings(doc_contents) query_embedding = get_embedding(query) scores = compute_similarity(query_embedding, doc_embeddings) ranked_docs = get_ranked_docs(scores) st.write("Most Relevant Documents") for doc, score in ranked_docs: st.write(f"{doc.name} (score: {score:.2f})")