File size: 1,975 Bytes
011c6b2
 
0c7ffdb
 
ff9f02f
79ecc72
6ed8967
0c7ffdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f52a963
 
 
 
 
 
 
 
 
 
 
 
 
79ecc72
 
 
0c7ffdb
f52a963
0ee4a85
958bbd7
ff9f02f
 
0ee4a85
60eae40
79ecc72
 
0c7ffdb
79ecc72
 
edbe973
 
79ecc72
0c7ffdb
79ecc72
 
011c6b2
79ecc72
 
011c6b2
79ecc72
 
 
 
 
0c7ffdb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import streamlit as st
import torch
import torch.nn.functional as F
from torch import Tensor

import textract
import os

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

st.title("Text Similarity Model")

task = 'Given a web search query, retrieve relevant passages that answer the query'  


UPLOAD_DIR = "uploads" 

if not os.path.exists(UPLOAD_DIR):
    os.mkdir(UPLOAD_DIR)

def save_upload(uploaded_file):
    filepath = os.path.join(UPLOAD_DIR, uploaded_file.name)
    with open(filepath,"wb") as f: 
        f.write(uploaded_file.getbuffer())
        
    return filepath

docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx']) 
query = st.text_input("Enter search query")
click = st.button("Search")




def extract_text(doc):
    return textract.process(doc).decode('utf-8')


if click and query:
    doc_contents = []
    
    for doc in docs:
        # Extract text from each document
        doc_path = save_upload(doc)
        doc_text = extract_text(doc_path)  
        doc_contents.append(doc_text)

    doc_embeddings = get_embeddings(doc_contents)  
    query_embedding = get_embedding(query)
    
    scores = compute_similarity(query_embedding, doc_embeddings)
    ranked_docs = get_ranked_docs(scores)
    
    st.write("Most Relevant Documents")
    for doc, score in ranked_docs:
        st.write(f"{doc.name} (score: {score:.2f})")