notabaka commited on
Commit
0c7ffdb
1 Parent(s): 24ec349
Files changed (2) hide show
  1. app.py +52 -43
  2. requirements.txt +1 -6
app.py CHANGED
@@ -1,48 +1,57 @@
1
- #document q&a app to run on hugging face space (not for automatic speech recognition)
2
-
3
  import streamlit as st
4
  import torch
5
- from transformers import AutoModelForCTC
6
- from transformers import AutoProcessor
7
- import annoy
8
- import numpy as np
9
-
10
- # Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY"
11
- embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral")
12
- processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral")
13
-
14
- # Use streamlit to select one or more files (documents like pdf, word or excel)
15
- uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
16
-
17
- # Create an index for storing the embeddings
18
- index = annoy.AnnoyIndex(768, 'angular') # Assuming the embeddings have a dimension of 768
19
-
20
- # Implement code to embed text from selected files in vector database using the text embeddings model
21
- success = True # Assume success by default
22
-
23
- for file in uploaded_files:
24
- # Read the content of the file
25
- text = file.read().decode("utf-8")
26
-
27
- # Tokenize the text
28
- inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True)
29
-
30
- # Get the embeddings
31
- with torch.no_grad():
32
- embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1)
33
- # Add the embeddings to the index
34
- try:
35
- for i, emb in enumerate(embeddings.numpy()):
36
- index.add_item(i, emb)
37
- index.build(10) # 10 trees for building the index
38
- except Exception as e:
39
- success = False
40
- st.write(f"Failed to add embeddings to the index: {e}")
41
-
42
- if success:
43
- st.write("Embeddings added to the index successfully")
44
- else:
45
- st.write("Operation failed")
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ import torch.nn.functional as F
4
+ from torch import Tensor
5
+ from transformers import AutoTokenizer, AutoModel
6
+
7
+ def last_token_pool(last_hidden_states: Tensor,
8
+ attention_mask: Tensor) -> Tensor:
9
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
10
+ if left_padding:
11
+ return last_hidden_states[:, -1]
12
+ else:
13
+ sequence_lengths = attention_mask.sum(dim=1) - 1
14
+ batch_size = last_hidden_states.shape[0]
15
+ return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
16
+
17
+ def get_detailed_instruct(task_description: str, query: str) -> str:
18
+ return f'Instruct: {task_description}\nQuery: {query}'
19
+
20
+ st.title("Text Similarity Model")
21
+
22
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
23
+
24
+ query1 = st.text_input("Enter first query")
25
+ query2 = st.text_input("Enter second query")
26
+
27
+ if query1 and query2:
28
+ queries = [
29
+ get_detailed_instruct(task, query1),
30
+ get_detailed_instruct(task, query2)
31
+ ]
32
+
33
+ passages = [
34
+ "To bake a delicious chocolate cake, you'll need the following ingredients: all-purpose flour, sugar, cocoa powder, baking powder, baking soda, salt, eggs, milk, vegetable oil, and vanilla extract. Start by preheating your oven to 350°F (175°C). In a mixing bowl, combine the dry ingredients (flour, sugar, cocoa powder, baking powder, baking soda, and salt). In a separate bowl, whisk together the wet ingredients (eggs, milk, vegetable oil, and vanilla extract). Gradually add the wet mixture to the dry ingredients, stirring until well combined. Pour the batter into a greased cake pan and bake for 30-35 minutes. Let it cool before frosting with your favorite chocolate frosting. Enjoy your homemade chocolate cake!",
35
+ "The flu, or influenza, is an illness caused by influenza viruses. Common symptoms of the flu include a high fever, chills, cough, sore throat, runny or stuffy nose, body aches, headache, fatigue, and sometimes nausea and vomiting. These symptoms can come on suddenly and are usually more severe than the common cold. It's important to get plenty of rest, stay hydrated, and consult a healthcare professional if you suspect you have the flu. In some cases, antiviral medications can help alleviate symptoms and reduce the duration of the illness."]
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
38
+ model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-Mistral')
39
+
40
+ # Get embeddings
41
+ max_length = 4096
42
+ input_texts = queries + passages
43
+ batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
44
+
45
+ outputs = model(**batch_dict)
46
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
47
+
48
+ # Normalize embeddings
49
+ embeddings = F.normalize(embeddings, p=2, dim=1)
50
+
51
+ scores = (embeddings[:2] @ embeddings[2:].T) * 100
52
+
53
+ st.write("Similarity scores:", scores.tolist())
54
 
55
 
56
 
57
+
requirements.txt CHANGED
@@ -1,7 +1,2 @@
1
- streamlit
2
  torch
3
- transformers
4
- librosa
5
- numpy
6
- soundfile
7
- annoy
 
 
1
  torch
2
+ transformers