tst
Browse files- app.py +47 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#document q&a app to run on hugging face space (not for automatic speech recognition)
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import torch
|
5 |
+
from transformers import AutoModelForCTC
|
6 |
+
from transformers import AutoProcessor
|
7 |
+
import faiss
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
# Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY"
|
11 |
+
embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral")
|
12 |
+
processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral")
|
13 |
+
|
14 |
+
# Use streamlit to select one or more files (documents like pdf, word or excel)
|
15 |
+
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
|
16 |
+
|
17 |
+
# Create an index for storing the embeddings
|
18 |
+
index = faiss.IndexFlatL2(768) # Assuming the embeddings have a dimension of 768
|
19 |
+
|
20 |
+
# Implement code to embed text from selected files in vector database using the text embeddings model
|
21 |
+
success = True # Assume success by default
|
22 |
+
|
23 |
+
for file in uploaded_files:
|
24 |
+
# Read the content of the file
|
25 |
+
text = file.read().decode("utf-8")
|
26 |
+
|
27 |
+
# Tokenize the text
|
28 |
+
inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True)
|
29 |
+
|
30 |
+
# Get the embeddings
|
31 |
+
with torch.no_grad():
|
32 |
+
embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1)
|
33 |
+
# Add the embeddings to the index
|
34 |
+
try:
|
35 |
+
index.add(embeddings.numpy())
|
36 |
+
except Exception as e:
|
37 |
+
success = False # Set success to False if an exception occurs
|
38 |
+
st.write(f"Failed to add embeddings to the index: {e}")
|
39 |
+
break
|
40 |
+
|
41 |
+
if success:
|
42 |
+
st.write("Embeddings added to the index successfully")
|
43 |
+
else:
|
44 |
+
st.write("Operation failed")
|
45 |
+
|
46 |
+
|
47 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
torch
|
3 |
+
transformers
|
4 |
+
librosa
|
5 |
+
numpy
|
6 |
+
soundfile
|
7 |
+
faiss
|