ASRtest / app.py
notabaka's picture
tst
011c6b2
raw
history blame
1.71 kB
#document q&a app to run on hugging face space (not for automatic speech recognition)
import streamlit as st
import torch
from transformers import AutoModelForCTC
from transformers import AutoProcessor
import faiss
import numpy as np
# Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY"
embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral")
processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral")
# Use streamlit to select one or more files (documents like pdf, word or excel)
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
# Create an index for storing the embeddings
index = faiss.IndexFlatL2(768) # Assuming the embeddings have a dimension of 768
# Implement code to embed text from selected files in vector database using the text embeddings model
success = True # Assume success by default
for file in uploaded_files:
# Read the content of the file
text = file.read().decode("utf-8")
# Tokenize the text
inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True)
# Get the embeddings
with torch.no_grad():
embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1)
# Add the embeddings to the index
try:
index.add(embeddings.numpy())
except Exception as e:
success = False # Set success to False if an exception occurs
st.write(f"Failed to add embeddings to the index: {e}")
break
if success:
st.write("Embeddings added to the index successfully")
else:
st.write("Operation failed")