#document q&a app to run on hugging face space (not for automatic speech recognition) import streamlit as st import torch from transformers import AutoModelForCTC from transformers import AutoProcessor import faiss import numpy as np # Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY" embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral") processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral") # Use streamlit to select one or more files (documents like pdf, word or excel) uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True) # Create an index for storing the embeddings index = faiss.IndexFlatL2(768) # Assuming the embeddings have a dimension of 768 # Implement code to embed text from selected files in vector database using the text embeddings model success = True # Assume success by default for file in uploaded_files: # Read the content of the file text = file.read().decode("utf-8") # Tokenize the text inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True) # Get the embeddings with torch.no_grad(): embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1) # Add the embeddings to the index try: index.add(embeddings.numpy()) except Exception as e: success = False # Set success to False if an exception occurs st.write(f"Failed to add embeddings to the index: {e}") break if success: st.write("Embeddings added to the index successfully") else: st.write("Operation failed")