#document q&a app to run on hugging face space (not for automatic speech recognition) import streamlit as st import torch from transformers import AutoModelForCTC from transformers import AutoProcessor import annoy import numpy as np # Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY" embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral") processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral") # Use streamlit to select one or more files (documents like pdf, word or excel) uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True) # Create an index for storing the embeddings index = annoy.AnnoyIndex(768, 'angular') # Assuming the embeddings have a dimension of 768 # Implement code to embed text from selected files in vector database using the text embeddings model success = True # Assume success by default for file in uploaded_files: # Read the content of the file text = file.read().decode("utf-8") # Tokenize the text inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True) # Get the embeddings with torch.no_grad(): embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1) # Add the embeddings to the index try: for i, emb in enumerate(embeddings.numpy()): index.add_item(i, emb) index.build(10) # 10 trees for building the index except Exception as e: success = False st.write(f"Failed to add embeddings to the index: {e}") if success: st.write("Embeddings added to the index successfully") else: st.write("Operation failed")