Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import torch | |
from transformers import BitsAndBytesConfig | |
# Import necessary modules from llama-index and langchain | |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, PromptTemplate | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from llama_index.embeddings.langchain import LangchainEmbedding | |
# --------------------------- | |
# Retrieve Hugging Face Token from Environment Variables | |
# --------------------------- | |
hf_token = os.getenv("HF_TOKEN") | |
if hf_token is None: | |
st.error("Missing Hugging Face token. Please set HF_TOKEN in your Space secrets.") | |
st.stop() | |
# --------------------------- | |
# Configure BitsAndBytes Quantization (only if GPU is available) | |
# --------------------------- | |
if torch.cuda.is_available(): | |
quantization_config = BitsAndBytesConfig( | |
load_in_8bit=True, | |
bnb_4bit_compute_dtype=torch.float16 | |
) | |
else: | |
# If no GPU is available, disable bitsandbytes quantization | |
quantization_config = None | |
# --------------------------- | |
# Configure your LLM and embeddings | |
# --------------------------- | |
system_prompt = """ | |
You are a Q&A assistant. Your goal is to answer questions as | |
accurately as possible based on the instructions and context provided. | |
""" | |
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>") | |
# Prepare model_kwargs based on whether quantization is enabled | |
model_kwargs = {"torch_dtype": torch.float16} | |
if quantization_config is not None: | |
model_kwargs["quantization_config"] = quantization_config | |
# Initialize the HuggingFaceLLM with your model settings and authentication token | |
llm = HuggingFaceLLM( | |
context_window=4096, | |
max_new_tokens=256, | |
generate_kwargs={"temperature": 0.0, "do_sample": False}, | |
system_prompt=system_prompt, | |
query_wrapper_prompt=query_wrapper_prompt, | |
tokenizer_name="meta-llama/Llama-2-7b-chat-hf", | |
model_name="meta-llama/Llama-2-7b-chat-hf", | |
device_map="auto", | |
model_kwargs=model_kwargs, | |
) | |
# Set up the embedding model using Langchain's HuggingFaceEmbeddings | |
lc_embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
embed_model = LangchainEmbedding(lc_embed_model) | |
# Apply global settings for llama-index | |
Settings.llm = llm | |
Settings.embed_model = embed_model | |
Settings.chunk_size = 1024 | |
# --------------------------- | |
# Load documents from repository | |
# --------------------------- | |
DATA_DIR = "data" # Ensure this folder exists in your repository and contains your documents | |
try: | |
documents = SimpleDirectoryReader(DATA_DIR).load_data() | |
except Exception as e: | |
st.error(f"Error loading documents from '{DATA_DIR}': {e}") | |
documents = [] | |
if not documents: | |
st.warning("No documents found in the data folder. Please add your documents and redeploy.") | |
st.stop() | |
else: | |
# Create the vector store index and query engine | |
index = VectorStoreIndex.from_documents(documents) | |
query_engine = index.as_query_engine() | |
# --------------------------- | |
# Streamlit Interface | |
# --------------------------- | |
st.title("LLama Index Q&A Assistant") | |
user_query = st.text_input("Enter your question:") | |
if user_query: | |
with st.spinner("Querying..."): | |
response = query_engine.query(user_query) | |
st.markdown("### Response:") | |
st.write(response) | |