Spaces:

danishjameel003
/

CSSChatbot

Running

File size: 6,063 Bytes

import os
import torch
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from dotenv import load_dotenv

# Set Streamlit page configuration
st.set_page_config(page_title="Chat with Notes and AI", page_icon=":books:", layout="wide")

# Load environment variables
load_dotenv()

# Optimized Dolly-v2 model pipeline
@st.cache_resource
def load_pipeline():
    model_name = "databricks/dolly-v2-1b"  # Smaller model for CPU

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Use float32 for CPU
        device_map="auto",
        trust_remote_code=True,
        offload_folder="./offload_weights"  # Folder to store weights if needed
    )

    # Create text-generation pipeline
    return pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=50,  # Limit response length for speed
        return_full_text=False,
        device_map="auto"
    )

# Initialize Dolly pipeline
generate_text = load_pipeline()

# Create HuggingFace pipeline wrapper for LangChain
hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

# Prompt templates
prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
prompt_with_context = PromptTemplate(input_variables=["instruction", "context"], template="{instruction}\n\nInput:\n{context}")

# Create LLM chains
llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

# Extract text from .txt files
def get_text_files_content(folder):
    text = ""
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
                text += file.read() + "\n"
    return text

# Convert text into smaller chunks
def get_chunks(raw_text):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=512,  # Smaller chunks for faster processing
        chunk_overlap=50,  # Minimal overlap
        length_function=len
    )
    return text_splitter.split_text(raw_text)

# Create FAISS vectorstore
def get_vectorstore(chunks):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}  # Force CPU usage for embeddings
    )
    return FAISS.from_texts(texts=chunks, embedding=embeddings)

# Generate response from user queries
def handle_question(question, vectorstore=None):
    if vectorstore:
        documents = vectorstore.similarity_search(question, k=1)  # Retrieve fewer chunks
        context = "\n".join([doc.page_content for doc in documents])[:512]  # Shorter context

        if context:
            result_with_context = llm_context_chain.invoke({"instruction": question, "context": context})
            return result_with_context

    # Fallback to instruction-only chain if no context is found
    return llm_chain.invoke({"instruction": question})

def main():
    st.title("Chat with Notes :books:")

    # Initialize session state
    if "vectorstore" not in st.session_state:
        st.session_state.vectorstore = None

    # Define folders for Current Affairs and Essays
    data_folder = "data"  # Current Affairs folders
    essay_folder = "essays"  # Essays folder

    # Sidebar for content selection
    content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])

    # Handle folder-based selection
    if content_type == "Current Affairs":
        subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))] if os.path.exists(data_folder) else []
    elif content_type == "Essays":
        subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')] if os.path.exists(essay_folder) else []

    selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)

    # Process the selected subject
    raw_text = ""
    if content_type == "Current Affairs" and selected_subject:
        subject_folder = os.path.join(data_folder, selected_subject)
        raw_text = get_text_files_content(subject_folder)
    elif content_type == "Essays" and selected_subject:
        subject_file = os.path.join(essay_folder, selected_subject + ".txt")
        if os.path.exists(subject_file):
            with open(subject_file, "r", encoding="utf-8") as file:
                raw_text = file.read()

    # Display preview of notes and load vectorstore
    if raw_text:
        st.subheader("Preview of Notes")
        st.text_area("Preview Content:", value=raw_text[:1000], height=300, disabled=True)  # Display shorter preview

        # Preload vectorstore if not already cached
        if "vectorstore" not in st.session_state or st.session_state.vectorstore is None:
            text_chunks = get_chunks(raw_text)
            st.session_state.vectorstore = get_vectorstore(text_chunks)
    else:
        st.warning("No content available for the selected subject.")

    # Chat interface
    st.subheader("Ask Your Question")
    question = st.text_input("Ask a question about your selected subject:")
    if question:
        if st.session_state.vectorstore:
            response = handle_question(question, st.session_state.vectorstore)
            st.subheader("Answer:")
            st.write(response.get("text", "No response found."))
        else:
            st.warning("Please load the content for the selected subject before asking a question.")

if __name__ == '__main__':
    main()