File size: 3,305 Bytes
cd24b00
 
 
 
 
 
 
7b4e8af
 
cd24b00
 
 
f1886da
cd24b00
 
 
 
 
 
 
 
 
 
 
 
7b4e8af
 
 
 
 
 
 
 
cd24b00
 
 
7b0dedd
 
 
7373789
 
 
cd24b00
 
 
 
5e5a267
 
cd24b00
5e5a267
cd24b00
 
 
 
 
5e5a267
 
 
 
 
 
cd24b00
 
 
5e5a267
cd24b00
 
 
 
 
5e5a267
cd24b00
 
 
 
5e5a267
 
 
 
 
 
cd24b00
 
 
 
 
 
 
 
 
 
 
5e5a267
 
 
 
 
cd24b00
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import streamlit as st
import requests
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings  # Import HF Embeddings

HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

HF_API_URL = "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base"

HEADERS = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}

pdfs_directory = "./pdfs/"

template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""



# Initialize Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize vector store with embeddings
vector_store = InMemoryVectorStore(embedding=embeddings)



def upload_pdf(file):
    """Ensure directory exists and save the uploaded file."""
    os.makedirs(pdfs_directory, exist_ok=True)
    
    with open(pdfs_directory + file.name, "wb") as f:
        f.write(file.getbuffer())



def load_pdf(file_path):
    """Load text from the PDF"""
    loader = PDFPlumberLoader(file_path)
    documents = loader.load()

    return documents



def split_text(documents):
    """Split text into smaller chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )
    return text_splitter.split_documents(documents)


def index_docs(documents):
    vector_store.add_documents(documents)



def retrieve_docs(query):
    """Retrieve similar documents"""
    return vector_store.similarity_search(query)


def query_huggingface_api(prompt):
    """Send query to DeepSeek R1 model on Hugging Face"""
    payload = {"inputs": prompt}
    response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        return "Error: Unable to process request."


def answer_question(question, documents):
    """Generate answer using DeepSeek R1 API"""
    context = "\n\n".join([doc.page_content for doc in documents])
    prompt = ChatPromptTemplate.from_template(template).format(question=question, context=context)
    return query_huggingface_api(prompt)


st.title("PDF-based RAG Chatbot")

uploaded_file = st.file_uploader(
    "Upload PDF", 
    type="pdf", 
    accept_multiple_files=False
)

if uploaded_file:
    upload_pdf(uploaded_file)
    documents = load_pdf(pdfs_directory + uploaded_file.name)
    chunked_documents = split_text(documents)
    index_docs(chunked_documents)

    question = st.chat_input("Ask a question based on the document")

    if question:
        st.chat_message("user").write(question)
        related_documents = retrieve_docs(question)
        answer = answer_question(question, related_documents)
        st.chat_message("assistant").write(answer)