Manishkumaryadav commited on
Commit
bd1a52f
·
verified ·
1 Parent(s): a15e68f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ import faiss
4
+ import torch
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import pipeline
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+
10
+ # Load embedding model
11
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
13
+
14
+ # Function to extract text from PDF
15
+ def extract_text_from_pdf(pdf_file):
16
+ text = ""
17
+ with pdfplumber.open(pdf_file) as pdf:
18
+ for page in pdf.pages:
19
+ text += page.extract_text() + "\n"
20
+ return text.strip()
21
+
22
+ # Chunking text
23
+ def chunk_text(text, chunk_size=500, overlap=100):
24
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
25
+ return splitter.split_text(text)
26
+
27
+ # Generate embeddings
28
+ def generate_embeddings(text_chunks):
29
+ return embedding_model.encode(text_chunks, convert_to_numpy=True)
30
+
31
+ # Create FAISS index
32
+ def create_faiss_index(embeddings):
33
+ dimension = embeddings.shape[1]
34
+ index = faiss.IndexFlatL2(dimension)
35
+ index.add(embeddings)
36
+ return index
37
+
38
+ # Retrieve relevant context
39
+ def retrieve_context(query, index, text_chunks, top_k=3):
40
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
41
+ distances, indices = index.search(query_embedding, top_k)
42
+ retrieved_text = "\n".join([text_chunks[i] for i in indices[0]])
43
+ return retrieved_text
44
+
45
+ # Generate Answer
46
+ def answer_question(query, faiss_index, book_chunks):
47
+ context = retrieve_context(query, faiss_index, book_chunks)
48
+ result = qa_pipeline(question=query, context=context)
49
+ return result["answer"]
50
+
51
+ # Streamlit UI
52
+ st.title("📖 Book-Based Question Answering System")
53
+ st.write("Upload a book (PDF) and ask any question!")
54
+
55
+ # File uploader
56
+ uploaded_file = st.file_uploader("Upload a PDF book", type="pdf")
57
+
58
+ if uploaded_file:
59
+ st.write("Processing book...")
60
+ book_text = extract_text_from_pdf(uploaded_file)
61
+ book_chunks = chunk_text(book_text)
62
+ chunk_embeddings = generate_embeddings(book_chunks)
63
+ faiss_index = create_faiss_index(chunk_embeddings)
64
+ st.success(f"Book processed successfully! ({len(book_chunks)} chunks)")
65
+
66
+ query = st.text_input("Ask a question based on the book:")
67
+ if query:
68
+ answer = answer_question(query, faiss_index, book_chunks)
69
+ st.write(f"**Answer:** {answer}")