Manishkumaryadav commited on
Commit
a4f5b65
·
verified ·
1 Parent(s): bd1a52f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -69
app.py CHANGED
@@ -1,69 +1,69 @@
1
- import streamlit as st
2
- import pdfplumber
3
- import faiss
4
- import torch
5
- import numpy as np
6
- from sentence_transformers import SentenceTransformer
7
- from transformers import pipeline
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
-
10
- # Load embedding model
11
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
- qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
13
-
14
- # Function to extract text from PDF
15
- def extract_text_from_pdf(pdf_file):
16
- text = ""
17
- with pdfplumber.open(pdf_file) as pdf:
18
- for page in pdf.pages:
19
- text += page.extract_text() + "\n"
20
- return text.strip()
21
-
22
- # Chunking text
23
- def chunk_text(text, chunk_size=500, overlap=100):
24
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
25
- return splitter.split_text(text)
26
-
27
- # Generate embeddings
28
- def generate_embeddings(text_chunks):
29
- return embedding_model.encode(text_chunks, convert_to_numpy=True)
30
-
31
- # Create FAISS index
32
- def create_faiss_index(embeddings):
33
- dimension = embeddings.shape[1]
34
- index = faiss.IndexFlatL2(dimension)
35
- index.add(embeddings)
36
- return index
37
-
38
- # Retrieve relevant context
39
- def retrieve_context(query, index, text_chunks, top_k=3):
40
- query_embedding = embedding_model.encode([query], convert_to_numpy=True)
41
- distances, indices = index.search(query_embedding, top_k)
42
- retrieved_text = "\n".join([text_chunks[i] for i in indices[0]])
43
- return retrieved_text
44
-
45
- # Generate Answer
46
- def answer_question(query, faiss_index, book_chunks):
47
- context = retrieve_context(query, faiss_index, book_chunks)
48
- result = qa_pipeline(question=query, context=context)
49
- return result["answer"]
50
-
51
- # Streamlit UI
52
- st.title("📖 Book-Based Question Answering System")
53
- st.write("Upload a book (PDF) and ask any question!")
54
-
55
- # File uploader
56
- uploaded_file = st.file_uploader("Upload a PDF book", type="pdf")
57
-
58
- if uploaded_file:
59
- st.write("Processing book...")
60
- book_text = extract_text_from_pdf(uploaded_file)
61
- book_chunks = chunk_text(book_text)
62
- chunk_embeddings = generate_embeddings(book_chunks)
63
- faiss_index = create_faiss_index(chunk_embeddings)
64
- st.success(f"Book processed successfully! ({len(book_chunks)} chunks)")
65
-
66
- query = st.text_input("Ask a question based on the book:")
67
- if query:
68
- answer = answer_question(query, faiss_index, book_chunks)
69
- st.write(f"**Answer:** {answer}")
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ import faiss
4
+ import torch
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import pipeline
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+
10
+ # Load embedding model
11
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
13
+
14
+ # Function to extract text from PDF
15
+ def extract_text_from_pdf(pdf_file):
16
+ text = ""
17
+ with pdfplumber.open(pdf_file) as pdf:
18
+ for page in pdf.pages:
19
+ text += page.extract_text() + "\n"
20
+ return text.strip()
21
+
22
+ # Chunking text
23
+ def chunk_text(text, chunk_size=500, overlap=100):
24
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
25
+ return splitter.split_text(text)
26
+
27
+ # Generate embeddings
28
+ def generate_embeddings(text_chunks):
29
+ return embedding_model.encode(text_chunks, convert_to_numpy=True)
30
+
31
+ # Create FAISS index
32
+ def create_faiss_index(embeddings):
33
+ dimension = embeddings.shape[1]
34
+ index = faiss.IndexFlatL2(dimension)
35
+ index.add(embeddings)
36
+ return index
37
+
38
+ # Retrieve relevant context (Increased context size)
39
+ def retrieve_context(query, index, text_chunks, top_k=7):
40
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
41
+ distances, indices = index.search(query_embedding, top_k)
42
+ retrieved_text = "\n".join([text_chunks[i] for i in indices[0]])
43
+ return retrieved_text
44
+
45
+ # Generate Answer (Allow longer answers)
46
+ def answer_question(query, faiss_index, book_chunks):
47
+ context = retrieve_context(query, faiss_index, book_chunks)
48
+ result = qa_pipeline(question=query, context=context, max_answer_len=150)
49
+ return result["answer"] + "\n\n**Additional Context:** " + context[:400] + "..."
50
+
51
+ # Streamlit UI
52
+ st.title("📖 Book-Based Question Answering System")
53
+ st.write("Upload a book (PDF) and ask any question!")
54
+
55
+ # File uploader
56
+ uploaded_file = st.file_uploader("Upload a PDF book", type="pdf")
57
+
58
+ if uploaded_file:
59
+ st.write("Processing book...")
60
+ book_text = extract_text_from_pdf(uploaded_file)
61
+ book_chunks = chunk_text(book_text)
62
+ chunk_embeddings = generate_embeddings(book_chunks)
63
+ faiss_index = create_faiss_index(chunk_embeddings)
64
+ st.success(f"Book processed successfully! ({len(book_chunks)} chunks)")
65
+
66
+ query = st.text_input("Ask a question based on the book:")
67
+ if query:
68
+ answer = answer_question(query, faiss_index, book_chunks)
69
+ st.write(f"**Answer:** {answer}")