yashasgupta commited on
Commit
4e4a24d
·
verified ·
1 Parent(s): cfb6e62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -3
app.py CHANGED
@@ -4,6 +4,7 @@ from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTempla
4
  import os
5
  import nltk
6
  import io
 
7
  nltk.download("punkt")
8
 
9
  st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
@@ -46,6 +47,13 @@ from langchain_google_genai import GoogleGenerativeAIEmbeddings
46
  from langchain_community.vectorstores import Chroma
47
  from langchain_core.runnables import RunnablePassthrough
48
 
 
 
 
 
 
 
 
49
 
50
 
51
  uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
@@ -53,10 +61,11 @@ uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
53
  if uploaded_file is not None:
54
 
55
  pdf_file = io.BytesIO(uploaded_file.read())
56
- pdf_loader = PDFMinerLoader(pdf_file)
57
- dat_nik = pdf_loader.load()
 
58
  text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
59
- chunks = test_splitter.split_documents(dat_nik)
60
 
61
  embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
62
 
 
4
  import os
5
  import nltk
6
  import io
7
+ import fitz
8
  nltk.download("punkt")
9
 
10
  st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
 
47
  from langchain_community.vectorstores import Chroma
48
  from langchain_core.runnables import RunnablePassthrough
49
 
50
+ def extract_text_from_pdf(pdf_file):
51
+ document = fitz.open(stream=pdf_file, filetype="pdf")
52
+ text = ""
53
+ for page_num in range(len(document)):
54
+ page = document.load_page(page_num)
55
+ text += page.get_text()
56
+ return text
57
 
58
 
59
  uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
 
61
  if uploaded_file is not None:
62
 
63
  pdf_file = io.BytesIO(uploaded_file.read())
64
+ text = extract_text_from_pdf(pdf_file)
65
+ #pdf_loader = PDFMinerLoader(pdf_file)
66
+ #dat_nik = pdf_loader.load()
67
  text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
68
+ chunks = test_splitter.split_documents([text])
69
 
70
  embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
71