chat-PDF-demo

Sleeping

JPLTedCas commited on Feb 3, 2024

Commit

f8312e2

verified ·

1 Parent(s): 29d9fe7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,7 +44,33 @@ def get_pdf_text(pdf_docs : list) -> str:
             text += page.extract_text()
     return text
 #def get_text_chunks(text:str) ->list:
 #    text_splitter = CharacterTextSplitter(
 #        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
@@ -167,10 +193,11 @@ def main():
             with st.spinner("Procesando"):
                 # get pdf text
                 raw_text = get_pdf_text(pdf_docs)
                 # get the text chunks
-                text_chunks = get_text_chunks(raw_text)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)

             text += page.extract_text()
     return text
+def get_pdf_pages(pdf_docs):
+    """
+    Extract text from a list of PDF documents.
+    Parameters
+    ----------
+    pdf_docs : list
+        List of PDF documents to extract text from.
+    Returns
+    -------
+    str
+        Extracted text from all the PDF documents.
+    """
+    pages = []
+    import tempfile
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for pdf in pdf_docs:
+            pdf_path=os.path.join(tmpdirname,pdf.name)
+            with open(pdf_path, "wb") as f:
+               f.write(pdf.getbuffer())
+            pdf_loader = UnstructuredPDFLoader(pdf_path)
+            pdf_pages = pdf_loader.load_and_split()
+            pages=pages+pdf_pages
+    return pages
 #def get_text_chunks(text:str) ->list:
 #    text_splitter = CharacterTextSplitter(
 #        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
             with st.spinner("Procesando"):
                 # get pdf text
                 raw_text = get_pdf_text(pdf_docs)
+                pages = get_pdf_pages(pdf_docs)
                 # get the text chunks
+                #text_chunks = get_text_chunks(raw_text)
+                text_chunks = get_text_chunks(pages)
                 # create vector store
                 vectorstore = get_vectorstore(text_chunks)