JPLTedCas commited on
Commit
f8312e2
·
verified ·
1 Parent(s): 29d9fe7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -3
app.py CHANGED
@@ -44,7 +44,33 @@ def get_pdf_text(pdf_docs : list) -> str:
44
  text += page.extract_text()
45
  return text
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
48
  #def get_text_chunks(text:str) ->list:
49
  # text_splitter = CharacterTextSplitter(
50
  # separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
@@ -167,10 +193,11 @@ def main():
167
  with st.spinner("Procesando"):
168
  # get pdf text
169
  raw_text = get_pdf_text(pdf_docs)
170
-
 
171
  # get the text chunks
172
- text_chunks = get_text_chunks(raw_text)
173
-
174
  # create vector store
175
  vectorstore = get_vectorstore(text_chunks)
176
 
 
44
  text += page.extract_text()
45
  return text
46
 
47
+ def get_pdf_pages(pdf_docs):
48
+ """
49
+ Extract text from a list of PDF documents.
50
+ Parameters
51
+ ----------
52
+ pdf_docs : list
53
+ List of PDF documents to extract text from.
54
+ Returns
55
+ -------
56
+ str
57
+ Extracted text from all the PDF documents.
58
+ """
59
+ pages = []
60
+ import tempfile
61
+
62
+ with tempfile.TemporaryDirectory() as tmpdirname:
63
+ for pdf in pdf_docs:
64
+ pdf_path=os.path.join(tmpdirname,pdf.name)
65
+ with open(pdf_path, "wb") as f:
66
+ f.write(pdf.getbuffer())
67
+
68
+ pdf_loader = UnstructuredPDFLoader(pdf_path)
69
+ pdf_pages = pdf_loader.load_and_split()
70
+ pages=pages+pdf_pages
71
+ return pages
72
 
73
+
74
  #def get_text_chunks(text:str) ->list:
75
  # text_splitter = CharacterTextSplitter(
76
  # separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
 
193
  with st.spinner("Procesando"):
194
  # get pdf text
195
  raw_text = get_pdf_text(pdf_docs)
196
+ pages = get_pdf_pages(pdf_docs)
197
+
198
  # get the text chunks
199
+ #text_chunks = get_text_chunks(raw_text)
200
+ text_chunks = get_text_chunks(pages)
201
  # create vector store
202
  vectorstore = get_vectorstore(text_chunks)
203