Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,7 +44,33 @@ def get_pdf_text(pdf_docs : list) -> str:
|
|
44 |
text += page.extract_text()
|
45 |
return text
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
|
|
48 |
#def get_text_chunks(text:str) ->list:
|
49 |
# text_splitter = CharacterTextSplitter(
|
50 |
# separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
|
@@ -167,10 +193,11 @@ def main():
|
|
167 |
with st.spinner("Procesando"):
|
168 |
# get pdf text
|
169 |
raw_text = get_pdf_text(pdf_docs)
|
170 |
-
|
|
|
171 |
# get the text chunks
|
172 |
-
text_chunks = get_text_chunks(raw_text)
|
173 |
-
|
174 |
# create vector store
|
175 |
vectorstore = get_vectorstore(text_chunks)
|
176 |
|
|
|
44 |
text += page.extract_text()
|
45 |
return text
|
46 |
|
47 |
+
def get_pdf_pages(pdf_docs):
|
48 |
+
"""
|
49 |
+
Extract text from a list of PDF documents.
|
50 |
+
Parameters
|
51 |
+
----------
|
52 |
+
pdf_docs : list
|
53 |
+
List of PDF documents to extract text from.
|
54 |
+
Returns
|
55 |
+
-------
|
56 |
+
str
|
57 |
+
Extracted text from all the PDF documents.
|
58 |
+
"""
|
59 |
+
pages = []
|
60 |
+
import tempfile
|
61 |
+
|
62 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
63 |
+
for pdf in pdf_docs:
|
64 |
+
pdf_path=os.path.join(tmpdirname,pdf.name)
|
65 |
+
with open(pdf_path, "wb") as f:
|
66 |
+
f.write(pdf.getbuffer())
|
67 |
+
|
68 |
+
pdf_loader = UnstructuredPDFLoader(pdf_path)
|
69 |
+
pdf_pages = pdf_loader.load_and_split()
|
70 |
+
pages=pages+pdf_pages
|
71 |
+
return pages
|
72 |
|
73 |
+
|
74 |
#def get_text_chunks(text:str) ->list:
|
75 |
# text_splitter = CharacterTextSplitter(
|
76 |
# separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
|
|
|
193 |
with st.spinner("Procesando"):
|
194 |
# get pdf text
|
195 |
raw_text = get_pdf_text(pdf_docs)
|
196 |
+
pages = get_pdf_pages(pdf_docs)
|
197 |
+
|
198 |
# get the text chunks
|
199 |
+
#text_chunks = get_text_chunks(raw_text)
|
200 |
+
text_chunks = get_text_chunks(pages)
|
201 |
# create vector store
|
202 |
vectorstore = get_vectorstore(text_chunks)
|
203 |
|