chromadb / pages /8_🧹2_Read_PDF.py
aiXpert's picture
init
5a95a6c
raw
history blame
1.01 kB
import streamlit as st
from utils.st_def import st_logo, st_read_pdf
st_logo(title = "Welcome 👋 to Text Cleaning!", page_title="Text Cleaning",)
st_read_pdf()
#------------------------------------------------------------------------
import openai, PyPDF2, os, time, pandas as pd
if 'pdfreader' not in st.session_state:
st.error('Load PDF before continue ... ')
else:
page_text=[] #array for page
summary=' '
pr = st.session_state['pdfreader']
with st.spinner('Loading files...'):
for i in range(0,len(pr.pages)):
# creating a page object
pageObj = pr.pages[i].extract_text() # extract one page's text
pageObj= pageObj.replace('\t\r','') # tab, enter
pageObj= pageObj.replace('\xa0','') # non-breaking spaces
# extracting text from page
page_text.append(pageObj) # the whole pdf --> txt
st.session_state['page_text'] = page_text
st.write(page_text)