File size: 1,010 Bytes
5a95a6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import streamlit as st
from utils.st_def import st_logo, st_read_pdf

st_logo(title = "Welcome 👋 to Text Cleaning!", page_title="Text Cleaning",)
st_read_pdf()
#------------------------------------------------------------------------
import openai, PyPDF2, os, time, pandas as pd

if 'pdfreader' not in st.session_state:   
    st.error('Load PDF before continue ... ')
else:
    page_text=[]     #array for page
    summary=' '
    pr = st.session_state['pdfreader']
    with st.spinner('Loading files...'):
        for i in range(0,len(pr.pages)):
            # creating a page object
            pageObj = pr.pages[i].extract_text()    # extract one page's text
            pageObj= pageObj.replace('\t\r','')     # tab, enter
            pageObj= pageObj.replace('\xa0','')     # non-breaking spaces
            # extracting text from page
            page_text.append(pageObj)                    # the whole pdf --> txt
            
    st.session_state['page_text'] = page_text
    st.write(page_text)