Spaces:

nnpy
/

TableOCR

Running

File size: 1,401 Bytes

decd62f
 
 
962aa5c
 
 
f103fd3
962aa5c
f489156
decd62f
 
 
962aa5c
 
 
 
 
 
 
 
 
 
 
 
 
decd62f

import tabula
import pandas as pd
import streamlit as st
import os

os.system('wget https://download.oracle.com/java/17/latest/jdk-17_linux-x64_bin.deb')
os.system('sudo dpkg -i jdk-17_linux-x64_bin.deb')

os.environ["JAVA_HOME"] = "/usr/lib/jvm/jdk-17/bin"

st.title("TableOCR")

st.markdown(
    """
    <style>
    .css-1jc7ptx, .e1ewe7hr3, .viewerBadge_container__1QSob,
    .styles_viewerBadge__1yB5_, .viewerBadge_link__1S137,
    .viewerBadge_text__1JaDK {
        display: none;
    }
    </style>
    """,
    unsafe_allow_html=True
)

with st.form(key='my_form'):
    file = st.file_uploader("Upload a file", type="pdf", accept_multiple_files=False)
    page_no = st.number_input("Enter page number", min_value=1, value=1)
    submit_button = st.form_submit_button(label='Submit')

    if submit_button and file is not None and page_no is not None:
        with st.spinner("Converting PDF page to image..."):
            tables = tabula.read_pdf(file, pages=page_no, multiple_tables=True)
            table_df = tables[0] if tables else pd.DataFrame()
            st.write("Scroll down to download the output file.")
            st.table(table_df)
            table_df.to_excel("output.xlsx", index=False)
            st.markdown(
                f'<a href="output.xlsx" download="output.xlsx">Click here to download the output file</a>',
                unsafe_allow_html=True
            )