import tabula
import pandas as pd
import streamlit as st
import os
os.system('wget https://download.oracle.com/java/17/latest/jdk-17_linux-x64_bin.deb')
os.system('sudo dpkg -i jdk-17_linux-x64_bin.deb')
os.environ["JAVA_HOME"] = "/usr/lib/jvm/jdk-17/bin"
st.title("TableOCR")
st.markdown(
"""
""",
unsafe_allow_html=True
)
with st.form(key='my_form'):
file = st.file_uploader("Upload a file", type="pdf", accept_multiple_files=False)
page_no = st.number_input("Enter page number", min_value=1, value=1)
submit_button = st.form_submit_button(label='Submit')
if submit_button and file is not None and page_no is not None:
with st.spinner("Converting PDF page to image..."):
tables = tabula.read_pdf(file, pages=page_no, multiple_tables=True)
table_df = tables[0] if tables else pd.DataFrame()
st.write("Scroll down to download the output file.")
st.table(table_df)
table_df.to_excel("output.xlsx", index=False)
st.markdown(
f'Click here to download the output file',
unsafe_allow_html=True
)