|
import os |
|
import streamlit as st |
|
from img2table.document import PDF |
|
from img2table.ocr import TesseractOCR |
|
|
|
st.title("Image to Table") |
|
|
|
uploaded_file = st.file_uploader("Choose a file") |
|
page_number = st.number_input("Page number", value=1, min_value=1) |
|
submit = st.button("Submit") |
|
|
|
if submit and uploaded_file is not None and page_number is not None: |
|
pdf = PDF(uploaded_file, pages=[page_number-1]) |
|
ocr = TesseractOCR(lang="eng") |
|
pdf_tables = pdf.extract_tables(ocr) |
|
st.write(pdf_tables) |
|
pdf.to_xlsx("output.xlsx", ocr=ocr) |
|
with open("output.xlsx", "rb") as file: |
|
btn = st.download_button( |
|
label="Download Excel", |
|
data=file, |
|
file_name="output.xlsx", |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
) |
|
os.remove("output.xlsx") |