Spaces:
Running
Running
File size: 2,284 Bytes
549c0e5 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab 5ad10db d9a65ab a37c21d d9a65ab 5ad10db 549c0e5 022e85b 549c0e5 022e85b 549c0e5 5ad10db 549c0e5 284df8b bb404ed 549c0e5 5ad10db 549c0e5 5ad10db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import fitz # PyMuPDF
import pandas as pd
from transformers import pipeline
import base64
# Function to convert PDF to DataFrame
def pdf_to_dataframe(uploaded_file):
# Open the PDF document
# doc = fitz.open(pdf_path)
# # Initialize an empty list to store text blocks
# text_blocks = []
# # Iterate through each page in the PDF
# for page_num in range(len(doc)):
# page = doc.load_page(page_num)
# text = page.get_text("text")
# print(text)
# text_blocks.append(text)
# # Join all text blocks into a single string
# full_text = "\n".join(text_blocks)
# # Split the text into lines
# lines = full_text.split('\n')
# # Create a DataFrame from the lines
if uploaded_file is not None:
ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large")
extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"]
lines = extracted_text.split("\n")
data = []
for line in lines:
data.append([line])
df = pd.DataFrame(data, columns=["Text"])
# df = pd.DataFrame(lines, columns=['Text'])
return df
# Function to save DataFrame to Excel
def dataframe_to_excel(df, excel_path):
# Save the DataFrame to an Excel file
df.to_excel(excel_path, index=False)
# Main function
def main():
def pdf_to_excel_function(pdf_file):
# Save the uploaded PDF to a temporary file
pdf_path = "temp.pdf"
# with open(pdf_path, "wb") as f:
# f.write(pdf_file.read())
# Convert PDF to DataFrame
df = pdf_to_dataframe(pdf_file)
# Save DataFrame to Excel
excel_path = "output.xlsx"
dataframe_to_excel(df, excel_path)
return excel_path
# Create the Gradio interface
iface = gr.Interface(
fn=pdf_to_excel_function,
inputs=gr.File(label="Upload PDF File"),
outputs=gr.File(label="Download Excel File"),
title="PDF to Excel Converter",
description="Convert a PDF file to an Excel file."
)
# Launch the interface
iface.launch()
if __name__ == "__main__":
main() |