Spaces:

lodhrangpt
/

pdf_to_excel

Running

lodhrangpt commited on Nov 19, 2024

Commit

d9a65ab

verified ·

1 Parent(s): 022e85b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,30 +1,41 @@
 import gradio as gr
 import fitz  # PyMuPDF
 import pandas as pd
 # Function to convert PDF to DataFrame
-def pdf_to_dataframe(pdf_path):
     # Open the PDF document
-    doc = fitz.open(pdf_path)
-    # Initialize an empty list to store text blocks
-    text_blocks = []
-    # Iterate through each page in the PDF
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        text = page.get_text("text")
-        print(text)
-        text_blocks.append(text)
-    # Join all text blocks into a single string
-    full_text = "\n".join(text_blocks)
-    # Split the text into lines
-    lines = full_text.split('\n')
-    # Create a DataFrame from the lines
-    df = pd.DataFrame(lines, columns=['Text'])
     return df

 import gradio as gr
 import fitz  # PyMuPDF
 import pandas as pd
+from transformers import pipeline
+import base64
 # Function to convert PDF to DataFrame
+def pdf_to_dataframe(uploaded_file):
     # Open the PDF document
+    # doc = fitz.open(pdf_path)
+    # # Initialize an empty list to store text blocks
+    # text_blocks = []
+    # # Iterate through each page in the PDF
+    # for page_num in range(len(doc)):
+    #     page = doc.load_page(page_num)
+    #     text = page.get_text("text")
+    #     print(text)
+    #     text_blocks.append(text)
+    # # Join all text blocks into a single string
+    # full_text = "\n".join(text_blocks)
+    # # Split the text into lines
+    # lines = full_text.split('\n')
+    # # Create a DataFrame from the lines
+    if uploaded_file is not None:
+        ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large")
+        extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"]
+        lines = extracted_text.split("\n")
+        data = []
+       for line in lines:
+        data.append([line])
+       df = pd.DataFrame(data, columns=["Text"])
+    # df = pd.DataFrame(lines, columns=['Text'])
     return df