lodhrangpt commited on
Commit
d9a65ab
·
verified ·
1 Parent(s): 022e85b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -1,30 +1,41 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import pandas as pd
 
 
4
 
5
  # Function to convert PDF to DataFrame
6
- def pdf_to_dataframe(pdf_path):
7
  # Open the PDF document
8
- doc = fitz.open(pdf_path)
9
 
10
- # Initialize an empty list to store text blocks
11
- text_blocks = []
12
 
13
- # Iterate through each page in the PDF
14
- for page_num in range(len(doc)):
15
- page = doc.load_page(page_num)
16
- text = page.get_text("text")
17
- print(text)
18
- text_blocks.append(text)
19
 
20
- # Join all text blocks into a single string
21
- full_text = "\n".join(text_blocks)
22
 
23
- # Split the text into lines
24
- lines = full_text.split('\n')
25
 
26
- # Create a DataFrame from the lines
27
- df = pd.DataFrame(lines, columns=['Text'])
 
 
 
 
 
 
 
 
 
28
 
29
  return df
30
 
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import pandas as pd
4
+ from transformers import pipeline
5
+ import base64
6
 
7
  # Function to convert PDF to DataFrame
8
+ def pdf_to_dataframe(uploaded_file):
9
  # Open the PDF document
10
+ # doc = fitz.open(pdf_path)
11
 
12
+ # # Initialize an empty list to store text blocks
13
+ # text_blocks = []
14
 
15
+ # # Iterate through each page in the PDF
16
+ # for page_num in range(len(doc)):
17
+ # page = doc.load_page(page_num)
18
+ # text = page.get_text("text")
19
+ # print(text)
20
+ # text_blocks.append(text)
21
 
22
+ # # Join all text blocks into a single string
23
+ # full_text = "\n".join(text_blocks)
24
 
25
+ # # Split the text into lines
26
+ # lines = full_text.split('\n')
27
 
28
+ # # Create a DataFrame from the lines
29
+ if uploaded_file is not None:
30
+ ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large")
31
+ extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"]
32
+ lines = extracted_text.split("\n")
33
+ data = []
34
+ for line in lines:
35
+ data.append([line])
36
+ df = pd.DataFrame(data, columns=["Text"])
37
+
38
+ # df = pd.DataFrame(lines, columns=['Text'])
39
 
40
  return df
41