Spaces:
Sleeping
Sleeping
File size: 4,036 Bytes
7dec78f c506d0d f598e4b 7dec78f f15272f 7dec78f f15272f 7dec78f f15272f 849e175 f15272f c506d0d 849e175 f15272f 7dec78f f15272f 7dec78f f598e4b 7dec78f f15272f 7dec78f f598e4b 7dec78f f15272f 7dec78f c506d0d 7dec78f 849e175 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os # Import os for file path manipulation
def parse_pdf(pdf_file, output_format):
with open(pdf_file, 'rb') as file:
pages = extract_pages(file)
text = ""
tables = [] # Placeholder for extracted table data
images = [] # List to store extracted image data
for page in pages:
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text += element.get_text()
elif isinstance(element, (LTFigure, LTImage)):
# Extract image data (e.g., save as image, convert to base64)
# ... (Implement image processing logic)
# Here's an example of extracting image data and saving the image
if hasattr(element, 'stream'): # Check for image data stream (LTImage)
image_data = element.stream.read()
else: # Handle LTFigure (may require additional processing)
# ... (Implement logic to extract image data from LTFigure)
# You might need libraries like Pillow for image manipulation
image_data = b"Placeholder for extracted image data" # Example placeholder
image_filename = f"extracted_image_{len(images)}.jpg"
with open(image_filename, 'wb') as image_file:
image_file.write(image_data)
images.append({"filename": image_filename}) # Add filename to image data
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
# You can use libraries like Camelot for complex tables
# ...
# Convert extracted data to desired format and populate download_data
if output_format == "JSON":
json_data = {
"text": text,
"tables": tables, # Replace with actual table data
"images": images # List of dictionaries with filenames
}
download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
elif output_format == "Markdown":
# Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
# markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
# Implement logic to embed images within Markdown (optional)
# ... (e.g., use relative paths if images are saved locally)
# or (consider alternative Markdown image embedding methods)
download_data = markdown_text.encode("utf-8")
elif output_format == "HTML":
# Implement table conversion using HTML table tags
html_tables = "<table>" # Start of HTML table (replace with actual table structure)
# ... (Implement table data conversion to HTML)
# html_tables += "</table>"
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
# Implement logic to display images within HTML (optional)
# ... (e.g., use `<img>` tags with image source)
download_data = html_text.encode("utf-8")
# Create a temporary directory to store downloaded files (optional)
# download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage
# Return the extracted text and the filename (or path) for download
return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs=[
gr.Text(label="Output Text"),
gr.File(label="Download Output")
],
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
if __name__ == "__main__": |