Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -7,9 +7,21 @@ import io | |
| 7 | 
             
            from PIL import Image
         | 
| 8 | 
             
            import pandas as pd
         | 
| 9 | 
             
            import pdfplumber
         | 
| 10 | 
            -
            import tempfile | 
| 11 |  | 
| 12 | 
             
            def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 13 | 
             
                try:
         | 
| 14 | 
             
                    with open(pdf_file, 'rb') as file:
         | 
| 15 | 
             
                        text = ""
         | 
| @@ -53,7 +65,6 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()): | |
| 53 | 
             
                                        df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
         | 
| 54 | 
             
                                    tables.append(df)
         | 
| 55 |  | 
| 56 | 
            -
                        # Use a temporary file for the download
         | 
| 57 | 
             
                        with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
         | 
| 58 | 
             
                            if output_format == "JSON":
         | 
| 59 | 
             
                                json_data = {
         | 
| @@ -61,8 +72,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()): | |
| 61 | 
             
                                    "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
         | 
| 62 | 
             
                                    "images": images
         | 
| 63 | 
             
                                }
         | 
| 64 | 
            -
                                json.dump(json_data, tmp, indent=4)
         | 
| 65 | 
            -
                                download_path = tmp.name
         | 
| 66 | 
             
                            elif output_format == "Markdown":
         | 
| 67 | 
             
                                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
         | 
| 68 | 
             
                                for i, table in enumerate(tables):
         | 
| @@ -73,8 +83,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()): | |
| 73 | 
             
                                for image in images:
         | 
| 74 | 
             
                                    image_path = os.path.join(os.getcwd(), image["filename"])
         | 
| 75 | 
             
                                    markdown_text += f'\n'
         | 
| 76 | 
            -
                                tmp.write(markdown_text.encode('utf-8'))
         | 
| 77 | 
            -
                                download_path = tmp.name
         | 
| 78 | 
             
                            elif output_format == "HTML":
         | 
| 79 | 
             
                                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
         | 
| 80 | 
             
                                for i, table in enumerate(tables):
         | 
| @@ -85,9 +94,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()): | |
| 85 | 
             
                                for image in images:
         | 
| 86 | 
             
                                    image_path = os.path.join(os.getcwd(), image["filename"])
         | 
| 87 | 
             
                                    html_text += f'<img src="{image_path}" alt="Image"><br>\n'
         | 
| 88 | 
            -
                                tmp.write(html_text.encode('utf-8'))
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                            
         | 
| 91 | 
             
                        return text, download_path
         | 
| 92 |  | 
| 93 | 
             
                except Exception as main_e:
         | 
|  | |
| 7 | 
             
            from PIL import Image
         | 
| 8 | 
             
            import pandas as pd
         | 
| 9 | 
             
            import pdfplumber
         | 
| 10 | 
            +
            import tempfile
         | 
| 11 |  | 
| 12 | 
             
            def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
         | 
| 13 | 
            +
                """
         | 
| 14 | 
            +
                Parses a PDF file, extracts text, tables, and images, and formats the output.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                Args:
         | 
| 17 | 
            +
                    pdf_file: Path to the uploaded PDF file.
         | 
| 18 | 
            +
                    output_format: Desired output format ("JSON", "Markdown", or "HTML").
         | 
| 19 | 
            +
                    progress: Gradio Progress object for displaying progress.
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                Returns:
         | 
| 22 | 
            +
                    tuple: Extracted text and download data in the specified format.
         | 
| 23 | 
            +
                        Returns an empty string and None if there is an error.
         | 
| 24 | 
            +
                """
         | 
| 25 | 
             
                try:
         | 
| 26 | 
             
                    with open(pdf_file, 'rb') as file:
         | 
| 27 | 
             
                        text = ""
         | 
|  | |
| 65 | 
             
                                        df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
         | 
| 66 | 
             
                                    tables.append(df)
         | 
| 67 |  | 
|  | |
| 68 | 
             
                        with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
         | 
| 69 | 
             
                            if output_format == "JSON":
         | 
| 70 | 
             
                                json_data = {
         | 
|  | |
| 72 | 
             
                                    "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
         | 
| 73 | 
             
                                    "images": images
         | 
| 74 | 
             
                                }
         | 
| 75 | 
            +
                                json.dump(json_data, tmp, indent=4) 
         | 
|  | |
| 76 | 
             
                            elif output_format == "Markdown":
         | 
| 77 | 
             
                                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
         | 
| 78 | 
             
                                for i, table in enumerate(tables):
         | 
|  | |
| 83 | 
             
                                for image in images:
         | 
| 84 | 
             
                                    image_path = os.path.join(os.getcwd(), image["filename"])
         | 
| 85 | 
             
                                    markdown_text += f'\n'
         | 
| 86 | 
            +
                                tmp.write(markdown_text.encode('utf-8')) 
         | 
|  | |
| 87 | 
             
                            elif output_format == "HTML":
         | 
| 88 | 
             
                                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
         | 
| 89 | 
             
                                for i, table in enumerate(tables):
         | 
|  | |
| 94 | 
             
                                for image in images:
         | 
| 95 | 
             
                                    image_path = os.path.join(os.getcwd(), image["filename"])
         | 
| 96 | 
             
                                    html_text += f'<img src="{image_path}" alt="Image"><br>\n'
         | 
| 97 | 
            +
                                tmp.write(html_text.encode('utf-8')) 
         | 
| 98 | 
            +
                            download_path = tmp.name
         | 
|  | |
| 99 | 
             
                        return text, download_path
         | 
| 100 |  | 
| 101 | 
             
                except Exception as main_e:
         |