File size: 4,140 Bytes
febd231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import gradio as gr
from pdf2image import convert_from_path,pdfinfo_from_path
import zipfile

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))



DIRECTORY = "image_reference"
DIRECTORY_OUTPUT = "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

# Check and create directories
for directory in DIRECTORIES:
    if not os.path.exists(directory):
        os.makedirs(directory)        
    else:
        pass


ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']
def get_image_files(directory):
    image_files = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)):
            filepath = os.path.join(directory, filename)
            image_files.append(filepath)
    return image_files


def clear_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                os.rmdir(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")



def extract_photos_from_pdf(file_pdf):    
    clear_directory(DIRECTORY)   
    clear_directory(DIRECTORY_OUTPUT)     
    try:
        pdf_path = file_pdf.name
        info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)

        total_pages = info["Pages"]  # Total number of pages in the PDF book
        batch_size = 100  # Number of pages to process in each batch

        for start_page in range(0, total_pages, batch_size):
            end_page = min(start_page + batch_size, total_pages)
            images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
            for idx, image in enumerate(images, start=start_page):
                image.save(f'{DIRECTORY}/{idx+1}.png', 'PNG')

        images_pdf_list = get_image_files(DIRECTORY)
        image_names = [(path, os.path.basename(path)) for path in images_pdf_list]         
        sorted_names = sorted(image_names, key=lambda x: int(x[1].split('.')[0]))               
        zip_folder(DIRECTORY, f'{DIRECTORY_OUTPUT}/all_photos.zip') 
        return (
            gr.Gallery.update(value=sorted_names, label=f"Detected {len(images_pdf_list)} Page{'' if len(images_pdf_list) == 1 else 's'}", show_label=True, visible=True),
            gr.File.update(value=f'{DIRECTORY_OUTPUT}/all_photos.zip',visible=True)
        )
    except:     
        return (
            gr.Gallery.update(value=[], label="Error", show_label=True, visible=True),
            gr.File.update(visible=False)
        )
         
with gr.Blocks() as demo:
    with gr.Tabs() as tabs:        

        with gr.TabItem("PDF",id=0):

            with gr.Row():
                with gr.Column():   
                    proegres = gr.Text(show_label=False,value="",visible=False)                        
                    file_pdf = gr.File(file_types=['.pdf'], label="Upload PDF *")           
                    btn = gr.Button("Extract Photos from PDF")  
                    

    with gr.Tabs(visible=True) as tabs_under:
   
        with gr.TabItem("Photos",id=0):

            with gr.Column():
                
                list_image = gr.Gallery(value=[], label=f"0 Page",visible=True, show_label=True, elem_id="gallery").style(columns=[3], object_fit="cover", height="auto")
                file_download = gr.File(file_types=['.zip'], label="Download File",visible=False) 


    examples = gr.Examples([["./1706.03762.pdf", None]], fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download], cache_examples=False)    
    btn.click(fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download])



                
demo.queue().launch()