File size: 2,909 Bytes
26b5631
 
00e7d0c
26b5631
00e7d0c
 
 
 
 
 
 
26b5631
 
a2297d7
 
 
 
 
 
 
 
26b5631
 
 
 
fe3d0f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00e7d0c
 
 
 
26b5631
 
 
 
 
 
 
 
 
9e582dd
26b5631
9e582dd
26b5631
 
 
00e7d0c
 
 
 
 
 
 
 
 
9e582dd
26b5631
00e7d0c
 
 
26b5631
 
 
 
00e7d0c
4f6be96
 
26b5631
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import gradio as gr
import subprocess

# Define function to convert ebook to txt using Calibre
def convert_to_txt(input_file):
    output_txt = os.path.splitext(input_file)[0] + ".txt"
    if not os.path.exists(output_txt):
        subprocess.run(["ebook-convert", input_file, output_txt], check=True)
    return output_txt

# Define function to process file
def process_book(file):
    import shutil
    import spacy
    from booknlp.booknlp import BookNLP
    from spacy.cli import download
    
    #This will download the booknlp files using my huggingface backup     
    import download_missing_booknlp_models 
    
    input_file = file.name
    output_dir = "output_dir/booknlp_output/"
    book_id = os.path.splitext(os.path.basename(input_file))[0]
    
    # Ensure Spacy model is downloaded
    def ensure_spacy_model():
        try:
            spacy.load("en_core_web_sm")
        except OSError:
            download("en_core_web_sm")

    # Initialize Spacy model and BookNLP
    ensure_spacy_model()
    model_params = {
        "pipeline": "entity,quote,supersense,event,coref", 
        "model": "big"
    }
    booknlp = BookNLP("en", model_params)
    
    # Check if the file is already a .txt file
    if not input_file.endswith(".txt"):
        input_file = convert_to_txt(input_file)

    # Create output directory if it doesn't exist
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    # Run BookNLP
    booknlp.process(input_file, output_dir, book_id)
    
    # Zip the output folder
    zip_file = f"{output_dir}/{book_id}_output.zip"
    shutil.make_archive(f"{output_dir}/{book_id}_output", 'zip', output_dir)
    return zip_file

# Gradio Interface
def gradio_interface():
    # Define supported file formats
    supported_formats = [
        '.azw', '.azw3', '.azw4', '.cbz', '.cbr', '.cb7', '.cbc', '.chm', 
        '.djvu', '.docx', '.epub', '.fb2', '.fbz', '.html', '.htmlz', '.lit', 
        '.lrf', '.mobi', '.odt', '.pdf', '.prc', '.pdb', '.pml', '.rb', 
        '.rtf', '.snb', '.tcr', '.txt', '.txtz'
    ]

    file_input = gr.File(file_types=supported_formats, label="Upload an ebook file (.azw, .epub, .pdf, .txt, etc.)")
    file_output = gr.File(label="Download the output files")
    
    # Show supported formats in the description
    description = f"Upload any of the supported formats: {', '.join(supported_formats)}. If a .txt file is uploaded, it will directly be processed by BookNLP. Otherwise, it will be converted to .txt using Calibre first."

    gr.Interface(
        fn=process_book, 
        inputs=file_input, 
        outputs=file_output, 
        title="BookNLP Processor with Ebook Support",
        description=description,
        article="This interface is based on [BookNLP](https://github.com/booknlp/booknlp)."
    ).launch()

if __name__ == "__main__":
    gradio_interface()