File size: 10,314 Bytes
130c582
551e732
130c582
 
 
 
 
551e732
130c582
551e732
130c582
fe437cb
9b32604
 
 
 
315a442
 
130c582
551e732
f895e21
130c582
551e732
 
 
9b32604
551e732
 
 
 
130c582
551e732
130c582
 
551e732
130c582
 
9b32604
 
 
130c582
551e732
 
 
130c582
551e732
130c582
 
551e732
 
130c582
551e732
 
9b32604
 
 
 
 
 
 
 
 
 
130c582
551e732
 
 
130c582
551e732
 
130c582
 
 
 
 
551e732
130c582
551e732
 
130c582
 
 
 
551e732
130c582
9b32604
 
 
 
551e732
130c582
551e732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130c582
551e732
 
130c582
 
b6260fc
 
130c582
 
551e732
 
130c582
551e732
 
 
130c582
551e732
 
130c582
 
551e732
 
130c582
 
 
551e732
 
 
 
 
 
 
 
9b32604
 
 
 
 
 
 
 
 
 
 
315a442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b32604
551e732
 
315a442
551e732
 
 
315a442
551e732
 
 
315a442
551e732
 
315a442
551e732
 
f895e21
9b32604
fe437cb
315a442
130c582
315a442
130c582
551e732
 
9b32604
315a442
130c582
 
 
 
 
9b32604
130c582
 
 
 
 
 
 
9b32604
 
 
 
551e732
 
130c582
551e732
130c582
9b32604
315a442
9b32604
 
 
 
fe437cb
 
 
 
 
 
 
 
9b32604
 
 
 
 
130c582
551e732
 
9b32604
fe437cb
 
 
 
 
551e732
130c582
315a442
 
 
9b32604
 
315a442
9b32604
 
551e732
145f8e8
130c582
 
551e732
fe437cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF
import docx
import pptx
import openpyxl
import re
import nltk
from nltk.tokenize import sent_tokenize
import torch
from fastapi import FastAPI
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
from gtts import gTTS
import tempfile
import os
import easyocr
from fpdf import FPDF
import datetime

# Download required NLTK data
nltk.download('punkt', quiet=True)

# Initialize components
app = FastAPI()

# Load models (CPU optimized)
MODEL_NAME = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # Force CPU usage
    torch_dtype=torch.float32
)

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])  # English only for faster initialization

def clean_text(text: str) -> str:
    """Clean and normalize document text"""
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'β€’\s*|\d\.\s+', '', text)  # Remove bullets and numbering
    text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove brackets/parentheses
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)  # Remove page numbers
    return text.strip()

def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
    """Extract text from various document formats"""
    try:
        if file_extension == "pdf":
            with fitz.open(file_path) as doc:
                text = "\n".join(page.get_text("text") for page in doc)
                # Try OCR for scanned PDFs if text extraction fails
                if len(text.strip()) < 50:
                    images = [page.get_pixmap() for page in doc]
                    temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
                    images[0].save(temp_img.name)
                    ocr_result = reader.readtext(temp_img.name, detail=0)
                    os.unlink(temp_img.name)
                    text = "\n".join(ocr_result) if ocr_result else text
                return clean_text(text), ""
            
        elif file_extension == "docx":
            doc = docx.Document(file_path)
            return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
            
        elif file_extension == "pptx":
            prs = pptx.Presentation(file_path)
            text = []
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text.append(shape.text)
            return clean_text("\n".join(text)), ""
            
        elif file_extension == "xlsx":
            wb = openpyxl.load_workbook(file_path, read_only=True)
            text = []
            for sheet in wb.sheetnames:
                for row in wb[sheet].iter_rows(values_only=True):
                    text.append(" ".join(str(cell) for cell in row if cell))
            return clean_text("\n".join(text)), ""
            
        elif file_extension in ["jpg", "jpeg", "png"]:
            ocr_result = reader.readtext(file_path, detail=0)
            return clean_text("\n".join(ocr_result)), ""
            
        return "", "Unsupported file format"
    except Exception as e:
        return "", f"Error reading {file_extension.upper()} file: {str(e)}"

def chunk_text(text: str, max_tokens: int = 768) -> list[str]:
    """Split text into manageable chunks for summarization"""
    try:
        sentences = sent_tokenize(text)
    except:
        # Fallback if sentence tokenization fails
        words = text.split()
        sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def generate_summary(text: str, length: str = "medium") -> str:
    """Generate summary with appropriate length parameters"""
    length_params = {
        "short": {"max_length": 80, "min_length": 30},
        "medium": {"max_length": 200, "min_length": 80},
        "long": {"max_length": 300, "min_length": 210}
    }
    
    chunks = chunk_text(text)
    summaries = []
    
    for chunk in chunks:
        try:
            summary = summarizer(
                chunk,
                max_length=length_params[length]["max_length"],
                min_length=length_params[length]["min_length"],
                do_sample=False,
                truncation=True,
                no_repeat_ngram_size=2,
                num_beams=2,
                early_stopping=True
            )
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            summaries.append(f"[Chunk error: {str(e)}]")
    
    # Combine and format the final summary
    final_summary = " ".join(summaries)
    final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
    return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"

def text_to_speech(text: str) -> str:
    """Convert text to speech and return temporary audio file path"""
    try:
        tts = gTTS(text)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)
        return temp_audio.name
    except Exception as e:
        print(f"Error in text-to-speech: {e}")
        return ""

def create_pdf(summary: str, original_filename: str) -> str:
    """Create a PDF file from the summary text"""
    try:
        # Create PDF object
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        
        # Add title
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
        pdf.set_font("Arial", size=12)
        
        # Add metadata
        pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
        pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
        pdf.ln(10)
        
        # Add summary content
        pdf.multi_cell(0, 10, txt=summary)
        
        # Save to temporary file
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        pdf.output(temp_pdf.name)
        return temp_pdf.name
    except Exception as e:
        print(f"Error creating PDF: {e}")
        return ""

def summarize_document(file, summary_length: str, enable_tts: bool):
    """Main processing function for Gradio interface"""
    if file is None:
        return "Please upload a document first", "Ready", None, None
    
    file_path = file.name
    file_extension = file_path.split(".")[-1].lower()
    original_filename = os.path.basename(file_path)
    
    text, error = extract_text(file_path, file_extension)
    if error:
        return error, "Error", None, None
    
    if not text or len(text.split()) < 30:
        return "Document is too short or contains too little text to summarize", "Ready", None, None
    
    try:
        summary = generate_summary(text, summary_length)
        audio_path = text_to_speech(summary) if enable_tts else None
        pdf_path = create_pdf(summary, original_filename) if summary else None
        return summary, "Summary complete", audio_path, pdf_path
    except Exception as e:
        return f"Summarization error: {str(e)}", "Error", None, None

# Gradio Interface
with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“„ Advanced Document Summarizer")
    gr.Markdown("Upload a document to generate a summary with optional audio reading and PDF download")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Document",
                file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
                type="filepath"
            )
            length_radio = gr.Radio(
                ["short", "medium", "long"],
                value="medium",
                label="Summary Length"
            )
            tts_checkbox = gr.Checkbox(
                label="Enable Text-to-Speech",
                value=False
            )
            submit_btn = gr.Button("Generate Summary", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(label="Summary", lines=10)
            status = gr.Textbox(label="Status", interactive=False)
            audio_output = gr.Audio(label="Audio Summary", visible=False)
            pdf_download = gr.File(label="Download Summary as PDF", visible=False)
    
    def toggle_audio_visibility(enable_tts):
        return gr.Audio(visible=enable_tts)
    
    def update_ui(summary, status, audio_path, pdf_path):
        return (
            summary,
            status,
            gr.Audio(visible=audio_path is not None, value=audio_path),
            gr.File(visible=pdf_path is not None, value=pdf_path)
        )
    
    tts_checkbox.change(
        fn=toggle_audio_visibility,
        inputs=tts_checkbox,
        outputs=audio_output
    )
    
    submit_btn.click(
        fn=summarize_document,
        inputs=[file_input, length_radio, tts_checkbox],
        outputs=[output, status, audio_output, pdf_download]
    ).then(
        fn=update_ui,
        inputs=[output, status, audio_output, pdf_download],
        outputs=[output, status, audio_output, pdf_download]
    )

# FastAPI endpoints for files
@app.get("/files/{file_name}")
async def get_file(file_name: str):
    file_path = os.path.join(tempfile.gettempdir(), file_name)
    if os.path.exists(file_path):
        return FileResponse(file_path)
    return JSONResponse({"error": "File not found"}, status_code=404)

# Mount Gradio app to FastAPI
app = gr.mount_gradio_app(app, demo, path="/")

@app.get("/")
def redirect_to_interface():
    return RedirectResponse(url="/")