Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,115 +1,92 @@
|
|
1 |
-
from fastapi import FastAPI, UploadFile, File
|
2 |
-
from fastapi.responses import RedirectResponse
|
3 |
import gradio as gr
|
4 |
-
from transformers import pipeline
|
5 |
-
import tempfile
|
6 |
-
import os
|
7 |
-
from PIL import Image
|
8 |
import fitz # PyMuPDF
|
9 |
import docx
|
10 |
-
import
|
|
|
|
|
11 |
|
12 |
-
|
|
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
IMAGE_CAPTIONING_MODEL = "Salesforce/blip-image-captioning-base" # 300MB
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
model=SUMMARIZATION_MODEL,
|
23 |
-
device="cpu"
|
24 |
-
)
|
25 |
-
except Exception as e:
|
26 |
-
print(f"Error loading summarizer: {e}")
|
27 |
-
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") # Fallback 250MB model
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
return
|
46 |
-
|
47 |
-
return "
|
48 |
-
except Exception as e:
|
49 |
-
return f"Error reading file: {str(e)}"
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
return
|
|
|
|
|
|
|
69 |
|
70 |
-
def process_image(image):
|
71 |
-
"""Handle image captioning and OCR"""
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
-
# Get caption
|
76 |
-
caption = captioner(img)[0]['generated_text']
|
77 |
-
|
78 |
-
# Get OCR text
|
79 |
-
ocr_result = reader.readtext(img)
|
80 |
-
ocr_text = " ".join([res[1] for res in ocr_result])
|
81 |
-
|
82 |
-
return {
|
83 |
-
"caption": caption,
|
84 |
-
"ocr_text": ocr_text if ocr_text else "No readable text found"
|
85 |
-
}
|
86 |
except Exception as e:
|
87 |
-
return
|
88 |
|
89 |
# Gradio Interface
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
gr.Markdown("Get captions and extracted text from images")
|
101 |
-
img_input = gr.Image(type="filepath", label="Upload Image")
|
102 |
-
with gr.Accordion("Results", open=False):
|
103 |
-
caption_output = gr.Textbox(label="Image Caption")
|
104 |
-
ocr_output = gr.Textbox(label="Extracted Text")
|
105 |
-
img_button = gr.Button("Analyze")
|
106 |
-
|
107 |
-
doc_button.click(process_document, inputs=doc_input, outputs=doc_output)
|
108 |
-
img_button.click(process_image, inputs=img_input, outputs=[caption_output, ocr_output])
|
109 |
|
110 |
-
# Mount Gradio
|
111 |
app = gr.mount_gradio_app(app, demo, path="/")
|
112 |
|
|
|
113 |
@app.get("/")
|
114 |
def redirect_to_interface():
|
115 |
-
return RedirectResponse(url="/")
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline
|
|
|
|
|
|
|
3 |
import fitz # PyMuPDF
|
4 |
import docx
|
5 |
+
import pptx
|
6 |
+
import openpyxl
|
7 |
+
import os
|
8 |
|
9 |
+
from fastapi import FastAPI
|
10 |
+
from fastapi.responses import RedirectResponse
|
11 |
|
12 |
+
# Load your custom summarization model
|
13 |
+
pipe = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")
|
|
|
14 |
|
15 |
+
# Document text extraction function
|
16 |
+
def extract_text(file):
|
17 |
+
ext = file.name.split(".")[-1].lower()
|
18 |
+
path = file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
if ext == "pdf":
|
21 |
+
try:
|
22 |
+
with fitz.open(path) as doc:
|
23 |
+
return "\n".join([page.get_text("text") for page in doc])
|
24 |
+
except Exception as e:
|
25 |
+
return f"Error reading PDF: {e}"
|
26 |
|
27 |
+
elif ext == "docx":
|
28 |
+
try:
|
29 |
+
doc = docx.Document(path)
|
30 |
+
return "\n".join([p.text for p in doc.paragraphs])
|
31 |
+
except Exception as e:
|
32 |
+
return f"Error reading DOCX: {e}"
|
33 |
|
34 |
+
elif ext == "pptx":
|
35 |
+
try:
|
36 |
+
prs = pptx.Presentation(path)
|
37 |
+
text = ""
|
38 |
+
for slide in prs.slides:
|
39 |
+
for shape in slide.shapes:
|
40 |
+
if hasattr(shape, "text"):
|
41 |
+
text += shape.text + "\n"
|
42 |
+
return text
|
43 |
+
except Exception as e:
|
44 |
+
return f"Error reading PPTX: {e}"
|
|
|
|
|
45 |
|
46 |
+
elif ext == "xlsx":
|
47 |
+
try:
|
48 |
+
wb = openpyxl.load_workbook(path)
|
49 |
+
text = ""
|
50 |
+
for sheet in wb.sheetnames:
|
51 |
+
for row in wb[sheet].iter_rows(values_only=True):
|
52 |
+
text += " ".join([str(cell) for cell in row if cell]) + "\n"
|
53 |
+
return text
|
54 |
+
except Exception as e:
|
55 |
+
return f"Error reading XLSX: {e}"
|
56 |
+
else:
|
57 |
+
return "Unsupported file format"
|
58 |
+
|
59 |
+
# Summarization logic
|
60 |
+
def summarize_document(file):
|
61 |
+
text = extract_text(file)
|
62 |
+
if "Error" in text or "Unsupported" in text:
|
63 |
+
return text
|
64 |
+
|
65 |
+
word_count = len(text.split())
|
66 |
+
max_summary_len = max(20, int(word_count * 0.2))
|
67 |
|
|
|
|
|
68 |
try:
|
69 |
+
summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
|
70 |
+
return summary[0]['generated_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
except Exception as e:
|
72 |
+
return f"Error during summarization: {e}"
|
73 |
|
74 |
# Gradio Interface
|
75 |
+
demo = gr.Interface(
|
76 |
+
fn=summarize_document,
|
77 |
+
inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
|
78 |
+
outputs=gr.Textbox(label="20% Summary"),
|
79 |
+
title="π Document Summarizer (20% Length)",
|
80 |
+
description="Upload a document and get a concise summary generated by your custom Hugging Face model."
|
81 |
+
)
|
82 |
+
|
83 |
+
# FastAPI setup
|
84 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
# Mount Gradio at "/"
|
87 |
app = gr.mount_gradio_app(app, demo, path="/")
|
88 |
|
89 |
+
# Optional root redirect
|
90 |
@app.get("/")
|
91 |
def redirect_to_interface():
|
92 |
+
return RedirectResponse(url="/")
|