ikraamkb commited on
Commit
3a44bf2
Β·
verified Β·
1 Parent(s): c3071ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -95
app.py CHANGED
@@ -1,115 +1,92 @@
1
- from fastapi import FastAPI, UploadFile, File
2
- from fastapi.responses import RedirectResponse
3
  import gradio as gr
4
- from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
5
- import tempfile
6
- import os
7
- from PIL import Image
8
  import fitz # PyMuPDF
9
  import docx
10
- import easyocr
 
 
11
 
12
- app = FastAPI()
 
13
 
14
- # Lightweight model choices
15
- SUMMARIZATION_MODEL = "facebook/bart-large-cnn" # 500MB
16
- IMAGE_CAPTIONING_MODEL = "Salesforce/blip-image-captioning-base" # 300MB
17
 
18
- # Initialize models
19
- try:
20
- summarizer = pipeline(
21
- "summarization",
22
- model=SUMMARIZATION_MODEL,
23
- device="cpu"
24
- )
25
- except Exception as e:
26
- print(f"Error loading summarizer: {e}")
27
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") # Fallback 250MB model
28
 
29
- captioner = pipeline(
30
- "image-to-text",
31
- model=IMAGE_CAPTIONING_MODEL,
32
- device="cpu"
33
- )
 
34
 
35
- reader = easyocr.Reader(['en']) # Lightweight OCR
 
 
 
 
 
36
 
37
- def extract_text_from_file(file_path: str, file_type: str):
38
- """Extract text from different document formats"""
39
- try:
40
- if file_type == "pdf":
41
- with fitz.open(file_path) as doc:
42
- return "\n".join(page.get_text() for page in doc)
43
- elif file_type == "docx":
44
- doc = docx.Document(file_path)
45
- return "\n".join(p.text for p in doc.paragraphs)
46
- else:
47
- return "Unsupported file format (only PDF/DOCX supported in lightweight version)"
48
- except Exception as e:
49
- return f"Error reading file: {str(e)}"
50
 
51
- def process_document(file):
52
- """Handle document summarization"""
53
- try:
54
- file_ext = os.path.splitext(file.name)[1][1:].lower()
55
- if file_ext not in ["pdf", "docx"]:
56
- return "Lightweight version only supports PDF and DOCX"
57
-
58
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
59
- tmp.write(file.read())
60
- tmp_path = tmp.name
61
-
62
- text = extract_text_from_file(tmp_path, file_ext)
63
- summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
64
-
65
- os.unlink(tmp_path)
66
- return summary
67
- except Exception as e:
68
- return f"Processing error: {str(e)}"
 
 
 
69
 
70
- def process_image(image):
71
- """Handle image captioning and OCR"""
72
  try:
73
- img = Image.open(image)
74
-
75
- # Get caption
76
- caption = captioner(img)[0]['generated_text']
77
-
78
- # Get OCR text
79
- ocr_result = reader.readtext(img)
80
- ocr_text = " ".join([res[1] for res in ocr_result])
81
-
82
- return {
83
- "caption": caption,
84
- "ocr_text": ocr_text if ocr_text else "No readable text found"
85
- }
86
  except Exception as e:
87
- return {"error": str(e)}
88
 
89
  # Gradio Interface
90
- with gr.Blocks(title="Lightweight Document & Image Analysis") as demo:
91
- gr.Markdown("## πŸ“„ Lightweight Document & Image Analysis")
92
-
93
- with gr.Tab("Document Summarization"):
94
- gr.Markdown("Supports PDF and DOCX files (max 10MB)")
95
- doc_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx"])
96
- doc_output = gr.Textbox(label="Summary")
97
- doc_button = gr.Button("Summarize")
98
-
99
- with gr.Tab("Image Analysis"):
100
- gr.Markdown("Get captions and extracted text from images")
101
- img_input = gr.Image(type="filepath", label="Upload Image")
102
- with gr.Accordion("Results", open=False):
103
- caption_output = gr.Textbox(label="Image Caption")
104
- ocr_output = gr.Textbox(label="Extracted Text")
105
- img_button = gr.Button("Analyze")
106
-
107
- doc_button.click(process_document, inputs=doc_input, outputs=doc_output)
108
- img_button.click(process_image, inputs=img_input, outputs=[caption_output, ocr_output])
109
 
110
- # Mount Gradio app
111
  app = gr.mount_gradio_app(app, demo, path="/")
112
 
 
113
  @app.get("/")
114
  def redirect_to_interface():
115
- return RedirectResponse(url="/")
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
 
 
 
3
  import fitz # PyMuPDF
4
  import docx
5
+ import pptx
6
+ import openpyxl
7
+ import os
8
 
9
+ from fastapi import FastAPI
10
+ from fastapi.responses import RedirectResponse
11
 
12
+ # Load your custom summarization model
13
+ pipe = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")
 
14
 
15
+ # Document text extraction function
16
+ def extract_text(file):
17
+ ext = file.name.split(".")[-1].lower()
18
+ path = file.name
 
 
 
 
 
 
19
 
20
+ if ext == "pdf":
21
+ try:
22
+ with fitz.open(path) as doc:
23
+ return "\n".join([page.get_text("text") for page in doc])
24
+ except Exception as e:
25
+ return f"Error reading PDF: {e}"
26
 
27
+ elif ext == "docx":
28
+ try:
29
+ doc = docx.Document(path)
30
+ return "\n".join([p.text for p in doc.paragraphs])
31
+ except Exception as e:
32
+ return f"Error reading DOCX: {e}"
33
 
34
+ elif ext == "pptx":
35
+ try:
36
+ prs = pptx.Presentation(path)
37
+ text = ""
38
+ for slide in prs.slides:
39
+ for shape in slide.shapes:
40
+ if hasattr(shape, "text"):
41
+ text += shape.text + "\n"
42
+ return text
43
+ except Exception as e:
44
+ return f"Error reading PPTX: {e}"
 
 
45
 
46
+ elif ext == "xlsx":
47
+ try:
48
+ wb = openpyxl.load_workbook(path)
49
+ text = ""
50
+ for sheet in wb.sheetnames:
51
+ for row in wb[sheet].iter_rows(values_only=True):
52
+ text += " ".join([str(cell) for cell in row if cell]) + "\n"
53
+ return text
54
+ except Exception as e:
55
+ return f"Error reading XLSX: {e}"
56
+ else:
57
+ return "Unsupported file format"
58
+
59
+ # Summarization logic
60
+ def summarize_document(file):
61
+ text = extract_text(file)
62
+ if "Error" in text or "Unsupported" in text:
63
+ return text
64
+
65
+ word_count = len(text.split())
66
+ max_summary_len = max(20, int(word_count * 0.2))
67
 
 
 
68
  try:
69
+ summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
70
+ return summary[0]['generated_text']
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
+ return f"Error during summarization: {e}"
73
 
74
  # Gradio Interface
75
+ demo = gr.Interface(
76
+ fn=summarize_document,
77
+ inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
78
+ outputs=gr.Textbox(label="20% Summary"),
79
+ title="πŸ“„ Document Summarizer (20% Length)",
80
+ description="Upload a document and get a concise summary generated by your custom Hugging Face model."
81
+ )
82
+
83
+ # FastAPI setup
84
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
85
 
86
+ # Mount Gradio at "/"
87
  app = gr.mount_gradio_app(app, demo, path="/")
88
 
89
+ # Optional root redirect
90
  @app.get("/")
91
  def redirect_to_interface():
92
+ return RedirectResponse(url="/")