Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on May 1

Commit

2695e4e

verified ·

1 Parent(s): 5bc19e6

Update qtAnswering/app.py

Browse files

Files changed (1) hide show

qtAnswering/app.py +72 -73

qtAnswering/app.py CHANGED Viewed

@@ -1,73 +1,72 @@
-### ✅ app.py — Document QA Backend (Cleaned)
-from fastapi import FastAPI
-from fastapi.responses import FileResponse, JSONResponse
-import fitz  # PyMuPDF
-import easyocr
-import openpyxl
-import pptx
-import docx
-from transformers import pipeline
-from gtts import gTTS
-import tempfile
-import os
-app = FastAPI()
-qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-reader = easyocr.Reader(['en', 'fr'])
-def extract_text_from_pdf(pdf_file):
-    try:
-        with fitz.open(pdf_file) as doc:
-            return "\n".join(page.get_text("text") for page in doc)
-    except Exception as e:
-        return f"Error reading PDF: {e}"
-def extract_text_from_docx(docx_file):
-    doc = docx.Document(docx_file)
-    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
-def extract_text_from_pptx(pptx_file):
-    try:
-        prs = pptx.Presentation(pptx_file)
-        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
-    except Exception as e:
-        return f"Error reading PPTX: {e}"
-def extract_text_from_xlsx(xlsx_file):
-    try:
-        wb = openpyxl.load_workbook(xlsx_file)
-        return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
-    except Exception as e:
-        return f"Error reading XLSX: {e}"
-def answer_question_from_doc(file, question):
-    ext = file.filename.split(".")[-1].lower()
-    file_path = f"/tmp/{file.filename}"
-    with open(file_path, "wb") as f:
-        f.write(file.read())
-    if ext == "pdf":
-        context = extract_text_from_pdf(file_path)
-    elif ext == "docx":
-        context = extract_text_from_docx(file_path)
-    elif ext == "pptx":
-        context = extract_text_from_pptx(file_path)
-    elif ext == "xlsx":
-        context = extract_text_from_xlsx(file_path)
-    else:
-        return "Unsupported file format.", None
-    if not context.strip():
-        return "No text found in the document.", None
-    try:
-        result = qa_model({"question": question, "context": context})
-        answer = result["answer"]
-        tts = gTTS(answer)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
-            tts.save(tmp.name)
-            return answer, tmp.name
-    except Exception as e:
-        return f"Error generating answer: {e}", None

+from fastapi import FastAPI
+from fastapi.responses import FileResponse, JSONResponse
+import fitz  # PyMuPDF
+import easyocr
+import openpyxl
+import pptx
+import docx
+from transformers import pipeline
+from gtts import gTTS
+import tempfile
+import os
+app = FastAPI()
+qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+reader = easyocr.Reader(['en', 'fr'])
+def extract_text_from_pdf(pdf_file):
+    try:
+        with fitz.open(pdf_file) as doc:
+            return "\n".join(page.get_text("text") for page in doc)
+    except Exception as e:
+        return f"Error reading PDF: {e}"
+def extract_text_from_docx(docx_file):
+    doc = docx.Document(docx_file)
+    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+def extract_text_from_pptx(pptx_file):
+    try:
+        prs = pptx.Presentation(pptx_file)
+        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
+    except Exception as e:
+        return f"Error reading PPTX: {e}"
+def extract_text_from_xlsx(xlsx_file):
+    try:
+        wb = openpyxl.load_workbook(xlsx_file)
+        return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
+    except Exception as e:
+        return f"Error reading XLSX: {e}"
+def answer_question_from_doc(file, question):
+    ext = file.filename.split(".")[-1].lower()
+    file_path = f"/tmp/{file.filename}"
+    with open(file_path, "wb") as f:
+        f.write(file.read())
+    if ext == "pdf":
+        context = extract_text_from_pdf(file_path)
+    elif ext == "docx":
+        context = extract_text_from_docx(file_path)
+    elif ext == "pptx":
+        context = extract_text_from_pptx(file_path)
+    elif ext == "xlsx":
+        context = extract_text_from_xlsx(file_path)
+    else:
+        return "Unsupported file format.", None
+    if not context.strip():
+        return "No text found in the document.", None
+    try:
+        result = qa_model({"question": question, "context": context})
+        answer = result["answer"]
+        tts = gTTS(answer)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
+            tts.save(tmp.name)
+            return answer, tmp.name
+    except Exception as e:
+        return f"Error generating answer: {e}", None