ikraamkb commited on
Commit
2695e4e
·
verified ·
1 Parent(s): 5bc19e6

Update qtAnswering/app.py

Browse files
Files changed (1) hide show
  1. qtAnswering/app.py +72 -73
qtAnswering/app.py CHANGED
@@ -1,73 +1,72 @@
1
- ### app.py — Document QA Backend (Cleaned)
2
- from fastapi import FastAPI
3
- from fastapi.responses import FileResponse, JSONResponse
4
- import fitz # PyMuPDF
5
- import easyocr
6
- import openpyxl
7
- import pptx
8
- import docx
9
- from transformers import pipeline
10
- from gtts import gTTS
11
- import tempfile
12
- import os
13
-
14
- app = FastAPI()
15
-
16
- qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
17
- reader = easyocr.Reader(['en', 'fr'])
18
-
19
- def extract_text_from_pdf(pdf_file):
20
- try:
21
- with fitz.open(pdf_file) as doc:
22
- return "\n".join(page.get_text("text") for page in doc)
23
- except Exception as e:
24
- return f"Error reading PDF: {e}"
25
-
26
- def extract_text_from_docx(docx_file):
27
- doc = docx.Document(docx_file)
28
- return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
29
-
30
- def extract_text_from_pptx(pptx_file):
31
- try:
32
- prs = pptx.Presentation(pptx_file)
33
- return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
34
- except Exception as e:
35
- return f"Error reading PPTX: {e}"
36
-
37
- def extract_text_from_xlsx(xlsx_file):
38
- try:
39
- wb = openpyxl.load_workbook(xlsx_file)
40
- return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
41
- except Exception as e:
42
- return f"Error reading XLSX: {e}"
43
-
44
- def answer_question_from_doc(file, question):
45
- ext = file.filename.split(".")[-1].lower()
46
- file_path = f"/tmp/{file.filename}"
47
-
48
- with open(file_path, "wb") as f:
49
- f.write(file.read())
50
-
51
- if ext == "pdf":
52
- context = extract_text_from_pdf(file_path)
53
- elif ext == "docx":
54
- context = extract_text_from_docx(file_path)
55
- elif ext == "pptx":
56
- context = extract_text_from_pptx(file_path)
57
- elif ext == "xlsx":
58
- context = extract_text_from_xlsx(file_path)
59
- else:
60
- return "Unsupported file format.", None
61
-
62
- if not context.strip():
63
- return "No text found in the document.", None
64
-
65
- try:
66
- result = qa_model({"question": question, "context": context})
67
- answer = result["answer"]
68
- tts = gTTS(answer)
69
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
70
- tts.save(tmp.name)
71
- return answer, tmp.name
72
- except Exception as e:
73
- return f"Error generating answer: {e}", None
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import FileResponse, JSONResponse
3
+ import fitz # PyMuPDF
4
+ import easyocr
5
+ import openpyxl
6
+ import pptx
7
+ import docx
8
+ from transformers import pipeline
9
+ from gtts import gTTS
10
+ import tempfile
11
+ import os
12
+
13
+ app = FastAPI()
14
+
15
+ qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
16
+ reader = easyocr.Reader(['en', 'fr'])
17
+
18
+ def extract_text_from_pdf(pdf_file):
19
+ try:
20
+ with fitz.open(pdf_file) as doc:
21
+ return "\n".join(page.get_text("text") for page in doc)
22
+ except Exception as e:
23
+ return f"Error reading PDF: {e}"
24
+
25
+ def extract_text_from_docx(docx_file):
26
+ doc = docx.Document(docx_file)
27
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
28
+
29
+ def extract_text_from_pptx(pptx_file):
30
+ try:
31
+ prs = pptx.Presentation(pptx_file)
32
+ return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
33
+ except Exception as e:
34
+ return f"Error reading PPTX: {e}"
35
+
36
+ def extract_text_from_xlsx(xlsx_file):
37
+ try:
38
+ wb = openpyxl.load_workbook(xlsx_file)
39
+ return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
40
+ except Exception as e:
41
+ return f"Error reading XLSX: {e}"
42
+
43
+ def answer_question_from_doc(file, question):
44
+ ext = file.filename.split(".")[-1].lower()
45
+ file_path = f"/tmp/{file.filename}"
46
+
47
+ with open(file_path, "wb") as f:
48
+ f.write(file.read())
49
+
50
+ if ext == "pdf":
51
+ context = extract_text_from_pdf(file_path)
52
+ elif ext == "docx":
53
+ context = extract_text_from_docx(file_path)
54
+ elif ext == "pptx":
55
+ context = extract_text_from_pptx(file_path)
56
+ elif ext == "xlsx":
57
+ context = extract_text_from_xlsx(file_path)
58
+ else:
59
+ return "Unsupported file format.", None
60
+
61
+ if not context.strip():
62
+ return "No text found in the document.", None
63
+
64
+ try:
65
+ result = qa_model({"question": question, "context": context})
66
+ answer = result["answer"]
67
+ tts = gTTS(answer)
68
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
69
+ tts.save(tmp.name)
70
+ return answer, tmp.name
71
+ except Exception as e:
72
+ return f"Error generating answer: {e}", None