Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -254,17 +254,6 @@ async def process_document(
|
|
254 |
)
|
255 |
source_1 = txt_content
|
256 |
|
257 |
-
# Source 2: PyPDF2
|
258 |
-
def extract_text_from_pdf(doc_path):
|
259 |
-
try:
|
260 |
-
reader = PdfReader(doc_path)
|
261 |
-
text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
|
262 |
-
return text
|
263 |
-
except Exception as e:
|
264 |
-
return str(e)
|
265 |
-
|
266 |
-
source_2 = extract_text_from_pdf(temp_path)
|
267 |
-
|
268 |
# Source 3: PDFMiner
|
269 |
def extract_text_pdfminer(pdf_path):
|
270 |
try:
|
@@ -302,46 +291,6 @@ async def process_document(
|
|
302 |
|
303 |
source_4 = extract_text_pymupdf(temp_path)
|
304 |
|
305 |
-
# Source 5: LayoutLMv3 for structured document understanding
|
306 |
-
def extract_text_layoutlm(pdf_path):
|
307 |
-
try:
|
308 |
-
# Initialize LayoutLMv3
|
309 |
-
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
|
310 |
-
model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
|
311 |
-
|
312 |
-
# Convert PDF to images
|
313 |
-
doc = fitz.open(pdf_path)
|
314 |
-
text_results = []
|
315 |
-
|
316 |
-
for page_num in range(min(end_pages, doc.page_count)):
|
317 |
-
page = doc[page_num]
|
318 |
-
pix = page.get_pixmap()
|
319 |
-
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
320 |
-
|
321 |
-
# Process image through LayoutLMv3
|
322 |
-
encoding = processor(img, return_tensors="pt")
|
323 |
-
with torch.no_grad():
|
324 |
-
outputs = model(**encoding)
|
325 |
-
|
326 |
-
# Extract text with layout information
|
327 |
-
text = page.get_text("dict")
|
328 |
-
blocks = text["blocks"]
|
329 |
-
structured_text = ""
|
330 |
-
for block in blocks:
|
331 |
-
if "lines" in block:
|
332 |
-
for line in block["lines"]:
|
333 |
-
if "spans" in line:
|
334 |
-
for span in line["spans"]:
|
335 |
-
structured_text += span["text"] + " "
|
336 |
-
text_results.append(structured_text)
|
337 |
-
|
338 |
-
doc.close()
|
339 |
-
return "\n".join(text_results)
|
340 |
-
except Exception as e:
|
341 |
-
return str(e)
|
342 |
-
|
343 |
-
source_5 = extract_text_layoutlm(temp_path)
|
344 |
-
|
345 |
# Clean up
|
346 |
os.remove(temp_path)
|
347 |
|
@@ -372,10 +321,8 @@ async def process_document(
|
|
372 |
|
373 |
validated_sources = validate_results({
|
374 |
'source_1': source_1,
|
375 |
-
'source_2': source_2,
|
376 |
'source_3': source_3,
|
377 |
-
'source_4': source_4
|
378 |
-
'source_5': source_5
|
379 |
})
|
380 |
|
381 |
return JSONResponse({
|
|
|
254 |
)
|
255 |
source_1 = txt_content
|
256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
# Source 3: PDFMiner
|
258 |
def extract_text_pdfminer(pdf_path):
|
259 |
try:
|
|
|
291 |
|
292 |
source_4 = extract_text_pymupdf(temp_path)
|
293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
# Clean up
|
295 |
os.remove(temp_path)
|
296 |
|
|
|
321 |
|
322 |
validated_sources = validate_results({
|
323 |
'source_1': source_1,
|
|
|
324 |
'source_3': source_3,
|
325 |
+
'source_4': source_4
|
|
|
326 |
})
|
327 |
|
328 |
return JSONResponse({
|