Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -254,17 +254,6 @@ async def process_document(
|
|
| 254 |
)
|
| 255 |
source_1 = txt_content
|
| 256 |
|
| 257 |
-
# Source 2: PyPDF2
|
| 258 |
-
def extract_text_from_pdf(doc_path):
|
| 259 |
-
try:
|
| 260 |
-
reader = PdfReader(doc_path)
|
| 261 |
-
text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
|
| 262 |
-
return text
|
| 263 |
-
except Exception as e:
|
| 264 |
-
return str(e)
|
| 265 |
-
|
| 266 |
-
source_2 = extract_text_from_pdf(temp_path)
|
| 267 |
-
|
| 268 |
# Source 3: PDFMiner
|
| 269 |
def extract_text_pdfminer(pdf_path):
|
| 270 |
try:
|
|
@@ -302,46 +291,6 @@ async def process_document(
|
|
| 302 |
|
| 303 |
source_4 = extract_text_pymupdf(temp_path)
|
| 304 |
|
| 305 |
-
# Source 5: LayoutLMv3 for structured document understanding
|
| 306 |
-
def extract_text_layoutlm(pdf_path):
|
| 307 |
-
try:
|
| 308 |
-
# Initialize LayoutLMv3
|
| 309 |
-
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
|
| 310 |
-
model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
|
| 311 |
-
|
| 312 |
-
# Convert PDF to images
|
| 313 |
-
doc = fitz.open(pdf_path)
|
| 314 |
-
text_results = []
|
| 315 |
-
|
| 316 |
-
for page_num in range(min(end_pages, doc.page_count)):
|
| 317 |
-
page = doc[page_num]
|
| 318 |
-
pix = page.get_pixmap()
|
| 319 |
-
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 320 |
-
|
| 321 |
-
# Process image through LayoutLMv3
|
| 322 |
-
encoding = processor(img, return_tensors="pt")
|
| 323 |
-
with torch.no_grad():
|
| 324 |
-
outputs = model(**encoding)
|
| 325 |
-
|
| 326 |
-
# Extract text with layout information
|
| 327 |
-
text = page.get_text("dict")
|
| 328 |
-
blocks = text["blocks"]
|
| 329 |
-
structured_text = ""
|
| 330 |
-
for block in blocks:
|
| 331 |
-
if "lines" in block:
|
| 332 |
-
for line in block["lines"]:
|
| 333 |
-
if "spans" in line:
|
| 334 |
-
for span in line["spans"]:
|
| 335 |
-
structured_text += span["text"] + " "
|
| 336 |
-
text_results.append(structured_text)
|
| 337 |
-
|
| 338 |
-
doc.close()
|
| 339 |
-
return "\n".join(text_results)
|
| 340 |
-
except Exception as e:
|
| 341 |
-
return str(e)
|
| 342 |
-
|
| 343 |
-
source_5 = extract_text_layoutlm(temp_path)
|
| 344 |
-
|
| 345 |
# Clean up
|
| 346 |
os.remove(temp_path)
|
| 347 |
|
|
@@ -372,10 +321,8 @@ async def process_document(
|
|
| 372 |
|
| 373 |
validated_sources = validate_results({
|
| 374 |
'source_1': source_1,
|
| 375 |
-
'source_2': source_2,
|
| 376 |
'source_3': source_3,
|
| 377 |
-
'source_4': source_4
|
| 378 |
-
'source_5': source_5
|
| 379 |
})
|
| 380 |
|
| 381 |
return JSONResponse({
|
|
|
|
| 254 |
)
|
| 255 |
source_1 = txt_content
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
# Source 3: PDFMiner
|
| 258 |
def extract_text_pdfminer(pdf_path):
|
| 259 |
try:
|
|
|
|
| 291 |
|
| 292 |
source_4 = extract_text_pymupdf(temp_path)
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
# Clean up
|
| 295 |
os.remove(temp_path)
|
| 296 |
|
|
|
|
| 321 |
|
| 322 |
validated_sources = validate_results({
|
| 323 |
'source_1': source_1,
|
|
|
|
| 324 |
'source_3': source_3,
|
| 325 |
+
'source_4': source_4
|
|
|
|
| 326 |
})
|
| 327 |
|
| 328 |
return JSONResponse({
|