Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -254,17 +254,6 @@ async def process_document( | |
| 254 | 
             
                    )
         | 
| 255 | 
             
                    source_1 = txt_content
         | 
| 256 |  | 
| 257 | 
            -
                    # Source 2: PyPDF2
         | 
| 258 | 
            -
                    def extract_text_from_pdf(doc_path):
         | 
| 259 | 
            -
                        try:
         | 
| 260 | 
            -
                            reader = PdfReader(doc_path)
         | 
| 261 | 
            -
                            text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
         | 
| 262 | 
            -
                            return text
         | 
| 263 | 
            -
                        except Exception as e:
         | 
| 264 | 
            -
                            return str(e)
         | 
| 265 | 
            -
             | 
| 266 | 
            -
                    source_2 = extract_text_from_pdf(temp_path)
         | 
| 267 | 
            -
             | 
| 268 | 
             
                    # Source 3: PDFMiner
         | 
| 269 | 
             
                    def extract_text_pdfminer(pdf_path):
         | 
| 270 | 
             
                        try:
         | 
| @@ -302,46 +291,6 @@ async def process_document( | |
| 302 |  | 
| 303 | 
             
                    source_4 = extract_text_pymupdf(temp_path)
         | 
| 304 |  | 
| 305 | 
            -
                    # Source 5: LayoutLMv3 for structured document understanding
         | 
| 306 | 
            -
                    def extract_text_layoutlm(pdf_path):
         | 
| 307 | 
            -
                        try:
         | 
| 308 | 
            -
                            # Initialize LayoutLMv3
         | 
| 309 | 
            -
                            processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
         | 
| 310 | 
            -
                            model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
         | 
| 311 | 
            -
                            
         | 
| 312 | 
            -
                            # Convert PDF to images
         | 
| 313 | 
            -
                            doc = fitz.open(pdf_path)
         | 
| 314 | 
            -
                            text_results = []
         | 
| 315 | 
            -
                            
         | 
| 316 | 
            -
                            for page_num in range(min(end_pages, doc.page_count)):
         | 
| 317 | 
            -
                                page = doc[page_num]
         | 
| 318 | 
            -
                                pix = page.get_pixmap()
         | 
| 319 | 
            -
                                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         | 
| 320 | 
            -
                                
         | 
| 321 | 
            -
                                # Process image through LayoutLMv3
         | 
| 322 | 
            -
                                encoding = processor(img, return_tensors="pt")
         | 
| 323 | 
            -
                                with torch.no_grad():
         | 
| 324 | 
            -
                                    outputs = model(**encoding)
         | 
| 325 | 
            -
                                
         | 
| 326 | 
            -
                                # Extract text with layout information
         | 
| 327 | 
            -
                                text = page.get_text("dict")
         | 
| 328 | 
            -
                                blocks = text["blocks"]
         | 
| 329 | 
            -
                                structured_text = ""
         | 
| 330 | 
            -
                                for block in blocks:
         | 
| 331 | 
            -
                                    if "lines" in block:
         | 
| 332 | 
            -
                                        for line in block["lines"]:
         | 
| 333 | 
            -
                                            if "spans" in line:
         | 
| 334 | 
            -
                                                for span in line["spans"]:
         | 
| 335 | 
            -
                                                    structured_text += span["text"] + " "
         | 
| 336 | 
            -
                                text_results.append(structured_text)
         | 
| 337 | 
            -
                            
         | 
| 338 | 
            -
                            doc.close()
         | 
| 339 | 
            -
                            return "\n".join(text_results)
         | 
| 340 | 
            -
                        except Exception as e:
         | 
| 341 | 
            -
                            return str(e)
         | 
| 342 | 
            -
             | 
| 343 | 
            -
                    source_5 = extract_text_layoutlm(temp_path)
         | 
| 344 | 
            -
             | 
| 345 | 
             
                    # Clean up
         | 
| 346 | 
             
                    os.remove(temp_path)
         | 
| 347 |  | 
| @@ -372,10 +321,8 @@ async def process_document( | |
| 372 |  | 
| 373 | 
             
                    validated_sources = validate_results({
         | 
| 374 | 
             
                        'source_1': source_1,
         | 
| 375 | 
            -
                        'source_2': source_2,
         | 
| 376 | 
             
                        'source_3': source_3,
         | 
| 377 | 
            -
                        'source_4': source_4 | 
| 378 | 
            -
                        'source_5': source_5
         | 
| 379 | 
             
                    })
         | 
| 380 |  | 
| 381 | 
             
                    return JSONResponse({
         | 
|  | |
| 254 | 
             
                    )
         | 
| 255 | 
             
                    source_1 = txt_content
         | 
| 256 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 257 | 
             
                    # Source 3: PDFMiner
         | 
| 258 | 
             
                    def extract_text_pdfminer(pdf_path):
         | 
| 259 | 
             
                        try:
         | 
|  | |
| 291 |  | 
| 292 | 
             
                    source_4 = extract_text_pymupdf(temp_path)
         | 
| 293 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 294 | 
             
                    # Clean up
         | 
| 295 | 
             
                    os.remove(temp_path)
         | 
| 296 |  | 
|  | |
| 321 |  | 
| 322 | 
             
                    validated_sources = validate_results({
         | 
| 323 | 
             
                        'source_1': source_1,
         | 
|  | |
| 324 | 
             
                        'source_3': source_3,
         | 
| 325 | 
            +
                        'source_4': source_4
         | 
|  | |
| 326 | 
             
                    })
         | 
| 327 |  | 
| 328 | 
             
                    return JSONResponse({
         |