dmitrynovikov2121 commited on
Commit
a1ad56a
Β·
verified Β·
1 Parent(s): 59be4c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -54
app.py CHANGED
@@ -254,17 +254,6 @@ async def process_document(
254
  )
255
  source_1 = txt_content
256
 
257
- # Source 2: PyPDF2
258
- def extract_text_from_pdf(doc_path):
259
- try:
260
- reader = PdfReader(doc_path)
261
- text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
262
- return text
263
- except Exception as e:
264
- return str(e)
265
-
266
- source_2 = extract_text_from_pdf(temp_path)
267
-
268
  # Source 3: PDFMiner
269
  def extract_text_pdfminer(pdf_path):
270
  try:
@@ -302,46 +291,6 @@ async def process_document(
302
 
303
  source_4 = extract_text_pymupdf(temp_path)
304
 
305
- # Source 5: LayoutLMv3 for structured document understanding
306
- def extract_text_layoutlm(pdf_path):
307
- try:
308
- # Initialize LayoutLMv3
309
- processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
310
- model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
311
-
312
- # Convert PDF to images
313
- doc = fitz.open(pdf_path)
314
- text_results = []
315
-
316
- for page_num in range(min(end_pages, doc.page_count)):
317
- page = doc[page_num]
318
- pix = page.get_pixmap()
319
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
320
-
321
- # Process image through LayoutLMv3
322
- encoding = processor(img, return_tensors="pt")
323
- with torch.no_grad():
324
- outputs = model(**encoding)
325
-
326
- # Extract text with layout information
327
- text = page.get_text("dict")
328
- blocks = text["blocks"]
329
- structured_text = ""
330
- for block in blocks:
331
- if "lines" in block:
332
- for line in block["lines"]:
333
- if "spans" in line:
334
- for span in line["spans"]:
335
- structured_text += span["text"] + " "
336
- text_results.append(structured_text)
337
-
338
- doc.close()
339
- return "\n".join(text_results)
340
- except Exception as e:
341
- return str(e)
342
-
343
- source_5 = extract_text_layoutlm(temp_path)
344
-
345
  # Clean up
346
  os.remove(temp_path)
347
 
@@ -372,10 +321,8 @@ async def process_document(
372
 
373
  validated_sources = validate_results({
374
  'source_1': source_1,
375
- 'source_2': source_2,
376
  'source_3': source_3,
377
- 'source_4': source_4,
378
- 'source_5': source_5
379
  })
380
 
381
  return JSONResponse({
 
254
  )
255
  source_1 = txt_content
256
 
 
 
 
 
 
 
 
 
 
 
 
257
  # Source 3: PDFMiner
258
  def extract_text_pdfminer(pdf_path):
259
  try:
 
291
 
292
  source_4 = extract_text_pymupdf(temp_path)
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  # Clean up
295
  os.remove(temp_path)
296
 
 
321
 
322
  validated_sources = validate_results({
323
  'source_1': source_1,
 
324
  'source_3': source_3,
325
+ 'source_4': source_4
 
326
  })
327
 
328
  return JSONResponse({