sblumenf commited on
Commit
e17150e
·
verified ·
1 Parent(s): ce01472

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -36,15 +36,15 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
36
  elif isinstance(element, (LTFigure, LTImage)):
37
  try:
38
  if hasattr(element, 'stream'):
39
- image_data = element.stream.read()
40
  image = Image.open(io.BytesIO(image_data))
41
  image_filename = f"extracted_image_{len(images)}.png"
42
  image.save(image_filename)
43
  images.append({"filename": image_filename})
44
  else:
45
  for child in element:
46
- if isinstance(child, LTImage):
47
- image_data = child.stream.read()
48
  image = Image.open(io.BytesIO(image_data))
49
  image_filename = f"extracted_image_{len(images)}.png"
50
  image.save(image_filename)
@@ -55,16 +55,19 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
55
  with pdfplumber.open(pdf_file) as pdf:
56
  for page_num, page in enumerate(pdf.pages):
57
  for table in page.extract_tables():
58
- if len(table) > 0 and len(set(table[0])) != len(table[0]):
59
- unique_columns = []
60
- for col in table[0]:
61
- if col in unique_columns:
62
- col = f"{col}_{unique_columns.count(col)}"
63
- unique_columns.append(col)
64
- df = pd.DataFrame(table[1:], columns=unique_columns)
65
- else:
66
- df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
67
- tables.append(df)
 
 
 
68
 
69
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
70
  if output_format == "JSON":
 
36
  elif isinstance(element, (LTFigure, LTImage)):
37
  try:
38
  if hasattr(element, 'stream'):
39
+ image_data = element.stream.get_rawdata()
40
  image = Image.open(io.BytesIO(image_data))
41
  image_filename = f"extracted_image_{len(images)}.png"
42
  image.save(image_filename)
43
  images.append({"filename": image_filename})
44
  else:
45
  for child in element:
46
+ if isinstance(child, LTImage) and hasattr(child, 'stream'):
47
+ image_data = child.stream.get_rawdata()
48
  image = Image.open(io.BytesIO(image_data))
49
  image_filename = f"extracted_image_{len(images)}.png"
50
  image.save(image_filename)
 
55
  with pdfplumber.open(pdf_file) as pdf:
56
  for page_num, page in enumerate(pdf.pages):
57
  for table in page.extract_tables():
58
+ try:
59
+ if len(table) > 0 and len(set(table[0])) != len(table[0]):
60
+ unique_columns = []
61
+ for col in table[0]:
62
+ if col in unique_columns:
63
+ col = f"{col}_{unique_columns.count(col)}"
64
+ unique_columns.append(col)
65
+ df = pd.DataFrame(table[1:], columns=unique_columns)
66
+ else:
67
+ df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
68
+ tables.append(df)
69
+ except Exception as e:
70
+ print(f"Error processing table: {e}")
71
 
72
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
73
  if output_format == "JSON":