Ali2206 commited on
Commit
c0b2cb7
·
verified ·
1 Parent(s): 55c02ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -22
app.py CHANGED
@@ -81,35 +81,45 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
81
 
82
  def extract_page(i):
83
  page = pdf.pages[i]
84
- # Try table extraction first
85
  text = ""
86
- tables = page.extract_tables()
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  if tables:
88
  for table in tables:
89
- # Mimic Excel/CSV: join non-None cells as strings
90
  table_text = "\n".join(
91
  " | ".join(str(cell) if cell is not None else "" for cell in row)
92
- for row in table
93
  )
94
  text += table_text + "\n\n"
95
  logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
96
  else:
97
- # Fall back to raw text
98
  text = page.extract_text() or ""
99
  logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
100
 
101
- # OCR if text is short or force_ocr is True
102
  if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
103
  try:
104
  logger.info("Attempting OCR for page %d", i + 1)
105
  pdfium_pdf = pdfium.PdfDocument(file_path)
106
  page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
107
- text = pytesseract.image_to_string(page_bitmap, lang="eng")
108
- logger.debug("Page %d OCR text length: %d chars", i + 1, len(text))
 
109
  pdfium_pdf.close()
110
  except Exception as e:
111
  logger.error("OCR failed for page %d: %s", i + 1, e)
112
- text = text or ""
113
  return (i, f"=== Page {i + 1} ===\n{text.strip()}")
114
 
115
  with ThreadPoolExecutor(max_workers=4) as executor:
@@ -124,6 +134,7 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
124
  text_chunks.sort(key=lambda x: x[0])
125
  extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
126
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
 
127
  if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
128
  logger.info("Text too short, forcing OCR for all pages")
129
  return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
@@ -136,25 +147,54 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
136
  try:
137
  file_h = file_hash(file_path)
138
  cache_key = f"{file_h}_{file_type}"
139
- if cache_key in cache:
140
- logger.info("Using cached extraction for %s", file_path)
141
- return cache[cache_key]
 
 
142
 
143
  if file_type == "pdf":
144
  text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
145
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
146
  elif file_type == "csv":
147
- df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
148
- skip_blank_lines=False, on_bad_lines="skip")
149
- content = df.fillna("").astype(str).values.tolist()
150
- result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
 
 
 
 
151
  elif file_type in ["xls", "xlsx"]:
152
  try:
153
- df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
154
- except Exception:
155
- df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
156
- content = df.fillna("").astype(str).values.tolist()
157
- result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  else:
159
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
160
 
@@ -324,7 +364,7 @@ Patient Record Excerpt:
324
  chunk_response = ""
325
  raw_outputs = []
326
  for chunk_output in agent.run_gradio_chat(
327
- message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=1024, call_agent=False, conversation=[]
328
  ):
329
  if chunk_output is None:
330
  continue
 
81
 
82
  def extract_page(i):
83
  page = pdf.pages[i]
 
84
  text = ""
85
+ # Adjust table settings for complex layouts
86
+ table_settings = {
87
+ "vertical_strategy": "lines",
88
+ "horizontal_strategy": "lines",
89
+ "explicit_vertical_lines": [],
90
+ "explicit_horizontal_lines": [],
91
+ "snap_tolerance": 5,
92
+ "join_tolerance": 5,
93
+ "edge_min_length": 3,
94
+ "min_words_vertical": 3,
95
+ "min_words_horizontal": 1,
96
+ "intersection_tolerance": 5,
97
+ }
98
+ tables = page.extract_tables(table_settings=table_settings)
99
  if tables:
100
  for table in tables:
 
101
  table_text = "\n".join(
102
  " | ".join(str(cell) if cell is not None else "" for cell in row)
103
+ for row in table if any(cell is not None for cell in row)
104
  )
105
  text += table_text + "\n\n"
106
  logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
107
  else:
 
108
  text = page.extract_text() or ""
109
  logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
110
 
111
+ # Force OCR if text is short or force_ocr is True
112
  if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
113
  try:
114
  logger.info("Attempting OCR for page %d", i + 1)
115
  pdfium_pdf = pdfium.PdfDocument(file_path)
116
  page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
117
+ ocr_text = pytesseract.image_to_string(page_bitmap, lang="eng")
118
+ logger.debug("Page %d OCR text length: %d chars", i + 1, len(ocr_text))
119
+ text = ocr_text if ocr_text.strip() else text
120
  pdfium_pdf.close()
121
  except Exception as e:
122
  logger.error("OCR failed for page %d: %s", i + 1, e)
 
123
  return (i, f"=== Page {i + 1} ===\n{text.strip()}")
124
 
125
  with ThreadPoolExecutor(max_workers=4) as executor:
 
134
  text_chunks.sort(key=lambda x: x[0])
135
  extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
136
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
137
+ # Force OCR retry if text is too short
138
  if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
139
  logger.info("Text too short, forcing OCR for all pages")
140
  return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
 
147
  try:
148
  file_h = file_hash(file_path)
149
  cache_key = f"{file_h}_{file_type}"
150
+ # Bypass cache to force fresh extraction
151
+ logger.info("Forcing fresh extraction for %s", file_path)
152
+ # if cache_key in cache:
153
+ # logger.info("Using cached extraction for %s", file_path)
154
+ # return cache[cache_key]
155
 
156
  if file_type == "pdf":
157
  text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
158
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
159
  elif file_type == "csv":
160
+ try:
161
+ df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
162
+ skip_blank_lines=False, on_bad_lines="skip")
163
+ content = df.fillna("").astype(str).values.tolist()
164
+ result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
165
+ logger.info("CSV processed, rows: %d", len(content))
166
+ except Exception as e:
167
+ logger.error("CSV processing failed: %s", e)
168
+ result = json.dumps({"error": f"CSV processing failed: {str(e)}"})
169
  elif file_type in ["xls", "xlsx"]:
170
  try:
171
+ # Try all sheets to maximize data
172
+ xl = pd.ExcelFile(file_path, engine="openpyxl")
173
+ content = []
174
+ for sheet_name in xl.sheet_names:
175
+ try:
176
+ df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl", header=None, dtype=str)
177
+ sheet_content = df.fillna("").astype(str).values.tolist()
178
+ content.extend(sheet_content)
179
+ logger.debug("Excel sheet %s processed, rows: %d", sheet_name, len(sheet_content))
180
+ except Exception as e:
181
+ logger.warning("Excel sheet %s failed: %s", sheet_name, e)
182
+ if not content:
183
+ logger.error("No valid data extracted from Excel")
184
+ result = json.dumps({"error": "No valid data extracted from Excel"})
185
+ else:
186
+ result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
187
+ logger.info("Excel processed, total rows: %d", len(content))
188
+ except Exception as e:
189
+ logger.error("Excel processing failed: %s", e)
190
+ try:
191
+ df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
192
+ content = df.fillna("").astype(str).values.tolist()
193
+ result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
194
+ logger.info("Excel processed with xlrd, rows: %d", len(content))
195
+ except Exception as e2:
196
+ logger.error("Excel processing failed with xlrd: %s", e2)
197
+ result = json.dumps({"error": f"Excel processing failed: {str(e)}"})
198
  else:
199
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
200
 
 
364
  chunk_response = ""
365
  raw_outputs = []
366
  for chunk_output in agent.run_gradio_chat(
367
+ message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=2048, call_agent=False, conversation=[]
368
  ):
369
  if chunk_output is None:
370
  continue