Ali2206 commited on
Commit
62739d0
·
verified ·
1 Parent(s): aba9ae9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -6
app.py CHANGED
@@ -61,6 +61,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
61
  with pdfplumber.open(file_path) as pdf:
62
  total_pages = len(pdf.pages)
63
  if total_pages == 0:
 
64
  return ""
65
 
66
  batch_size = 10
@@ -71,22 +72,28 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
71
  def extract_batch(start: int, end: int) -> List[tuple]:
72
  results = []
73
  with pdfplumber.open(file_path) as pdf:
74
- for page in pdf.pages[start:end]:
75
- page_num = start + pdf.pages.index(page)
76
  page_text = page.extract_text() or ""
77
- results.append((page_num, f"=== Page {page_num + 1} ===\n{page_text.strip()}"))
 
78
  return results
79
 
80
  with ThreadPoolExecutor(max_workers=6) as executor:
81
  futures = [executor.submit(extract_batch, start, end) for start, end in batches]
82
  for future in as_completed(futures):
83
  for page_num, text in future.result():
84
- text_chunks[page_num] = text
 
 
 
85
  processed_pages += batch_size
86
  if progress_callback:
87
  progress_callback(min(processed_pages, total_pages), total_pages)
 
88
 
89
- return "\n\n".join(filter(None, text_chunks))
 
 
90
  except Exception as e:
91
  logger.error("PDF processing error: %s", e)
92
  return f"PDF processing error: {str(e)}"
@@ -96,6 +103,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
96
  file_h = file_hash(file_path)
97
  cache_key = f"{file_h}_{file_type}"
98
  if cache_key in cache:
 
99
  return cache[cache_key]
100
 
101
  if file_type == "pdf":
@@ -117,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
117
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
118
 
119
  cache[cache_key] = result
 
120
  return result
121
  except Exception as e:
122
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
@@ -259,9 +268,11 @@ Patient Record Excerpt (Chunk {0} of {1}):
259
 
260
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
261
  yield history, None, ""
 
262
 
263
  chunk_size = 6000
264
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
 
265
  combined_response = ""
266
  batch_size = 2
267
 
@@ -287,7 +298,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
287
  if cleaned and re.search(r"###\s*\w+", cleaned):
288
  chunk_response += cleaned + "\n\n"
289
  elif isinstance(chunk_output, str) and chunk_output.strip():
290
- cleaned = clean_response(m.content)
291
  if cleaned and re.search(r"###\s*\w+", cleaned):
292
  chunk_response += cleaned + "\n\n"
293
  batch_responses.append(chunk_response)
 
61
  with pdfplumber.open(file_path) as pdf:
62
  total_pages = len(pdf.pages)
63
  if total_pages == 0:
64
+ logger.error("No pages found in PDF")
65
  return ""
66
 
67
  batch_size = 10
 
72
  def extract_batch(start: int, end: int) -> List[tuple]:
73
  results = []
74
  with pdfplumber.open(file_path) as pdf:
75
+ for idx, page in enumerate(pdf.pages[start:end], start=start):
 
76
  page_text = page.extract_text() or ""
77
+ results.append((idx, f"=== Page {idx + 1} ===\n{page_text.strip()}"))
78
+ logger.debug("Extracted page %d, text length: %d chars", idx + 1, len(page_text))
79
  return results
80
 
81
  with ThreadPoolExecutor(max_workers=6) as executor:
82
  futures = [executor.submit(extract_batch, start, end) for start, end in batches]
83
  for future in as_completed(futures):
84
  for page_num, text in future.result():
85
+ if page_num < len(text_chunks):
86
+ text_chunks[page_num] = text
87
+ else:
88
+ logger.warning("Page number %d out of range for text_chunks (size %d)", page_num, len(text_chunks))
89
  processed_pages += batch_size
90
  if progress_callback:
91
  progress_callback(min(processed_pages, total_pages), total_pages)
92
+ logger.info("Processed %d/%d pages", min(processed_pages, total_pages), total_pages)
93
 
94
+ extracted_text = "\n\n".join(filter(None, text_chunks))
95
+ logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
96
+ return extracted_text
97
  except Exception as e:
98
  logger.error("PDF processing error: %s", e)
99
  return f"PDF processing error: {str(e)}"
 
103
  file_h = file_hash(file_path)
104
  cache_key = f"{file_h}_{file_type}"
105
  if cache_key in cache:
106
+ logger.info("Using cached extraction for %s", file_path)
107
  return cache[cache_key]
108
 
109
  if file_type == "pdf":
 
125
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
126
 
127
  cache[cache_key] = result
128
+ logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
129
  return result
130
  except Exception as e:
131
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
 
268
 
269
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
270
  yield history, None, ""
271
+ logger.info("Extracted text length: %d chars", len(extracted))
272
 
273
  chunk_size = 6000
274
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
275
+ logger.info("Created %d chunks", len(chunks))
276
  combined_response = ""
277
  batch_size = 2
278
 
 
298
  if cleaned and re.search(r"###\s*\w+", cleaned):
299
  chunk_response += cleaned + "\n\n"
300
  elif isinstance(chunk_output, str) and chunk_output.strip():
301
+ cleaned = clean_response(chunk_output)
302
  if cleaned and re.search(r"###\s*\w+", cleaned):
303
  chunk_response += cleaned + "\n\n"
304
  batch_responses.append(chunk_response)