Update app.py
Browse files
app.py
CHANGED
@@ -61,6 +61,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
61 |
with pdfplumber.open(file_path) as pdf:
|
62 |
total_pages = len(pdf.pages)
|
63 |
if total_pages == 0:
|
|
|
64 |
return ""
|
65 |
|
66 |
batch_size = 10
|
@@ -71,22 +72,28 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
71 |
def extract_batch(start: int, end: int) -> List[tuple]:
|
72 |
results = []
|
73 |
with pdfplumber.open(file_path) as pdf:
|
74 |
-
for page in pdf.pages[start:end]:
|
75 |
-
page_num = start + pdf.pages.index(page)
|
76 |
page_text = page.extract_text() or ""
|
77 |
-
results.append((
|
|
|
78 |
return results
|
79 |
|
80 |
with ThreadPoolExecutor(max_workers=6) as executor:
|
81 |
futures = [executor.submit(extract_batch, start, end) for start, end in batches]
|
82 |
for future in as_completed(futures):
|
83 |
for page_num, text in future.result():
|
84 |
-
|
|
|
|
|
|
|
85 |
processed_pages += batch_size
|
86 |
if progress_callback:
|
87 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
|
|
88 |
|
89 |
-
|
|
|
|
|
90 |
except Exception as e:
|
91 |
logger.error("PDF processing error: %s", e)
|
92 |
return f"PDF processing error: {str(e)}"
|
@@ -96,6 +103,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
96 |
file_h = file_hash(file_path)
|
97 |
cache_key = f"{file_h}_{file_type}"
|
98 |
if cache_key in cache:
|
|
|
99 |
return cache[cache_key]
|
100 |
|
101 |
if file_type == "pdf":
|
@@ -117,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
117 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
118 |
|
119 |
cache[cache_key] = result
|
|
|
120 |
return result
|
121 |
except Exception as e:
|
122 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
@@ -259,9 +268,11 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
259 |
|
260 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
261 |
yield history, None, ""
|
|
|
262 |
|
263 |
chunk_size = 6000
|
264 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
|
|
265 |
combined_response = ""
|
266 |
batch_size = 2
|
267 |
|
@@ -287,7 +298,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
287 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
288 |
chunk_response += cleaned + "\n\n"
|
289 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
290 |
-
cleaned = clean_response(
|
291 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
292 |
chunk_response += cleaned + "\n\n"
|
293 |
batch_responses.append(chunk_response)
|
|
|
61 |
with pdfplumber.open(file_path) as pdf:
|
62 |
total_pages = len(pdf.pages)
|
63 |
if total_pages == 0:
|
64 |
+
logger.error("No pages found in PDF")
|
65 |
return ""
|
66 |
|
67 |
batch_size = 10
|
|
|
72 |
def extract_batch(start: int, end: int) -> List[tuple]:
|
73 |
results = []
|
74 |
with pdfplumber.open(file_path) as pdf:
|
75 |
+
for idx, page in enumerate(pdf.pages[start:end], start=start):
|
|
|
76 |
page_text = page.extract_text() or ""
|
77 |
+
results.append((idx, f"=== Page {idx + 1} ===\n{page_text.strip()}"))
|
78 |
+
logger.debug("Extracted page %d, text length: %d chars", idx + 1, len(page_text))
|
79 |
return results
|
80 |
|
81 |
with ThreadPoolExecutor(max_workers=6) as executor:
|
82 |
futures = [executor.submit(extract_batch, start, end) for start, end in batches]
|
83 |
for future in as_completed(futures):
|
84 |
for page_num, text in future.result():
|
85 |
+
if page_num < len(text_chunks):
|
86 |
+
text_chunks[page_num] = text
|
87 |
+
else:
|
88 |
+
logger.warning("Page number %d out of range for text_chunks (size %d)", page_num, len(text_chunks))
|
89 |
processed_pages += batch_size
|
90 |
if progress_callback:
|
91 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
92 |
+
logger.info("Processed %d/%d pages", min(processed_pages, total_pages), total_pages)
|
93 |
|
94 |
+
extracted_text = "\n\n".join(filter(None, text_chunks))
|
95 |
+
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
96 |
+
return extracted_text
|
97 |
except Exception as e:
|
98 |
logger.error("PDF processing error: %s", e)
|
99 |
return f"PDF processing error: {str(e)}"
|
|
|
103 |
file_h = file_hash(file_path)
|
104 |
cache_key = f"{file_h}_{file_type}"
|
105 |
if cache_key in cache:
|
106 |
+
logger.info("Using cached extraction for %s", file_path)
|
107 |
return cache[cache_key]
|
108 |
|
109 |
if file_type == "pdf":
|
|
|
125 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
126 |
|
127 |
cache[cache_key] = result
|
128 |
+
logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
|
129 |
return result
|
130 |
except Exception as e:
|
131 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
|
|
268 |
|
269 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
270 |
yield history, None, ""
|
271 |
+
logger.info("Extracted text length: %d chars", len(extracted))
|
272 |
|
273 |
chunk_size = 6000
|
274 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
275 |
+
logger.info("Created %d chunks", len(chunks))
|
276 |
combined_response = ""
|
277 |
batch_size = 2
|
278 |
|
|
|
298 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
299 |
chunk_response += cleaned + "\n\n"
|
300 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
301 |
+
cleaned = clean_response(chunk_output)
|
302 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
303 |
chunk_response += cleaned + "\n\n"
|
304 |
batch_responses.append(chunk_response)
|