Ali2206 commited on
Commit
5d37db7
·
verified ·
1 Parent(s): 8ce9243

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -54
app.py CHANGED
@@ -27,10 +27,11 @@ logger = logging.getLogger(__name__)
27
 
28
  # Constants
29
  MAX_TOKENS = 1800
30
- BATCH_SIZE = 2
31
- MAX_WORKERS = 4
32
- CHUNK_SIZE = 10 # For PDF processing
33
- MODEL_MAX_TOKENS = 131072 # Model's maximum token limit
 
34
 
35
  # Persistent directory setup
36
  persistent_dir = "/data/hf_cache"
@@ -79,45 +80,75 @@ def file_hash(path: str) -> str:
79
  hash_md5.update(chunk)
80
  return hash_md5.hexdigest()
81
 
82
- def extract_pdf_page(page) -> str:
83
- """Optimized single page extraction"""
84
  try:
85
  text = page.extract_text() or ""
86
- return f"=== Page {page.page_number} ===\n{text.strip()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
  logger.warning(f"Error extracting page {page.page_number}: {str(e)}")
89
- return ""
90
 
91
- def extract_all_pages(file_path: str, progress_callback=None) -> str:
92
- """Optimized PDF extraction with memory management"""
93
  try:
 
94
  with pdfplumber.open(file_path) as pdf:
95
  total_pages = len(pdf.pages)
96
  if total_pages == 0:
97
- return ""
98
 
99
  results = []
 
100
  for chunk_start in range(0, total_pages, CHUNK_SIZE):
101
  chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
102
 
103
  with pdfplumber.open(file_path) as pdf:
104
- with ThreadPoolExecutor(max_workers=min(CHUNK_SIZE, 4)) as executor:
105
- futures = [executor.submit(extract_pdf_page, pdf.pages[i])
106
  for i in range(chunk_start, chunk_end)]
107
 
108
  for future in as_completed(futures):
109
- results.append(future.result())
110
-
 
 
 
 
 
 
 
111
  if progress_callback:
112
  progress_callback(min(chunk_end, total_pages), total_pages)
113
 
114
  del pdf
115
  gc.collect()
116
 
117
- return "\n\n".join(filter(None, results))
118
  except Exception as e:
119
  logger.error(f"PDF processing error: {e}")
120
- return f"PDF processing error: {str(e)}"
121
 
122
  def excel_to_json(file_path: str) -> List[Dict]:
123
  """Optimized Excel processing with chunking"""
@@ -173,13 +204,13 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
173
  """Cached file processing with memory optimization"""
174
  try:
175
  if file_type == "pdf":
176
- text = extract_all_pages(file_path)
177
  return [{
178
  "filename": os.path.basename(file_path),
179
- "content": text,
180
  "status": "initial",
181
  "type": "pdf"
182
- }]
183
  elif file_type in ["xls", "xlsx"]:
184
  return excel_to_json(file_path)
185
  elif file_type == "csv":
@@ -191,9 +222,17 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
191
  return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
192
 
193
  def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
194
- """Optimized tokenization and chunking with strict token limit enforcement"""
 
 
 
 
195
  tokenizer = get_tokenizer()
196
  tokens = tokenizer.encode(text, add_special_tokens=False)
 
 
 
 
197
  chunks = []
198
  current_chunk = []
199
  current_length = 0
@@ -210,21 +249,6 @@ def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
210
  if current_chunk:
211
  chunks.append(tokenizer.decode(current_chunk))
212
 
213
- # Validate total tokens
214
- total_tokens = sum(len(tokenizer.encode(chunk, add_special_tokens=False)) for chunk in chunks)
215
- if total_tokens > MODEL_MAX_TOKENS:
216
- logger.warning(f"Total tokens ({total_tokens}) exceed model limit ({MODEL_MAX_TOKENS}). Truncating.")
217
- truncated_chunks = []
218
- current_tokens = 0
219
- for chunk in chunks:
220
- chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
221
- if current_tokens + chunk_tokens <= MODEL_MAX_TOKENS:
222
- truncated_chunks.append(chunk)
223
- current_tokens += chunk_tokens
224
- else:
225
- break
226
- chunks = truncated_chunks
227
-
228
  return chunks
229
 
230
  def log_system_usage(tag=""):
@@ -427,26 +451,22 @@ Patient Record Excerpt (Chunk {0} of {1}):
427
  history.append({"role": "assistant", "content": "✅ File processing complete"})
428
  yield history, None, ""
429
 
430
- text_content = "\n".join(json.dumps(item, ensure_ascii=False) for item in extracted)
431
- del extracted
432
- gc.collect()
433
-
434
- try:
435
- chunks = tokenize_and_chunk(text_content)
436
- except Exception as e:
437
- logger.error(f"Tokenization error: {e}")
438
- history.append({"role": "assistant", "content": f"❌ Error: Input too large to process. Please upload a smaller file."})
439
- yield history, None, f"Error: Input too large to process."
440
  return
441
-
442
- del text_content
443
- gc.collect()
444
-
445
  combined_response = ""
446
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
447
  seen_responses = set()
448
 
449
  try:
 
 
 
 
 
 
450
  for batch_idx in range(0, len(chunks), BATCH_SIZE):
451
  batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
452
 
@@ -511,10 +531,12 @@ Patient Record Excerpt (Chunk {0} of {1}):
511
  for chunk_output in future.result():
512
  if isinstance(chunk_output, list):
513
  for msg in chunk_output:
514
- if isinstance(msg, ChatMessage) and msg.content:
515
- combined_response += clean_response(msg.content) + "\n"
516
- history[-1] = {"role": "assistant", "content": combined_response.strip()}
517
- yield history, report_path, ""
 
 
518
  except Exception as e:
519
  logger.error(f"Detailed analysis error for chunk {batch_idx + chunk_idx + 1}: {e}")
520
  history[-1] = {"role": "assistant", "content": f"Error in detailed analysis for chunk {batch_idx + chunk_idx + 1}: {str(e)}"}
 
27
 
28
  # Constants
29
  MAX_TOKENS = 1800
30
+ BATCH_SIZE = 1 # Reduced to minimize memory pressure
31
+ MAX_WORKERS = 2
32
+ CHUNK_SIZE = 5 # Smaller chunks for PDF processing
33
+ MODEL_MAX_TOKENS = 131072
34
+ MAX_TEXT_LENGTH = 500000 # Limit raw text length before tokenization
35
 
36
  # Persistent directory setup
37
  persistent_dir = "/data/hf_cache"
 
80
  hash_md5.update(chunk)
81
  return hash_md5.hexdigest()
82
 
83
+ def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
84
+ """Extract and chunk a single page with token limit"""
85
  try:
86
  text = page.extract_text() or ""
87
+ text = sanitize_utf8(text)
88
+ if len(text) > MAX_TEXT_LENGTH // 10: # Per-page text limit
89
+ logger.warning(f"Page {page.page_number} text too long ({len(text)}). Truncating.")
90
+ text = text[:MAX_TEXT_LENGTH // 10]
91
+
92
+ tokens = tokenizer.encode(text, add_special_tokens=False)
93
+ if len(tokens) > max_tokens:
94
+ chunks = []
95
+ current_chunk = []
96
+ current_length = 0
97
+ for token in tokens:
98
+ if current_length + 1 > max_tokens:
99
+ chunks.append(tokenizer.decode(current_chunk))
100
+ current_chunk = [token]
101
+ current_length = 1
102
+ else:
103
+ current_chunk.append(token)
104
+ current_length += 1
105
+ if current_chunk:
106
+ chunks.append(tokenizer.decode(current_chunk))
107
+ return [f"=== Page {page.page_number} ===\n{c}" for c in chunks]
108
+ return [f"=== Page {page.page_number} ===\n{text}"]
109
  except Exception as e:
110
  logger.warning(f"Error extracting page {page.page_number}: {str(e)}")
111
+ return []
112
 
113
+ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
114
+ """Extract PDF pages with early token-based chunking"""
115
  try:
116
+ tokenizer = get_tokenizer()
117
  with pdfplumber.open(file_path) as pdf:
118
  total_pages = len(pdf.pages)
119
  if total_pages == 0:
120
+ return []
121
 
122
  results = []
123
+ total_tokens = 0
124
  for chunk_start in range(0, total_pages, CHUNK_SIZE):
125
  chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
126
 
127
  with pdfplumber.open(file_path) as pdf:
128
+ with ThreadPoolExecutor(max_workers=min(CHUNK_SIZE, 2)) as executor:
129
+ futures = [executor.submit(extract_pdf_page, pdf.pages[i], tokenizer)
130
  for i in range(chunk_start, chunk_end)]
131
 
132
  for future in as_completed(futures):
133
+ page_chunks = future.result()
134
+ for chunk in page_chunks:
135
+ chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
136
+ if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
137
+ logger.warning(f"Total tokens ({total_tokens + chunk_tokens}) exceed model limit ({MODEL_MAX_TOKENS}). Stopping.")
138
+ return results
139
+ results.append(chunk)
140
+ total_tokens += chunk_tokens
141
+
142
  if progress_callback:
143
  progress_callback(min(chunk_end, total_pages), total_pages)
144
 
145
  del pdf
146
  gc.collect()
147
 
148
+ return results
149
  except Exception as e:
150
  logger.error(f"PDF processing error: {e}")
151
+ return [f"PDF processing error: {str(e)}"]
152
 
153
  def excel_to_json(file_path: str) -> List[Dict]:
154
  """Optimized Excel processing with chunking"""
 
204
  """Cached file processing with memory optimization"""
205
  try:
206
  if file_type == "pdf":
207
+ chunks = extract_all_pages(file_path)
208
  return [{
209
  "filename": os.path.basename(file_path),
210
+ "content": chunk,
211
  "status": "initial",
212
  "type": "pdf"
213
+ } for chunk in chunks]
214
  elif file_type in ["xls", "xlsx"]:
215
  return excel_to_json(file_path)
216
  elif file_type == "csv":
 
222
  return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
223
 
224
  def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
225
+ """Optimized tokenization and chunking with early validation"""
226
+ if len(text) > MAX_TEXT_LENGTH:
227
+ logger.warning(f"Text length ({len(text)}) exceeds limit ({MAX_TEXT_LENGTH}). Truncating.")
228
+ text = text[:MAX_TEXT_LENGTH]
229
+
230
  tokenizer = get_tokenizer()
231
  tokens = tokenizer.encode(text, add_special_tokens=False)
232
+ if len(tokens) > MODEL_MAX_TOKENS:
233
+ logger.error(f"Token count ({len(tokens)}) exceeds model limit ({MODEL_MAX_TOKENS}).")
234
+ return [text[:MAX_TEXT_LENGTH // 10]] # Fallback to small chunk
235
+
236
  chunks = []
237
  current_chunk = []
238
  current_length = 0
 
249
  if current_chunk:
250
  chunks.append(tokenizer.decode(current_chunk))
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  return chunks
253
 
254
  def log_system_usage(tag=""):
 
451
  history.append({"role": "assistant", "content": "✅ File processing complete"})
452
  yield history, None, ""
453
 
454
+ if not extracted:
455
+ history.append({"role": "assistant", "content": "❌ No valid content extracted. Please upload a supported file."})
456
+ yield history, None, "No valid content extracted."
 
 
 
 
 
 
 
457
  return
458
+
 
 
 
459
  combined_response = ""
460
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
461
  seen_responses = set()
462
 
463
  try:
464
+ chunks = [item["content"] for item in extracted if "content" in item]
465
+ if not chunks:
466
+ history.append({"role": "assistant", "content": "❌ No processable content found in the file."})
467
+ yield history, None, "No processable content found."
468
+ return
469
+
470
  for batch_idx in range(0, len(chunks), BATCH_SIZE):
471
  batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
472
 
 
531
  for chunk_output in future.result():
532
  if isinstance(chunk_output, list):
533
  for msg in chunk_output:
534
+ if isinstance(msg, gr.ChatMessage) and msg.content:
535
+ cleaned_content = clean_response(msg.content)
536
+ if cleaned_content and cleaned_content != "No missed diagnoses identified.":
537
+ combined_response += cleaned_content + "\n"
538
+ history[-1] = {"role": "assistant", "content": combined_response.strip()}
539
+ yield history, report_path, ""
540
  except Exception as e:
541
  logger.error(f"Detailed analysis error for chunk {batch_idx + chunk_idx + 1}: {e}")
542
  history[-1] = {"role": "assistant", "content": f"Error in detailed analysis for chunk {batch_idx + chunk_idx + 1}: {str(e)}"}