Ali2206 commited on
Commit
543491f
·
verified ·
1 Parent(s): 58a777c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -28
app.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
- from typing import List, Tuple, Optional, Generator
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
@@ -16,13 +16,7 @@ import torch
16
  import gc
17
  from diskcache import Cache
18
  import time
19
- import pyarrow as pa
20
- import pyarrow.parquet as pq
21
- import pyarrow.csv as pc
22
- import numpy as np
23
- from functools import partial
24
- from itertools import islice
25
- import io
26
 
27
  # Configure logging
28
  logging.basicConfig(level=logging.INFO)
@@ -56,6 +50,9 @@ from txagent.txagent import TxAgent
56
  # Initialize cache with 10GB limit
57
  cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
58
 
 
 
 
59
  def sanitize_utf8(text: str) -> str:
60
  return text.encode("utf-8", "ignore").decode("utf-8")
61
 
@@ -98,8 +95,8 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
98
  logger.error("PDF processing error: %s", e)
99
  return f"PDF processing error: {str(e)}"
100
 
101
- def excel_to_json(file_path: str) -> List[dict]:
102
- """Convert Excel file to JSON data with proper error handling"""
103
  try:
104
  # First try with openpyxl (faster for xlsx)
105
  try:
@@ -108,38 +105,46 @@ def excel_to_json(file_path: str) -> List[dict]:
108
  # Fall back to xlrd if needed
109
  df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
110
 
111
- # Convert to list of lists
112
- content = df.fillna("").astype(str).values.tolist()
113
 
114
  return [{
115
  "filename": os.path.basename(file_path),
116
- "rows": content
 
117
  }]
118
  except Exception as e:
119
  logger.error(f"Error processing Excel file: {e}")
120
  return [{"error": f"Error processing Excel file: {str(e)}"}]
121
 
122
- def csv_to_json(file_path: str) -> List[dict]:
123
- """Convert CSV file to JSON data with proper error handling"""
124
  try:
125
- df = pd.read_csv(
 
 
126
  file_path,
127
  header=None,
128
  dtype=str,
129
  encoding_errors='replace',
130
- on_bad_lines='skip'
131
- )
132
- content = df.fillna("").astype(str).values.tolist()
 
 
 
 
133
 
134
  return [{
135
  "filename": os.path.basename(file_path),
136
- "rows": content
 
137
  }]
138
  except Exception as e:
139
  logger.error(f"Error processing CSV file: {e}")
140
  return [{"error": f"Error processing CSV file: {str(e)}"}]
141
 
142
- def process_file(file_path: str, file_type: str) -> List[dict]:
143
  """Process file based on type and return JSON data"""
144
  try:
145
  if file_type == "pdf":
@@ -147,7 +152,8 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
147
  return [{
148
  "filename": os.path.basename(file_path),
149
  "content": text,
150
- "status": "initial"
 
151
  }]
152
  elif file_type in ["xls", "xlsx"]:
153
  return excel_to_json(file_path)
@@ -159,6 +165,15 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
159
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
160
  return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
161
 
 
 
 
 
 
 
 
 
 
162
  def log_system_usage(tag=""):
163
  try:
164
  cpu = psutil.cpu_percent(interval=1)
@@ -308,14 +323,13 @@ Patient Record Excerpt (Chunk {0} of {1}):
308
  history.append({"role": "assistant", "content": "✅ File processing complete"})
309
  yield history, None, ""
310
 
311
- # Convert extracted data to text
312
  text_content = "\n".join(json.dumps(item) for item in extracted)
313
 
314
- # Process chunks in parallel with dynamic batching
315
- chunk_size = 8000
316
- chunks = [text_content[i:i+chunk_size] for i in range(0, len(text_content), chunk_size)]
317
  combined_response = ""
318
- batch_size = 4 # Optimal for most GPUs
319
 
320
  try:
321
  for batch_idx in range(0, len(chunks), batch_size):
@@ -324,7 +338,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
324
  prompt_template.format(
325
  batch_idx + i + 1,
326
  len(chunks),
327
- chunk=chunk[:6000]
328
  )
329
  for i, chunk in enumerate(batch_chunks)
330
  ]
 
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
+ from typing import List, Dict, Optional, Generator
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
 
16
  import gc
17
  from diskcache import Cache
18
  import time
19
+ from transformers import AutoTokenizer
 
 
 
 
 
 
20
 
21
  # Configure logging
22
  logging.basicConfig(level=logging.INFO)
 
50
  # Initialize cache with 10GB limit
51
  cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
52
 
53
+ # Initialize tokenizer for precise chunking
54
+ tokenizer = AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
55
+
56
  def sanitize_utf8(text: str) -> str:
57
  return text.encode("utf-8", "ignore").decode("utf-8")
58
 
 
95
  logger.error("PDF processing error: %s", e)
96
  return f"PDF processing error: {str(e)}"
97
 
98
+ def excel_to_json(file_path: str) -> List[Dict]:
99
+ """Convert Excel file to JSON with optimized processing"""
100
  try:
101
  # First try with openpyxl (faster for xlsx)
102
  try:
 
105
  # Fall back to xlrd if needed
106
  df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
107
 
108
+ # Convert to list of lists with null handling
109
+ content = df.where(pd.notnull(df), "").astype(str).values.tolist()
110
 
111
  return [{
112
  "filename": os.path.basename(file_path),
113
+ "rows": content,
114
+ "type": "excel"
115
  }]
116
  except Exception as e:
117
  logger.error(f"Error processing Excel file: {e}")
118
  return [{"error": f"Error processing Excel file: {str(e)}"}]
119
 
120
+ def csv_to_json(file_path: str) -> List[Dict]:
121
+ """Convert CSV file to JSON with optimized processing"""
122
  try:
123
+ # Read CSV in chunks if large
124
+ chunks = []
125
+ for chunk in pd.read_csv(
126
  file_path,
127
  header=None,
128
  dtype=str,
129
  encoding_errors='replace',
130
+ on_bad_lines='skip',
131
+ chunksize=10000
132
+ ):
133
+ chunks.append(chunk)
134
+
135
+ df = pd.concat(chunks) if chunks else pd.DataFrame()
136
+ content = df.where(pd.notnull(df), "").astype(str).values.tolist()
137
 
138
  return [{
139
  "filename": os.path.basename(file_path),
140
+ "rows": content,
141
+ "type": "csv"
142
  }]
143
  except Exception as e:
144
  logger.error(f"Error processing CSV file: {e}")
145
  return [{"error": f"Error processing CSV file: {str(e)}"}]
146
 
147
+ def process_file(file_path: str, file_type: str) -> List[Dict]:
148
  """Process file based on type and return JSON data"""
149
  try:
150
  if file_type == "pdf":
 
152
  return [{
153
  "filename": os.path.basename(file_path),
154
  "content": text,
155
+ "status": "initial",
156
+ "type": "pdf"
157
  }]
158
  elif file_type in ["xls", "xlsx"]:
159
  return excel_to_json(file_path)
 
165
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
166
  return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
167
 
168
+ def tokenize_and_chunk(text: str, max_tokens: int = 1800) -> List[str]:
169
+ """Split text into chunks based on token count"""
170
+ tokens = tokenizer.encode(text)
171
+ chunks = []
172
+ for i in range(0, len(tokens), max_tokens):
173
+ chunk_tokens = tokens[i:i + max_tokens]
174
+ chunks.append(tokenizer.decode(chunk_tokens))
175
+ return chunks
176
+
177
  def log_system_usage(tag=""):
178
  try:
179
  cpu = psutil.cpu_percent(interval=1)
 
323
  history.append({"role": "assistant", "content": "✅ File processing complete"})
324
  yield history, None, ""
325
 
326
+ # Convert extracted data to JSON text
327
  text_content = "\n".join(json.dumps(item) for item in extracted)
328
 
329
+ # Tokenize and chunk the content properly
330
+ chunks = tokenize_and_chunk(text_content)
 
331
  combined_response = ""
332
+ batch_size = 2 # Reduced batch size to prevent token overflow
333
 
334
  try:
335
  for batch_idx in range(0, len(chunks), batch_size):
 
338
  prompt_template.format(
339
  batch_idx + i + 1,
340
  len(chunks),
341
+ chunk=chunk[:1800] # Conservative chunk size
342
  )
343
  for i, chunk in enumerate(batch_chunks)
344
  ]