Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
|
|
4 |
import pdfplumber
|
5 |
import json
|
6 |
import gradio as gr
|
7 |
-
from typing import List,
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
import hashlib
|
10 |
import shutil
|
@@ -16,13 +16,7 @@ import torch
|
|
16 |
import gc
|
17 |
from diskcache import Cache
|
18 |
import time
|
19 |
-
|
20 |
-
import pyarrow.parquet as pq
|
21 |
-
import pyarrow.csv as pc
|
22 |
-
import numpy as np
|
23 |
-
from functools import partial
|
24 |
-
from itertools import islice
|
25 |
-
import io
|
26 |
|
27 |
# Configure logging
|
28 |
logging.basicConfig(level=logging.INFO)
|
@@ -56,6 +50,9 @@ from txagent.txagent import TxAgent
|
|
56 |
# Initialize cache with 10GB limit
|
57 |
cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
|
58 |
|
|
|
|
|
|
|
59 |
def sanitize_utf8(text: str) -> str:
|
60 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
61 |
|
@@ -98,8 +95,8 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
98 |
logger.error("PDF processing error: %s", e)
|
99 |
return f"PDF processing error: {str(e)}"
|
100 |
|
101 |
-
def excel_to_json(file_path: str) -> List[
|
102 |
-
"""Convert Excel file to JSON
|
103 |
try:
|
104 |
# First try with openpyxl (faster for xlsx)
|
105 |
try:
|
@@ -108,38 +105,46 @@ def excel_to_json(file_path: str) -> List[dict]:
|
|
108 |
# Fall back to xlrd if needed
|
109 |
df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
|
110 |
|
111 |
-
# Convert to list of lists
|
112 |
-
content = df.
|
113 |
|
114 |
return [{
|
115 |
"filename": os.path.basename(file_path),
|
116 |
-
"rows": content
|
|
|
117 |
}]
|
118 |
except Exception as e:
|
119 |
logger.error(f"Error processing Excel file: {e}")
|
120 |
return [{"error": f"Error processing Excel file: {str(e)}"}]
|
121 |
|
122 |
-
def csv_to_json(file_path: str) -> List[
|
123 |
-
"""Convert CSV file to JSON
|
124 |
try:
|
125 |
-
|
|
|
|
|
126 |
file_path,
|
127 |
header=None,
|
128 |
dtype=str,
|
129 |
encoding_errors='replace',
|
130 |
-
on_bad_lines='skip'
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
|
134 |
return [{
|
135 |
"filename": os.path.basename(file_path),
|
136 |
-
"rows": content
|
|
|
137 |
}]
|
138 |
except Exception as e:
|
139 |
logger.error(f"Error processing CSV file: {e}")
|
140 |
return [{"error": f"Error processing CSV file: {str(e)}"}]
|
141 |
|
142 |
-
def process_file(file_path: str, file_type: str) -> List[
|
143 |
"""Process file based on type and return JSON data"""
|
144 |
try:
|
145 |
if file_type == "pdf":
|
@@ -147,7 +152,8 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
|
|
147 |
return [{
|
148 |
"filename": os.path.basename(file_path),
|
149 |
"content": text,
|
150 |
-
"status": "initial"
|
|
|
151 |
}]
|
152 |
elif file_type in ["xls", "xlsx"]:
|
153 |
return excel_to_json(file_path)
|
@@ -159,6 +165,15 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
|
|
159 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
160 |
return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
def log_system_usage(tag=""):
|
163 |
try:
|
164 |
cpu = psutil.cpu_percent(interval=1)
|
@@ -308,14 +323,13 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
308 |
history.append({"role": "assistant", "content": "✅ File processing complete"})
|
309 |
yield history, None, ""
|
310 |
|
311 |
-
# Convert extracted data to text
|
312 |
text_content = "\n".join(json.dumps(item) for item in extracted)
|
313 |
|
314 |
-
#
|
315 |
-
|
316 |
-
chunks = [text_content[i:i+chunk_size] for i in range(0, len(text_content), chunk_size)]
|
317 |
combined_response = ""
|
318 |
-
batch_size =
|
319 |
|
320 |
try:
|
321 |
for batch_idx in range(0, len(chunks), batch_size):
|
@@ -324,7 +338,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
324 |
prompt_template.format(
|
325 |
batch_idx + i + 1,
|
326 |
len(chunks),
|
327 |
-
chunk=chunk[:
|
328 |
)
|
329 |
for i, chunk in enumerate(batch_chunks)
|
330 |
]
|
|
|
4 |
import pdfplumber
|
5 |
import json
|
6 |
import gradio as gr
|
7 |
+
from typing import List, Dict, Optional, Generator
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
import hashlib
|
10 |
import shutil
|
|
|
16 |
import gc
|
17 |
from diskcache import Cache
|
18 |
import time
|
19 |
+
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Configure logging
|
22 |
logging.basicConfig(level=logging.INFO)
|
|
|
50 |
# Initialize cache with 10GB limit
|
51 |
cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
|
52 |
|
53 |
+
# Initialize tokenizer for precise chunking
|
54 |
+
tokenizer = AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
|
55 |
+
|
56 |
def sanitize_utf8(text: str) -> str:
|
57 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
58 |
|
|
|
95 |
logger.error("PDF processing error: %s", e)
|
96 |
return f"PDF processing error: {str(e)}"
|
97 |
|
98 |
+
def excel_to_json(file_path: str) -> List[Dict]:
|
99 |
+
"""Convert Excel file to JSON with optimized processing"""
|
100 |
try:
|
101 |
# First try with openpyxl (faster for xlsx)
|
102 |
try:
|
|
|
105 |
# Fall back to xlrd if needed
|
106 |
df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
|
107 |
|
108 |
+
# Convert to list of lists with null handling
|
109 |
+
content = df.where(pd.notnull(df), "").astype(str).values.tolist()
|
110 |
|
111 |
return [{
|
112 |
"filename": os.path.basename(file_path),
|
113 |
+
"rows": content,
|
114 |
+
"type": "excel"
|
115 |
}]
|
116 |
except Exception as e:
|
117 |
logger.error(f"Error processing Excel file: {e}")
|
118 |
return [{"error": f"Error processing Excel file: {str(e)}"}]
|
119 |
|
120 |
+
def csv_to_json(file_path: str) -> List[Dict]:
|
121 |
+
"""Convert CSV file to JSON with optimized processing"""
|
122 |
try:
|
123 |
+
# Read CSV in chunks if large
|
124 |
+
chunks = []
|
125 |
+
for chunk in pd.read_csv(
|
126 |
file_path,
|
127 |
header=None,
|
128 |
dtype=str,
|
129 |
encoding_errors='replace',
|
130 |
+
on_bad_lines='skip',
|
131 |
+
chunksize=10000
|
132 |
+
):
|
133 |
+
chunks.append(chunk)
|
134 |
+
|
135 |
+
df = pd.concat(chunks) if chunks else pd.DataFrame()
|
136 |
+
content = df.where(pd.notnull(df), "").astype(str).values.tolist()
|
137 |
|
138 |
return [{
|
139 |
"filename": os.path.basename(file_path),
|
140 |
+
"rows": content,
|
141 |
+
"type": "csv"
|
142 |
}]
|
143 |
except Exception as e:
|
144 |
logger.error(f"Error processing CSV file: {e}")
|
145 |
return [{"error": f"Error processing CSV file: {str(e)}"}]
|
146 |
|
147 |
+
def process_file(file_path: str, file_type: str) -> List[Dict]:
|
148 |
"""Process file based on type and return JSON data"""
|
149 |
try:
|
150 |
if file_type == "pdf":
|
|
|
152 |
return [{
|
153 |
"filename": os.path.basename(file_path),
|
154 |
"content": text,
|
155 |
+
"status": "initial",
|
156 |
+
"type": "pdf"
|
157 |
}]
|
158 |
elif file_type in ["xls", "xlsx"]:
|
159 |
return excel_to_json(file_path)
|
|
|
165 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
166 |
return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
|
167 |
|
168 |
+
def tokenize_and_chunk(text: str, max_tokens: int = 1800) -> List[str]:
|
169 |
+
"""Split text into chunks based on token count"""
|
170 |
+
tokens = tokenizer.encode(text)
|
171 |
+
chunks = []
|
172 |
+
for i in range(0, len(tokens), max_tokens):
|
173 |
+
chunk_tokens = tokens[i:i + max_tokens]
|
174 |
+
chunks.append(tokenizer.decode(chunk_tokens))
|
175 |
+
return chunks
|
176 |
+
|
177 |
def log_system_usage(tag=""):
|
178 |
try:
|
179 |
cpu = psutil.cpu_percent(interval=1)
|
|
|
323 |
history.append({"role": "assistant", "content": "✅ File processing complete"})
|
324 |
yield history, None, ""
|
325 |
|
326 |
+
# Convert extracted data to JSON text
|
327 |
text_content = "\n".join(json.dumps(item) for item in extracted)
|
328 |
|
329 |
+
# Tokenize and chunk the content properly
|
330 |
+
chunks = tokenize_and_chunk(text_content)
|
|
|
331 |
combined_response = ""
|
332 |
+
batch_size = 2 # Reduced batch size to prevent token overflow
|
333 |
|
334 |
try:
|
335 |
for batch_idx in range(0, len(chunks), batch_size):
|
|
|
338 |
prompt_template.format(
|
339 |
batch_idx + i + 1,
|
340 |
len(chunks),
|
341 |
+
chunk=chunk[:1800] # Conservative chunk size
|
342 |
)
|
343 |
for i, chunk in enumerate(batch_chunks)
|
344 |
]
|