Update app.py
Browse files
app.py
CHANGED
@@ -81,35 +81,45 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
|
|
81 |
|
82 |
def extract_page(i):
|
83 |
page = pdf.pages[i]
|
84 |
-
# Try table extraction first
|
85 |
text = ""
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
if tables:
|
88 |
for table in tables:
|
89 |
-
# Mimic Excel/CSV: join non-None cells as strings
|
90 |
table_text = "\n".join(
|
91 |
" | ".join(str(cell) if cell is not None else "" for cell in row)
|
92 |
-
for row in table
|
93 |
)
|
94 |
text += table_text + "\n\n"
|
95 |
logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
|
96 |
else:
|
97 |
-
# Fall back to raw text
|
98 |
text = page.extract_text() or ""
|
99 |
logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
|
100 |
|
101 |
-
# OCR if text is short or force_ocr is True
|
102 |
if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
103 |
try:
|
104 |
logger.info("Attempting OCR for page %d", i + 1)
|
105 |
pdfium_pdf = pdfium.PdfDocument(file_path)
|
106 |
page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
|
107 |
-
|
108 |
-
logger.debug("Page %d OCR text length: %d chars", i + 1, len(
|
|
|
109 |
pdfium_pdf.close()
|
110 |
except Exception as e:
|
111 |
logger.error("OCR failed for page %d: %s", i + 1, e)
|
112 |
-
text = text or ""
|
113 |
return (i, f"=== Page {i + 1} ===\n{text.strip()}")
|
114 |
|
115 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
@@ -124,6 +134,7 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
|
|
124 |
text_chunks.sort(key=lambda x: x[0])
|
125 |
extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
|
126 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
|
|
127 |
if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
128 |
logger.info("Text too short, forcing OCR for all pages")
|
129 |
return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
|
@@ -136,25 +147,54 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
136 |
try:
|
137 |
file_h = file_hash(file_path)
|
138 |
cache_key = f"{file_h}_{file_type}"
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
142 |
|
143 |
if file_type == "pdf":
|
144 |
text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
|
145 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
146 |
elif file_type == "csv":
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
151 |
elif file_type in ["xls", "xlsx"]:
|
152 |
try:
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
else:
|
159 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
160 |
|
@@ -324,7 +364,7 @@ Patient Record Excerpt:
|
|
324 |
chunk_response = ""
|
325 |
raw_outputs = []
|
326 |
for chunk_output in agent.run_gradio_chat(
|
327 |
-
message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=
|
328 |
):
|
329 |
if chunk_output is None:
|
330 |
continue
|
|
|
81 |
|
82 |
def extract_page(i):
|
83 |
page = pdf.pages[i]
|
|
|
84 |
text = ""
|
85 |
+
# Adjust table settings for complex layouts
|
86 |
+
table_settings = {
|
87 |
+
"vertical_strategy": "lines",
|
88 |
+
"horizontal_strategy": "lines",
|
89 |
+
"explicit_vertical_lines": [],
|
90 |
+
"explicit_horizontal_lines": [],
|
91 |
+
"snap_tolerance": 5,
|
92 |
+
"join_tolerance": 5,
|
93 |
+
"edge_min_length": 3,
|
94 |
+
"min_words_vertical": 3,
|
95 |
+
"min_words_horizontal": 1,
|
96 |
+
"intersection_tolerance": 5,
|
97 |
+
}
|
98 |
+
tables = page.extract_tables(table_settings=table_settings)
|
99 |
if tables:
|
100 |
for table in tables:
|
|
|
101 |
table_text = "\n".join(
|
102 |
" | ".join(str(cell) if cell is not None else "" for cell in row)
|
103 |
+
for row in table if any(cell is not None for cell in row)
|
104 |
)
|
105 |
text += table_text + "\n\n"
|
106 |
logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
|
107 |
else:
|
|
|
108 |
text = page.extract_text() or ""
|
109 |
logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
|
110 |
|
111 |
+
# Force OCR if text is short or force_ocr is True
|
112 |
if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
113 |
try:
|
114 |
logger.info("Attempting OCR for page %d", i + 1)
|
115 |
pdfium_pdf = pdfium.PdfDocument(file_path)
|
116 |
page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
|
117 |
+
ocr_text = pytesseract.image_to_string(page_bitmap, lang="eng")
|
118 |
+
logger.debug("Page %d OCR text length: %d chars", i + 1, len(ocr_text))
|
119 |
+
text = ocr_text if ocr_text.strip() else text
|
120 |
pdfium_pdf.close()
|
121 |
except Exception as e:
|
122 |
logger.error("OCR failed for page %d: %s", i + 1, e)
|
|
|
123 |
return (i, f"=== Page {i + 1} ===\n{text.strip()}")
|
124 |
|
125 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
|
134 |
text_chunks.sort(key=lambda x: x[0])
|
135 |
extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
|
136 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
137 |
+
# Force OCR retry if text is too short
|
138 |
if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
139 |
logger.info("Text too short, forcing OCR for all pages")
|
140 |
return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
|
|
|
147 |
try:
|
148 |
file_h = file_hash(file_path)
|
149 |
cache_key = f"{file_h}_{file_type}"
|
150 |
+
# Bypass cache to force fresh extraction
|
151 |
+
logger.info("Forcing fresh extraction for %s", file_path)
|
152 |
+
# if cache_key in cache:
|
153 |
+
# logger.info("Using cached extraction for %s", file_path)
|
154 |
+
# return cache[cache_key]
|
155 |
|
156 |
if file_type == "pdf":
|
157 |
text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
|
158 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
159 |
elif file_type == "csv":
|
160 |
+
try:
|
161 |
+
df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
|
162 |
+
skip_blank_lines=False, on_bad_lines="skip")
|
163 |
+
content = df.fillna("").astype(str).values.tolist()
|
164 |
+
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
165 |
+
logger.info("CSV processed, rows: %d", len(content))
|
166 |
+
except Exception as e:
|
167 |
+
logger.error("CSV processing failed: %s", e)
|
168 |
+
result = json.dumps({"error": f"CSV processing failed: {str(e)}"})
|
169 |
elif file_type in ["xls", "xlsx"]:
|
170 |
try:
|
171 |
+
# Try all sheets to maximize data
|
172 |
+
xl = pd.ExcelFile(file_path, engine="openpyxl")
|
173 |
+
content = []
|
174 |
+
for sheet_name in xl.sheet_names:
|
175 |
+
try:
|
176 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl", header=None, dtype=str)
|
177 |
+
sheet_content = df.fillna("").astype(str).values.tolist()
|
178 |
+
content.extend(sheet_content)
|
179 |
+
logger.debug("Excel sheet %s processed, rows: %d", sheet_name, len(sheet_content))
|
180 |
+
except Exception as e:
|
181 |
+
logger.warning("Excel sheet %s failed: %s", sheet_name, e)
|
182 |
+
if not content:
|
183 |
+
logger.error("No valid data extracted from Excel")
|
184 |
+
result = json.dumps({"error": "No valid data extracted from Excel"})
|
185 |
+
else:
|
186 |
+
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
187 |
+
logger.info("Excel processed, total rows: %d", len(content))
|
188 |
+
except Exception as e:
|
189 |
+
logger.error("Excel processing failed: %s", e)
|
190 |
+
try:
|
191 |
+
df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
|
192 |
+
content = df.fillna("").astype(str).values.tolist()
|
193 |
+
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
194 |
+
logger.info("Excel processed with xlrd, rows: %d", len(content))
|
195 |
+
except Exception as e2:
|
196 |
+
logger.error("Excel processing failed with xlrd: %s", e2)
|
197 |
+
result = json.dumps({"error": f"Excel processing failed: {str(e)}"})
|
198 |
else:
|
199 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
200 |
|
|
|
364 |
chunk_response = ""
|
365 |
raw_outputs = []
|
366 |
for chunk_output in agent.run_gradio_chat(
|
367 |
+
message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=2048, call_agent=False, conversation=[]
|
368 |
):
|
369 |
if chunk_output is None:
|
370 |
continue
|