Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -81,10 +81,10 @@ def rule_based_parser(text):
|
|
81 |
"""Fallback parser for structured tables with pipe delimiters"""
|
82 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
83 |
|
84 |
-
# Find header line containing 'Date'
|
85 |
header_index = None
|
86 |
for i, line in enumerate(lines):
|
87 |
-
if re.search(r'
|
88 |
header_index = i
|
89 |
break
|
90 |
|
@@ -120,7 +120,10 @@ def rule_based_parser(text):
|
|
120 |
def process_file(file, is_scanned):
|
121 |
"""Main processing function"""
|
122 |
if not file:
|
123 |
-
return
|
|
|
|
|
|
|
124 |
|
125 |
file_path = file.name
|
126 |
file_ext = os.path.splitext(file_path)[1].lower()
|
@@ -131,13 +134,32 @@ def process_file(file, is_scanned):
|
|
131 |
elif file_ext == '.pdf':
|
132 |
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
|
133 |
else:
|
134 |
-
return
|
|
|
|
|
|
|
135 |
|
136 |
parsed_data = parse_bank_statement(text)
|
137 |
df = pd.DataFrame(parsed_data["transactions"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
return df
|
|
|
139 |
except Exception as e:
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
# Gradio Interface
|
143 |
interface = gr.Interface(
|
|
|
81 |
"""Fallback parser for structured tables with pipe delimiters"""
|
82 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
83 |
|
84 |
+
# Find header line containing '| Date'
|
85 |
header_index = None
|
86 |
for i, line in enumerate(lines):
|
87 |
+
if re.search(r'\|Date', line): # Improved pattern to match "|Date"
|
88 |
header_index = i
|
89 |
break
|
90 |
|
|
|
120 |
def process_file(file, is_scanned):
|
121 |
"""Main processing function"""
|
122 |
if not file:
|
123 |
+
return pd.DataFrame(columns=[
|
124 |
+
"Date", "Description", "Amount", "Debit",
|
125 |
+
"Credit", "Closing Balance", "Category"
|
126 |
+
])
|
127 |
|
128 |
file_path = file.name
|
129 |
file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
134 |
elif file_ext == '.pdf':
|
135 |
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
|
136 |
else:
|
137 |
+
return pd.DataFrame(columns=[
|
138 |
+
"Date", "Description", "Amount", "Debit",
|
139 |
+
"Credit", "Closing Balance", "Category"
|
140 |
+
])
|
141 |
|
142 |
parsed_data = parse_bank_statement(text)
|
143 |
df = pd.DataFrame(parsed_data["transactions"])
|
144 |
+
|
145 |
+
# Ensure all required columns exist
|
146 |
+
required_cols = ["date", "description", "amount", "debit",
|
147 |
+
"credit", "closing_balance", "category"]
|
148 |
+
for col in required_cols:
|
149 |
+
if col not in df.columns:
|
150 |
+
df[col] = ""
|
151 |
+
|
152 |
+
df.columns = ["Date", "Description", "Amount", "Debit",
|
153 |
+
"Credit", "Closing Balance", "Category"]
|
154 |
return df
|
155 |
+
|
156 |
except Exception as e:
|
157 |
+
print(f"Processing error: {str(e)}")
|
158 |
+
# Return empty DataFrame with correct columns on error
|
159 |
+
return pd.DataFrame(columns=[
|
160 |
+
"Date", "Description", "Amount", "Debit",
|
161 |
+
"Credit", "Closing Balance", "Category"
|
162 |
+
])
|
163 |
|
164 |
# Gradio Interface
|
165 |
interface = gr.Interface(
|