Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import pytesseract
|
|
8 |
from pdf2image import convert_from_path
|
9 |
from huggingface_hub import InferenceClient
|
10 |
|
11 |
-
# Initialize Hugging Face Inference Client
|
12 |
hf_token = os.getenv("HF_TOKEN")
|
13 |
-
client = InferenceClient(model="
|
14 |
|
15 |
def extract_excel_data(file_path):
|
16 |
"""Extract text from Excel file"""
|
@@ -86,11 +86,23 @@ Rules:
|
|
86 |
1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
|
87 |
2. Convert negative balances to standard format (e.g., "-2421.72")
|
88 |
3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
|
|
|
89 |
"""
|
90 |
|
91 |
try:
|
92 |
# Call LLM via Hugging Face Inference API
|
93 |
-
response = client.text_generation(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
return json.loads(response)
|
95 |
except Exception as e:
|
96 |
print(f"LLM Error: {str(e)}")
|
@@ -104,7 +116,7 @@ def rule_based_parser(text):
|
|
104 |
# Find header line containing '| Date'
|
105 |
header_index = None
|
106 |
for i, line in enumerate(lines):
|
107 |
-
if re.search(r'\|Date', line):
|
108 |
header_index = i
|
109 |
break
|
110 |
|
@@ -115,7 +127,7 @@ def rule_based_parser(text):
|
|
115 |
transactions = []
|
116 |
|
117 |
for line in data_lines:
|
118 |
-
if not
|
119 |
continue
|
120 |
|
121 |
parts = [p.strip() for p in line.split('|') if p.strip()]
|
@@ -123,13 +135,14 @@ def rule_based_parser(text):
|
|
123 |
continue
|
124 |
|
125 |
try:
|
|
|
126 |
transactions.append({
|
127 |
"date": parts[0],
|
128 |
"description": parts[1],
|
129 |
-
"amount": parts[2],
|
130 |
-
"debit": parts[3],
|
131 |
-
"credit": parts[4],
|
132 |
-
"closing_balance": parts[5],
|
133 |
"category": parts[6]
|
134 |
})
|
135 |
except Exception as e:
|
@@ -137,6 +150,13 @@ def rule_based_parser(text):
|
|
137 |
|
138 |
return {"transactions": transactions}
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
def process_file(file, is_scanned):
|
141 |
"""Main processing function"""
|
142 |
if not file:
|
|
|
8 |
from pdf2image import convert_from_path
|
9 |
from huggingface_hub import InferenceClient
|
10 |
|
11 |
+
# Initialize Hugging Face Inference Client with a free model
|
12 |
hf_token = os.getenv("HF_TOKEN")
|
13 |
+
client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
|
14 |
|
15 |
def extract_excel_data(file_path):
|
16 |
"""Extract text from Excel file"""
|
|
|
86 |
1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
|
87 |
2. Convert negative balances to standard format (e.g., "-2421.72")
|
88 |
3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
|
89 |
+
4. Only return valid JSON with no additional text
|
90 |
"""
|
91 |
|
92 |
try:
|
93 |
# Call LLM via Hugging Face Inference API
|
94 |
+
response = client.text_generation(
|
95 |
+
prompt,
|
96 |
+
max_new_tokens=2000,
|
97 |
+
temperature=0.1,
|
98 |
+
stop_sequences=["</s>"]
|
99 |
+
)
|
100 |
+
print(f"LLM Response: {response}")
|
101 |
+
|
102 |
+
# Extract JSON from response (remove non-JSON prefixes/suffixes)
|
103 |
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
104 |
+
if json_match:
|
105 |
+
return json.loads(json_match.group())
|
106 |
return json.loads(response)
|
107 |
except Exception as e:
|
108 |
print(f"LLM Error: {str(e)}")
|
|
|
116 |
# Find header line containing '| Date'
|
117 |
header_index = None
|
118 |
for i, line in enumerate(lines):
|
119 |
+
if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
|
120 |
header_index = i
|
121 |
break
|
122 |
|
|
|
127 |
transactions = []
|
128 |
|
129 |
for line in data_lines:
|
130 |
+
if not '|' in line:
|
131 |
continue
|
132 |
|
133 |
parts = [p.strip() for p in line.split('|') if p.strip()]
|
|
|
135 |
continue
|
136 |
|
137 |
try:
|
138 |
+
# Handle numeric values consistently
|
139 |
transactions.append({
|
140 |
"date": parts[0],
|
141 |
"description": parts[1],
|
142 |
+
"amount": format_number(parts[2]),
|
143 |
+
"debit": format_number(parts[3]),
|
144 |
+
"credit": format_number(parts[4]),
|
145 |
+
"closing_balance": format_number(parts[5]),
|
146 |
"category": parts[6]
|
147 |
})
|
148 |
except Exception as e:
|
|
|
150 |
|
151 |
return {"transactions": transactions}
|
152 |
|
153 |
+
def format_number(value):
|
154 |
+
"""Format numeric values consistently"""
|
155 |
+
value = value.replace(',', '')
|
156 |
+
if re.match(r'^-?\d+(\.\d+)?$', value):
|
157 |
+
return f"{float(value):.2f}"
|
158 |
+
return value
|
159 |
+
|
160 |
def process_file(file, is_scanned):
|
161 |
"""Main processing function"""
|
162 |
if not file:
|