Bhaskar2611 commited on
Commit
aca59c0
·
verified ·
1 Parent(s): 15c9ede

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -9
app.py CHANGED
@@ -8,9 +8,9 @@ import pytesseract
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
- # Initialize Hugging Face Inference Client
12
  hf_token = os.getenv("HF_TOKEN")
13
- client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=hf_token)
14
 
15
  def extract_excel_data(file_path):
16
  """Extract text from Excel file"""
@@ -86,11 +86,23 @@ Rules:
86
  1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
87
  2. Convert negative balances to standard format (e.g., "-2421.72")
88
  3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
 
89
  """
90
 
91
  try:
92
  # Call LLM via Hugging Face Inference API
93
- response = client.text_generation(prompt, max_new_tokens=1000, temperature=0.1)
 
 
 
 
 
 
 
 
 
 
 
94
  return json.loads(response)
95
  except Exception as e:
96
  print(f"LLM Error: {str(e)}")
@@ -104,7 +116,7 @@ def rule_based_parser(text):
104
  # Find header line containing '| Date'
105
  header_index = None
106
  for i, line in enumerate(lines):
107
- if re.search(r'\|Date', line): # Improved pattern to match "|Date"
108
  header_index = i
109
  break
110
 
@@ -115,7 +127,7 @@ def rule_based_parser(text):
115
  transactions = []
116
 
117
  for line in data_lines:
118
- if not line.startswith('|'):
119
  continue
120
 
121
  parts = [p.strip() for p in line.split('|') if p.strip()]
@@ -123,13 +135,14 @@ def rule_based_parser(text):
123
  continue
124
 
125
  try:
 
126
  transactions.append({
127
  "date": parts[0],
128
  "description": parts[1],
129
- "amount": parts[2],
130
- "debit": parts[3],
131
- "credit": parts[4],
132
- "closing_balance": parts[5],
133
  "category": parts[6]
134
  })
135
  except Exception as e:
@@ -137,6 +150,13 @@ def rule_based_parser(text):
137
 
138
  return {"transactions": transactions}
139
 
 
 
 
 
 
 
 
140
  def process_file(file, is_scanned):
141
  """Main processing function"""
142
  if not file:
 
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
+ # Initialize Hugging Face Inference Client with a free model
12
  hf_token = os.getenv("HF_TOKEN")
13
+ client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
14
 
15
  def extract_excel_data(file_path):
16
  """Extract text from Excel file"""
 
86
  1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
87
  2. Convert negative balances to standard format (e.g., "-2421.72")
88
  3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
89
+ 4. Only return valid JSON with no additional text
90
  """
91
 
92
  try:
93
  # Call LLM via Hugging Face Inference API
94
+ response = client.text_generation(
95
+ prompt,
96
+ max_new_tokens=2000,
97
+ temperature=0.1,
98
+ stop_sequences=["</s>"]
99
+ )
100
+ print(f"LLM Response: {response}")
101
+
102
+ # Extract JSON from response (remove non-JSON prefixes/suffixes)
103
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
104
+ if json_match:
105
+ return json.loads(json_match.group())
106
  return json.loads(response)
107
  except Exception as e:
108
  print(f"LLM Error: {str(e)}")
 
116
  # Find header line containing '| Date'
117
  header_index = None
118
  for i, line in enumerate(lines):
119
+ if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
120
  header_index = i
121
  break
122
 
 
127
  transactions = []
128
 
129
  for line in data_lines:
130
+ if not '|' in line:
131
  continue
132
 
133
  parts = [p.strip() for p in line.split('|') if p.strip()]
 
135
  continue
136
 
137
  try:
138
+ # Handle numeric values consistently
139
  transactions.append({
140
  "date": parts[0],
141
  "description": parts[1],
142
+ "amount": format_number(parts[2]),
143
+ "debit": format_number(parts[3]),
144
+ "credit": format_number(parts[4]),
145
+ "closing_balance": format_number(parts[5]),
146
  "category": parts[6]
147
  })
148
  except Exception as e:
 
150
 
151
  return {"transactions": transactions}
152
 
153
+ def format_number(value):
154
+ """Format numeric values consistently"""
155
+ value = value.replace(',', '')
156
+ if re.match(r'^-?\d+(\.\d+)?$', value):
157
+ return f"{float(value):.2f}"
158
+ return value
159
+
160
  def process_file(file, is_scanned):
161
  """Main processing function"""
162
  if not file: