Spaces:

kkawamu1
/

Utility_Bill_Parser

Runtime error

App Files Files

xet

Community

Utility_Bill_Parser / app.py

kkawamu1

Fix package issues

376bc5c over 1 year ago

raw

history blame

4.31 kB

	import json
	import os
	from pathlib import Path

	import google.generativeai as genai
	import gradio as gr
	import pandas as pd
	from gradio_pdf import PDF
	from pdf2image import convert_from_path
	from pypdf import PdfReader

	genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
	headers = [
	"DUE DATE",
	"SERVICE ADDRESS",
	"SERVICE PERIOD",
	"ELECTRICITY USAGE (KWH)",
	"ELECTRICITY SPEND ($)",
	"GAS USAGE (THERMS)",
	"GAS SPEND ($)",
	"WATER USAGE (CCF)",
	"WATER SPEND ($)",
	"SEWER ($)",
	"REFUSE ($)",
	"STORM DRAIN ($)",
	"UTILITY USERS TAX ($)",
	"TOTAL CURRENT CHARGES ($)",
	"TOTAL AMOUNT DUE",
	]


	inputs = [PDF(label="Document")]

	outputs = [
	gr.Dataframe(
	row_count=(1, "dynamic"),
	col_count=(15, "fixed"),
	label="Utility",
	headers=headers,
	datatype=[
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	"str",
	],
	)
	]


	def get_content_between_curly_braces(text):
	"""
	This function extracts the content between the opening and closing curly braces of a string.

	Args:
	text: The string to extract content from.

	Returns:
	The extracted content as a string, or None if no curly braces are found.
	"""
	start_index = text.find("{")
	end_index = text.rfind("}")

	if start_index != -1 and end_index > start_index:
	return text[start_index : end_index + 1]
	else:
	return None


	def parse_utility_bill(filepath):
	print("FOUND PDF!")
	reader = PdfReader(filepath)
	number_of_pages = len(reader.pages)
	images = convert_from_path(filepath)
	assert number_of_pages == len(images)
	page = reader.pages[0]
	text = page.extract_text()
	image = images[0]

	print("---------------------------------------------------------------")
	print(f"We have the image at: ")
	print(image)
	print(f"Here is the text:")
	print(text)
	print("---------------------------------------------------------------")
	model = genai.GenerativeModel(
	"gemini-pro-vision",
	)
	promt_text = (
	f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull."""
	+ """This JSON schema:
	{'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}."""
	)
	print(f"PROMPT: {promt_text}")
	response = model.generate_content(
	[
	promt_text,
	image,
	],
	generation_config={"max_output_tokens": 2048, "temperature": 0.0},
	)
	json_response = get_content_between_curly_braces(response.text)
	respone_dict = json.loads(json_response)
	print(respone_dict)
	rectified_dict = {}
	for target_key in headers:

	for key, value in respone_dict.items():
	if key == target_key:
	rectified_dict[key] = value
	break
	else:
	rectified_dict[target_key] = None
	print(rectified_dict)
	example_data = [rectified_dict]

	return pd.DataFrame(example_data)


	gr.Interface(
	fn=parse_utility_bill,
	inputs=inputs,
	outputs=outputs,
	examples=[
	"utl-bill-sample.pdf",
	"nem-2-utility-bill-sample.pdf",
	"Sample_Utility_Bill.pdf",
	"Water Bill Sample.pdf",
	"canada.pdf",
	"water.pdf",
	],
	title="🌏⚡💧🔥PDF Utitlity Bill Parser",
	).launch()