Spaces:

kkawamu1
/

Utility_Bill_Parser

Runtime error

File size: 4,424 Bytes

b17d312

import json
import os

import google.generativeai as genai
import gradio as gr
import pandas as pd
from gradio_pdf import PDF
from pdf2image import convert_from_path
from pypdf import PdfReader
from pathlib import Path
dir_ = Path(__file__).parent

genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
headers=[
            "DUE DATE",
            "SERVICE ADDRESS",
            "SERVICE PERIOD",
            "ELECTRICITY USAGE (KWH)",
            "ELECTRICITY SPEND ($)",
            "GAS USAGE (THERMS)",
            "GAS SPEND ($)",
            "WATER USAGE (CCF)",
            "WATER SPEND ($)",
            "SEWER ($)",
            "REFUSE ($)",
            "STORM DRAIN ($)",
            "UTILITY USERS TAX ($)",
            "TOTAL CURRENT CHARGES ($)",
            "TOTAL AMOUNT DUE",
        ]


inputs = [PDF(label="Document")]

outputs = [
    gr.Dataframe(
        row_count=(1, "dynamic"),
        col_count=(15, "fixed"),
        label="Utility",
        headers=headers,
        datatype=[
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
            "str",
        ],
    )
]


def get_content_between_curly_braces(text):
    """
    This function extracts the content between the opening and closing curly braces of a string.

    Args:
        text: The string to extract content from.

    Returns:
        The extracted content as a string, or None if no curly braces are found.
    """
    start_index = text.find("{")
    end_index = text.rfind("}")

    if start_index != -1 and end_index > start_index:
        return text[start_index : end_index + 1]
    else:
        return None



def parse_utility_bill(filepath):
    print("FOUND PDF!")
    reader = PdfReader(filepath)
    number_of_pages = len(reader.pages)
    images = convert_from_path(filepath)
    assert number_of_pages == len(images)
    page = reader.pages[0]
    text = page.extract_text()
    image = images[0]

    print("---------------------------------------------------------------")
    print(f"We have the image at: ")
    print(image)
    print(f"Here is the text:")
    print(text)
    print("---------------------------------------------------------------")
    model = genai.GenerativeModel(
        "gemini-pro-vision",
    )
    promt_text = (
        f""" Please extract the following JSON object from the utility bill I give. Here is the noisy OCR extractio of the page {text}. Depending on the document, it may contain values for only a few keys such as SEWER. So, you have to be extra carefull."""
        + """This JSON schema:
{'type': 'object', 'properties': { 'DUE DATE': {'type': 'string'},'SERVICE ADDRESS': {'type': 'string'},'SERVICE PERIOD': {'type': 'string'}'ELECTRICITY USAGE (KWH)': {'type': 'string'},'ELECTRICITY SPEND ($)': {'type': 'string'},'GAS USAGE (THERMS)': {'type': 'string'},'GAS SPEND ($)': {'type': 'string'},'WATER USAGE (CCF)': {'type': 'string'},'WATER SPEND ($)': {'type': 'string'},'SEWER ($)': {'type': 'string'},'REFUSE ($)': {'type': 'string'},'STORM DRAIN ($)': {'type': 'string'},'UTILITY USERS TAX ($)': {'type': 'string'},'TOTAL CURRENT CHARGES ($)': {'type': 'string'},'TOTAL AMOUNT DUE ($)': {'type': 'string'}}."""
    )
    print(f"PROMPT: {promt_text}")
    response = model.generate_content(
        [
            promt_text,
            image,
        ],
        generation_config={"max_output_tokens": 2048, "temperature": 0.0},
    )
    json_response = get_content_between_curly_braces(response.text)
    respone_dict = json.loads(json_response)
    print(respone_dict)
    rectified_dict = {}
    for target_key in headers:
        
        for key, value in respone_dict.items():
            if key == target_key:
                rectified_dict[key] = value
                break
        else:
            rectified_dict[target_key] = None
    print(rectified_dict)
    example_data = [rectified_dict]
    

    return pd.DataFrame(example_data)

gr.Interface(
    fn=parse_utility_bill,
    inputs=inputs,
    outputs=outputs,
    examples=["utl-bill-sample.pdf", "nem-2-utility-bill-sample.pdf", "Sample_Utility_Bill.pdf", "Water Bill Sample.pdf", "canada.pdf", "water.pdf"],
    title="🌏⚡💧🔥PDF Utitlity Bill Parser",
).launch()