Spaces:

Didier
/

Postal_address_canada_parsing

Running

File size: 3,083 Bytes

import gradio as gr
import string
import re
import pandas as pd

from transformers import pipeline

model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return text.translate(translator)

def replace_multiple_spaces(text):
    """Replaces multiple contiguous spaces in a string with a single space.

    Args:
        text: The input string.

    Returns:
        The string with multiple contiguous spaces replaced by a single space.
    """
    return re.sub(r'\s+', ' ', text)

def parse_postal_address_can(text):
    """Parse the given Canadian address into its components. 
    """
    text = text.lower()
    text = text.replace(".", "")
    #text = replace_punctuation_with_space(text)
    text = replace_multiple_spaces(text)
    results = token_classifier(text)

    # Format the output as a dataframe
    data = []
    for result in results:
        data.append({
            'entity_group': result['entity_group'],
            'score': f"{result['score']:.2f}",
            'word': result['word'],
            'start': result['start'],
            'end': result['end']
        })
    df = pd.DataFrame(data)
    return df


#
# User interface
#
with gr.Blocks() as demo:

    gr.Markdown("""
        ## Canadian postal address parsing - version 0.02
    """)

    input_text = gr.Textbox(
        lines=5,
        placeholder="Enter Canadian postal address to parse",
        label="Canadian postal address",
        render=False
    )
    output = gr.DataFrame(value=None, row_count=6, render=False)

    examples = [
        ["405-200 René Lévesque Blvd W,  Montreal, Quebec H2Z 1X4",],
        ["1 Sussex Dr, Ottawa, ON K1A 0A1",],
        ["5124 53 St, #205, Yellowknife, Northwest Territories, X1A 1V6",]
    ]

    gr.Interface(
        fn=parse_postal_address_can,
        inputs=[input_text,],
        outputs=[output,],
        examples=examples
    )

    with gr.Accordion("Documentation", open=False):
        gr.Markdown("""
            - Labels (address components):
                - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
            - Dataset trained on:
                - 15+ million Canadian postal addresses available at OpenAddresses.io
            - (Current) Limitations:
                - no label for person_name / company_name (no data to train on)
                - trained on **post-normalized** addresses from OpenAddresses.io,
                  hence missing un-normalized forms. E.g. "ST" (for street), but
                  no training data with "street", "str.", ...
            - Enhancements:
                - Additional de-normalization of training data
                - Addition of person / companies names to the training data
                - Post-processing of results
        """)


demo.launch()