File size: 3,083 Bytes
1aa2346
 
 
2469e02
1aa2346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0e0fd8
f5d7472
 
1aa2346
2469e02
 
 
 
 
 
 
4bab2d8
2469e02
 
 
 
 
 
1aa2346
 
 
 
 
 
 
8b56e33
f5d7472
8b56e33
1aa2346
 
 
 
 
 
 
ffcecb5
1aa2346
8b56e33
05f325e
8b56e33
05f325e
8b56e33
 
1aa2346
 
 
8b56e33
 
1aa2346
 
 
 
84d6cd0
1aa2346
 
3e8ad7c
1aa2346
 
 
 
 
8b56e33
 
 
 
1aa2346
 
2469e02
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import string
import re
import pandas as pd

from transformers import pipeline

model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return text.translate(translator)

def replace_multiple_spaces(text):
    """Replaces multiple contiguous spaces in a string with a single space.

    Args:
        text: The input string.

    Returns:
        The string with multiple contiguous spaces replaced by a single space.
    """
    return re.sub(r'\s+', ' ', text)

def parse_postal_address_can(text):
    """Parse the given Canadian address into its components. 
    """
    text = text.lower()
    text = text.replace(".", "")
    #text = replace_punctuation_with_space(text)
    text = replace_multiple_spaces(text)
    results = token_classifier(text)

    # Format the output as a dataframe
    data = []
    for result in results:
        data.append({
            'entity_group': result['entity_group'],
            'score': f"{result['score']:.2f}",
            'word': result['word'],
            'start': result['start'],
            'end': result['end']
        })
    df = pd.DataFrame(data)
    return df


#
# User interface
#
with gr.Blocks() as demo:

    gr.Markdown("""
        ## Canadian postal address parsing - version 0.02
    """)

    input_text = gr.Textbox(
        lines=5,
        placeholder="Enter Canadian postal address to parse",
        label="Canadian postal address",
        render=False
    )
    output = gr.DataFrame(value=None, row_count=6, render=False)

    examples = [
        ["405-200 René Lévesque Blvd W,  Montreal, Quebec H2Z 1X4",],
        ["1 Sussex Dr, Ottawa, ON K1A 0A1",],
        ["5124 53 St, #205, Yellowknife, Northwest Territories, X1A 1V6",]
    ]

    gr.Interface(
        fn=parse_postal_address_can,
        inputs=[input_text,],
        outputs=[output,],
        examples=examples
    )

    with gr.Accordion("Documentation", open=False):
        gr.Markdown("""
            - Labels (address components):
                - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
            - Dataset trained on:
                - 15+ million Canadian postal addresses available at OpenAddresses.io
            - (Current) Limitations:
                - no label for person_name / company_name (no data to train on)
                - trained on **post-normalized** addresses from OpenAddresses.io,
                  hence missing un-normalized forms. E.g. "ST" (for street), but
                  no training data with "street", "str.", ...
            - Enhancements:
                - Additional de-normalization of training data
                - Addition of person / companies names to the training data
                - Post-processing of results
        """)


demo.launch()