File size: 2,623 Bytes
1aa2346
 
 
2469e02
1aa2346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2469e02
 
 
 
 
 
 
4bab2d8
2469e02
 
 
 
 
 
1aa2346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffcecb5
1aa2346
 
 
 
2469e02
1aa2346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2469e02
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import string
import re
import pandas as pd

from transformers import pipeline

model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return text.translate(translator)

def replace_multiple_spaces(text):
    """Replaces multiple contiguous spaces in a string with a single space.

    Args:
        text: The input string.

    Returns:
        The string with multiple contiguous spaces replaced by a single space.
    """
    return re.sub(r'\s+', ' ', text)

def parse_postal_address_can(text):
    """Parse the given Canadian address into its components. 
    """
    text = text.lower()
    text = replace_punctuation_with_space(text)
    text = replace_multiple_spaces(text)
    results = token_classifier(text)

    # Format the output as a dataframe
    data = []
    for result in results:
        data.append({
            'entity_group': result['entity_group'],
            'score': f"{result['score']:.2f}",
            'word': result['word'],
            'start': result['start'],
            'end': result['end']
        })
    df = pd.DataFrame(data)
    return df


#
# User interface
#
with gr.Blocks() as demo:

    #gr.Markdown("""
    #    - The provided Canadian postal address will be parsed into its components.
    #""")

    input_text = gr.Textbox(
        lines=5,
        placeholder="Enter Canadian postal address to parse",
        label="Canadian postal address",
        render=False
    )
    output = gr.DataFrame(value=None, row_count=6, render=False)

    gr.Interface(
        fn=parse_postal_address_can,
        inputs=[input_text,],
        outputs=[output,]
    )

    with gr.Accordion("Documentation", open=False):
        gr.Markdown("""
            - **Labels (address components)**:
                - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
            - Dataset trained on:
                - Approx. 1 million Canadian postal addresses from OpenAddresses.io
            - (Current) Limitations:
                - no label for person_name / company_name (no data to train on)
                - trained on **post-normalized** addresses from OpenAddresses.io,
                  hence missing un-normalized forms. E.g. "ST" (for street), but
                  no training data with "street", "str.", ...
        """)


demo.launch()