File size: 3,083 Bytes
1aa2346 2469e02 1aa2346 d0e0fd8 f5d7472 1aa2346 2469e02 4bab2d8 2469e02 1aa2346 8b56e33 f5d7472 8b56e33 1aa2346 ffcecb5 1aa2346 8b56e33 05f325e 8b56e33 05f325e 8b56e33 1aa2346 8b56e33 1aa2346 84d6cd0 1aa2346 3e8ad7c 1aa2346 8b56e33 1aa2346 2469e02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import string
import re
import pandas as pd
from transformers import pipeline
model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
return text.translate(translator)
def replace_multiple_spaces(text):
"""Replaces multiple contiguous spaces in a string with a single space.
Args:
text: The input string.
Returns:
The string with multiple contiguous spaces replaced by a single space.
"""
return re.sub(r'\s+', ' ', text)
def parse_postal_address_can(text):
"""Parse the given Canadian address into its components.
"""
text = text.lower()
text = text.replace(".", "")
#text = replace_punctuation_with_space(text)
text = replace_multiple_spaces(text)
results = token_classifier(text)
# Format the output as a dataframe
data = []
for result in results:
data.append({
'entity_group': result['entity_group'],
'score': f"{result['score']:.2f}",
'word': result['word'],
'start': result['start'],
'end': result['end']
})
df = pd.DataFrame(data)
return df
#
# User interface
#
with gr.Blocks() as demo:
gr.Markdown("""
## Canadian postal address parsing - version 0.02
""")
input_text = gr.Textbox(
lines=5,
placeholder="Enter Canadian postal address to parse",
label="Canadian postal address",
render=False
)
output = gr.DataFrame(value=None, row_count=6, render=False)
examples = [
["405-200 René Lévesque Blvd W, Montreal, Quebec H2Z 1X4",],
["1 Sussex Dr, Ottawa, ON K1A 0A1",],
["5124 53 St, #205, Yellowknife, Northwest Territories, X1A 1V6",]
]
gr.Interface(
fn=parse_postal_address_can,
inputs=[input_text,],
outputs=[output,],
examples=examples
)
with gr.Accordion("Documentation", open=False):
gr.Markdown("""
- Labels (address components):
- O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
- Dataset trained on:
- 15+ million Canadian postal addresses available at OpenAddresses.io
- (Current) Limitations:
- no label for person_name / company_name (no data to train on)
- trained on **post-normalized** addresses from OpenAddresses.io,
hence missing un-normalized forms. E.g. "ST" (for street), but
no training data with "street", "str.", ...
- Enhancements:
- Additional de-normalization of training data
- Addition of person / companies names to the training data
- Post-processing of results
""")
demo.launch() |