File size: 2,623 Bytes
1aa2346 2469e02 1aa2346 2469e02 4bab2d8 2469e02 1aa2346 ffcecb5 1aa2346 2469e02 1aa2346 2469e02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import string
import re
import pandas as pd
from transformers import pipeline
model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
return text.translate(translator)
def replace_multiple_spaces(text):
"""Replaces multiple contiguous spaces in a string with a single space.
Args:
text: The input string.
Returns:
The string with multiple contiguous spaces replaced by a single space.
"""
return re.sub(r'\s+', ' ', text)
def parse_postal_address_can(text):
"""Parse the given Canadian address into its components.
"""
text = text.lower()
text = replace_punctuation_with_space(text)
text = replace_multiple_spaces(text)
results = token_classifier(text)
# Format the output as a dataframe
data = []
for result in results:
data.append({
'entity_group': result['entity_group'],
'score': f"{result['score']:.2f}",
'word': result['word'],
'start': result['start'],
'end': result['end']
})
df = pd.DataFrame(data)
return df
#
# User interface
#
with gr.Blocks() as demo:
#gr.Markdown("""
# - The provided Canadian postal address will be parsed into its components.
#""")
input_text = gr.Textbox(
lines=5,
placeholder="Enter Canadian postal address to parse",
label="Canadian postal address",
render=False
)
output = gr.DataFrame(value=None, row_count=6, render=False)
gr.Interface(
fn=parse_postal_address_can,
inputs=[input_text,],
outputs=[output,]
)
with gr.Accordion("Documentation", open=False):
gr.Markdown("""
- **Labels (address components)**:
- O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
- Dataset trained on:
- Approx. 1 million Canadian postal addresses from OpenAddresses.io
- (Current) Limitations:
- no label for person_name / company_name (no data to train on)
- trained on **post-normalized** addresses from OpenAddresses.io,
hence missing un-normalized forms. E.g. "ST" (for street), but
no training data with "street", "str.", ...
""")
demo.launch() |