|
import gradio as gr |
|
import string |
|
import re |
|
|
|
from transformers import pipeline |
|
|
|
model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can" |
|
token_classifier = pipeline( |
|
"token-classification", model=model_checkpoint, aggregation_strategy="simple" |
|
) |
|
|
|
|
|
|
|
|
|
def replace_punctuation_with_space(text): |
|
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) |
|
return text.translate(translator) |
|
|
|
def replace_multiple_spaces(text): |
|
"""Replaces multiple contiguous spaces in a string with a single space. |
|
|
|
Args: |
|
text: The input string. |
|
|
|
Returns: |
|
The string with multiple contiguous spaces replaced by a single space. |
|
""" |
|
return re.sub(r'\s+', ' ', text) |
|
|
|
def parse_postal_address_can(text): |
|
"""Parse the given Canadian address into its components. |
|
""" |
|
text = text.lower() |
|
text = replace_punctuation_with_space(text) |
|
text = replace_multiple_spaces(text) |
|
return token_classifier(text) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
|
|
|
|
|
|
|
|
input_text = gr.Textbox( |
|
lines=5, |
|
placeholder="Enter Canadian postal address to parse", |
|
label="Canadian postal address", |
|
render=False |
|
) |
|
output_text = gr.Textbox(lines=10, render=False) |
|
|
|
gr.Interface( |
|
fn=parse_postal_address_can, |
|
inputs=[input_text,], |
|
outputs=[output_text,], |
|
allow_flagging="never" |
|
|
|
) |
|
|
|
with gr.Accordion("Documentation", open=False): |
|
gr.Markdown(""" |
|
- **Labels (address components)**: |
|
- O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE |
|
- Dataset trained on: |
|
- Approx. 1 million Canadian postal addresses from OpenAddresses.io |
|
- (Current) Limitations: |
|
- no label for person_name / company_name (no data to train on) |
|
- trained on **post-normalized** addresses from OpenAddresses.io, |
|
hence missing un-normalized forms. E.g. "ST" (for street), but |
|
no training data with "street", "str.", ... |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |