import gradio as gr import string import re import pandas as pd from transformers import pipeline model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can" token_classifier = pipeline( "token-classification", model=model_checkpoint, aggregation_strategy="simple" ) # # Parse a given Canadian postal address # def replace_punctuation_with_space(text): translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) return text.translate(translator) def replace_multiple_spaces(text): """Replaces multiple contiguous spaces in a string with a single space. Args: text: The input string. Returns: The string with multiple contiguous spaces replaced by a single space. """ return re.sub(r'\s+', ' ', text) def parse_postal_address_can(text): """Parse the given Canadian address into its components. """ text = text.lower() text = text.replace(".", "") #text = replace_punctuation_with_space(text) text = replace_multiple_spaces(text) results = token_classifier(text) # Format the output as a dataframe data = [] for result in results: data.append({ 'entity_group': result['entity_group'], 'score': f"{result['score']:.2f}", 'word': result['word'], 'start': result['start'], 'end': result['end'] }) df = pd.DataFrame(data) return df # # User interface # with gr.Blocks() as demo: gr.Markdown(""" ## Canadian postal address parsing - version 0.02 """) input_text = gr.Textbox( lines=5, placeholder="Enter Canadian postal address to parse", label="Canadian postal address", render=False ) output = gr.DataFrame(value=None, row_count=6, render=False) examples = [ ["405-200 René Lévesque Blvd W, Montreal, Quebec H2Z 1X4",], ["1 Sussex Dr, Ottawa, ON K1A 0A1",], ["5124 53 St, #205, Yellowknife, Northwest Territories, X1A 1V6",] ] gr.Interface( fn=parse_postal_address_can, inputs=[input_text,], outputs=[output,], examples=examples ) with gr.Accordion("Documentation", open=False): gr.Markdown(""" - Labels (address components): - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE - Dataset trained on: - 15+ million Canadian postal addresses available at OpenAddresses.io - (Current) Limitations: - no label for person_name / company_name (no data to train on) - trained on **post-normalized** addresses from OpenAddresses.io, hence missing un-normalized forms. E.g. "ST" (for street), but no training data with "street", "str.", ... - Enhancements: - Additional de-normalization of training data - Addition of person / companies names to the training data - Post-processing of results """) demo.launch()