import gradio as gr import string import re from transformers import pipeline model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can" token_classifier = pipeline( "token-classification", model=model_checkpoint, aggregation_strategy="simple" ) # # Parse a given Canadian postal address # def replace_punctuation_with_space(text): translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) return text.translate(translator) def replace_multiple_spaces(text): """Replaces multiple contiguous spaces in a string with a single space. Args: text: The input string. Returns: The string with multiple contiguous spaces replaced by a single space. """ return re.sub(r'\s+', ' ', text) def parse_postal_address_can(text): """Parse the given Canadian address into its components. """ text = text.lower() text = replace_punctuation_with_space(text) text = replace_multiple_spaces(text) return token_classifier(text) # # User interface # with gr.Blocks() as demo: #gr.Markdown(""" # - The provided Canadian postal address will be parsed into its components. #""") input_text = gr.Textbox( lines=5, placeholder="Enter Canadian postal address to parse", label="Canadian postal address", render=False ) output_text = gr.Textbox(lines=10, render=False) gr.Interface( fn=parse_postal_address_can, inputs=[input_text,], outputs=[output_text,], allow_flagging="never" #clear_btn=None ) with gr.Accordion("Documentation", open=False): gr.Markdown(""" - **Labels (address components)**: - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE - Dataset trained on: - Approx. 1 million Canadian postal addresses from OpenAddresses.io - (Current) Limitations: - no label for person_name / company_name (no data to train on) - trained on **post-normalized** addresses from OpenAddresses.io, hence missing un-normalized forms. E.g. "ST" (for street), but no training data with "street", "str.", ... """) if __name__ == "__main__": demo.launch()