import gradio as gr
import string
import re

from transformers import pipeline

model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

#
# Parse a given Canadian postal address
#
def replace_punctuation_with_space(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return text.translate(translator)

def replace_multiple_spaces(text):
    """Replaces multiple contiguous spaces in a string with a single space.

    Args:
        text: The input string.

    Returns:
        The string with multiple contiguous spaces replaced by a single space.
    """
    return re.sub(r'\s+', ' ', text)

def parse_postal_address_can(text):
    """Parse the given Canadian address into its components. 
    """
    text = text.lower()
    text = replace_punctuation_with_space(text)
    text = replace_multiple_spaces(text)
    return token_classifier(text)


#
# User interface
#
with gr.Blocks() as demo:

    #gr.Markdown("""
    #    - The provided Canadian postal address will be parsed into its components.
    #""")

    input_text = gr.Textbox(
        lines=5,
        placeholder="Enter Canadian postal address to parse",
        label="Canadian postal address",
        render=False
    )
    output_text = gr.Textbox(lines=10, render=False)

    gr.Interface(
        fn=parse_postal_address_can,
        inputs=[input_text,],
        outputs=[output_text,],
        allow_flagging="never"
        #clear_btn=None
    )

    with gr.Accordion("Documentation", open=False):
        gr.Markdown("""
            - **Labels (address components)**:
                - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
            - Dataset trained on:
                - Approx. 1 million Canadian postal addresses from OpenAddresses.io
            - (Current) Limitations:
                - no label for person_name / company_name (no data to train on)
                - trained on **post-normalized** addresses from OpenAddresses.io,
                  hence missing un-normalized forms. E.g. "ST" (for street), but
                  no training data with "street", "str.", ...
        """)

if __name__ == "__main__":
    demo.launch()