Spaces:

Didier
/

Postal_address_canada_parsing

Running

App Files Files Community

Didier commited on Aug 13, 2024

Commit

1aa2346

verified ·

1 Parent(s): 53950e0

Create app.py

Browse files

Files changed (1) hide show

app.py +78 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import gradio as gr
+import string
+import re
+from transformers import pipeline
+model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
+token_classifier = pipeline(
+    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
+)
+#
+# Parse a given Canadian postal address
+#
+def replace_punctuation_with_space(text):
+    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
+    return text.translate(translator)
+def replace_multiple_spaces(text):
+    """Replaces multiple contiguous spaces in a string with a single space.
+    Args:
+        text: The input string.
+    Returns:
+        The string with multiple contiguous spaces replaced by a single space.
+    """
+    return re.sub(r'\s+', ' ', text)
+def parse_postal_address_can(text):
+    """Parse the given Canadian address into its components.
+    """
+    text = text.lower()
+    text = replace_punctuation_with_space(text)
+    text = replace_multiple_spaces(text)
+    return token_classifier(text)
+#
+# User interface
+#
+with gr.Blocks() as demo:
+    #gr.Markdown("""
+    #    - The provided Canadian postal address will be parsed into its components.
+    #""")
+    input_text = gr.Textbox(
+        lines=5,
+        placeholder="Enter Canadian postal address to parse",
+        label="Canadian postal address",
+        render=False
+    )
+    output_text = gr.Textbox(lines=10, render=False)
+    gr.Interface(
+        fn=parse_postal_address_can,
+        inputs=[input_text,],
+        outputs=[output_text,],
+        allow_flagging="never"
+        #clear_btn=None
+    )
+    with gr.Accordion("Documentation", open=False):
+        gr.Markdown("""
+            - **Labels (address components)**:
+                - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
+            - Dataset trained on:
+                - Approx. 1 million Canadian postal addresses from OpenAddresses.io
+            - (Current) Limitations:
+                - no label for person_name / company_name (no data to train on)
+                - trained on **post-normalized** addresses from OpenAddresses.io,
+                  hence missing un-normalized forms. E.g. "ST" (for street), but
+                  no training data with "street", "str.", ...
+        """)
+if __name__ == "__main__":
+    demo.launch()