Didier commited on
Commit
1aa2346
·
verified ·
1 Parent(s): 53950e0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import string
3
+ import re
4
+
5
+ from transformers import pipeline
6
+
7
+ model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can"
8
+ token_classifier = pipeline(
9
+ "token-classification", model=model_checkpoint, aggregation_strategy="simple"
10
+ )
11
+
12
+ #
13
+ # Parse a given Canadian postal address
14
+ #
15
+ def replace_punctuation_with_space(text):
16
+ translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
17
+ return text.translate(translator)
18
+
19
+ def replace_multiple_spaces(text):
20
+ """Replaces multiple contiguous spaces in a string with a single space.
21
+
22
+ Args:
23
+ text: The input string.
24
+
25
+ Returns:
26
+ The string with multiple contiguous spaces replaced by a single space.
27
+ """
28
+ return re.sub(r'\s+', ' ', text)
29
+
30
+ def parse_postal_address_can(text):
31
+ """Parse the given Canadian address into its components.
32
+ """
33
+ text = text.lower()
34
+ text = replace_punctuation_with_space(text)
35
+ text = replace_multiple_spaces(text)
36
+ return token_classifier(text)
37
+
38
+
39
+ #
40
+ # User interface
41
+ #
42
+ with gr.Blocks() as demo:
43
+
44
+ #gr.Markdown("""
45
+ # - The provided Canadian postal address will be parsed into its components.
46
+ #""")
47
+
48
+ input_text = gr.Textbox(
49
+ lines=5,
50
+ placeholder="Enter Canadian postal address to parse",
51
+ label="Canadian postal address",
52
+ render=False
53
+ )
54
+ output_text = gr.Textbox(lines=10, render=False)
55
+
56
+ gr.Interface(
57
+ fn=parse_postal_address_can,
58
+ inputs=[input_text,],
59
+ outputs=[output_text,],
60
+ allow_flagging="never"
61
+ #clear_btn=None
62
+ )
63
+
64
+ with gr.Accordion("Documentation", open=False):
65
+ gr.Markdown("""
66
+ - **Labels (address components)**:
67
+ - O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE
68
+ - Dataset trained on:
69
+ - Approx. 1 million Canadian postal addresses from OpenAddresses.io
70
+ - (Current) Limitations:
71
+ - no label for person_name / company_name (no data to train on)
72
+ - trained on **post-normalized** addresses from OpenAddresses.io,
73
+ hence missing un-normalized forms. E.g. "ST" (for street), but
74
+ no training data with "street", "str.", ...
75
+ """)
76
+
77
+ if __name__ == "__main__":
78
+ demo.launch()