ky2k commited on
Commit
479b3e7
·
1 Parent(s): bcbd51f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ # Install hunspell and its dependencies, pip wheels are completely broken
4
+ subprocess.call(['sudo', 'apt', 'install', 'hunspell', 'hunspell-uk', 'libhunspell-dev'])
5
+ subprocess.call(['sudo', 'pip', 'install', 'hunspell'])
6
+
7
+ # Import hunspell
8
+ import hunspell
9
+ # Main imports
10
+ import gradio as gr
11
+ import re
12
+ import stanza
13
+ import spacy
14
+ import pandas as pd
15
+
16
+ def create_settlement_and_country_lists():
17
+ settlement_list = []
18
+ country_list = []
19
+
20
+ # Read Ukrainian settlement names from CSV file
21
+ df_settlements = pd.read_csv("assets/locations/ukrainian_settlement_mames.csv", encoding="utf-8")
22
+ ukrainian_settlements = df_settlements["Назва об'єкта українською мовою"].values.tolist()
23
+ settlement_list.extend(ukrainian_settlements)
24
+
25
+ # Read European settlement names from CSV file
26
+ df_eu_settlements = pd.read_csv("assets/locations/european_cities.csv", encoding="utf-8")
27
+ european_settlements = df_eu_settlements["City"].values.tolist()
28
+ settlement_list.extend(european_settlements)
29
+
30
+ # Convert settlement list to lowercase
31
+ settlement_list = [word.lower() for word in settlement_list]
32
+
33
+ # Read country names from text file
34
+ with open("assets/locations/countries.txt", "r", encoding="utf-8") as country_file:
35
+ country_list = [line.strip().lower() for line in country_file]
36
+
37
+ return settlement_list, country_list
38
+
39
+ # Call the function to create settlement and country lists
40
+ settlement_list, country_list = create_settlement_and_country_lists()
41
+
42
+ spellchecker = hunspell.HunSpell('assets/dictionaries/uk_UA.dic', 'assets/dictionaries/uk_UA.aff')
43
+
44
+ settlement_list = [s.lower() for s in settlement_list] # Convert settlement list to lowercase
45
+ country_list = [c.lower() for c in country_list] # Convert country list to lowercase
46
+
47
+ # Initialize Stanza NLP
48
+ stanza.download('uk')
49
+ nlp_stanza = stanza.Pipeline('uk', processors='tokenize,pos,ner')
50
+
51
+ # Load SpaCy NER model
52
+ nlp_spacy = spacy.load("uk_ner_web_trf_base")
53
+
54
+ def process_text_with_stanza(text):
55
+ doc = nlp_stanza(text)
56
+ return format_output(process_text(doc))
57
+
58
+ def process_text_with_spacy(text):
59
+ doc = nlp_spacy(text)
60
+ return format_output(process_text_spacy(doc))
61
+
62
+ def format_output(matches):
63
+ formatted_matches = []
64
+ for match in matches:
65
+ location_type = match[0]
66
+ entity = match[1]
67
+ formatted_matches.append(f"{location_type}: {entity}")
68
+ return "\n".join(formatted_matches) if formatted_matches else notify_no_result()
69
+
70
+ def notify_no_result():
71
+ return "No locations found in the text."
72
+
73
+ def process_text(doc):
74
+ starting_point_patterns = [r'(з|із|із-за|від|от|од){pos:IN} (\w+{ner:LOC})']
75
+ destination_patterns = [r'(до|в|у|ув|к){pos:IN} (\w+{ner:LOC})']
76
+
77
+ starting_point_matches = []
78
+ for pattern in starting_point_patterns:
79
+ matches = re.findall(pattern, doc.text)
80
+ starting_point_matches.extend(matches)
81
+
82
+ destination_matches = []
83
+ for pattern in destination_patterns:
84
+ matches = re.findall(pattern, doc.text)
85
+ destination_matches.extend(matches)
86
+
87
+ loc_entities = [ent.text for ent in doc.ents if ent.type == 'LOC']
88
+ if len(loc_entities) == 2 and not starting_point_matches and not destination_matches:
89
+ starting_point = loc_entities[0]
90
+ destination = loc_entities[1]
91
+ return [
92
+ (starting_point, 'Starting Point', get_base_form_regex(starting_point, settlement_list, country_list, doc)),
93
+ (destination, 'Destination', get_base_form_regex(destination, settlement_list, country_list, doc))
94
+ ]
95
+
96
+ if len(loc_entities) == 1 and not starting_point_matches and not destination_matches:
97
+ return [(loc_entities[0], 'Unknown', get_base_form_regex(loc_entities[0], settlement_list, country_list, doc))]
98
+
99
+ treated_matches = [
100
+ (match[1], 'Starting Point', get_base_form_regex(match[1], settlement_list, country_list, doc))
101
+ for match in starting_point_matches
102
+ ] + [
103
+ (match[1], 'Destination', get_base_form_regex(match[1], settlement_list, country_list, doc))
104
+ for match in destination_matches
105
+ ]
106
+
107
+ formatted_matches = []
108
+ for match in treated_matches:
109
+ location_type = match[1]
110
+ lemma_results = match[2][0] # Access the first element of the nested list
111
+ formatted_lemma = lemma_results[1].capitalize().strip('\n')
112
+ formatted_matches.append((location_type, lemma_results[0], formatted_lemma))
113
+
114
+ return formatted_matches
115
+
116
+ def process_text_spacy(doc):
117
+ starting_point_patterns = [
118
+ r'(з|із|із-за|від|от|од){pos:ADP} (\w+{ner:LOC})',
119
+ r'(\w+{ner:LOC})\s+(з|із|із-за|від|от|од){pos:ADP}'
120
+ ]
121
+ destination_patterns = [
122
+ r'(до|в|у|ув|к){pos:ADP} (\w+{ner:LOC})',
123
+ r'(\w+{ner:LOC})\s+(до|в|у|ув|к){pos:ADP}'
124
+ ]
125
+
126
+ starting_point_matches = []
127
+ for pattern in starting_point_patterns:
128
+ matches = re.findall(pattern, doc.text)
129
+ starting_point_matches.extend(matches)
130
+
131
+ destination_matches = []
132
+ for pattern in destination_patterns:
133
+ matches = re.findall(pattern, doc.text)
134
+ destination_matches.extend(matches)
135
+
136
+ loc_entities = [ent.text for ent in doc.ents if ent.label_ == 'LOC']
137
+ if len(loc_entities) == 2 and not starting_point_matches and not destination_matches:
138
+ starting_point = loc_entities[0]
139
+ destination = loc_entities[1]
140
+ return [
141
+ (starting_point, 'Starting Point', get_base_form_stanza(starting_point, settlement_list, country_list, doc)),
142
+ (destination, 'Destination', get_base_form_stanza(destination, settlement_list, country_list, doc))
143
+ ]
144
+
145
+ if len(loc_entities) == 1 and not starting_point_matches and not destination_matches:
146
+ return [(loc_entities[0], 'Unknown', get_base_form_stanza(loc_entities[0], settlement_list, country_list, doc))]
147
+
148
+ treated_matches = [
149
+ (match[1], 'Starting Point', get_base_form_stanza(match[1], settlement_list, country_list, doc))
150
+ for match in starting_point_matches
151
+ ] + [
152
+ (match[1], 'Destination', get_base_form_stanza(match[1], settlement_list, country_list, doc))
153
+ for match in destination_matches
154
+ ]
155
+
156
+ formatted_matches = []
157
+ for match in treated_matches:
158
+ location_type = match[1]
159
+ lemma_results = match[2] # Use directly, as it's already the required format
160
+ formatted_lemma = lemma_results.capitalize().strip('\n')
161
+ formatted_matches.append((location_type, lemma_results, formatted_lemma))
162
+
163
+ return formatted_matches
164
+
165
+ def get_base_form_stanza(word, settlement_list, country_list, doc):
166
+ token = None
167
+ base_form = ""
168
+ for sent in doc.sentences:
169
+ for wrd in sent.words:
170
+ if wrd.text.lower() == word.lower():
171
+ token = wrd
172
+ break
173
+
174
+ if token is not None:
175
+ if token.upos == 'PROPN' and token.text.lower() not in settlement_list and token.text.lower() not in country_list:
176
+ base_form = token.lemma
177
+ else:
178
+ base_form = token.text
179
+
180
+ return base_form
181
+
182
+ def get_base_form_regex(word, settlement_list, country_list, doc):
183
+ base_form = ""
184
+ base_form_regex = ""
185
+
186
+ if word.lower() in settlement_list or word.lower() in country_list:
187
+ base_form = word.lower()
188
+ else:
189
+ base_form = get_base_form_stanza(word, settlement_list, country_list, doc)
190
+
191
+ if base_form:
192
+ base_form_regex = base_form
193
+
194
+ return base_form_regex, base_form
195
+
196
+ iface = gr.Interface(
197
+ fn=[process_text_with_stanza, process_text_with_spacy],
198
+ inputs=gr.inputs.Textbox(lines=5, label="Input Text"),
199
+ outputs=["text", "text"],
200
+ title="Text Processing Demo",
201
+ description="A demo to process text and extract locations using Stanza and SpaCy.",
202
+ examples=[
203
+ ["Автобус з Києва до Житомира"],
204
+ ["Автобус з Києва в Бердичів"],
205
+ ["Поїздка з Варшави до Івано-Франківська"],
206
+ ]
207
+ )
208
+
209
+ iface.launch()