Spaces:
Runtime error
Runtime error
File size: 8,084 Bytes
479b3e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import subprocess
# Install hunspell and its dependencies, pip wheels are completely broken
subprocess.call(['sudo', 'apt', 'install', 'hunspell', 'hunspell-uk', 'libhunspell-dev'])
subprocess.call(['sudo', 'pip', 'install', 'hunspell'])
# Import hunspell
import hunspell
# Main imports
import gradio as gr
import re
import stanza
import spacy
import pandas as pd
def create_settlement_and_country_lists():
settlement_list = []
country_list = []
# Read Ukrainian settlement names from CSV file
df_settlements = pd.read_csv("assets/locations/ukrainian_settlement_mames.csv", encoding="utf-8")
ukrainian_settlements = df_settlements["Назва об'єкта українською мовою"].values.tolist()
settlement_list.extend(ukrainian_settlements)
# Read European settlement names from CSV file
df_eu_settlements = pd.read_csv("assets/locations/european_cities.csv", encoding="utf-8")
european_settlements = df_eu_settlements["City"].values.tolist()
settlement_list.extend(european_settlements)
# Convert settlement list to lowercase
settlement_list = [word.lower() for word in settlement_list]
# Read country names from text file
with open("assets/locations/countries.txt", "r", encoding="utf-8") as country_file:
country_list = [line.strip().lower() for line in country_file]
return settlement_list, country_list
# Call the function to create settlement and country lists
settlement_list, country_list = create_settlement_and_country_lists()
spellchecker = hunspell.HunSpell('assets/dictionaries/uk_UA.dic', 'assets/dictionaries/uk_UA.aff')
settlement_list = [s.lower() for s in settlement_list] # Convert settlement list to lowercase
country_list = [c.lower() for c in country_list] # Convert country list to lowercase
# Initialize Stanza NLP
stanza.download('uk')
nlp_stanza = stanza.Pipeline('uk', processors='tokenize,pos,ner')
# Load SpaCy NER model
nlp_spacy = spacy.load("uk_ner_web_trf_base")
def process_text_with_stanza(text):
doc = nlp_stanza(text)
return format_output(process_text(doc))
def process_text_with_spacy(text):
doc = nlp_spacy(text)
return format_output(process_text_spacy(doc))
def format_output(matches):
formatted_matches = []
for match in matches:
location_type = match[0]
entity = match[1]
formatted_matches.append(f"{location_type}: {entity}")
return "\n".join(formatted_matches) if formatted_matches else notify_no_result()
def notify_no_result():
return "No locations found in the text."
def process_text(doc):
starting_point_patterns = [r'(з|із|із-за|від|от|од){pos:IN} (\w+{ner:LOC})']
destination_patterns = [r'(до|в|у|ув|к){pos:IN} (\w+{ner:LOC})']
starting_point_matches = []
for pattern in starting_point_patterns:
matches = re.findall(pattern, doc.text)
starting_point_matches.extend(matches)
destination_matches = []
for pattern in destination_patterns:
matches = re.findall(pattern, doc.text)
destination_matches.extend(matches)
loc_entities = [ent.text for ent in doc.ents if ent.type == 'LOC']
if len(loc_entities) == 2 and not starting_point_matches and not destination_matches:
starting_point = loc_entities[0]
destination = loc_entities[1]
return [
(starting_point, 'Starting Point', get_base_form_regex(starting_point, settlement_list, country_list, doc)),
(destination, 'Destination', get_base_form_regex(destination, settlement_list, country_list, doc))
]
if len(loc_entities) == 1 and not starting_point_matches and not destination_matches:
return [(loc_entities[0], 'Unknown', get_base_form_regex(loc_entities[0], settlement_list, country_list, doc))]
treated_matches = [
(match[1], 'Starting Point', get_base_form_regex(match[1], settlement_list, country_list, doc))
for match in starting_point_matches
] + [
(match[1], 'Destination', get_base_form_regex(match[1], settlement_list, country_list, doc))
for match in destination_matches
]
formatted_matches = []
for match in treated_matches:
location_type = match[1]
lemma_results = match[2][0] # Access the first element of the nested list
formatted_lemma = lemma_results[1].capitalize().strip('\n')
formatted_matches.append((location_type, lemma_results[0], formatted_lemma))
return formatted_matches
def process_text_spacy(doc):
starting_point_patterns = [
r'(з|із|із-за|від|от|од){pos:ADP} (\w+{ner:LOC})',
r'(\w+{ner:LOC})\s+(з|із|із-за|від|от|од){pos:ADP}'
]
destination_patterns = [
r'(до|в|у|ув|к){pos:ADP} (\w+{ner:LOC})',
r'(\w+{ner:LOC})\s+(до|в|у|ув|к){pos:ADP}'
]
starting_point_matches = []
for pattern in starting_point_patterns:
matches = re.findall(pattern, doc.text)
starting_point_matches.extend(matches)
destination_matches = []
for pattern in destination_patterns:
matches = re.findall(pattern, doc.text)
destination_matches.extend(matches)
loc_entities = [ent.text for ent in doc.ents if ent.label_ == 'LOC']
if len(loc_entities) == 2 and not starting_point_matches and not destination_matches:
starting_point = loc_entities[0]
destination = loc_entities[1]
return [
(starting_point, 'Starting Point', get_base_form_stanza(starting_point, settlement_list, country_list, doc)),
(destination, 'Destination', get_base_form_stanza(destination, settlement_list, country_list, doc))
]
if len(loc_entities) == 1 and not starting_point_matches and not destination_matches:
return [(loc_entities[0], 'Unknown', get_base_form_stanza(loc_entities[0], settlement_list, country_list, doc))]
treated_matches = [
(match[1], 'Starting Point', get_base_form_stanza(match[1], settlement_list, country_list, doc))
for match in starting_point_matches
] + [
(match[1], 'Destination', get_base_form_stanza(match[1], settlement_list, country_list, doc))
for match in destination_matches
]
formatted_matches = []
for match in treated_matches:
location_type = match[1]
lemma_results = match[2] # Use directly, as it's already the required format
formatted_lemma = lemma_results.capitalize().strip('\n')
formatted_matches.append((location_type, lemma_results, formatted_lemma))
return formatted_matches
def get_base_form_stanza(word, settlement_list, country_list, doc):
token = None
base_form = ""
for sent in doc.sentences:
for wrd in sent.words:
if wrd.text.lower() == word.lower():
token = wrd
break
if token is not None:
if token.upos == 'PROPN' and token.text.lower() not in settlement_list and token.text.lower() not in country_list:
base_form = token.lemma
else:
base_form = token.text
return base_form
def get_base_form_regex(word, settlement_list, country_list, doc):
base_form = ""
base_form_regex = ""
if word.lower() in settlement_list or word.lower() in country_list:
base_form = word.lower()
else:
base_form = get_base_form_stanza(word, settlement_list, country_list, doc)
if base_form:
base_form_regex = base_form
return base_form_regex, base_form
iface = gr.Interface(
fn=[process_text_with_stanza, process_text_with_spacy],
inputs=gr.inputs.Textbox(lines=5, label="Input Text"),
outputs=["text", "text"],
title="Text Processing Demo",
description="A demo to process text and extract locations using Stanza and SpaCy.",
examples=[
["Автобус з Києва до Житомира"],
["Автобус з Києва в Бердичів"],
["Поїздка з Варшави до Івано-Франківська"],
]
)
iface.launch()
|