File size: 7,615 Bytes
8c1fbe5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import pickle
import re
import gradio as gr
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm
from Utility.utils import load_json_from_path
class Visualizer:
def __init__(self, cache_root="."):
tree_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_tree_dist.json")
self.tree_dist = load_json_from_path(tree_lookup_path)
map_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_map_dist.json")
self.map_dist = load_json_from_path(map_lookup_path)
largest_value_map_dist = 0.0
for _, values in self.map_dist.items():
for _, value in values.items():
largest_value_map_dist = max(largest_value_map_dist, value)
for key1 in self.map_dist:
for key2 in self.map_dist[key1]:
self.map_dist[key1][key2] = self.map_dist[key1][key2] / largest_value_map_dist
asp_dict_path = os.path.join(cache_root, "asp_dict.pkl")
with open(asp_dict_path, 'rb') as dictfile:
asp_sim = pickle.load(dictfile)
lang_list = list(asp_sim.keys())
self.asp_dist = dict()
seen_langs = set()
for lang_1 in lang_list:
if lang_1 not in seen_langs:
self.asp_dist[lang_1] = dict()
for index, lang_2 in enumerate(lang_list):
if lang_2 not in seen_langs: # it's symmetric
self.asp_dist[lang_1][lang_2] = 1 - asp_sim[lang_1][index]
self.iso_codes_to_names = load_json_from_path(os.path.join(cache_root, "iso_to_fullname.json"))
for code in self.iso_codes_to_names:
self.iso_codes_to_names[code] = re.sub("\(.*?\)", "", self.iso_codes_to_names[code])
def visualize(self, distance_type, neighbor, num_neighbors):
plt.figure(figsize=(12, 12))
assert distance_type in ["Physical Distance between Language Centroids on the Globe",
"Distance to the Lowest Common Ancestor in the Language Family Tree",
"Angular Distance between the Frequencies of Phonemes"]
if distance_type == "Distance to the Lowest Common Ancestor in the Language Family Tree":
distance_measure = self.tree_dist
elif distance_type == "Angular Distance between the Frequencies of Phonemes":
distance_measure = self.asp_dist
elif distance_type == "Physical Distance between Language Centroids on the Globe":
distance_measure = self.map_dist
distances = list()
for lang_1 in distance_measure:
if lang_1 not in self.iso_codes_to_names:
for lang_2 in distance_measure[lang_1]:
if lang_2 not in self.iso_codes_to_names:
distances.append((self.iso_codes_to_names[lang_1], self.iso_codes_to_names[lang_2], distance_measure[lang_1][lang_2]))
G = nx.Graph()
min_dist = min(d for _, _, d in distances)
max_dist = max(d for _, _, d in distances)
normalized_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances]
d_dist = list()
for entity1, entity2, d in tqdm(normalized_distances):
if neighbor == entity2 or neighbor == entity1:
if entity1 != entity2:
thresh = sorted(d_dist)[num_neighbors]
neighbors = set()
for entity1, entity2, d in tqdm(normalized_distances):
if d < thresh and (neighbor == entity2 or neighbor == entity1) and (entity1 != entity2):
spring_tension = (thresh - d) * 10 # for vis purposes
G.add_edge(entity1, entity2, weight=spring_tension)
for entity1, entity2, d in tqdm(normalized_distances):
if entity2 in neighbors and entity1 in neighbors:
if entity1 != entity2:
spring_tension = thresh - d
G.add_edge(entity1, entity2, weight=spring_tension)
pos = nx.spring_layout(G, weight="weight") # Positions for all nodes
edges = G.edges(data=True)
nx.draw_networkx_nodes(G, pos, node_size=1, alpha=0.01)
edges_connected_to_specific_node = [(u, v) for u, v in G.edges() if u == neighbor or v == neighbor]
nx.draw_networkx_edges(G, pos, edgelist=edges_connected_to_specific_node, edge_color='orange', alpha=0.4, width=3)
# edges_not_connected_to_specific_node = [(u, v) for u, v in G.edges() if u != neighbor and v != neighbor]
# nx.draw_networkx_edges(G, pos, edgelist=edges_not_connected_to_specific_node, edge_color='gray', alpha=0.1, width=1)
for u, v, d in edges:
if u == neighbor or v == neighbor:
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): round((thresh - (d['weight'] / 10)) * 10, 2)}, font_color="red", alpha=0.4) # reverse modifications
nx.draw_networkx_labels(G, pos, font_size=14, font_family='sans-serif', font_color='green')
nx.draw_networkx_labels(G, pos, labels={neighbor: neighbor}, font_size=14, font_family='sans-serif', font_color='red')
plt.title(f'Graph of {distance_type}')
plt.subplots_adjust(left=0, right=1, top=0.9, bottom=0)
return plt.gcf()
if __name__ == '__main__':
vis = Visualizer(cache_root=".")
text_selection = [f"{vis.iso_codes_to_names[iso_code]}" for iso_code in vis.iso_codes_to_names]
iface = gr.Interface(fn=vis.visualize,
inputs=[gr.Dropdown(["Physical Distance between Language Centroids on the Globe",
"Distance to the Lowest Common Ancestor in the Language Family Tree",
"Angular Distance between the Frequencies of Phonemes"],
value='Physical Distance between Language Centroids on the Globe',
label="Select the Type of Distance"),
label="Select the second Language (type on your keyboard to find it quickly)"),
gr.Slider(minimum=0, maximum=100, step=1,
label="How many Nearest Neighbors should be displayed?")
outputs=[gr.Plot(label="", show_label=False, format="png", container=True)],
description="<br><br> This demo allows you to find the nearest neighbors of a language from the ISO 639-3 list according to several distance measurement functions. "
"For more information, check out our paper: and our text-to-speech tool, in which we make use of "
"this technique: <br><br>",