Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import pickle | |
| import re | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import networkx as nx | |
| from tqdm import tqdm | |
| def load_json_from_path(path): | |
| with open(path, "r", encoding="utf8") as f: | |
| obj = json.loads(f.read()) | |
| return obj | |
| class Visualizer: | |
| def __init__(self, cache_root="."): | |
| self.iso_codes_to_names = load_json_from_path(os.path.join(cache_root, "iso_to_fullname.json")) | |
| for code in self.iso_codes_to_names: | |
| self.iso_codes_to_names[code] = re.sub("\(.*?\)", "", self.iso_codes_to_names[code]) | |
| tree_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_tree_dist.json") | |
| tree_dist = load_json_from_path(tree_lookup_path) | |
| distances = list() | |
| for lang_1 in tree_dist: | |
| if lang_1 not in self.iso_codes_to_names: | |
| continue | |
| for lang_2 in tree_dist[lang_1]: | |
| if lang_2 not in self.iso_codes_to_names: | |
| continue | |
| if lang_1 != lang_2: | |
| distances.append((self.iso_codes_to_names[lang_1], self.iso_codes_to_names[lang_2], tree_dist[lang_1][lang_2])) | |
| min_dist = min(d for _, _, d in distances) | |
| max_dist = max(d for _, _, d in distances) | |
| self.tree_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances] | |
| map_lookup_path = os.path.join(cache_root, "lang_1_to_lang_2_to_map_dist.json") | |
| map_dist = load_json_from_path(map_lookup_path) | |
| distances = list() | |
| for lang_1 in map_dist: | |
| if lang_1 not in self.iso_codes_to_names: | |
| continue | |
| for lang_2 in map_dist[lang_1]: | |
| if lang_2 not in self.iso_codes_to_names: | |
| continue | |
| if lang_1 != lang_2: | |
| distances.append((self.iso_codes_to_names[lang_1], self.iso_codes_to_names[lang_2], map_dist[lang_1][lang_2])) | |
| min_dist = min(d for _, _, d in distances) | |
| max_dist = max(d for _, _, d in distances) | |
| self.map_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances] | |
| asp_dict_path = os.path.join(cache_root, "asp_dict.pkl") | |
| with open(asp_dict_path, 'rb') as dictfile: | |
| asp_sim = pickle.load(dictfile) | |
| lang_list = list(asp_sim.keys()) | |
| asp_dist = dict() | |
| seen_langs = set() | |
| for lang_1 in lang_list: | |
| if lang_1 not in seen_langs: | |
| seen_langs.add(lang_1) | |
| asp_dist[lang_1] = dict() | |
| for index, lang_2 in enumerate(lang_list): | |
| if lang_2 not in seen_langs: # it's symmetric | |
| asp_dist[lang_1][lang_2] = 1 - asp_sim[lang_1][index] | |
| distances = list() | |
| for lang_1 in asp_dist: | |
| if lang_1 not in self.iso_codes_to_names: | |
| continue | |
| for lang_2 in asp_dist[lang_1]: | |
| if lang_2 not in self.iso_codes_to_names: | |
| continue | |
| if lang_1 != lang_2: | |
| distances.append((self.iso_codes_to_names[lang_1], self.iso_codes_to_names[lang_2], asp_dist[lang_1][lang_2])) | |
| min_dist = min(d for _, _, d in distances) | |
| max_dist = max(d for _, _, d in distances) | |
| self.asp_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances] | |
| def visualize(self, distance_type, neighbor, num_neighbors): | |
| plt.clf() | |
| plt.figure(figsize=(12, 12)) | |
| assert distance_type in ["Physical Distance between Language Centroids on the Globe", | |
| "Distance to the Lowest Common Ancestor in the Language Family Tree", | |
| "Angular Distance between the Frequencies of Phonemes"] | |
| if distance_type == "Distance to the Lowest Common Ancestor in the Language Family Tree": | |
| normalized_distances = self.tree_distances | |
| elif distance_type == "Angular Distance between the Frequencies of Phonemes": | |
| normalized_distances = self.asp_distances | |
| elif distance_type == "Physical Distance between Language Centroids on the Globe": | |
| normalized_distances = self.map_distances | |
| G = nx.Graph() | |
| d_dist = list() | |
| for entity1, entity2, d in tqdm(normalized_distances): | |
| if neighbor == entity2 or neighbor == entity1: | |
| d_dist.append(d) | |
| thresh = sorted(d_dist)[num_neighbors] | |
| neighbors = set() | |
| for entity1, entity2, d in tqdm(normalized_distances): | |
| if d <= thresh and (neighbor == entity2 or neighbor == entity1) and len(neighbors) < num_neighbors + 1: | |
| neighbors.add(entity1) | |
| neighbors.add(entity2) | |
| spring_tension = (thresh - d) * 100 # for vis purposes | |
| G.add_edge(entity1, entity2, weight=spring_tension) | |
| neighbors.remove(neighbor) | |
| for entity1, entity2, d in tqdm(normalized_distances): | |
| if entity2 in neighbors and entity1 in neighbors: | |
| spring_tension = thresh - d | |
| G.add_edge(entity1, entity2, weight=spring_tension) | |
| pos = nx.spring_layout(G, weight="weight") # Positions for all nodes | |
| edges = G.edges(data=True) | |
| nx.draw_networkx_nodes(G, pos, node_size=1, alpha=0.01) | |
| edges_connected_to_specific_node = [(u, v) for u, v in G.edges() if u == neighbor or v == neighbor] | |
| nx.draw_networkx_edges(G, pos, edgelist=edges_connected_to_specific_node, edge_color='orange', alpha=0.4, width=3) | |
| # edges_not_connected_to_specific_node = [(u, v) for u, v in G.edges() if u != neighbor and v != neighbor] | |
| # nx.draw_networkx_edges(G, pos, edgelist=edges_not_connected_to_specific_node, edge_color='gray', alpha=0.1, width=1) | |
| for u, v, d in edges: | |
| if u == neighbor or v == neighbor: | |
| nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): round((thresh - (d['weight'] / 100)) * 10, 2)}, font_color="red", alpha=0.4) # reverse modifications | |
| nx.draw_networkx_labels(G, pos, font_size=14, font_family='sans-serif', font_color='green') | |
| nx.draw_networkx_labels(G, pos, labels={neighbor: neighbor}, font_size=14, font_family='sans-serif', font_color='red') | |
| plt.title(f'Graph of {distance_type}') | |
| plt.subplots_adjust(left=0, right=1, top=0.9, bottom=0) | |
| plt.tight_layout() | |
| return plt.gcf() | |
| if __name__ == '__main__': | |
| vis = Visualizer(cache_root="Preprocessing/multilinguality") | |
| text_selection = [f"{vis.iso_codes_to_names[iso_code]}" for iso_code in vis.iso_codes_to_names] | |
| iface = gr.Interface(fn=vis.visualize, | |
| inputs=[gr.Dropdown(["Physical Distance between Language Centroids on the Globe", | |
| "Distance to the Lowest Common Ancestor in the Language Family Tree", | |
| "Angular Distance between the Frequencies of Phonemes"], | |
| type="value", | |
| value='Physical Distance between Language Centroids on the Globe', | |
| label="Select the Type of Distance"), | |
| gr.Dropdown(text_selection, | |
| type="value", | |
| value="German", | |
| label="Select the second Language (type on your keyboard to find it quickly)"), | |
| gr.Slider(minimum=0, maximum=100, step=1, | |
| value=12, | |
| label="How many Nearest Neighbors should be displayed?") | |
| ], | |
| outputs=[gr.Plot(label="", show_label=False, format="png", container=True)], | |
| description="<br><br> This demo allows you to find the nearest neighbors of a language from the ISO 639-3 list according to several distance measurement functions. " | |
| "For more information, check out our paper: https://arxiv.org/abs/2406.06403 and our text-to-speech tool, in which we make use of " | |
| "this technique: https://github.com/DigitalPhonetics/IMS-Toucan <br><br>", | |
| fill_width=True, | |
| allow_flagging="never") | |
| iface.launch() | |