File size: 2,126 Bytes
35b22df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
=====
Roget
=====

Build a directed graph of 1022 categories and 5075 cross-references as defined
in the 1879 version of Roget's Thesaurus. This example is described in Section
1.2 of

    Donald E. Knuth, "The Stanford GraphBase: A Platform for Combinatorial
    Computing", ACM Press, New York, 1993.
    http://www-cs-faculty.stanford.edu/~knuth/sgb.html

Note that one of the 5075 cross references is a self loop yet it is included in
the graph built here because the standard networkx `DiGraph` class allows self
loops.  (cf. 400pungency:400 401 403 405).

The data file can be found at:

- https://github.com/networkx/networkx/blob/main/examples/graph/roget_dat.txt.gz
"""

import gzip
import re
import sys

import matplotlib.pyplot as plt
import networkx as nx


def roget_graph():
    """Return the thesaurus graph from the roget.dat example in
    the Stanford Graph Base.
    """
    # open file roget_dat.txt.gz
    fh = gzip.open("roget_dat.txt.gz", "r")

    G = nx.DiGraph()

    for line in fh.readlines():
        line = line.decode()
        if line.startswith("*"):  # skip comments
            continue
        if line.startswith(" "):  # this is a continuation line, append
            line = oldline + line
        if line.endswith("\\\n"):  # continuation line, buffer, goto next
            oldline = line.strip("\\\n")
            continue

        (headname, tails) = line.split(":")

        # head
        numfind = re.compile(r"^\d+")  # re to find the number of this word
        head = numfind.findall(headname)[0]  # get the number

        G.add_node(head)

        for tail in tails.split():
            if head == tail:
                print("skipping self loop", head, tail, file=sys.stderr)
            G.add_edge(head, tail)

    return G


G = roget_graph()
print("Loaded roget_dat.txt containing 1022 categories.")
print(G)
UG = G.to_undirected()
print(nx.number_connected_components(UG), "connected components")

options = {
    "node_color": "black",
    "node_size": 1,
    "edge_color": "gray",
    "linewidths": 0,
    "width": 0.1,
}
nx.draw_circular(UG, **options)
plt.show()