File size: 1,259 Bytes
2c2081e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbfaf91
2c2081e
66fcc65
 
2c2081e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""Test gen_cmat.

king
In [21]: len(de)
Out[21]: 51

In [22]: len(en)
Out[22]: 51

In [23]: len(" ".join(en))
Out[23]: 11208

In [24]: len(" ".join(de))
Out[24]: 13532

In [25]: %time en_vec = model_s.encode(en)
CPU times: user 22 s, sys: 436 ms, total: 22.4 s
Wall time: 22.4 s

In [26]: %time de_vec = model_s.encode(de)
CPU times: user 22.8 s, sys: 311 ms, total: 23.1 s
Wall time: 23.1 s

en1 = loadparas("data/sternstunden04-en.txt")
en2 = loadparas("data/sternstunden04-de.txt")

len(en1)  # 30

len(" ".join(en1))  # 29718
len(" ".join(en2))  # 31478
"""
from cmat2aset310 import cmat2aset
from aset2pairs import aset2pairs
from st_mlbee.gen_cmat import gen_cmat
from st_mlbee.loadtext import loadparas

paras1 = loadparas("data/sternstunden04-en.txt")
paras2 = loadparas("data/sternstunden04-de.txt")
cmat = gen_cmat(paras1, paras2)


def test_gen_cmat_sternstunden04():
    """Test gen_cmat sternstunden04."""
    len1, len2 = len(paras1), len(paras2)

    # note the order
    assert cmat.shape == (len2, len1)


def test_aset2pairs():
    """Test aset2pairs."""
    aset = cmat2aset(cmat)
    pairs = aset2pairs(paras1, paras2, aset)

    assert "Marseillaise" in pairs[2][0]
    assert "Marseillaise" in pairs[2][1]
    assert pairs[2][2] > 0.95