File size: 976 Bytes
0841c28
 
7b40c73
c149479
 
 
 
 
 
 
 
 
 
0841c28
911bee9
c149479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
set -ex

mkdir -p cache
cd cache
if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
    git clone https://github.com/acl-org/acl-anthology
else
    cd acl-anthology
    git pull
    cd ..
fi
cd acl-anthology/bin

pip install --no-cache-dir -r ./requirements.txt

python -c '
import json
from anthology import Anthology

anthology = Anthology(importdir="../data")
pops = ["xml_booktitle", "xml_title", "xml_url", "xml_abstract"]
papers = []
for paper in anthology.papers.values():
    p = paper.as_dict()
    if "xml_abstract" in p:
        p["abstract"] = paper.get_abstract(form="latex")
    for popkey in pops:
        if popkey in p:
            p.pop(popkey)
    if "author" in p:
        p["author"] = [a[0].as_dict() for a in p["author"]]
    if "editor" in p:
        p["editor"] = [a[0].as_dict() for a in p["editor"]]
    papers.append(p)

with open("../../aclanthology.json", "wt", encoding="utf8") as fout:
    json.dump(papers, fout, ensure_ascii=False)
'