File size: 1,691 Bytes
fa0f216 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import os
def test_split():
iam_path = r"C:\Users\bramv\Documents\Werk\Research\Unimore\datasets\IAM"
original_set_names = ["trainset.txt", "validationset1.txt", "validationset2.txt", "testset.txt"]
original_set_ids = []
print("ORIGINAL IAM")
print("---------------------")
for set_name in original_set_names:
with open(os.path.join(iam_path, set_name), 'r') as f:
set_form_ids = ["-".join(l.rstrip().split("-")[:-1]) for l in f]
form_to_id = {}
with open(os.path.join(iam_path, "forms.txt"), 'r') as f:
for line in f:
if line.startswith("#"):
continue
form, id, *_ = line.split(" ")
assert form not in form_to_id.keys() or form_to_id[form] == id
form_to_id[form] = int(id)
set_authors = [form_to_id[form] for form in set_form_ids]
set_authors = set(sorted(set_authors))
original_set_ids.append(set_authors)
print(f"{set_name} count: {len(set_authors)}")
htg_set_names = ["gan.iam.tr_va.gt.filter27", "gan.iam.test.gt.filter27"]
print("\n\nHTG IAM")
print("---------------------")
for set_name in htg_set_names:
with open(os.path.join(iam_path, set_name), 'r') as f:
set_authors = [int(l.split(",")[0]) for l in f]
set_authors = set(set_authors)
print(f"{set_name} count: {len(set_authors)}")
for name, original_set in zip(original_set_names, original_set_ids):
intr = set_authors.intersection(original_set)
print(f"\t intersection with {name}: {len(intr)}")
if __name__ == "__main__":
test_split()
|