vatrpp / data /show_dataset.py
vittoriopippi
Initial commit
fa0f216
raw
history blame
4.61 kB
import os
import pickle
import random
import shutil
import cv2
import matplotlib.pyplot as plt
import numpy as np
from data.dataset import get_transform
def summarize_dataset(data: dict):
print(f"Training authors: {len(data['train'].keys())} \t Testing authors: {len(data['test'].keys())}")
training_images = sum([len(data['train'][k]) for k in data['train'].keys()])
testing_images = sum([len(data['test'][k]) for k in data['test'].keys()])
print(f"Training images: {training_images} \t Testing images: {testing_images}")
def compare_data(path_a: str, path_b: str):
with open(path_a, 'rb') as f:
data_a = pickle.load(f)
summarize_dataset(data_a)
with open(path_b, 'rb') as f:
data_b = pickle.load(f)
summarize_dataset(data_b)
training_a = data_a['train']
training_b = data_b['train']
training_a = {int(k): v for k, v in training_a.items()}
training_b = {int(k): v for k, v in training_b.items()}
while True:
author = random.choice(list(training_a.keys()))
if author in training_b.keys():
author_images_a = [np.array(im_dict["img"]) for im_dict in training_a[author]]
author_images_b = [np.array(im_dict["img"]) for im_dict in training_b[author]]
labels_a = [str(im_dict["label"]) for im_dict in training_a[author]]
labels_b = [str(im_dict["label"]) for im_dict in training_b[author]]
vis_a = np.hstack(author_images_a[:10])
vis_b = np.hstack(author_images_b[:10])
cv2.imshow("Author a", vis_a)
cv2.imshow("Author b", vis_b)
cv2.waitKey(0)
else:
print(f"Author: {author} not found in second dataset")
def show_dataset(path: str, samples: int = 10):
with open(path, 'rb') as f:
data = pickle.load(f)
summarize_dataset(data)
training = data['train']
author = training['013']
author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in author]
for img in author_images:
cv2.imshow('image', img)
cv2.waitKey(0)
for author in list(training.keys()):
author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in training[author]]
labels = [str(im_dict["label"]) for im_dict in training[author]]
vis = np.hstack(author_images[:samples])
print(f"Author: {author}")
cv2.destroyAllWindows()
cv2.imshow("vis", vis)
cv2.waitKey(0)
def test_transform(path: str):
with open(path, 'rb') as f:
data = pickle.load(f)
summarize_dataset(data)
training = data['train']
transform = get_transform(grayscale=True)
for author_id in training.keys():
author = training[author_id]
for image_dict in author:
original_image = image_dict['img'].convert('L')
transformed_image = transform(original_image).detach().numpy()
restored_image = (((transformed_image + 1) / 2) * 255).astype(np.uint8)
restored_image = np.squeeze(restored_image)
original_image = np.array(original_image)
wrong_pixels = (original_image != restored_image).astype(np.uint8) * 255
combined = np.hstack((restored_image, original_image, wrong_pixels))
cv2.imshow("original", original_image)
cv2.imshow("restored", restored_image)
cv2.imshow("combined", combined)
f, ax = plt.subplots(1, 2)
ax[0].hist(original_image.flatten())
ax[1].hist(restored_image.flatten())
plt.show()
cv2.waitKey(0)
def dump_words():
data_path = r"..\files\IAM-32.pickle"
p_mark = 'point'
p = '.'
with open(data_path, 'rb') as f:
data = pickle.load(f)
training = data['train']
target_folder = f"../saved_images/debug/{p_mark}"
if os.path.exists(target_folder):
shutil.rmtree(target_folder)
os.mkdir(target_folder)
count = 0
for author in list(training.keys()):
author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in training[author]]
labels = [str(im_dict["label"]) for im_dict in training[author]]
for img, label in zip(author_images, labels):
if p in label:
cv2.imwrite(os.path.join(target_folder, f"{count}.png"), img)
count += 1
if __name__ == "__main__":
test_transform("../files/IAM-32.pickle")
#show_dataset("../files/IAM-32.pickle")
#compare_data(r"../files/IAM-32.pickle", r"../files/_IAM-32.pickle")