LPX55's picture
chore: resample-prep
e174112 verified
import os
import random
import cv2
from datetime import datetime
import logging
# Set up logging configuration
log_file = "sample_images.log"
logging.basicConfig(filename=log_file, level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def detect_faces(image_path):
# Load the pre-trained Haar Cascade model for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Read the image in grayscale
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
if image is None:
return False
# Detect faces in the image
faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
# Return True if at least one face is detected
return len(faces) > 0
def sample_images(input_folder, output_folder, sample_rate=0.2):
# Ensure the output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Initialize counters and start time
total_files = 0
sampled_files = 0
start_time = datetime.now()
# Walk through the input folder structure
for root, dirs, files in os.walk(input_folder):
relative_path = os.path.relpath(root, input_folder)
output_subfolder = os.path.join(output_folder, relative_path)
if not os.path.exists(output_subfolder):
os.makedirs(output_subfolder)
total_files += len(files)
# Sample files in this directory
sampled_files_this_batch = []
for file in files:
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
input_file_path = os.path.join(root, file)
if detect_faces(input_file_path):
sampled_files_this_batch.append(file)
sampled_files += len(sampled_files_this_batch)
for file in files:
if file in sampled_files_this_batch:
input_file_path = os.path.join(root, file)
output_file_path = os.path.join(output_subfolder, file)
os.link(input_file_path, output_file_path)
# Log the action
logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")
elapsed_time = datetime.now() - start_time
print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")
end_time = datetime.now()
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time}")
logging.info(f"Sampled {sampled_files} out of {total_files} files.")
if __name__ == "__main__":
input_folder = "EvalSet"
output_folder = "resampledEvalSet"
sample_images(input_folder, output_folder)