File size: 2,778 Bytes

e174112

import os
import random
import cv2
from datetime import datetime
import logging

# Set up logging configuration
log_file = "sample_images.log"
logging.basicConfig(filename=log_file, level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

def detect_faces(image_path):
    # Load the pre-trained Haar Cascade model for face detection
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    # Read the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return False
    
    # Detect faces in the image
    faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    
    # Return True if at least one face is detected
    return len(faces) > 0

def sample_images(input_folder, output_folder, sample_rate=0.2):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize counters and start time
    total_files = 0
    sampled_files = 0
    start_time = datetime.now()

    # Walk through the input folder structure
    for root, dirs, files in os.walk(input_folder):
        relative_path = os.path.relpath(root, input_folder)
        output_subfolder = os.path.join(output_folder, relative_path)

        if not os.path.exists(output_subfolder):
            os.makedirs(output_subfolder)

        total_files += len(files)

        # Sample files in this directory
        sampled_files_this_batch = []
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                input_file_path = os.path.join(root, file)
                if detect_faces(input_file_path):
                    sampled_files_this_batch.append(file)

        sampled_files += len(sampled_files_this_batch)

        for file in files:
            if file in sampled_files_this_batch:
                input_file_path = os.path.join(root, file)
                output_file_path = os.path.join(output_subfolder, file)
                os.link(input_file_path, output_file_path)
                
                # Log the action
                logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")

        elapsed_time = datetime.now() - start_time
        print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")

    end_time = datetime.now()
    total_time = end_time - start_time
    logging.info(f"Total time taken: {total_time}")
    logging.info(f"Sampled {sampled_files} out of {total_files} files.")

if __name__ == "__main__":
    input_folder = "EvalSet"
    output_folder = "resampledEvalSet"
    sample_images(input_folder, output_folder)