|
import os |
|
import random |
|
import cv2 |
|
from datetime import datetime |
|
import logging |
|
|
|
|
|
log_file = "sample_images.log" |
|
logging.basicConfig(filename=log_file, level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
def detect_faces(image_path): |
|
|
|
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') |
|
|
|
|
|
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) |
|
if image is None: |
|
return False |
|
|
|
|
|
faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) |
|
|
|
|
|
return len(faces) > 0 |
|
|
|
def sample_images(input_folder, output_folder, sample_rate=0.2): |
|
|
|
if not os.path.exists(output_folder): |
|
os.makedirs(output_folder) |
|
|
|
|
|
total_files = 0 |
|
sampled_files = 0 |
|
start_time = datetime.now() |
|
|
|
|
|
for root, dirs, files in os.walk(input_folder): |
|
relative_path = os.path.relpath(root, input_folder) |
|
output_subfolder = os.path.join(output_folder, relative_path) |
|
|
|
if not os.path.exists(output_subfolder): |
|
os.makedirs(output_subfolder) |
|
|
|
total_files += len(files) |
|
|
|
|
|
sampled_files_this_batch = [] |
|
for file in files: |
|
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): |
|
input_file_path = os.path.join(root, file) |
|
if detect_faces(input_file_path): |
|
sampled_files_this_batch.append(file) |
|
|
|
sampled_files += len(sampled_files_this_batch) |
|
|
|
for file in files: |
|
if file in sampled_files_this_batch: |
|
input_file_path = os.path.join(root, file) |
|
output_file_path = os.path.join(output_subfolder, file) |
|
os.link(input_file_path, output_file_path) |
|
|
|
|
|
logging.info(f"Sampled and copied {input_file_path} to {output_file_path}") |
|
|
|
elapsed_time = datetime.now() - start_time |
|
print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}") |
|
|
|
end_time = datetime.now() |
|
total_time = end_time - start_time |
|
logging.info(f"Total time taken: {total_time}") |
|
logging.info(f"Sampled {sampled_files} out of {total_files} files.") |
|
|
|
if __name__ == "__main__": |
|
input_folder = "EvalSet" |
|
output_folder = "resampledEvalSet" |
|
sample_images(input_folder, output_folder) |
|
|