# Base Framework import torch import torch.nn as nn # Set the device (GPU or CPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device:", device) # For data transformation from torchvision import transforms import torchvision.models as models from torchvision.transforms import v2 # For Image Transformation import transformers from transformers import ViTImageProcessor from transformers import set_seed # For Data Loaders import datasets from torch.utils.data import Dataset, DataLoader # For GPU from accelerate import Accelerator, notebook_launcher # General Libraries import os import PIL from glob import glob import pandas as pd import numpy as np import gradio as gr # Constants SEED = 42 BATCH_SIZE = 1 MODEL_TRANSFORMER = 'google/vit-base-patch16-224' MODEL = "IMAGENET1K_V1" CLIP_SIZE = 224 data_path = 'employees' #model_path = 'models' model_path_1 = 'models/Swin_tiny_celebA_imagenetWTS_loss0232.pt' model_path_2 = 'models/Swin_tiny_celebA_imagenetWTS_loss0209.pt' image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16) # Create Dataset - Pictures class CustomDataset(Dataset): def __init__(self, image_paths, image_processor): self.image_paths = image_paths # Store paths instead of images self.image_processor = image_processor def __len__(self): return len(self.image_paths) def __getitem__(self, idx): image_path = self.image_paths[idx] image_1 = PIL.Image.open(image_path).convert("RGB") image_1 = self.image_processor(image_1, return_tensors='pt')['pixel_values'][0] item = { 'pixel_values': image_1, } return item # Model Class class SwinTEmbedding(nn.Module): def __init__(self, model_name="DEFAULT"): super().__init__() self.model_name = model_name # Load pre-trained model self.base_model = models.swin_t(weights=self.model_name) # Remove classifier by selecting only the backbone self.base_model_backbone = list(self.base_model.children())[0] def forward(self, x): x = self.base_model_backbone(x) # Feature extraction x = torch.flatten(x, start_dim=1) # Flatten output to 1D embeddings return x # Return as embeddings # Build Model def prod_function(reconstructed_model, prod_dl, webcam_img): # Initialize accelerator accelerator = Accelerator() # The seed need to be set before we instantiate the model, as it will determine the random head. set_seed(SEED) image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16) webcam_img = image_processor(webcam_img, return_tensors='pt')['pixel_values'][0] webcam_img = torch.unsqueeze(webcam_img, 0) #webcam_img = torch.rand(1, 3, 224, 224) # Loss function criterion = torch.nn.CosineSimilarity(dim=1, eps=1e-6) # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the prepare method. accelerated_model, acclerated_criterion, acclerated_prod_dl, acclerated_webcam_img = accelerator.prepare(reconstructed_model, criterion, prod_dl, webcam_img) ## Evaluate at the end of the epoch (distributed evaluation as we have 8 TPU cores) accelerated_model.eval() # Find Embedding of the webcam to be evaluated with torch.no_grad(): # Extract embeddings webcam_emb = torch.flatten(accelerated_model(acclerated_webcam_img), start_dim=1) prod_predictions = [] for batch in acclerated_prod_dl: with torch.no_grad(): image_emb = torch.flatten(accelerated_model(batch['pixel_values']), start_dim=1) prod_predictions.append(acclerated_criterion(image_emb, webcam_emb)) return prod_predictions # Function to find similariy score def face_recognition(webcan_img): # Read images from directory image_paths = [] image_file = glob(os.path.join(data_path, '*.jpg')) image_paths.extend(image_file) # Convert the images to dataloader prod_ds = CustomDataset(image_paths=image_paths, image_processor=image_processor) prod_dl = DataLoader(prod_ds, batch_size=BATCH_SIZE) # Create DataLoader for Webcam Image #webcam_ds = CreateDatasetWebcam(pil_image=webcan_img, image_processor=image_processor) #webcam_dl = DataLoader(webcam_ds, batch_size=BATCH_SIZE) # Load Model recon_model = SwinTEmbedding(model_name=MODEL) recon_model.load_state_dict(torch.load(model_path_2, weights_only=True, map_location=torch.device('cpu'))) # Make Predictions prediction = prod_function(recon_model, prod_dl, webcan_img) # Convert to tensor prediction = torch.cat(prediction, 0) #.to(device) # Display the similarity score similarity_score = dict(zip(image_paths, prediction)) # Identify the person idx = prediction.argmax(-1) person_name = image_paths[idx].split('/')[-1].split('.')[0] return person_name, similarity_score # Function to read the about.md file def load_about_md(): with open("about.md", "r") as file: about_content = file.read() return about_content with gr.Blocks() as demo: gr.Markdown("# Face Recognition app 2.0") # About the App with gr.Tab("About the App"): gr.Markdown(load_about_md()) # Face recognition Tab with gr.Tab("Face recognition"): with gr.Row(): with gr.Column(scale=0.9, variant="panel"): with gr.Row(height=350, variant="panel"): # Add webcam input for image capture webcam_input = gr.Image(sources=["webcam"], type="pil", label="Face Capture") with gr.Row(variant="panel"): # Submit the Video image_button = gr.Button("Submit") # Add a button or functionality to process the video recognition_output = gr.Textbox(label="Face Recognised as:") with gr.Column(scale=1, variant="panel"): with gr.Row(): # Display the landmarked video similarity_score_output = gr.Textbox(label="Face Similarity Score:") # Set up the interface image_button.click(face_recognition, inputs=webcam_input, outputs=[recognition_output,similarity_score_output]) if __name__ == "__main__": demo.launch()