Kaushik066's picture
Update app.py
2cf5ca9 verified
# Base Framework
import torch
import torch.nn as nn
# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
# For data transformation
from torchvision import transforms
import torchvision.models as models
from torchvision.transforms import v2
# For Image Transformation
import transformers
from transformers import ViTImageProcessor
from transformers import set_seed
# For Data Loaders
import datasets
from torch.utils.data import Dataset, DataLoader
# For GPU
from accelerate import Accelerator, notebook_launcher
# General Libraries
import os
import PIL
from glob import glob
import pandas as pd
import numpy as np
import gradio as gr
# Constants
SEED = 42
BATCH_SIZE = 1
MODEL_TRANSFORMER = 'google/vit-base-patch16-224'
MODEL = "IMAGENET1K_V1"
CLIP_SIZE = 224
data_path = 'employees'
#model_path = 'models'
model_path_1 = 'models/Swin_tiny_celebA_imagenetWTS_loss0232.pt'
model_path_2 = 'models/Swin_tiny_celebA_imagenetWTS_loss0209.pt'
image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16)
# Create Dataset - Pictures
class CustomDataset(Dataset):
def __init__(self, image_paths, image_processor):
self.image_paths = image_paths # Store paths instead of images
self.image_processor = image_processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
image_path = self.image_paths[idx]
image_1 = PIL.Image.open(image_path).convert("RGB")
image_1 = self.image_processor(image_1, return_tensors='pt')['pixel_values'][0]
item = {
'pixel_values': image_1,
}
return item
# Model Class
class SwinTEmbedding(nn.Module):
def __init__(self, model_name="DEFAULT"):
super().__init__()
self.model_name = model_name
# Load pre-trained model
self.base_model = models.swin_t(weights=self.model_name)
# Remove classifier by selecting only the backbone
self.base_model_backbone = list(self.base_model.children())[0]
def forward(self, x):
x = self.base_model_backbone(x) # Feature extraction
x = torch.flatten(x, start_dim=1) # Flatten output to 1D embeddings
return x # Return as embeddings
# Build Model
def prod_function(reconstructed_model, prod_dl, webcam_img):
# Initialize accelerator
accelerator = Accelerator()
# The seed need to be set before we instantiate the model, as it will determine the random head.
set_seed(SEED)
image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16)
webcam_img = image_processor(webcam_img, return_tensors='pt')['pixel_values'][0]
webcam_img = torch.unsqueeze(webcam_img, 0)
#webcam_img = torch.rand(1, 3, 224, 224)
# Loss function
criterion = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the prepare method.
accelerated_model, acclerated_criterion, acclerated_prod_dl, acclerated_webcam_img = accelerator.prepare(reconstructed_model, criterion, prod_dl, webcam_img)
## Evaluate at the end of the epoch (distributed evaluation as we have 8 TPU cores)
accelerated_model.eval()
# Find Embedding of the webcam to be evaluated
with torch.no_grad():
# Extract embeddings
webcam_emb = torch.flatten(accelerated_model(acclerated_webcam_img), start_dim=1)
prod_predictions = []
for batch in acclerated_prod_dl:
with torch.no_grad():
image_emb = torch.flatten(accelerated_model(batch['pixel_values']), start_dim=1)
prod_predictions.append(acclerated_criterion(image_emb, webcam_emb))
return prod_predictions
# Function to find similariy score
def face_recognition(webcan_img):
# Read images from directory
image_paths = []
image_file = glob(os.path.join(data_path, '*.jpg'))
image_paths.extend(image_file)
# Convert the images to dataloader
prod_ds = CustomDataset(image_paths=image_paths, image_processor=image_processor)
prod_dl = DataLoader(prod_ds, batch_size=BATCH_SIZE)
# Create DataLoader for Webcam Image
#webcam_ds = CreateDatasetWebcam(pil_image=webcan_img, image_processor=image_processor)
#webcam_dl = DataLoader(webcam_ds, batch_size=BATCH_SIZE)
# Load Model
recon_model = SwinTEmbedding(model_name=MODEL)
recon_model.load_state_dict(torch.load(model_path_2, weights_only=True, map_location=torch.device('cpu')))
# Make Predictions
prediction = prod_function(recon_model, prod_dl, webcan_img)
# Convert to tensor
prediction = torch.cat(prediction, 0) #.to(device)
# Display the similarity score
similarity_score = dict(zip(image_paths, prediction))
# Identify the person
idx = prediction.argmax(-1)
person_name = image_paths[idx].split('/')[-1].split('.')[0]
return person_name, similarity_score
# Function to read the about.md file
def load_about_md():
with open("about.md", "r") as file:
about_content = file.read()
return about_content
with gr.Blocks() as demo:
gr.Markdown("# Face Recognition app 2.0")
# About the App
with gr.Tab("About the App"):
gr.Markdown(load_about_md())
# Face recognition Tab
with gr.Tab("Face recognition"):
with gr.Row():
with gr.Column(scale=0.9, variant="panel"):
with gr.Row(height=350, variant="panel"):
# Add webcam input for image capture
webcam_input = gr.Image(sources=["webcam"], type="pil", label="Face Capture")
with gr.Row(variant="panel"):
# Submit the Video
image_button = gr.Button("Submit")
# Add a button or functionality to process the video
recognition_output = gr.Textbox(label="Face Recognised as:")
with gr.Column(scale=1, variant="panel"):
with gr.Row():
# Display the landmarked video
similarity_score_output = gr.Textbox(label="Face Similarity Score:")
# Set up the interface
image_button.click(face_recognition, inputs=webcam_input, outputs=[recognition_output,similarity_score_output])
if __name__ == "__main__":
demo.launch()