# Base Framework
import torch
import torch.nn as nn
# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
# For data transformation
from torchvision import transforms
import torchvision.models as models
from torchvision.transforms import v2
# For Image Transformation
import transformers
from transformers import ViTImageProcessor
from transformers import set_seed
# For Data Loaders
import datasets
from torch.utils.data import Dataset, DataLoader
# For GPU
from accelerate import Accelerator, notebook_launcher
# General Libraries
import os
import PIL
from glob import glob
import pandas as pd
import numpy as np
import gradio as gr
# Constants
SEED = 42
BATCH_SIZE = 1
MODEL_TRANSFORMER = 'google/vit-base-patch16-224'
MODEL = "IMAGENET1K_V1"
CLIP_SIZE = 224
data_path = 'employees'
#model_path = 'models'
model_path_1 = 'models/Swin_tiny_celebA_imagenetWTS_loss0232.pt'
model_path_2 = 'models/Swin_tiny_celebA_imagenetWTS_loss0209.pt'
image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16)

# Create Dataset - Pictures
class CustomDataset(Dataset):
    def __init__(self, image_paths, image_processor):
        self.image_paths = image_paths  # Store paths instead of images
        self.image_processor = image_processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image_1 = PIL.Image.open(image_path).convert("RGB")
        image_1 = self.image_processor(image_1, return_tensors='pt')['pixel_values'][0]
        item = {
            'pixel_values': image_1,
        }
        return item

# Model Class
class SwinTEmbedding(nn.Module):
    def __init__(self, model_name="DEFAULT"):
        super().__init__()
        self.model_name = model_name
        # Load pre-trained model
        self.base_model = models.swin_t(weights=self.model_name)
        
        # Remove classifier by selecting only the backbone
        self.base_model_backbone = list(self.base_model.children())[0]  

    def forward(self, x):
        x = self.base_model_backbone(x)  # Feature extraction
        x = torch.flatten(x, start_dim=1)  # Flatten output to 1D embeddings
        return x  # Return as embeddings

# Build Model
def prod_function(reconstructed_model, prod_dl, webcam_img):
    # Initialize accelerator
    accelerator = Accelerator()

    # The seed need to be set before we instantiate the model, as it will determine the random head.
    set_seed(SEED)

    image_processor = ViTImageProcessor.from_pretrained(MODEL_TRANSFORMER, attn_implementation="sdpa", torch_dtype=torch.float16)
    webcam_img = image_processor(webcam_img, return_tensors='pt')['pixel_values'][0]
    webcam_img = torch.unsqueeze(webcam_img, 0)
    #webcam_img = torch.rand(1, 3, 224, 224)

    # Loss function
    criterion = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the prepare method.
    accelerated_model, acclerated_criterion, acclerated_prod_dl, acclerated_webcam_img = accelerator.prepare(reconstructed_model, criterion, prod_dl, webcam_img)

    ## Evaluate at the end of the epoch (distributed evaluation as we have 8 TPU cores)
    accelerated_model.eval()
    
    # Find Embedding of the webcam to be evaluated
    with torch.no_grad():
        # Extract embeddings
        webcam_emb = torch.flatten(accelerated_model(acclerated_webcam_img), start_dim=1)
            
    prod_predictions = []
    for batch in acclerated_prod_dl:
        with torch.no_grad():
            image_emb = torch.flatten(accelerated_model(batch['pixel_values']), start_dim=1)
        prod_predictions.append(acclerated_criterion(image_emb, webcam_emb))

    return prod_predictions

# Function to find similariy score
def face_recognition(webcan_img):
    # Read images from directory
    image_paths = []
    image_file = glob(os.path.join(data_path, '*.jpg'))
    image_paths.extend(image_file)
    # Convert the images to dataloader
    prod_ds = CustomDataset(image_paths=image_paths, image_processor=image_processor)
    prod_dl = DataLoader(prod_ds, batch_size=BATCH_SIZE)
    
    # Create DataLoader for Webcam Image
    #webcam_ds = CreateDatasetWebcam(pil_image=webcan_img, image_processor=image_processor)
    #webcam_dl = DataLoader(webcam_ds, batch_size=BATCH_SIZE)
    
    # Load Model
    recon_model = SwinTEmbedding(model_name=MODEL)
    recon_model.load_state_dict(torch.load(model_path_2, weights_only=True, map_location=torch.device('cpu')))
    
    # Make Predictions
    prediction = prod_function(recon_model, prod_dl, webcan_img)

    # Convert to tensor
    prediction = torch.cat(prediction, 0) #.to(device)
    # Display the similarity score
    similarity_score = dict(zip(image_paths, prediction))
    
    # Identify the person
    idx = prediction.argmax(-1)
    person_name = image_paths[idx].split('/')[-1].split('.')[0]
    
    return person_name, similarity_score
    
# Function to read the about.md file
def load_about_md():
    with open("about.md", "r") as file:
        about_content = file.read()
    return about_content
    
with gr.Blocks() as demo:
    gr.Markdown("# Face Recognition app 2.0")

    # About the App
    with gr.Tab("About the App"):
        gr.Markdown(load_about_md())
                
    # Face recognition Tab
    with gr.Tab("Face recognition"):
        with gr.Row():
            with gr.Column(scale=0.9, variant="panel"):
                with gr.Row(height=350, variant="panel"):
                    # Add webcam input for image capture
                    webcam_input = gr.Image(sources=["webcam"], type="pil", label="Face Capture")
                with gr.Row(variant="panel"):
                    # Submit the Video
                    image_button = gr.Button("Submit")
                    # Add a button or functionality to process the video
                    recognition_output = gr.Textbox(label="Face Recognised as:")
            with gr.Column(scale=1, variant="panel"):
                with gr.Row():
                    # Display the landmarked video
                    similarity_score_output = gr.Textbox(label="Face Similarity Score:")
    # Set up the interface
    image_button.click(face_recognition, inputs=webcam_input, outputs=[recognition_output,similarity_score_output])
    
if __name__ == "__main__":
    demo.launch()