Spaces:

PraneshJs
/

fakevideodetect

Running

App Files Files Community

PraneshJs commited on Aug 13

Commit

9ce33d8

verified ·

1 Parent(s): c715f39

Update inference_2.py

Browse files

Files changed (1) hide show

inference_2.py +74 -69

inference_2.py CHANGED Viewed

@@ -1,28 +1,24 @@
 import os
 import cv2
 import torch
 import numpy as np
 from onnx2pytorch import ConvertModel
 from models.TMC import ETMC
 from models import image
-import onnx
-# -------------------
-# Load ONNX -> PyTorch model for image modality
-# -------------------
-onnx_model_path = 'checkpoints/efficientnet.onnx'
-onnx_model = onnx.load(onnx_model_path)
-img_model = ConvertModel(onnx_model)
-img_model.eval()
-# -------------------
-# Set random seed for reproducibility
-# -------------------
 torch.manual_seed(42)
-# -------------------
-# Audio model configuration
-# -------------------
 audio_args = {
     'nb_samp': 64600,
     'first_conv': 1024,
@@ -32,14 +28,17 @@ audio_args = {
     'nb_fc_node': 1024,
     'gru_node': 1024,
     'nb_gru_layer': 3,
-    'nb_classes': 2
 }
-# -------------------
 # Load Audio Model
-# -------------------
 def load_audio_model():
-    spec_model = image.RawNet(audio_args)
     ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
     spec_model.load_state_dict(ckpt['spec_encoder'], strict=True)
     spec_model.eval()
@@ -47,76 +46,82 @@ def load_audio_model():
 spec_model = load_audio_model()
-# -------------------
-# Preprocessing Functions
-# -------------------
 def preprocess_img(face):
     face = face / 255.0
     face = cv2.resize(face, (256, 256))
-    face_tensor = torch.unsqueeze(torch.Tensor(face), dim=0)
-    return face_tensor
 def preprocess_audio(audio_file):
-    audio_tensor = torch.unsqueeze(torch.Tensor(audio_file), dim=0)
-    return audio_tensor
 def preprocess_video(input_video, n_frames=3):
-    cap = cv2.VideoCapture(input_video)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    sample = np.linspace(0, total_frames-1, n_frames).astype(int)
     frames = []
-    for i in range(total_frames):
-        success = cap.grab()
-        if i in sample:
-            success, frame = cap.retrieve()
             if not success:
                 continue
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = preprocess_img(frame)
             frames.append(frame)
-    cap.release()
     return frames
-# -------------------
-# Prediction Functions
-# -------------------
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
-    with torch.no_grad():
-        preds = img_model.forward(face).cpu().numpy().squeeze()
-    if preds[0] > 0.5:
-        score = round(preds[0] * 100, 3)
-        return f"The image is REAL. Confidence score: {score}%"
     else:
-        score = round(preds[1] * 100, 3)
-        return f"The image is FAKE. Confidence score: {score}%"
 def deepfakes_video_predict(input_video):
-    frames = preprocess_video(input_video)
-    real_scores, fake_scores = [], []
-    with torch.no_grad():
-        for frame in frames:
-            preds = img_model.forward(frame).cpu().numpy().squeeze()
-            real_scores.append(preds[0])
-            fake_scores.append(preds[1])
-    real_mean = np.mean(real_scores)
-    fake_mean = np.mean(fake_scores)
     if real_mean > 0.5:
-        return f"The video is REAL. Confidence score: {round(real_mean*100, 3)}%"
-    else:
-        return f"The video is FAKE. Confidence score: {round(fake_mean*100, 3)}%"
-def deepfakes_spec_predict(input_audio):
-    audio_tensor = preprocess_audio(input_audio)
-    with torch.no_grad():
-        preds = spec_model.forward(audio_tensor).cpu().numpy().squeeze()
-    if preds[0] > 0.5:
-        return "The audio is REAL."
     else:
-        return "The audio is FAKE."

 import os
 import cv2
+import onnx
 import torch
 import numpy as np
+from types import SimpleNamespace
 from onnx2pytorch import ConvertModel
 from models.TMC import ETMC
 from models import image
+# -----------------------------
+# Load ONNX -> PyTorch safely
+# -----------------------------
+onnx_model = onnx.load('checkpoints/efficientnet.onnx')
+pytorch_model = ConvertModel(onnx_model, strict=False)
 torch.manual_seed(42)
+# -----------------------------
+# Audio model arguments
+# -----------------------------
 audio_args = {
     'nb_samp': 64600,
     'first_conv': 1024,
     'nb_fc_node': 1024,
     'gru_node': 1024,
     'nb_gru_layer': 3,
+    'nb_classes': 2,
+    'device': 'cpu'
 }
+audio_args_obj = SimpleNamespace(**audio_args)
+# -----------------------------
 # Load Audio Model
+# -----------------------------
 def load_audio_model():
+    spec_model = image.RawNet(audio_args_obj)
     ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
     spec_model.load_state_dict(ckpt['spec_encoder'], strict=True)
     spec_model.eval()
 spec_model = load_audio_model()
+# -----------------------------
+# Load Image Model
+# -----------------------------
+def load_image_model():
+    rgb_encoder = pytorch_model
+    ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
+    rgb_encoder.load_state_dict(ckpt['rgb_encoder'], strict=True)
+    rgb_encoder.eval()
+    return rgb_encoder
+img_model = load_image_model()
+# -----------------------------
+# Preprocessing functions
+# -----------------------------
 def preprocess_img(face):
     face = face / 255.0
     face = cv2.resize(face, (256, 256))
+    face_pt = torch.unsqueeze(torch.Tensor(face), dim=0)
+    return face_pt
 def preprocess_audio(audio_file):
+    audio_pt = torch.unsqueeze(torch.Tensor(audio_file), dim=0)
+    return audio_pt
 def preprocess_video(input_video, n_frames=3):
+    v_cap = cv2.VideoCapture(input_video)
+    v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    sample = np.linspace(0, v_len-1, n_frames).astype(int)
     frames = []
+    for j in range(v_len):
+        success = v_cap.grab()
+        if j in sample:
+            success, frame = v_cap.retrieve()
             if not success:
                 continue
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = preprocess_img(frame)
             frames.append(frame)
+    v_cap.release()
     return frames
+# -----------------------------
+# Inference functions
+# -----------------------------
+def deepfakes_spec_predict(input_audio):
+    audio = preprocess_audio(input_audio)
+    spec_grads = spec_model.forward(audio)
+    spec_grads_np = np.exp(spec_grads.cpu().detach().numpy().squeeze())
+    max_value = np.argmax(spec_grads_np)
+    if max_value > 0.5:
+        text2 = f"The audio is REAL."
+    else:
+        text2 = f"The audio is FAKE."
+    return text2
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
+    img_grads = img_model.forward(face).cpu().detach().numpy().squeeze()
+    if img_grads[0] > 0.5:
+        text2 = f"The image is REAL. Confidence: {img_grads[0]*100:.3f}%"
     else:
+        text2 = f"The image is FAKE. Confidence: {img_grads[1]*100:.3f}%"
+    return text2
 def deepfakes_video_predict(input_video):
+    video_frames = preprocess_video(input_video)
+    real_list, fake_list = [], []
+    for face in video_frames:
+        img_grads = img_model.forward(face).cpu().detach().numpy().squeeze()
+        real_list.append(img_grads[0])
+        fake_list.append(img_grads[1])
+    real_mean = np.mean(real_list)
+    fake_mean = np.mean(fake_list)
     if real_mean > 0.5:
+        text2 = f"The video is REAL. Confidence: {real_mean*100:.3f}%"
     else:
+        text2 = f"The video is FAKE. Confidence: {fake_mean*100:.3f}%"
+    return text2