Commit
·
979a0f3
1
Parent(s):
ab2d783
demo v1
Browse files- .gitignore +1 -0
- app.py +137 -0
- requirements.txt +6 -0
- src/__pycache__/islr_model.cpython-39.pyc +0 -0
- src/__pycache__/keypoints_utils.cpython-39.pyc +0 -0
- src/__pycache__/predict.cpython-39.pyc +0 -0
- src/islr/__init__.py +1 -0
- src/islr/__pycache__/__init__.cpython-39.pyc +0 -0
- src/islr/__pycache__/islr_model.cpython-39.pyc +0 -0
- src/islr/islr_model.py +13 -0
- src/islr/save_dummy_model.py +12 -0
- src/pose/__init__.py +1 -0
- src/pose/__pycache__/__init__.cpython-39.pyc +0 -0
- src/pose/__pycache__/keypoints_utils.cpython-39.pyc +0 -0
- src/pose/keypoints_utils.py +15 -0
- src/predict.py +47 -0
- src/simple_demo.py +24 -0
- videos/wlasl/book.mp4 +0 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Demo/flagged/
|
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from src.predict import predict_from_video
|
4 |
+
from src.islr.islr_model import DummyISLRModel
|
5 |
+
|
6 |
+
|
7 |
+
#device = 'cpu'
|
8 |
+
#device = 'cuda'
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
|
11 |
+
|
12 |
+
# Simulación de diccionario con paths
|
13 |
+
dataset_models = {
|
14 |
+
"PERU": {"path":"models/demo_model.pt","num_classes":100},
|
15 |
+
"WLASL": {"path":"models/demo_model.pt","num_classes":100},
|
16 |
+
}
|
17 |
+
|
18 |
+
# Diccionario de rutas y etiquetas por dataset
|
19 |
+
dataset_examples = {
|
20 |
+
"PERU": [
|
21 |
+
{"label": "📘 **Glosa: `libro`**", "path": "videos/wlasl/book.mp4"},
|
22 |
+
{"label": "🏠 **Glosa: `casa`**", "path": "videos/wlasl/book.mp4"},
|
23 |
+
{"label": "📘 **Glosa: `libro2`**", "path": "videos/wlasl/book.mp4"},
|
24 |
+
{"label": "🏠 **Glosa: `casa2`**", "path": "videos/wlasl/book.mp4"},
|
25 |
+
],
|
26 |
+
"WLASL": [
|
27 |
+
{"label": "📙 **Glosa: `read`**", "path":"videos/wlasl/book.mp4"},
|
28 |
+
{"label": "🏫 **Glosa: `school`**", "path":"videos/wlasl/book.mp4"},
|
29 |
+
{"label": "📙 **Glosa: `read2`**", "path":"videos/wlasl/book.mp4"},
|
30 |
+
{"label": "🏫 **Glosa: `school2`**", "path":"videos/wlasl/book.mp4"},
|
31 |
+
]
|
32 |
+
}
|
33 |
+
|
34 |
+
# === Carga el modelo según el dataset seleccionado ===
|
35 |
+
def load_model_and_examples(dataset):
|
36 |
+
model_path = dataset_models.get(dataset)['path']
|
37 |
+
num_classes = dataset_models.get(dataset)['num_classes']
|
38 |
+
model = DummyISLRModel(num_classes=num_classes)
|
39 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
40 |
+
model.eval()
|
41 |
+
print(f"Model {dataset} Loaded!")
|
42 |
+
examples = dataset_examples.get(dataset, [{"label": "", "path": ""}, {"label": "", "path": ""}])
|
43 |
+
return (
|
44 |
+
model,
|
45 |
+
gr.update(visible=True),
|
46 |
+
gr.update(value=examples[0]["path"]),
|
47 |
+
examples[0]["path"],
|
48 |
+
gr.update(value=examples[0]["label"]),
|
49 |
+
gr.update(value=examples[1]["path"]),
|
50 |
+
examples[1]["path"],
|
51 |
+
gr.update(value=examples[1]["label"]),
|
52 |
+
gr.update(value=examples[2]["path"]),
|
53 |
+
examples[2]["path"],
|
54 |
+
gr.update(value=examples[2]["label"]),
|
55 |
+
gr.update(value=examples[3]["path"]),
|
56 |
+
examples[3]["path"],
|
57 |
+
gr.update(value=examples[3]["label"]),
|
58 |
+
gr.update(interactive=True) # activa el botón
|
59 |
+
)
|
60 |
+
|
61 |
+
# === Usamos el modelo cargado en el State ===
|
62 |
+
def classify_video_with_model(video, model):
|
63 |
+
top1, top5_df = predict_from_video(video, model=model) # asegúrate de pasar el modelo en `predict_from_video`
|
64 |
+
return f"Top-1: {top1}", top5_df
|
65 |
+
|
66 |
+
with gr.Blocks() as demo:
|
67 |
+
gr.Markdown("# 🧠 ISLR Demo con Mediapipe y 100 Clases")
|
68 |
+
gr.Markdown("Sube un video o usa la webcam. El modelo clasificará la seña y mostrará las 5 clases más probables.")
|
69 |
+
|
70 |
+
# === Selector de dataset
|
71 |
+
gr.Markdown("## 📁 Filtrar por Language")
|
72 |
+
dataset_selector = gr.Dropdown(choices=list(dataset_examples.keys()), value=None, label="Selecciona el lenguaje")
|
73 |
+
|
74 |
+
|
75 |
+
# === Estado del modelo ===
|
76 |
+
current_model = gr.State()
|
77 |
+
video_path_1 = gr.State()
|
78 |
+
video_path_2 = gr.State()
|
79 |
+
video_path_3 = gr.State()
|
80 |
+
video_path_4 = gr.State()
|
81 |
+
|
82 |
+
# === Entrada de video + salida
|
83 |
+
with gr.Row():
|
84 |
+
video_input = gr.Video(sources=["upload", "webcam"], label="🎥 Video de entrada", width=300, height=400)
|
85 |
+
with gr.Column():
|
86 |
+
output_text = gr.Text(label="Predicción Top-1")
|
87 |
+
output_table = gr.Label(num_top_classes=5)
|
88 |
+
button_classify = gr.Button("🔍 Clasificar",interactive=False)
|
89 |
+
|
90 |
+
button_classify.click(
|
91 |
+
fn=classify_video_with_model,
|
92 |
+
inputs=[video_input, current_model],
|
93 |
+
outputs=[output_text, output_table]
|
94 |
+
)
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
# === Contenedor dinámico de ejemplos
|
100 |
+
examples_output = gr.Column(visible=True)
|
101 |
+
|
102 |
+
with examples_output:
|
103 |
+
with gr.Row():
|
104 |
+
with gr.Column(scale=1, min_width=100):
|
105 |
+
m1 = gr.Markdown("📘 **Glosa: **")
|
106 |
+
v1 = gr.Video(interactive=False, width=160, height=120)
|
107 |
+
b1 = gr.Button("Usar", scale=0)
|
108 |
+
with gr.Column(scale=1, min_width=100):
|
109 |
+
m2 = gr.Markdown("🏠 **Glosa: **")
|
110 |
+
v2 = gr.Video(interactive=False, width=160, height=120)
|
111 |
+
b2 = gr.Button("Usar", scale=0)
|
112 |
+
with gr.Column(scale=1, min_width=100):
|
113 |
+
m3 = gr.Markdown("🏠 **Glosa: **")
|
114 |
+
v3 = gr.Video(interactive=False, width=160, height=120)
|
115 |
+
b3 = gr.Button("Usar", scale=0)
|
116 |
+
with gr.Column(scale=1, min_width=100):
|
117 |
+
m4 = gr.Markdown("🏠 **Glosa: **")
|
118 |
+
v4 = gr.Video(interactive=False, width=160, height=120)
|
119 |
+
b4 = gr.Button("Usar", scale=0)
|
120 |
+
|
121 |
+
b1.click(fn=lambda path: path, inputs=video_path_1, outputs=video_input)
|
122 |
+
b2.click(fn=lambda path: path, inputs=video_path_2, outputs=video_input)
|
123 |
+
b3.click(fn=lambda path: path, inputs=video_path_3, outputs=video_input)
|
124 |
+
b4.click(fn=lambda path: path, inputs=video_path_4, outputs=video_input)
|
125 |
+
|
126 |
+
gr.Markdown("## 📁 Ejemplos de videos")
|
127 |
+
# === Al cambiar dataset, cargamos modelo + ejemplos
|
128 |
+
dataset_selector.change(
|
129 |
+
fn=load_model_and_examples,
|
130 |
+
inputs=dataset_selector,
|
131 |
+
outputs=[current_model, examples_output, v1,video_path_1,m1, v2, video_path_2, m2, v3, video_path_3, m3, v4, video_path_4, m4,
|
132 |
+
button_classify
|
133 |
+
]
|
134 |
+
)
|
135 |
+
|
136 |
+
if __name__ == "__main__":
|
137 |
+
demo.launch(server_port=8080)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
torch
|
3 |
+
mediapipe
|
4 |
+
opencv-python
|
5 |
+
numpy
|
6 |
+
pandas
|
src/__pycache__/islr_model.cpython-39.pyc
ADDED
Binary file (807 Bytes). View file
|
|
src/__pycache__/keypoints_utils.cpython-39.pyc
ADDED
Binary file (1.23 kB). View file
|
|
src/__pycache__/predict.cpython-39.pyc
ADDED
Binary file (1.75 kB). View file
|
|
src/islr/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import *
|
src/islr/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (181 Bytes). View file
|
|
src/islr/__pycache__/islr_model.cpython-39.pyc
ADDED
Binary file (808 Bytes). View file
|
|
src/islr/islr_model.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
class DummyISLRModel(nn.Module):
|
4 |
+
def __init__(self, input_dim=225, num_classes=100):
|
5 |
+
super().__init__()
|
6 |
+
self.fc = nn.Sequential(
|
7 |
+
nn.Linear(input_dim, 128),
|
8 |
+
nn.ReLU(),
|
9 |
+
nn.Linear(128, num_classes)
|
10 |
+
)
|
11 |
+
|
12 |
+
def forward(self, x):
|
13 |
+
return self.fc(x)
|
src/islr/save_dummy_model.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from islr_model import DummyISLRModel
|
3 |
+
|
4 |
+
model = DummyISLRModel(num_classes=100)
|
5 |
+
for param in model.parameters():
|
6 |
+
if param.dim() > 1:
|
7 |
+
torch.nn.init.xavier_uniform_(param)
|
8 |
+
else:
|
9 |
+
torch.nn.init.zeros_(param)
|
10 |
+
|
11 |
+
torch.save(model.state_dict(), "demo_model.pt")
|
12 |
+
print("✅ Modelo guardado como demo_model.pt")
|
src/pose/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import *
|
src/pose/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (181 Bytes). View file
|
|
src/pose/__pycache__/keypoints_utils.cpython-39.pyc
ADDED
Binary file (1.23 kB). View file
|
|
src/pose/keypoints_utils.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mediapipe as mp
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
mp_holistic = mp.solutions.holistic
|
6 |
+
|
7 |
+
def extract_keypoints_from_frame(frame):
|
8 |
+
with mp_holistic.Holistic(static_image_mode=True) as holistic:
|
9 |
+
results = holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
10 |
+
if results.pose_landmarks and results.left_hand_landmarks and results.right_hand_landmarks:
|
11 |
+
pose = np.array([[p.x, p.y, p.z] for p in results.pose_landmarks.landmark])
|
12 |
+
left = np.array([[p.x, p.y, p.z] for p in results.left_hand_landmarks.landmark])
|
13 |
+
right = np.array([[p.x, p.y, p.z] for p in results.right_hand_landmarks.landmark])
|
14 |
+
return np.concatenate([pose, left, right], axis=0).flatten()
|
15 |
+
return np.random.random(33*3 + 21*3*2) # fallback
|
src/predict.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from .islr.islr_model import DummyISLRModel
|
6 |
+
from .pose.keypoints_utils import extract_keypoints_from_frame
|
7 |
+
|
8 |
+
#model = DummyISLRModel(num_classes=100)
|
9 |
+
#model.load_state_dict(torch.load("demo_model.pt", map_location='cpu'))
|
10 |
+
#model.eval()
|
11 |
+
|
12 |
+
LABELS = [f"Clase {i}" for i in range(100)]
|
13 |
+
|
14 |
+
def predict_from_video(video_path,model=None):
|
15 |
+
cap = cv2.VideoCapture(video_path)
|
16 |
+
keypoints = []
|
17 |
+
|
18 |
+
while cap.isOpened():
|
19 |
+
ret, frame = cap.read()
|
20 |
+
if not ret:
|
21 |
+
break
|
22 |
+
keypoint = extract_keypoints_from_frame(frame)
|
23 |
+
keypoints.append(keypoint)
|
24 |
+
|
25 |
+
cap.release()
|
26 |
+
if not keypoints:
|
27 |
+
return "No keypoints detected", pd.DataFrame()
|
28 |
+
|
29 |
+
x = torch.tensor(np.mean(keypoints, axis=0)).float().unsqueeze(0)
|
30 |
+
#print("x:")
|
31 |
+
#print(x)
|
32 |
+
with torch.no_grad():
|
33 |
+
logits = model(x)
|
34 |
+
probs = torch.softmax(logits, dim=1).numpy()[0]
|
35 |
+
|
36 |
+
#print("probs:")
|
37 |
+
#print(probs)
|
38 |
+
top5_idx = probs.argsort()[-5:][::-1]
|
39 |
+
|
40 |
+
top5_labels = [LABELS[i] for i in top5_idx]
|
41 |
+
top5_probs = [probs[i] for i in top5_idx]
|
42 |
+
confidences = {LABELS[i]: float(probs[i]) for i in top5_idx}
|
43 |
+
|
44 |
+
#print("confidences:")
|
45 |
+
#print(confidences)
|
46 |
+
#df = pd.DataFrame({"label": top5_labels, "value": top5_probs})
|
47 |
+
return top5_labels[0],confidences # df
|
src/simple_demo.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from predict import predict_from_video
|
3 |
+
|
4 |
+
def classify_video(video):
|
5 |
+
top1, top5_df = predict_from_video(video)
|
6 |
+
return f"Top-1: {top1}", top5_df
|
7 |
+
|
8 |
+
demo = gr.Interface(
|
9 |
+
fn=classify_video,
|
10 |
+
inputs=gr.Video(sources=["upload", "webcam"], label="🎥 Video (webcam o archivo)"),
|
11 |
+
outputs=[
|
12 |
+
gr.Text(label="Predicción principal"),
|
13 |
+
gr.Label(num_top_classes=5),
|
14 |
+
],
|
15 |
+
#outputs="text",
|
16 |
+
title="🧠 ISLR Demo con Mediapipe y 100 Clases",
|
17 |
+
description="Clasificador de lenguaje de señas aislado. Muestra las Top-5 clases más probables.",
|
18 |
+
#examples=["/home/va0831/slr/SLR_2024/Gloss/SignLanguageRecognition/J7tP98oDxqE_000000_000066-msasl-book.mp4",
|
19 |
+
# "/home/va0831/slr/SLR_2024/Gloss/SignLanguageRecognition/J7tP98oDxqE_000000_000066-msasl-book.mp4"],
|
20 |
+
#example_labels = ["book","house"],
|
21 |
+
)
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
demo.launch(server_port=9090)
|
videos/wlasl/book.mp4
ADDED
Binary file (52.5 kB). View file
|
|