Seonghyeon Go
add spaces decorator
e836611
raw
history blame
7.96 kB
import gradio as gr
import torch
import librosa
import numpy as np
from inference import inference
from huggingface_hub import hf_hub_download
import os
from pathlib import Path
import spaces
def download_models_from_hub():
"""
Download model checkpoints from Hugging Face Model Hub
"""
model_dir = Path("checkpoints")
model_dir.mkdir(exist_ok=True)
# Original checkpoint filenames on HF Hub
models = {
"main": "EmbeddingModel_MERT_768-epoch=0073-val_loss=0.1058-val_acc=0.9585-val_f1=0.9366-val_precision=0.9936-val_recall=0.8857.ckpt",
"backup": "step=007000-val_loss=0.1831-val_acc=0.9278.ckpt"
}
downloaded_models = {}
for model_name, filename in models.items():
local_path = model_dir / filename
if not local_path.exists():
print(f"πŸ“₯ Downloading {model_name} model from Hugging Face Hub...")
model_path = hf_hub_download(
repo_id="mippia/FST-checkpoints",
filename=filename,
local_dir=str(model_dir),
local_dir_use_symlinks=False
)
print(f"βœ… {model_name} model downloaded successfully!")
downloaded_models[model_name] = str(local_path)
else:
print(f"βœ… {model_name} model already exists locally")
downloaded_models[model_name] = str(local_path)
return downloaded_models
@spaces.GPU
def detect_ai_audio(audio_file):
"""
Detect whether the uploaded audio file was generated by AI
"""
if audio_file is None:
return """
<div style="text-align: center; padding: 20px; border-radius: 10px; background: linear-gradient(135deg, #ff6b6b22, #ff6b6b11);">
<div style="font-size: 18px; color: #ff6b6b;">⚠️ Please upload an audio file</div>
</div>
"""
try:
result = inference(audio_file)
# Format result with better styling
if "AI" in str(result).upper() or "artificial" in str(result).lower() or "fake" in str(result).lower():
status = "AI Generated"
color = "#ff6b6b"
confidence = "High confidence this audio was generated by AI"
else:
status = "Human Generated"
color = "#51cf66"
confidence = "High confidence this audio was created by humans"
formatted_result = f"""
<div style="text-align: center; padding: 25px; border-radius: 15px; background: linear-gradient(135deg, {color}22, {color}11); border: 2px solid {color}33;">
<div style="font-size: 28px; font-weight: bold; color: {color}; margin-bottom: 10px;">{status}</div>
<div style="font-size: 16px; color: #666; margin-bottom: 8px;">{confidence}</div>
<div style="font-size: 14px; color: #888;">Raw output: {result}</div>
</div>
"""
return formatted_result
except Exception as e:
error_result = f"""
<div style="text-align: center; padding: 20px; border-radius: 10px; background: linear-gradient(135deg, #ff6b6b22, #ff6b6b11);">
<div style="font-size: 20px; font-weight: bold; color: #ff6b6b; margin-bottom: 8px;">Error</div>
<div style="font-size: 14px; color: #666;">Failed to process audio: {str(e)}</div>
</div>
"""
return error_result
# Custom CSS for modern design
custom_css = """
/* Global background gradient */
.gradio-container {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
min-height: 100vh;
}
/* Main container styling */
.main-container {
background: rgba(255, 255, 255, 0.95) !important;
backdrop-filter: blur(10px) !important;
border-radius: 20px !important;
box-shadow: 0 20px 40px rgba(0,0,0,0.1) !important;
margin: 20px !important;
padding: 30px !important;
}
/* Title styling */
h1 {
background: linear-gradient(135deg, #667eea, #764ba2) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
text-align: center !important;
font-size: 3em !important;
font-weight: 800 !important;
margin-bottom: 10px !important;
}
/* Description text */
.gradio-markdown p {
text-align: center !important;
font-size: 1.2em !important;
color: #555 !important;
margin-bottom: 30px !important;
}
/* Audio upload component */
.upload-container {
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important;
border-radius: 15px !important;
padding: 20px !important;
border: none !important;
box-shadow: 0 10px 30px rgba(240, 147, 251, 0.3) !important;
transition: all 0.3s ease !important;
}
.upload-container:hover {
transform: translateY(-5px) !important;
box-shadow: 0 15px 40px rgba(240, 147, 251, 0.4) !important;
}
/* Output container */
.output-container {
background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%) !important;
border-radius: 15px !important;
padding: 20px !important;
border: none !important;
box-shadow: 0 10px 30px rgba(168, 237, 234, 0.3) !important;
min-height: 150px !important;
}
/* Button styling */
.gr-button {
background: linear-gradient(135deg, #667eea, #764ba2) !important;
border: none !important;
border-radius: 25px !important;
padding: 12px 30px !important;
font-weight: 600 !important;
color: white !important;
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4) !important;
transition: all 0.3s ease !important;
}
.gr-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
}
/* Animation */
@keyframes fadeInUp {
from {
opacity: 0;
transform: translateY(30px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
.gradio-container > div {
animation: fadeInUp 0.8s ease-out !important;
}
/* Responsive design */
@media (max-width: 768px) {
h1 {
font-size: 2em !important;
}
.main-container {
margin: 10px !important;
padding: 20px !important;
}
}
"""
# Initialize the app
print("πŸš€ Starting FST AI Audio Detection App...")
print("πŸ“¦ Initializing models...")
# Download models at startup
models = download_models_from_hub()
# Check if main model is available
if models.get("main"):
print("βœ… Main model ready for inference")
else:
print("⚠️ Warning: Main model not available, app may not work properly")
# Create Gradio interface
demo = gr.Interface(
fn=detect_ai_audio,
inputs=gr.Audio(
type="filepath",
label="Upload Audio File",
elem_classes=["upload-container"]
),
outputs=gr.HTML(
label="Detection Result",
elem_classes=["output-container"]
),
title="AI Audio Detector",
description="""
<div style="text-align: center; font-size: 1.2em; color: #555; margin: 20px 0;">
<p><strong>Advanced AI technology</strong> to accurately detect whether uploaded audio was generated by AI!</p>
<p>Supported formats: MP3, WAV, M4A, FLAC and various audio formats</p>
<p>Powered by Fusion Segment Transformer (FST) - ISMIR 2025</p>
<p style="font-size: 0.9em; color: #777;">πŸ”¬ Research-grade accuracy with MERT-768 backbone</p>
</div>
""",
examples=[
["example-ncs-light it up(human).mp3"],
["example-Strumming Heartbeats(suno v4).mp3"]
],
css=custom_css,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="purple",
neutral_hue="gray",
font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
),
elem_classes=["main-container"]
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_api=False,
show_error=True
)