Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
import tempfile | |
import os | |
import subprocess | |
import speech_recognition as sr | |
from pydub import AudioSegment | |
import re | |
from typing import Dict, Tuple | |
import time | |
# Configure Streamlit page | |
st.set_page_config( | |
page_title="English Accent Detector | REM Waste", | |
page_icon="π€", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# Custom CSS for better styling | |
st.markdown(""" | |
<style> | |
.main > div { | |
padding-top: 2rem; | |
} | |
.stButton > button { | |
width: 100%; | |
border-radius: 10px; | |
border: none; | |
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
font-weight: bold; | |
padding: 0.75rem; | |
} | |
.metric-container { | |
background: #f0f2f6; | |
padding: 1rem; | |
border-radius: 10px; | |
text-align: center; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
class AccentDetector: | |
"""Streamlined accent detection for English speech analysis""" | |
def __init__(self): | |
self.accent_patterns = { | |
'American': { | |
'keywords': ['gonna', 'wanna', 'gotta', 'kinda', 'sorta', 'yeah', 'awesome', 'dude'], | |
'vocabulary': ['elevator', 'apartment', 'garbage', 'vacation', 'cookie', 'candy', 'mom', 'color'], | |
'phrases': ['you know', 'like totally', 'for sure', 'right now'] | |
}, | |
'British': { | |
'keywords': ['brilliant', 'lovely', 'quite', 'rather', 'chap', 'bloody', 'bloke', 'cheers'], | |
'vocabulary': ['lift', 'flat', 'rubbish', 'holiday', 'biscuit', 'queue', 'mum', 'colour'], | |
'phrases': ['i say', 'good heavens', 'how do you do', 'spot on'] | |
}, | |
'Australian': { | |
'keywords': ['mate', 'bloody', 'crikey', 'reckon', 'fair dinkum', 'bonkers', 'ripper'], | |
'vocabulary': ['arvo', 'brekkie', 'servo', 'bottle-o', 'mozzie', 'barbie', 'ute'], | |
'phrases': ['no worries', 'good on ya', 'she\'ll be right', 'too right'] | |
}, | |
'Canadian': { | |
'keywords': ['eh', 'about', 'house', 'out', 'sorry', 'hoser', 'beauty'], | |
'vocabulary': ['toque', 'hydro', 'washroom', 'parkade', 'chesterfield', 'serviette'], | |
'phrases': ['you bet', 'take off', 'give\'r', 'double double'] | |
}, | |
'South African': { | |
'keywords': ['ag', 'man', 'hey', 'lekker', 'eish', 'shame', 'howzit'], | |
'vocabulary': ['robot', 'bakkie', 'boerewors', 'biltong', 'braai', 'veld'], | |
'phrases': ['just now', 'now now', 'is it', 'sharp sharp'] | |
} | |
} | |
def download_video(_self, url: str) -> str: | |
"""Download video with caching, including Loom/YouTube support and debug output""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
} | |
# YouTube support (including Shorts) | |
if 'youtube.com' in url or 'youtu.be' in url: | |
try: | |
import yt_dlp | |
except ImportError: | |
raise Exception("yt-dlp is required for YouTube downloads. Please install with 'pip install yt-dlp'.") | |
# Use yt-dlp to download best audio to a temp directory, let yt-dlp pick the filename | |
tmpdir = tempfile.mkdtemp() | |
ydl_opts = { | |
'format': 'bestaudio[ext=m4a]/bestaudio/best', | |
'outtmpl': f'{tmpdir}/%(id)s.%(ext)s', | |
'quiet': True, | |
'noplaylist': True, | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'wav', | |
'preferredquality': '192', | |
}], | |
'ffmpeg_location': '/opt/homebrew/bin/ffmpeg', | |
'overwrites': True, | |
} | |
try: | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info = ydl.extract_info(url, download=True) | |
# Find the resulting .wav file | |
for f in os.listdir(tmpdir): | |
if f.endswith('.wav'): | |
# Move the file to a permanent temp location so it persists after this function | |
final_temp = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
final_temp.close() | |
with open(os.path.join(tmpdir, f), 'rb') as src, open(final_temp.name, 'wb') as dst: | |
dst.write(src.read()) | |
return final_temp.name | |
raise Exception("yt-dlp did not produce a valid audio file. Try another video or update yt-dlp/ffmpeg.") | |
except Exception as e: | |
raise Exception(f"yt-dlp failed: {str(e)}. Try updating yt-dlp and ffmpeg.") | |
# Loom support (fallback: try to extract video from page HTML) | |
if 'loom.com' in url: | |
resp = requests.get(url, headers=headers, timeout=30) | |
if resp.status_code != 200: | |
raise Exception("Failed to fetch Loom page") | |
html = resp.text | |
import re | |
match = re.search(r'src="([^"]+\.mp4)"', html) | |
if not match: | |
match = re.search(r'https://cdn\.loom\.com/sessions/[^"\s]+\.mp4', html) | |
if not match: | |
raise Exception("Could not extract Loom video stream URL from page HTML") | |
video_url = match.group(1) | |
url = video_url | |
# Download video (Loom or direct) | |
response = requests.get(url, headers=headers, stream=True, timeout=60) | |
response.raise_for_status() | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file: | |
for chunk in response.iter_content(chunk_size=8192): | |
if chunk: | |
temp_file.write(chunk) | |
return temp_file.name | |
except Exception as e: | |
raise Exception(f"Download failed: {str(e)}") | |
def extract_audio_simple(self, video_path: str) -> str: | |
"""Robust audio extraction: handles mp3, wav, mp4, etc.""" | |
try: | |
import os | |
from pydub import AudioSegment | |
ext = os.path.splitext(video_path)[1].lower() | |
audio_path = video_path.rsplit('.', 1)[0] + '.wav' | |
# If already wav, use pydub directly | |
if ext == '.wav': | |
audio = AudioSegment.from_wav(video_path) | |
else: | |
audio = AudioSegment.from_file(video_path) | |
audio = audio.set_frame_rate(16000).set_channels(1) | |
if len(audio) > 120000: | |
audio = audio[:120000] | |
audio.export(audio_path, format="wav") | |
return audio_path | |
except Exception as e: | |
raise Exception(f"Audio extraction failed: {str(e)}") | |
def transcribe_audio(self, audio_path: str) -> str: | |
"""Transcribe with error handling""" | |
try: | |
r = sr.Recognizer() | |
r.energy_threshold = 300 | |
r.dynamic_energy_threshold = True | |
with sr.AudioFile(audio_path) as source: | |
r.adjust_for_ambient_noise(source, duration=0.5) | |
audio_data = r.record(source) | |
# Try Google Speech Recognition | |
text = r.recognize_google(audio_data, language='en-US') | |
return text.lower() | |
except sr.UnknownValueError: | |
raise Exception("Could not understand the audio clearly") | |
except sr.RequestError as e: | |
raise Exception(f"Speech recognition service error: {str(e)}") | |
except Exception as e: | |
raise Exception(f"Transcription failed: {str(e)}") | |
def analyze_patterns(self, text: str) -> Dict[str, float]: | |
"""Enhanced pattern analysis""" | |
scores = {} | |
words = text.split() | |
word_count = max(len(words), 1) | |
for accent, patterns in self.accent_patterns.items(): | |
score = 0.0 | |
total_matches = 0 | |
# Keywords (high weight) | |
for keyword in patterns['keywords']: | |
if keyword in text: | |
score += 20.0 | |
total_matches += 1 | |
# Vocabulary (medium weight) | |
for vocab in patterns['vocabulary']: | |
if vocab in text: | |
score += 15.0 | |
total_matches += 1 | |
# Phrases (high weight) | |
for phrase in patterns['phrases']: | |
if phrase in text: | |
score += 25.0 | |
total_matches += 1 | |
# Normalize and add base confidence | |
if total_matches > 0: | |
score = min(score * (total_matches / word_count) * 50, 95.0) | |
else: | |
score = self._get_base_score(text, accent) | |
scores[accent] = round(max(score, 5.0), 1) | |
return scores | |
def _get_base_score(self, text: str, accent: str) -> float: | |
"""Base scoring for general patterns""" | |
base_scores = { | |
'American': 30.0, | |
'British': 20.0, | |
'Australian': 15.0, | |
'Canadian': 18.0, | |
'South African': 12.0 | |
} | |
score = base_scores.get(accent, 15.0) | |
# Spelling adjustments | |
if accent == 'British': | |
if any(word in text for word in ['colour', 'favour', 'centre', 'theatre']): | |
score += 25.0 | |
elif accent == 'American': | |
if any(word in text for word in ['color', 'favor', 'center', 'theater']): | |
score += 25.0 | |
return min(score, 45.0) | |
def classify_accent(self, scores: Dict[str, float]) -> Tuple[str, float, str]: | |
"""Classify and explain results""" | |
if not scores: | |
return "Unknown", 0.0, "No speech detected" | |
# Get top result | |
top_accent = max(scores.items(), key=lambda x: x[1]) | |
accent, confidence = top_accent | |
# Generate explanation | |
if confidence < 25: | |
explanation = "Low confidence - speech patterns are not strongly distinctive" | |
elif confidence < 50: | |
explanation = f"Moderate confidence in {accent} accent based on some linguistic markers" | |
elif confidence < 75: | |
explanation = f"Good confidence in {accent} accent with clear characteristic patterns" | |
else: | |
explanation = f"High confidence in {accent} accent with strong linguistic evidence" | |
return accent, confidence, explanation | |
# Initialize detector | |
def get_detector(): | |
return AccentDetector() | |
def main(): | |
# Header | |
st.title("π€ English Accent Detection Tool") | |
st.markdown("**AI-powered accent analysis for English speech | Built for REM Waste**") | |
# Description | |
with st.expander("βΉοΈ How it works", expanded=False): | |
st.markdown(""" | |
1. **Input**: Paste a public video URL (MP4, Loom, YouTube, etc.) | |
2. **Processing**: Extract audio β Transcribe speech β Analyze patterns | |
3. **Output**: Accent classification + confidence score + explanation | |
**Supported Accents**: American, British, Australian, Canadian, South African | |
""") | |
# Input section | |
st.subheader("πΉ Video Input") | |
# Sample URLs for testing | |
with st.expander("π§ͺ Test with sample videos"): | |
st.markdown(""" | |
**Sample URLs for testing:** | |
- `https://sample-videos.com/zip/10/mp4/SampleVideo_1280x720_1mb.mp4` | |
- `https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4` | |
- Or any public Loom/YouTube video URL | |
""") | |
video_url = st.text_input( | |
"Enter video URL:", | |
placeholder="https://example.com/video.mp4", | |
help="Must be a publicly accessible video URL" | |
) | |
# Process button | |
if st.button("π Analyze Accent", type="primary"): | |
if not video_url.strip(): | |
st.error("β οΈ Please enter a video URL") | |
return | |
if not video_url.startswith(('http://', 'https://')): | |
st.error("β οΈ Please enter a valid URL starting with http:// or https://") | |
return | |
# Initialize detector and progress tracking | |
detector = get_detector() | |
temp_files = [] | |
try: | |
# Progress bar | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
# Step 1: Download video | |
status_text.text("π₯ Downloading video...") | |
progress_bar.progress(20) | |
video_path = detector.download_video(video_url) | |
temp_files.append(video_path) | |
# Step 2: Extract audio | |
status_text.text("π΅ Extracting audio...") | |
progress_bar.progress(50) | |
audio_path = detector.extract_audio_simple(video_path) | |
temp_files.append(audio_path) | |
# Step 3: Transcribe | |
status_text.text("π€ Transcribing speech...") | |
progress_bar.progress(75) | |
transcript = detector.transcribe_audio(audio_path) | |
# Step 4: Analyze | |
status_text.text("π Analyzing accent patterns...") | |
progress_bar.progress(90) | |
scores = detector.analyze_patterns(transcript) | |
accent, confidence, explanation = detector.classify_accent(scores) | |
# Complete | |
progress_bar.progress(100) | |
status_text.text("β Analysis complete!") | |
time.sleep(0.5) | |
# Clear progress indicators | |
progress_bar.empty() | |
status_text.empty() | |
# Display results | |
st.success("π **Analysis Complete!**") | |
# Main metrics | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.markdown(f""" | |
<div class="metric-container"> | |
<h3>π£οΈ Detected Accent</h3> | |
<h2 style="color: #667eea;">{accent}</h2> | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
st.markdown(f""" | |
<div class="metric-container"> | |
<h3>π― Confidence</h3> | |
<h2 style="color: #764ba2;">{confidence}%</h2> | |
</div> | |
""", unsafe_allow_html=True) | |
with col3: | |
# Get transcript length for quality indicator | |
word_count = len(transcript.split()) | |
quality = "High" if word_count > 50 else "Medium" if word_count > 20 else "Low" | |
st.markdown(f""" | |
<div class="metric-container"> | |
<h3>π Data Quality</h3> | |
<h2 style="color: #28a745;">{quality}</h2> | |
<small>{word_count} words</small> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown("---") | |
# Explanation | |
st.subheader("π Analysis Summary") | |
st.info(explanation) | |
# Transcript | |
st.subheader("π Transcribed Speech") | |
st.text_area( | |
"Full transcript:", | |
transcript, | |
height=120, | |
help="This is what the AI heard from the video" | |
) | |
# Detailed scores | |
st.subheader("π All Accent Scores") | |
# Create a more visual representation | |
for accent_name, score in sorted(scores.items(), key=lambda x: x[1], reverse=True): | |
# Create progress bar for each accent | |
col_name, col_bar, col_score = st.columns([2, 6, 1]) | |
with col_name: | |
st.write(f"**{accent_name}**") | |
with col_bar: | |
st.progress(score / 100) | |
with col_score: | |
st.write(f"{score}%") | |
# Additional insights | |
if confidence > 60: | |
st.success(f"π― **Strong Detection**: The {accent} accent markers are clearly present in the speech.") | |
elif confidence > 40: | |
st.warning(f"β οΈ **Moderate Detection**: Some {accent} patterns detected, but results may vary with longer audio.") | |
else: | |
st.info("π‘ **Tip**: Longer speech samples (30+ seconds) generally provide more accurate results.") | |
except Exception as e: | |
st.error(f"β **Processing Error**: {str(e)}") | |
st.info(""" | |
**Troubleshooting Tips:** | |
- Ensure the video URL is publicly accessible | |
- Try a different video format or shorter video | |
- Make sure the video contains clear English speech | |
- Check your internet connection | |
""") | |
finally: | |
# Cleanup temp files | |
for temp_file in temp_files: | |
try: | |
if os.path.exists(temp_file): | |
os.remove(temp_file) | |
except: | |
pass | |
# Footer information | |
st.markdown("---") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(""" | |
**π§ Technical Details** | |
- Audio processing: Up to 2 minutes | |
- Speech recognition: Google API | |
- Analysis: Pattern matching + linguistics | |
- Processing time: ~30-90 seconds | |
""") | |
with col2: | |
st.markdown(""" | |
**π Requirements** | |
- Public video URLs only | |
- Clear English speech preferred | |
- Supports MP4, MOV, AVI formats | |
- Works with Loom, YouTube, direct links | |
""") | |
# API information | |
with st.expander("π API Usage"): | |
st.code(""" | |
# Python API usage example | |
from accent_detector import AccentDetector | |
detector = AccentDetector() | |
result = detector.process_video("https://your-video.com/file.mp4") | |
print(f"Accent: {result['accent']}") | |
print(f"Confidence: {result['confidence']}%") | |
""", language="python") | |
# About section | |
with st.expander("βΉοΈ About This Tool"): | |
st.markdown(""" | |
**Built for REM Waste Interview Challenge** | |
This accent detection tool analyzes English speech patterns to classify regional accents. | |
It's designed for hiring automation systems that need to evaluate spoken English proficiency. | |
**Algorithm Overview:** | |
- Extracts audio from video files | |
- Transcribes speech using Google Speech Recognition | |
- Analyzes linguistic patterns, vocabulary, and pronunciation markers | |
- Provides confidence scores based on pattern strength | |
**Accuracy Notes:** | |
- Best results with 30+ seconds of clear speech | |
- Confidence scores reflect pattern strength, not absolute accuracy | |
- Designed for screening purposes, not definitive classification | |
**Privacy & Ethics:** | |
- No audio/video data is stored permanently | |
- Temporary files are automatically deleted | |
- Tool is intended for voluntary language assessment only | |
""") | |
if __name__ == "__main__": | |
main() |