agent_decoder / streamlit_app.py
Aziz3's picture
Initial commit: English Accent Detection Tool with Streamlit and tests
b3cdca1
raw
history blame
20 kB
import streamlit as st
import requests
import tempfile
import os
import subprocess
import speech_recognition as sr
from pydub import AudioSegment
import re
from typing import Dict, Tuple
import time
# Configure Streamlit page
st.set_page_config(
page_title="English Accent Detector | REM Waste",
page_icon="🎀",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for better styling
st.markdown("""
<style>
.main > div {
padding-top: 2rem;
}
.stButton > button {
width: 100%;
border-radius: 10px;
border: none;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
color: white;
font-weight: bold;
padding: 0.75rem;
}
.metric-container {
background: #f0f2f6;
padding: 1rem;
border-radius: 10px;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
class AccentDetector:
"""Streamlined accent detection for English speech analysis"""
def __init__(self):
self.accent_patterns = {
'American': {
'keywords': ['gonna', 'wanna', 'gotta', 'kinda', 'sorta', 'yeah', 'awesome', 'dude'],
'vocabulary': ['elevator', 'apartment', 'garbage', 'vacation', 'cookie', 'candy', 'mom', 'color'],
'phrases': ['you know', 'like totally', 'for sure', 'right now']
},
'British': {
'keywords': ['brilliant', 'lovely', 'quite', 'rather', 'chap', 'bloody', 'bloke', 'cheers'],
'vocabulary': ['lift', 'flat', 'rubbish', 'holiday', 'biscuit', 'queue', 'mum', 'colour'],
'phrases': ['i say', 'good heavens', 'how do you do', 'spot on']
},
'Australian': {
'keywords': ['mate', 'bloody', 'crikey', 'reckon', 'fair dinkum', 'bonkers', 'ripper'],
'vocabulary': ['arvo', 'brekkie', 'servo', 'bottle-o', 'mozzie', 'barbie', 'ute'],
'phrases': ['no worries', 'good on ya', 'she\'ll be right', 'too right']
},
'Canadian': {
'keywords': ['eh', 'about', 'house', 'out', 'sorry', 'hoser', 'beauty'],
'vocabulary': ['toque', 'hydro', 'washroom', 'parkade', 'chesterfield', 'serviette'],
'phrases': ['you bet', 'take off', 'give\'r', 'double double']
},
'South African': {
'keywords': ['ag', 'man', 'hey', 'lekker', 'eish', 'shame', 'howzit'],
'vocabulary': ['robot', 'bakkie', 'boerewors', 'biltong', 'braai', 'veld'],
'phrases': ['just now', 'now now', 'is it', 'sharp sharp']
}
}
@st.cache_data
def download_video(_self, url: str) -> str:
"""Download video with caching, including Loom/YouTube support and debug output"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
# YouTube support (including Shorts)
if 'youtube.com' in url or 'youtu.be' in url:
try:
import yt_dlp
except ImportError:
raise Exception("yt-dlp is required for YouTube downloads. Please install with 'pip install yt-dlp'.")
# Use yt-dlp to download best audio to a temp directory, let yt-dlp pick the filename
tmpdir = tempfile.mkdtemp()
ydl_opts = {
'format': 'bestaudio[ext=m4a]/bestaudio/best',
'outtmpl': f'{tmpdir}/%(id)s.%(ext)s',
'quiet': True,
'noplaylist': True,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'ffmpeg_location': '/opt/homebrew/bin/ffmpeg',
'overwrites': True,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
# Find the resulting .wav file
for f in os.listdir(tmpdir):
if f.endswith('.wav'):
# Move the file to a permanent temp location so it persists after this function
final_temp = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
final_temp.close()
with open(os.path.join(tmpdir, f), 'rb') as src, open(final_temp.name, 'wb') as dst:
dst.write(src.read())
return final_temp.name
raise Exception("yt-dlp did not produce a valid audio file. Try another video or update yt-dlp/ffmpeg.")
except Exception as e:
raise Exception(f"yt-dlp failed: {str(e)}. Try updating yt-dlp and ffmpeg.")
# Loom support (fallback: try to extract video from page HTML)
if 'loom.com' in url:
resp = requests.get(url, headers=headers, timeout=30)
if resp.status_code != 200:
raise Exception("Failed to fetch Loom page")
html = resp.text
import re
match = re.search(r'src="([^"]+\.mp4)"', html)
if not match:
match = re.search(r'https://cdn\.loom\.com/sessions/[^"\s]+\.mp4', html)
if not match:
raise Exception("Could not extract Loom video stream URL from page HTML")
video_url = match.group(1)
url = video_url
# Download video (Loom or direct)
response = requests.get(url, headers=headers, stream=True, timeout=60)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
temp_file.write(chunk)
return temp_file.name
except Exception as e:
raise Exception(f"Download failed: {str(e)}")
def extract_audio_simple(self, video_path: str) -> str:
"""Robust audio extraction: handles mp3, wav, mp4, etc."""
try:
import os
from pydub import AudioSegment
ext = os.path.splitext(video_path)[1].lower()
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
# If already wav, use pydub directly
if ext == '.wav':
audio = AudioSegment.from_wav(video_path)
else:
audio = AudioSegment.from_file(video_path)
audio = audio.set_frame_rate(16000).set_channels(1)
if len(audio) > 120000:
audio = audio[:120000]
audio.export(audio_path, format="wav")
return audio_path
except Exception as e:
raise Exception(f"Audio extraction failed: {str(e)}")
def transcribe_audio(self, audio_path: str) -> str:
"""Transcribe with error handling"""
try:
r = sr.Recognizer()
r.energy_threshold = 300
r.dynamic_energy_threshold = True
with sr.AudioFile(audio_path) as source:
r.adjust_for_ambient_noise(source, duration=0.5)
audio_data = r.record(source)
# Try Google Speech Recognition
text = r.recognize_google(audio_data, language='en-US')
return text.lower()
except sr.UnknownValueError:
raise Exception("Could not understand the audio clearly")
except sr.RequestError as e:
raise Exception(f"Speech recognition service error: {str(e)}")
except Exception as e:
raise Exception(f"Transcription failed: {str(e)}")
def analyze_patterns(self, text: str) -> Dict[str, float]:
"""Enhanced pattern analysis"""
scores = {}
words = text.split()
word_count = max(len(words), 1)
for accent, patterns in self.accent_patterns.items():
score = 0.0
total_matches = 0
# Keywords (high weight)
for keyword in patterns['keywords']:
if keyword in text:
score += 20.0
total_matches += 1
# Vocabulary (medium weight)
for vocab in patterns['vocabulary']:
if vocab in text:
score += 15.0
total_matches += 1
# Phrases (high weight)
for phrase in patterns['phrases']:
if phrase in text:
score += 25.0
total_matches += 1
# Normalize and add base confidence
if total_matches > 0:
score = min(score * (total_matches / word_count) * 50, 95.0)
else:
score = self._get_base_score(text, accent)
scores[accent] = round(max(score, 5.0), 1)
return scores
def _get_base_score(self, text: str, accent: str) -> float:
"""Base scoring for general patterns"""
base_scores = {
'American': 30.0,
'British': 20.0,
'Australian': 15.0,
'Canadian': 18.0,
'South African': 12.0
}
score = base_scores.get(accent, 15.0)
# Spelling adjustments
if accent == 'British':
if any(word in text for word in ['colour', 'favour', 'centre', 'theatre']):
score += 25.0
elif accent == 'American':
if any(word in text for word in ['color', 'favor', 'center', 'theater']):
score += 25.0
return min(score, 45.0)
def classify_accent(self, scores: Dict[str, float]) -> Tuple[str, float, str]:
"""Classify and explain results"""
if not scores:
return "Unknown", 0.0, "No speech detected"
# Get top result
top_accent = max(scores.items(), key=lambda x: x[1])
accent, confidence = top_accent
# Generate explanation
if confidence < 25:
explanation = "Low confidence - speech patterns are not strongly distinctive"
elif confidence < 50:
explanation = f"Moderate confidence in {accent} accent based on some linguistic markers"
elif confidence < 75:
explanation = f"Good confidence in {accent} accent with clear characteristic patterns"
else:
explanation = f"High confidence in {accent} accent with strong linguistic evidence"
return accent, confidence, explanation
# Initialize detector
@st.cache_resource
def get_detector():
return AccentDetector()
def main():
# Header
st.title("🎀 English Accent Detection Tool")
st.markdown("**AI-powered accent analysis for English speech | Built for REM Waste**")
# Description
with st.expander("ℹ️ How it works", expanded=False):
st.markdown("""
1. **Input**: Paste a public video URL (MP4, Loom, YouTube, etc.)
2. **Processing**: Extract audio β†’ Transcribe speech β†’ Analyze patterns
3. **Output**: Accent classification + confidence score + explanation
**Supported Accents**: American, British, Australian, Canadian, South African
""")
# Input section
st.subheader("πŸ“Ή Video Input")
# Sample URLs for testing
with st.expander("πŸ§ͺ Test with sample videos"):
st.markdown("""
**Sample URLs for testing:**
- `https://sample-videos.com/zip/10/mp4/SampleVideo_1280x720_1mb.mp4`
- `https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4`
- Or any public Loom/YouTube video URL
""")
video_url = st.text_input(
"Enter video URL:",
placeholder="https://example.com/video.mp4",
help="Must be a publicly accessible video URL"
)
# Process button
if st.button("πŸš€ Analyze Accent", type="primary"):
if not video_url.strip():
st.error("⚠️ Please enter a video URL")
return
if not video_url.startswith(('http://', 'https://')):
st.error("⚠️ Please enter a valid URL starting with http:// or https://")
return
# Initialize detector and progress tracking
detector = get_detector()
temp_files = []
try:
# Progress bar
progress_bar = st.progress(0)
status_text = st.empty()
# Step 1: Download video
status_text.text("πŸ“₯ Downloading video...")
progress_bar.progress(20)
video_path = detector.download_video(video_url)
temp_files.append(video_path)
# Step 2: Extract audio
status_text.text("🎡 Extracting audio...")
progress_bar.progress(50)
audio_path = detector.extract_audio_simple(video_path)
temp_files.append(audio_path)
# Step 3: Transcribe
status_text.text("🎀 Transcribing speech...")
progress_bar.progress(75)
transcript = detector.transcribe_audio(audio_path)
# Step 4: Analyze
status_text.text("πŸ” Analyzing accent patterns...")
progress_bar.progress(90)
scores = detector.analyze_patterns(transcript)
accent, confidence, explanation = detector.classify_accent(scores)
# Complete
progress_bar.progress(100)
status_text.text("βœ… Analysis complete!")
time.sleep(0.5)
# Clear progress indicators
progress_bar.empty()
status_text.empty()
# Display results
st.success("πŸŽ‰ **Analysis Complete!**")
# Main metrics
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(f"""
<div class="metric-container">
<h3>πŸ—£οΈ Detected Accent</h3>
<h2 style="color: #667eea;">{accent}</h2>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown(f"""
<div class="metric-container">
<h3>🎯 Confidence</h3>
<h2 style="color: #764ba2;">{confidence}%</h2>
</div>
""", unsafe_allow_html=True)
with col3:
# Get transcript length for quality indicator
word_count = len(transcript.split())
quality = "High" if word_count > 50 else "Medium" if word_count > 20 else "Low"
st.markdown(f"""
<div class="metric-container">
<h3>πŸ“Š Data Quality</h3>
<h2 style="color: #28a745;">{quality}</h2>
<small>{word_count} words</small>
</div>
""", unsafe_allow_html=True)
st.markdown("---")
# Explanation
st.subheader("πŸ“ Analysis Summary")
st.info(explanation)
# Transcript
st.subheader("πŸ“„ Transcribed Speech")
st.text_area(
"Full transcript:",
transcript,
height=120,
help="This is what the AI heard from the video"
)
# Detailed scores
st.subheader("πŸ“Š All Accent Scores")
# Create a more visual representation
for accent_name, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
# Create progress bar for each accent
col_name, col_bar, col_score = st.columns([2, 6, 1])
with col_name:
st.write(f"**{accent_name}**")
with col_bar:
st.progress(score / 100)
with col_score:
st.write(f"{score}%")
# Additional insights
if confidence > 60:
st.success(f"🎯 **Strong Detection**: The {accent} accent markers are clearly present in the speech.")
elif confidence > 40:
st.warning(f"⚠️ **Moderate Detection**: Some {accent} patterns detected, but results may vary with longer audio.")
else:
st.info("πŸ’‘ **Tip**: Longer speech samples (30+ seconds) generally provide more accurate results.")
except Exception as e:
st.error(f"❌ **Processing Error**: {str(e)}")
st.info("""
**Troubleshooting Tips:**
- Ensure the video URL is publicly accessible
- Try a different video format or shorter video
- Make sure the video contains clear English speech
- Check your internet connection
""")
finally:
# Cleanup temp files
for temp_file in temp_files:
try:
if os.path.exists(temp_file):
os.remove(temp_file)
except:
pass
# Footer information
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
**πŸ”§ Technical Details**
- Audio processing: Up to 2 minutes
- Speech recognition: Google API
- Analysis: Pattern matching + linguistics
- Processing time: ~30-90 seconds
""")
with col2:
st.markdown("""
**πŸ“‹ Requirements**
- Public video URLs only
- Clear English speech preferred
- Supports MP4, MOV, AVI formats
- Works with Loom, YouTube, direct links
""")
# API information
with st.expander("πŸ”— API Usage"):
st.code("""
# Python API usage example
from accent_detector import AccentDetector
detector = AccentDetector()
result = detector.process_video("https://your-video.com/file.mp4")
print(f"Accent: {result['accent']}")
print(f"Confidence: {result['confidence']}%")
""", language="python")
# About section
with st.expander("ℹ️ About This Tool"):
st.markdown("""
**Built for REM Waste Interview Challenge**
This accent detection tool analyzes English speech patterns to classify regional accents.
It's designed for hiring automation systems that need to evaluate spoken English proficiency.
**Algorithm Overview:**
- Extracts audio from video files
- Transcribes speech using Google Speech Recognition
- Analyzes linguistic patterns, vocabulary, and pronunciation markers
- Provides confidence scores based on pattern strength
**Accuracy Notes:**
- Best results with 30+ seconds of clear speech
- Confidence scores reflect pattern strength, not absolute accuracy
- Designed for screening purposes, not definitive classification
**Privacy & Ethics:**
- No audio/video data is stored permanently
- Temporary files are automatically deleted
- Tool is intended for voluntary language assessment only
""")
if __name__ == "__main__":
main()