BosonAI_Hackathon / tools /step047_emotion_auto_batch.py
github-actions[bot]
Deploy snapshot for HF Space (LFS pointers, heavy tests removed)
09eaf7c
# -*- coding: utf-8 -*-
"""
tools/step047_emotion_auto_batch.py
Batch tuner that uses the rate-safe, extra-obvious DSP (step045).
"""
from __future__ import annotations
import os, glob
from typing import Optional, Tuple, List
import numpy as np
import soundfile as sf
from loguru import logger
from .step045_emotion import auto_tune_emotion
def _downmix_mono(y: np.ndarray) -> np.ndarray:
y = np.asarray(y, dtype=np.float32)
if y.ndim == 2: y = y.mean(axis=1)
return y.astype(np.float32, copy=False)
def _xfade(a: np.ndarray, b: np.ndarray, xfade_samples: int) -> np.ndarray:
a = np.asarray(a, dtype=np.float32); b = np.asarray(b, dtype=np.float32)
if xfade_samples <= 0 or len(a) == 0: return np.concatenate([a,b]).astype(np.float32, copy=False)
if len(b) == 0: return a
x = min(int(xfade_samples), len(a), len(b))
fo = np.linspace(1.0, 0.0, x, dtype=np.float32); fi = 1.0 - fo
head = a[:-x] if x < len(a) else np.zeros(0, dtype=np.float32)
tail = a[-x:] * fo + b[:x] * fi
rest = b[x:]
return np.concatenate([head, tail, rest]).astype(np.float32, copy=False)
def _segment_indices(n: int, sr: int, win_s: float, hop_s: float) -> List[Tuple[int,int]]:
win = int(round(win_s*sr)); hop = int(round(hop_s*sr))
if win <= 0 or hop <= 0: return [(0,n)]
i=0; out=[]
while i < n:
j = min(n, i+win); out.append((i,j))
if j >= n: break
i += hop
return out
def _safe_write(path: str, y: np.ndarray, sr: int):
y = np.asarray(y, dtype=np.float32)
peak = float(np.max(np.abs(y)) + 1e-8)
if peak > 1.0: y = (y / peak).astype(np.float32)
sf.write(path, y, sr)
def _parse_auto_preset(emotion: str) -> Optional[str]:
if not emotion: return None
e = emotion.strip().lower()
if e == "auto": return "happy"
if e.startswith("auto-"): return e.split("-",1)[1].strip() or "happy"
return None
def auto_tune_emotion_all_wavs_under_folder(
folder: str,
emotion: str = "auto-angry",
strength: float = 0.85,
lang_hint: str = "en",
win_s: float = 10.0,
hop_s: float = 9.0,
xfade_ms: int = 28,
latency_budget_s: float = 1.0,
min_confidence: float = 0.40,
max_iters: int = 6,
exaggerate: bool = True,
) -> tuple[bool, str]:
target = _parse_auto_preset(emotion)
if target is None: return False, f"Emotion '{emotion}' is not an auto-* mode"
wav_dir = os.path.join(folder, "wavs")
if not os.path.isdir(wav_dir): return False, f"No wavs dir: {wav_dir}"
paths = sorted(glob.glob(os.path.join(wav_dir, "*.wav")))
if not paths: return False, f"No wav files in {wav_dir}"
processed = 0
xfade_cache = {}
for p in paths:
try:
y, sr = sf.read(p, dtype="float32", always_2d=False)
y = _downmix_mono(y)
n = len(y)
if n == 0:
logger.warning(f"[EmotionAutoBatch] Empty file skipped: {p}")
continue
spans = _segment_indices(n, sr, win_s, hop_s)
xfade = xfade_cache.get(sr)
if xfade is None:
xfade = max(0, int(round(xfade_ms * 1e-3 * sr)))
xfade_cache[sr] = xfade
out = np.zeros(0, dtype=np.float32)
last_v, last_a, last_cf = 0.0, 0.0, 0.0
for (i0,i1) in spans:
seg = y[i0:i1]
tuned, meta = auto_tune_emotion(
seg, sr,
target_preset=target,
strength=strength,
lang=lang_hint,
sentence_times=None,
latency_budget_s=latency_budget_s,
min_confidence=min_confidence,
max_iters=max_iters,
exaggerate=exaggerate,
)
final = meta.get("final", {}) or {}
v = float(final.get("valence", 0.0) or 0.0)
a = float(final.get("arousal", 0.0) or 0.0)
cf = float(final.get("confidence", 0.0) or 0.0)
logger.debug(
f"[EmotionAutoBatch] {os.path.basename(p)} [{i0/sr:.2f}-{i1/sr:.2f}s] "
f"target={target}{' EXAG' if exaggerate else ''} → "
f"v={v:+.2f} a={a:+.2f} conf={cf:.2f}"
)
last_v, last_a, last_cf = v, a, cf
out = _xfade(out, tuned, xfade) if len(out) else tuned
_safe_write(p, out, sr)
processed += 1
logger.info(
f"[EmotionAutoBatch] Auto-tuned {target} ({strength:.2f}) "
f"{'[EXAG]' if exaggerate else ''} → "
f"{os.path.basename(p)} | final: v={last_v:+.2f} a={last_a:+.2f} conf={last_cf:.2f}"
)
except Exception as e:
logger.exception(f"[EmotionAutoBatch] Failed '{p}': {e}")
return True, f"Auto-tuned {processed} file(s) to {target} ({strength:.2f}) with rate clamped."