File size: 1,518 Bytes
427ce39
 
 
6dfc92e
427ce39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Configuration settings for the Whisper Transcription Space

# Model configurations
WHISPER_MODEL = "distil-whisper/distil-large-v3"
DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"

# Audio processing settings
AUDIO_SAMPLE_RATE = 16000
AUDIO_CHANNELS = 1
MAX_AUDIO_DURATION = 600  # 10 minutes in seconds

# Transcription settings
DEFAULT_BEAM_SIZE = 5
DEFAULT_LANGUAGE = None  # Auto-detect
DEFAULT_TRANSLATE = False

# Diarization settings
MAX_SPEAKERS = 20
DEFAULT_NUM_SPEAKERS = None  # Auto-detect

# Segment grouping settings
MAX_SEGMENT_GAP = 1.0  # seconds
MAX_SEGMENT_DURATION = 30.0  # seconds

# Flash attention settings
FLASH_ATTENTION_ENABLED = True
TORCH_DTYPE = "float16"

# ZeroGPU settings
GPU_MEMORY_FRACTION = 0.8
CUDA_DEVICE = "cuda:0"

# Gradio interface settings
GRADIO_THEME = "soft"
GRADIO_DEBUG = False
GRADIO_SHARE = False

# Environment variables
HF_TOKEN_ENV_VAR = "HF_TOKEN"

# Supported audio formats
SUPPORTED_AUDIO_FORMATS = [
    ".mp3", ".wav", ".m4a", ".flac", ".ogg", 
    ".aac", ".wma", ".opus", ".webm"
]

# Language codes
SUPPORTED_LANGUAGES = {
    "auto": "Auto-detect",
    "en": "English",
    "es": "Spanish", 
    "fr": "French",
    "de": "German",
    "it": "Italian",
    "pt": "Portuguese",
    "ru": "Russian",
    "ja": "Japanese",
    "ko": "Korean",
    "zh": "Chinese",
    "ar": "Arabic",
    "hi": "Hindi",
    "tr": "Turkish",
    "pl": "Polish",
    "nl": "Dutch",
    "sv": "Swedish",
    "da": "Danish",
    "no": "Norwegian",
    "fi": "Finnish"
}