Spaces:

marquesafonso
/

multilang-asr-captioner

Sleeping

App Files Files Community

marquesafonso commited on Aug 25

Commit

fc6dd1b

1 Parent(s): fc4371c

add validation for other video formats; double cache size; adapt to device_type change in transcriber

Browse files

Files changed (6) hide show

main.py +16 -13
static/process_settings.html +1 -6
static/transcribe_video.html +8 -0
utils/process_video.py +2 -2
utils/subtitler.py +5 -5
utils/transcriber.py +3 -1

main.py CHANGED Viewed

@@ -16,10 +16,6 @@ from fastapi.security import HTTPBasic
 from pydantic import BaseModel, field_validator
 from cachetools import TTLCache
-## TODO: add word level highlighting option. WIP (Avoid caption char overflow by using a max chars heuristic in transcriber)
-## TODO: prevent double for submission in process_video/
-## TODO: add more video format options
-## TODO: Add Box + Word highlighting mode options
 ## TODO: improve UI
 app = FastAPI()
@@ -27,9 +23,9 @@ security = HTTPBasic()
 static_dir = os.path.join(os.path.dirname(__file__), 'static')
 app.mount("/static", StaticFiles(directory=static_dir), name="static")
 templates = Jinja2Templates(directory=static_dir)
-cache = TTLCache(maxsize=1024, ttl=600)
-class MP4Video(BaseModel):
     video_file: UploadFile
     @property
@@ -41,7 +37,11 @@ class MP4Video(BaseModel):
     @field_validator('video_file')
     def validate_video_file(cls, v):
-        if not v.filename.endswith('.mp4'):
             raise HTTPException(status_code=500, detail='Invalid video file type. Please upload an MP4 file.')
         return v
@@ -67,24 +67,26 @@ async def get_temp_dir():
         HTTPException(status_code=500, detail=str(e))
 @app.post("/transcribe/")
-async def transcribe_api(video_file: MP4Video = Depends(),
                         task: str = Form("transcribe"),
                         model_version: str = Form("deepdml/faster-whisper-large-v3-turbo-ct2"),
                         max_words_per_line: int = Form(6),
                         temp_dir: TemporaryDirectory = Depends(get_temp_dir)):
     try:
         video_path = os.path.join(temp_dir.name, video_file.filename)
         with open(video_path, 'wb') as f:
             shutil.copyfileobj(video_file.file, f)
-        transcription_text, transcription_json = transcriber(video_path, max_words_per_line, task, model_version)
         uid = str(uuid4())
         cache[uid] = {
             "video_path": video_path,
             "transcription_text": transcription_text,
             "transcription_json": transcription_json,
-            "temp_dir_path": temp_dir.name}
         return RedirectResponse(url=f"/process_settings/?uid={uid}", status_code=303)
     except Exception as e:
@@ -100,7 +102,8 @@ async def process_settings(request: Request, uid: str):
         "transcription_text": data["transcription_text"],
         "transcription_json": data["transcription_json"],
         "video_path": data["video_path"],
-        "temp_dir_path": data["temp_dir_path"]
     })
 @app.post("/process_video/")
@@ -114,11 +117,11 @@ async def process_video_api(video_path: str = Form(...),
                             text_color: Optional[str] = Form("white"),
                             highlight_mode: Optional[bool] = Form(False),
                             highlight_color: Optional[str] = Form("LightBlue"),
-                            caption_mode: Optional[str] = Form("desktop"),
                             temp_dir: TemporaryDirectory = Depends(get_temp_dir)
                             ):
     try:
-        output_path = process_video(video_path, srt_string, srt_json, fontsize, font, bg_color, text_color, highlight_mode, highlight_color, caption_mode, temp_dir.name)
         with open(os.path.join(temp_dir.name, f"{video_path.split('.')[0]}.srt"), 'w+') as temp_srt_file:
             logging.info("Processing the video...")
             temp_srt_file.write(srt_string)

 from pydantic import BaseModel, field_validator
 from cachetools import TTLCache
 ## TODO: improve UI
 app = FastAPI()
 static_dir = os.path.join(os.path.dirname(__file__), 'static')
 app.mount("/static", StaticFiles(directory=static_dir), name="static")
 templates = Jinja2Templates(directory=static_dir)
+cache = TTLCache(maxsize=2048, ttl=600)
+class Video(BaseModel):
     video_file: UploadFile
     @property
     @field_validator('video_file')
     def validate_video_file(cls, v):
+        video_extensions = ('.webm', '.mkv', '.flv', '.vob', '.ogv', '.ogg', '.rrc', '.gifv',
+                            '.mng', '.mov', '.avi', '.qt', '.wmv', '.yuv', '.rm', '.asf', '.amv', '.mp4',
+                            '.m4p', '.m4v', '.mpg', '.mp2', '.mpeg', '.mpe', '.mpv', '.m4v', '.svi', '.3gp',
+                            '.3g2', '.mxf', '.roq', '.nsv', '.flv', '.f4v', '.f4p', '.f4a', '.f4b', '.mod')
+        if not v.filename.endswith(video_extensions):
             raise HTTPException(status_code=500, detail='Invalid video file type. Please upload an MP4 file.')
         return v
         HTTPException(status_code=500, detail=str(e))
 @app.post("/transcribe/")
+async def transcribe_api(video_file: Video = Depends(),
                         task: str = Form("transcribe"),
                         model_version: str = Form("deepdml/faster-whisper-large-v3-turbo-ct2"),
                         max_words_per_line: int = Form(6),
+                        device_type: str = Form("desktop"),
                         temp_dir: TemporaryDirectory = Depends(get_temp_dir)):
     try:
         video_path = os.path.join(temp_dir.name, video_file.filename)
         with open(video_path, 'wb') as f:
             shutil.copyfileobj(video_file.file, f)
+        transcription_text, transcription_json = transcriber(video_path, max_words_per_line, task, model_version, device_type)
         uid = str(uuid4())
         cache[uid] = {
             "video_path": video_path,
             "transcription_text": transcription_text,
             "transcription_json": transcription_json,
+            "temp_dir_path": temp_dir.name,
+            "device_type": device_type}
         return RedirectResponse(url=f"/process_settings/?uid={uid}", status_code=303)
     except Exception as e:
         "transcription_text": data["transcription_text"],
         "transcription_json": data["transcription_json"],
         "video_path": data["video_path"],
+        "temp_dir_path": data["temp_dir_path"],
+        "device_type": data["device_type"]
     })
 @app.post("/process_video/")
                             text_color: Optional[str] = Form("white"),
                             highlight_mode: Optional[bool] = Form(False),
                             highlight_color: Optional[str] = Form("LightBlue"),
+                            device_type: Optional[str] = Form("desktop"),
                             temp_dir: TemporaryDirectory = Depends(get_temp_dir)
                             ):
     try:
+        output_path = process_video(video_path, srt_string, srt_json, fontsize, font, bg_color, text_color, highlight_mode, highlight_color, device_type, temp_dir.name)
         with open(os.path.join(temp_dir.name, f"{video_path.split('.')[0]}.srt"), 'w+') as temp_srt_file:
             logging.info("Processing the video...")
             temp_srt_file.write(srt_string)

static/process_settings.html CHANGED Viewed

@@ -165,17 +165,12 @@
         <select id="highlight_color" name="highlight_color">
           <option>Loading colors...</option>
         </select>
-        <label for="caption_mode">Caption mode</label>
-        <select name="caption_mode">
-          <option value="desktop">Desktop</option>
-          <option value="mobile">Mobile</option>
-        </select>
       </div>
     </div>
     <input type="hidden" name="video_path" value="{{ video_path }}">
     <input type="hidden" name="temp_dir_path" value="{{ temp_dir_path }}">
     <input type="submit" name="submitButton" value="Submit">
   </form>

         <select id="highlight_color" name="highlight_color">
           <option>Loading colors...</option>
         </select>
       </div>
     </div>
     <input type="hidden" name="video_path" value="{{ video_path }}">
     <input type="hidden" name="temp_dir_path" value="{{ temp_dir_path }}">
+    <input type="hidden" name="device_type" value="{{ device_type }}">
     <input type="submit" name="submitButton" value="Submit">
   </form>

static/transcribe_video.html CHANGED Viewed

@@ -8,6 +8,8 @@
         form { background: white; padding: 2rem; border-radius: 10px; max-width: 600px; margin: auto; }
         label, select, input { display: block; width: 100%; margin-bottom: 1rem; }
         input[type="submit"] { background: #4CAF50; color: white; padding: 0.8rem; border: none; cursor: pointer; }
     </style>
 </head>
 <body>
@@ -32,6 +34,12 @@
         <label for="max_words_per_line">Max words per line</label>
         <input type="number" name="max_words_per_line" id="max_words_per_line" value="6">
         <div id="loading" style="display:none; text-align: center; margin-top: 10px; margin-bottom: 10px; font-weight: bold;">
             <i class="fas fa-spinner fa-spin"></i> Processing, please wait...

         form { background: white; padding: 2rem; border-radius: 10px; max-width: 600px; margin: auto; }
         label, select, input { display: block; width: 100%; margin-bottom: 1rem; }
         input[type="submit"] { background: #4CAF50; color: white; padding: 0.8rem; border: none; cursor: pointer; }
+        .radio-container { display: flex; gap: 2rem; margin-bottom: 1rem;}
+        .radio-option { display: flex; flex-direction: column; align-items: flex-start;}
     </style>
 </head>
 <body>
         <label for="max_words_per_line">Max words per line</label>
         <input type="number" name="max_words_per_line" id="max_words_per_line" value="6">
+        <label for="device_type">Device Type</label>
+        <select name="device_type">
+          <option value="desktop">Desktop</option>
+          <option value="mobile">Mobile</option>
+        </select>
         <div id="loading" style="display:none; text-align: center; margin-top: 10px; margin-bottom: 10px; font-weight: bold;">
             <i class="fas fa-spinner fa-spin"></i> Processing, please wait...

utils/process_video.py CHANGED Viewed

@@ -10,12 +10,12 @@ def process_video(invideo_file: str,
                 text_color:str,
                 highlight_mode: bool,
                 highlight_color: str,
-                caption_mode:str,
                 temp_dir: str
                 ):
     invideo_path_parts = os.path.normpath(invideo_file).split(os.path.sep)
     VIDEO_NAME = os.path.basename(invideo_file)
     OUTVIDEO_PATH = os.path.join(os.path.normpath('/'.join(invideo_path_parts[:-1])), f"result_{VIDEO_NAME}")
     logging.info("Subtitling...")
-    subtitler(invideo_file, srt_string, srt_json, OUTVIDEO_PATH, fontsize, font, bg_color, text_color, highlight_mode, highlight_color, caption_mode, temp_dir)
     return OUTVIDEO_PATH

                 text_color:str,
                 highlight_mode: bool,
                 highlight_color: str,
+                device_type:str,
                 temp_dir: str
                 ):
     invideo_path_parts = os.path.normpath(invideo_file).split(os.path.sep)
     VIDEO_NAME = os.path.basename(invideo_file)
     OUTVIDEO_PATH = os.path.join(os.path.normpath('/'.join(invideo_path_parts[:-1])), f"result_{VIDEO_NAME}")
     logging.info("Subtitling...")
+    subtitler(invideo_file, srt_string, srt_json, OUTVIDEO_PATH, fontsize, font, bg_color, text_color, highlight_mode, highlight_color, device_type, temp_dir)
     return OUTVIDEO_PATH

utils/subtitler.py CHANGED Viewed

@@ -18,11 +18,11 @@ def parse_srt(srt_string):
             i += 1
     return subtitles
-def filter_caption_width(caption_mode:str):
-    if caption_mode == 'desktop':
         caption_width_ratio = 0.5
         caption_height_ratio = 0.8
-    elif caption_mode == 'mobile':
         caption_width_ratio = 0.2
         caption_height_ratio = 0.7
     return caption_width_ratio, caption_height_ratio
@@ -38,7 +38,7 @@ def subtitler(video_file: str,
             text_color: str,
             highlight_mode: bool,
             highlight_color: str,
-            caption_mode: str,
             temp_dir: str
             ):
     """Add subtitles to a video, with optional word-level highlighting."""
@@ -49,7 +49,7 @@ def subtitler(video_file: str,
     subtitle_clips = []
-    caption_width_ratio, caption_height_ratio = filter_caption_width(caption_mode)
     subtitle_y_position = clip.h * caption_height_ratio
     if highlight_mode:
         srt_data = json.loads(json.dumps(eval(srt_json)))

             i += 1
     return subtitles
+def filter_caption_width(device_type:str):
+    if device_type == 'desktop':
         caption_width_ratio = 0.5
         caption_height_ratio = 0.8
+    elif device_type == 'mobile':
         caption_width_ratio = 0.2
         caption_height_ratio = 0.7
     return caption_width_ratio, caption_height_ratio
             text_color: str,
             highlight_mode: bool,
             highlight_color: str,
+            device_type: str,
             temp_dir: str
             ):
     """Add subtitles to a video, with optional word-level highlighting."""
     subtitle_clips = []
+    caption_width_ratio, caption_height_ratio = filter_caption_width(device_type)
     subtitle_y_position = clip.h * caption_height_ratio
     if highlight_mode:
         srt_data = json.loads(json.dumps(eval(srt_json)))

utils/transcriber.py CHANGED Viewed

@@ -5,7 +5,8 @@ from dotenv import load_dotenv
 def transcriber(invideo_file:str,
                 max_words_per_line:int,
                 task:str,
-                model_version:str
                 ):
         load_dotenv()
         HF_TOKEN = os.getenv("HF_TOKEN")
@@ -17,6 +18,7 @@ def transcriber(invideo_file:str,
                 max_words_per_line=max_words_per_line,
                 task=task,
                 model_version=model_version,
                 api_name="/predict"
         )
         return result[0], result[3]

 def transcriber(invideo_file:str,
                 max_words_per_line:int,
                 task:str,
+                model_version:str,
+                device_type:str
                 ):
         load_dotenv()
         HF_TOKEN = os.getenv("HF_TOKEN")
                 max_words_per_line=max_words_per_line,
                 task=task,
                 model_version=model_version,
+                device_type=device_type,
                 api_name="/predict"
         )
         return result[0], result[3]