Spaces:

ImparkAI
/

EduBert-Impark

Sleeping

App Files Files Community

eraydikyologlu commited on Aug 5

Commit

ece1c42

1 Parent(s): 943f366

Video dosyalarının paralel işlenmesi için yeni bir işlev eklendi. Thread havuzu kullanılarak video transkripsiyonu optimize edildi ve hata yönetimi geliştirildi. Ayrıca, işlem süresi ve sonuç sayısı hakkında bilgilendirici çıktılar eklendi.

Browse files

Files changed (1) hide show

main-videopluskazanim.py +77 -35

main-videopluskazanim.py CHANGED Viewed

@@ -35,6 +35,8 @@ from typing import List, Dict, Optional
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 import functools
 import kazanim_id_konu_isim_dict_list as kazanimlar
 import logging
 import whisper
@@ -137,6 +139,50 @@ logging.basicConfig(stream=sys.stdout,
                     level=logging.INFO,
                     format="%(asctime)s  %(levelname)s  %(message)s")
 @app.post("/predict", response_model=PredictResponse)
 async def predict(req: PredictRequest):
     t0 = time.time()
@@ -177,9 +223,9 @@ async def predict(req: PredictRequest):
 async def transcribe_videos(files: List[UploadFile] = File(...),
                           model_name: str = "small",
                           language: str = "Turkish"):
-    """Video dosyalarını metne çevir"""
     t0 = time.time()
-    print(f"new whisper request /model = {model_name} / n = {len(files)}")
     try:
         if not files:
@@ -188,44 +234,40 @@ async def transcribe_videos(files: List[UploadFile] = File(...),
         # Whisper modelini yükle
         model = load_whisper_model(model_name)
-        results = []
-        for file in files:
-            if not file.filename.lower().endswith(('.mp4', '.wav', '.mp3', '.m4a', '.flac')):
-                continue
-            # Geçici dosya oluştur
-            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
-                content = await file.read()
-                temp_file.write(content)
-                temp_file_path = temp_file.name
-            try:
-                # Whisper ile transkript et
-                result = model.transcribe(temp_file_path, language=language.lower(), verbose=False)
-                text = result['text'].strip()
-                results.append(VideoResult(
-                    id=file.filename,
-                    text=text
-                ))
-            except Exception as e:
-                print(f"Video işleme hatası ({file.filename}): {e}")
-                results.append(VideoResult(
-                    id=file.filename,
-                    text=""
-                ))
-            finally:
-                # Geçici dosyayı temizle
-                if os.path.exists(temp_file_path):
-                    os.unlink(temp_file_path)
         dt = time.time() - t0
-        print(f"✅ Whisper done | took {dt:.2f}s")
-        print(f"Tamamlandı: {len(results)} video transkript edildi")
-        return WhisperResponse(model=model_name, results=results)
     except Exception as e:
         print(f"Whisper Hatası: {e}")

 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 import functools
+import asyncio
+import concurrent.futures
 import kazanim_id_konu_isim_dict_list as kazanimlar
 import logging
 import whisper
                     level=logging.INFO,
                     format="%(asctime)s  %(levelname)s  %(message)s")
+# Thread pool for parallel processing - HF Space friendly
+# HF Space'lerde çok aggressive olmamak için worker sayısını azalttık
+executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
+async def process_single_video(file: UploadFile, model, language: str) -> VideoResult:
+    """Tek bir video dosyasını işle - paralel kullanım için"""
+    if not file.filename.lower().endswith(('.mp4', '.wav', '.mp3', '.m4a', '.flac')):
+        return VideoResult(id=file.filename, text="")
+    # Geçici dosya oluştur
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
+        content = await file.read()
+        temp_file.write(content)
+        temp_file_path = temp_file.name
+    def transcribe_sync():
+        """Senkron transcription - thread pool'da çalışacak - HF Space optimized"""
+        try:
+            # HF Space için daha conservative ayarlar
+            result = model.transcribe(
+                temp_file_path,
+                language=language.lower(),
+                verbose=False,
+                fp16=False,  # HF Space'te daha stabil
+                temperature=0.0  # Deterministic output
+            )
+            return result['text'].strip()
+        except Exception as e:
+            print(f"Video işleme hatası ({file.filename}): {e}")
+            return ""
+        finally:
+            # Geçici dosyayı temizle
+            if os.path.exists(temp_file_path):
+                try:
+                    os.unlink(temp_file_path)
+                except:
+                    pass  # HF Space'te silme hatası olabilir
+    # Thread pool'da transcription çalıştır
+    loop = asyncio.get_event_loop()
+    text = await loop.run_in_executor(executor, transcribe_sync)
+    return VideoResult(id=file.filename, text=text)
 @app.post("/predict", response_model=PredictResponse)
 async def predict(req: PredictRequest):
     t0 = time.time()
 async def transcribe_videos(files: List[UploadFile] = File(...),
                           model_name: str = "small",
                           language: str = "Turkish"):
+    """Video dosyalarını metne çevir - PARALEL İŞLEME"""
     t0 = time.time()
+    print(f"🚀 new whisper request /model = {model_name} / n = {len(files)} - PARALEL İŞLEME BAŞLIYOR")
     try:
         if not files:
         # Whisper modelini yükle
         model = load_whisper_model(model_name)
+        # HF SPACE İÇİN CHUNK'LI PARALEL İŞLEME
+        chunk_size = 16  # HF Space için güvenli chunk boyutu
+        final_results = []
+        print(f"📡 {len(files)} dosya {chunk_size}'lı chunk'larda paralel işlenecek...")
+        # Dosyaları chunk'lara böl ve her chunk'ı paralel işle
+        for i in range(0, len(files), chunk_size):
+            chunk = files[i:i + chunk_size]
+            print(f"🔄 Chunk {i//chunk_size + 1}: {len(chunk)} dosya işleniyor...")
+            # Bu chunk'ı paralel işle
+            tasks = [process_single_video(file, model, language) for file in chunk]
+            chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Exception'ları handle et
+            for j, result in enumerate(chunk_results):
+                if isinstance(result, Exception):
+                    print(f"❌ Dosya {chunk[j].filename} işlenirken hata: {result}")
+                    final_results.append(VideoResult(id=chunk[j].filename, text=""))
+                else:
+                    final_results.append(result)
+            print(f"✅ Chunk {i//chunk_size + 1} tamamlandı!")
+            # Memory'yi rahatlatmak için küçük bir bekleme (HF Space için)
+            if i + chunk_size < len(files):
+                await asyncio.sleep(0.1)
         dt = time.time() - t0
+        print(f"✅ Whisper PARALEL done | took {dt:.2f}s")
+        print(f"🎯 Tamamlandı: {len(final_results)} video PARALEL olarak transkript edildi")
+        return WhisperResponse(model=model_name, results=final_results)
     except Exception as e:
         print(f"Whisper Hatası: {e}")