Spaces:
Sleeping
Sleeping
# main.py - Hugging Face Spaces API: ders_id -> model mapping -> batch inference -> kazanımID | |
# Requirements (requirements.txt): | |
# fastapi transformers torch pydantic uvicorn tensorflow | |
# | |
# Directory layout within Space repo: | |
# - main.py (this file) | |
# - model_mapping.json | |
# - kazanim_id_konu_isim_dict_list.py | |
# | |
# 📌 Endpoints: | |
# POST /predict {"model_name": "eraydikyologlu/bert_ayt_matematik", "inputs": ["soru1", "soru2", ...]} | |
# → {"model": "...", "results": [{"kazanım_id": "2873", "label": "LABEL_0", "score": 0.97}, ...]} | |
import os | |
import logging | |
logger = logging.getLogger("uvicorn") | |
logger.setLevel(logging.INFO) | |
# Hugging Face cache'ini writable dizine yönlendir | |
os.environ["HF_HOME"] = "/tmp/.cache/huggingface" | |
os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface" | |
os.environ["HF_HUB_CACHE"] = "/tmp/.cache/huggingface" | |
os.environ["TRANSFORMERS_VERBOSITY"] = "info" | |
os.environ["HF_HUB_DISABLE_BIN_TO_SAFETENSORS_CONVERSION"] = "1" | |
try: | |
import tensorflow as tf | |
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) | |
except ImportError: | |
pass | |
from fastapi import FastAPI, HTTPException, UploadFile, File | |
from pydantic import BaseModel, Field | |
from typing import List, Dict, Optional | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
import torch | |
import functools | |
# asyncio, concurrent.futures, threading kaldırıldı - direkt sequential processing | |
import kazanim_id_konu_isim_dict_list as kazanimlar | |
import logging | |
import whisper | |
import tempfile | |
import os | |
import logging | |
logger = logging.getLogger("uvicorn") | |
logger.setLevel(logging.INFO) | |
# Hugging Face cache'ini writable dizine yönlendir | |
os.environ["HF_HOME"] = "/tmp/.cache/huggingface" | |
os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface" | |
os.environ["HF_HUB_CACHE"] = "/tmp/.cache/huggingface" | |
# Whisper ve diğer cache'ler için | |
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" | |
os.environ["TORCH_HOME"] = "/tmp/.cache/torch" | |
# Whisper model cache için özel dizin | |
whisper_cache_dir = "/tmp/.cache/whisper" | |
os.makedirs(whisper_cache_dir, exist_ok=True) | |
os.environ["WHISPER_CACHE_DIR"] = whisper_cache_dir | |
os.environ["TRANSFORMERS_VERBOSITY"] = "info" | |
os.environ["HF_HUB_DISABLE_BIN_TO_SAFETENSORS_CONVERSION"] = "1" | |
app = FastAPI(title="Edu-BERT Multi‑Model API") | |
# Hugging Face Space CPU kullandığı için device -1 (CPU) | |
device = 0 if torch.cuda.is_available() else -1 | |
print(f"🧠 torch: {torch.__version__}, cuda available: {torch.cuda.is_available()}") | |
if torch.cuda.is_available(): | |
print(f"🚀 CUDA device name: {torch.cuda.get_device_name(0)}") | |
else: | |
print("⚠️ CUDA not available, using CPU.") | |
# ---------- Pydantic Schemas ---------- # | |
class PredictRequest(BaseModel): | |
model_name: str = Field(..., description="Model adı (örn: eraydikyologlu/bert_ayt_matematik)") | |
inputs: List[str] = Field(..., description="Soru metinleri listesi") | |
class WhisperRequest(BaseModel): | |
model_name: str = Field(default="small", description="Whisper model adı (tiny, base, small, medium, large)") | |
language: str = Field(default="tr", description="Dil") | |
batch_size: int = Field(default=8, description="Batch boyutu") | |
class QuestionResult(BaseModel): | |
label: str | |
score: float | |
class VideoResult(BaseModel): | |
id: str | |
text: str | |
class PredictResponse(BaseModel): | |
model: str | |
results: List[QuestionResult] | |
class WhisperResponse(BaseModel): | |
model: str | |
results: List[VideoResult] | |
# ---------- Helpers ---------- # | |
def load_pipeline(model_name: str): | |
"""Model pipeline yükleme - minimal approach""" | |
try: | |
print(f"Model yükleniyor: {model_name}") | |
#base_tok = "umutarpayy/tyt_turkce_bert" | |
#model_name = "eraydikyologlu/tyt_turkce_bert_pt" | |
# EXACTLY like your working local code - NO extra parameters | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device) | |
print(f"Model başarıyla yüklendi: {model_name}") | |
return classifier | |
except Exception as e: | |
print(f"Model yükleme hatası ({model_name}): {e}") | |
raise HTTPException(status_code=500, detail=f"Model yükleme hatası: {str(e)}") | |
def load_whisper_model(model_name: str): | |
"""openai-whisper model yükleme - ESKİ STABİL VERSİYON""" | |
try: | |
print(f"openai-whisper modeli yükleniyor: {model_name}") | |
# CPU/GPU device seçimi | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Device: {device}") | |
# openai-whisper modeli yükle | |
model = whisper.load_model(model_name, device=device) | |
print(f"✅ openai-whisper modeli başarıyla yüklendi: {model_name} on {device}") | |
return model | |
except Exception as e: | |
print(f"openai-whisper model yükleme hatası ({model_name}): {e}") | |
raise HTTPException(status_code=500, detail=f"openai-whisper model yükleme hatası: {str(e)}") | |
import time, logging, sys | |
logging.basicConfig(stream=sys.stdout, | |
level=logging.INFO, | |
format="%(asctime)s %(levelname)s %(message)s") | |
# WORKER YOK - Direkt sequential processing | |
# faster-whisper kullanıyoruz artık | |
async def prepare_video_file(file: UploadFile) -> tuple[str, str]: | |
"""Video dosyasını geçici dizine kaydet - Validation ile""" | |
if not file.filename.lower().endswith(('.mp4', '.wav', '.mp3', '.m4a', '.flac')): | |
return file.filename, "" | |
# Geçici dosya oluştur | |
original_ext = os.path.splitext(file.filename)[1] | |
with tempfile.NamedTemporaryFile(delete=False, suffix=original_ext) as temp_file: | |
content = await file.read() | |
temp_file.write(content) | |
temp_file_path = temp_file.name | |
# FILE VALIDATION: Dosya boyutu kontrolü | |
file_size = len(content) | |
if file_size < 1000: # 1KB'den küçükse corrupt | |
print(f"❌ {file.filename}: Dosya çok küçük ({file_size} bytes)") | |
os.unlink(temp_file_path) | |
return file.filename, "" | |
print(f"✅ {file.filename}: Dosya geçerli ({file_size} bytes)") | |
return file.filename, temp_file_path | |
def process_single_video_sync(file_path: str, filename: str, model, language: str) -> VideoResult: | |
"""Tek bir video dosyasını işle - DIREKT SEQUENTIAL (worker yok) - ESKİ STABİL VERSİYON""" | |
try: | |
print(f"🔄 {filename}: openai-whisper ile işleniyor...") | |
# openai-whisper ile transcription - ESKİ STABİL API | |
result = model.transcribe(file_path, language=language) | |
text = result['text'].strip() | |
# Model çıktısının bir kısmını logla (debug için) | |
preview = text[:150] + "..." if len(text) > 150 else text | |
print(f"📝 {filename}: {preview}") | |
return VideoResult(id=filename, text=text) | |
except Exception as e: | |
print(f"❌ Video işleme hatası ({filename}): {e}") | |
return VideoResult(id=filename, text="") | |
finally: | |
# Geçici dosyayı temizle | |
if os.path.exists(file_path): | |
try: | |
os.unlink(file_path) | |
except: | |
pass | |
async def predict(req: PredictRequest): | |
t0 = time.time() | |
print(f"new request /model = {req.model_name} / n = {len(req.inputs)}") | |
"""Ana endpoint - model_name ile inference""" | |
try: | |
if not req.inputs: | |
raise HTTPException(status_code=400, detail="inputs boş olamaz") | |
# Pipeline yükle | |
classifier = load_pipeline(req.model_name) | |
# Batch işleme | |
outputs = classifier(req.inputs, truncation=True, padding=True, batch_size=8) | |
dt = time.time() - t0 | |
print(f"✅ done | took {dt:.2f}s") | |
results = [] | |
for out in outputs: | |
label = out["label"] | |
score = float(out["score"]) | |
results.append(QuestionResult( | |
label=label, | |
score=score | |
)) | |
print(f"Tamamlandı: {len(results)} sonuç") | |
return PredictResponse(model=req.model_name, results=results) | |
except Exception as e: | |
print(f"Hata: {e}") | |
import traceback | |
traceback.print_exc() | |
raise HTTPException(status_code=500, detail=f"Hata: {str(e)}") | |
async def transcribe_single_video(file: UploadFile = File(...), | |
model_name: str = "small", | |
language: str = "tr", | |
ustkurumid: Optional[str] = None, | |
testid: Optional[str] = None, | |
soruno: Optional[str] = None): | |
"""TEK VİDEO işleme - Bug'sız tek tek processing""" | |
t0 = time.time() | |
print(f"🎯 TEK VIDEO işleme başlıyor: {file.filename}") | |
print(f"📊 Request info: ustkurumid={ustkurumid}, testid={testid}, soruno={soruno}") | |
try: | |
if not file.filename.lower().endswith(('.mp4', '.wav', '.mp3', '.m4a', '.flac')): | |
raise HTTPException(status_code=400, detail="Desteklenmeyen dosya formatı") | |
# openai-whisper Model yükle | |
model = load_whisper_model(model_name) | |
# Dosyayı hazırla | |
print(f"📁 Dosya hazırlanıyor: {file.filename}") | |
file_name, file_path = await prepare_video_file(file) | |
if not file_path: | |
return WhisperResponse(model=model_name, results=[VideoResult(id=file_name, text="")]) | |
print(f"🚀 Transcription başlıyor: {file_name}") | |
# openai-whisper ile TEK VİDEO işleme - ESKİ STABİL VERSİYON | |
result = model.transcribe(file_path, language=language) | |
text = result['text'].strip() | |
# Model çıktısının bir kısmını logla | |
preview = text[:150] + "..." if len(text) > 150 else text | |
print(f"📝 {file_name}: {preview}") | |
# Geçici dosyayı temizle | |
try: | |
os.unlink(file_path) | |
except: | |
pass | |
dt = time.time() - t0 | |
print(f"✅ TEK VIDEO tamamlandı | took {dt:.2f}s") | |
return WhisperResponse(model=model_name, results=[VideoResult(id=file_name, text=text)]) | |
except Exception as e: | |
print(f"❌ Tek video hatası: {e}") | |
import traceback | |
traceback.print_exc() | |
raise HTTPException(status_code=500, detail=f"Tek video hatası: {str(e)}") | |
async def transcribe_videos(files: List[UploadFile] = File(...), | |
model_name: str = "small", | |
language: str = "tr", | |
ustkurumid: Optional[str] = None, | |
testid: Optional[str] = None, | |
soruno: Optional[str] = None): | |
"""Video dosyalarını metne çevir - TEK TEK İŞLEME (Bug'sız)""" | |
t0 = time.time() | |
print(f"🎯 TEK TEK whisper request /model = {model_name} / n = {len(files)} - SEQUENTIAL İŞLEME") | |
print(f"📊 Batch Request info: ustkurumid={ustkurumid}, testid={testid}, soruno={soruno}") | |
try: | |
if not files: | |
raise HTTPException(status_code=400, detail="Video dosyaları boş olamaz") | |
# openai-whisper modelini yükle | |
model = load_whisper_model(model_name) | |
# 🎯 TEK TEK SEQUENTIAL İŞLEME - Bug'sız! | |
final_results = [] | |
print(f"📝 {len(files)} dosya tek tek sırayla işlenecek (bug'sız)...") | |
# Her dosyayı sırasıyla tek tek işle | |
for i, file in enumerate(files): | |
print(f"🔄 Video {i+1}/{len(files)}: {file.filename} işleniyor...") | |
try: | |
# Dosyayı hazırla | |
file_name, file_path = await prepare_video_file(file) | |
if not file_path: | |
final_results.append(VideoResult(id=file_name, text="")) | |
continue | |
# TEK TEK işleme - DIREKT SEQUENTIAL | |
result = process_single_video_sync(file_path, file_name, model, language) | |
final_results.append(result) | |
# Başarı durumunu logla | |
if result.text.strip(): | |
print(f"✅ {file.filename}: Başarılı!") | |
else: | |
print(f"❌ {file.filename}: Boş sonuç!") | |
except Exception as e: | |
print(f"❌ {file.filename}: İşleme hatası: {e}") | |
final_results.append(VideoResult(id=file.filename, text="")) | |
# Her dosya arasında kısa bekleme (stability için) | |
if i < len(files) - 1: | |
import asyncio | |
await asyncio.sleep(0.5) | |
dt = time.time() - t0 | |
# Final istatistikler | |
total_success = len([r for r in final_results if r.text.strip()]) | |
total_failed = len(final_results) - total_success | |
success_rate = (total_success / len(final_results) * 100) if final_results else 0 | |
print(f"✅ Whisper SEQUENTIAL done | took {dt:.2f}s") | |
print(f"🎯 TEK TEK SONUÇ: {len(final_results)} video | ✅{total_success} başarılı | ❌{total_failed} hatalı | 📊{success_rate:.1f}% başarı oranı") | |
print(f"⚡ SEQUENTIAL HIZI: {len(final_results)/dt:.1f} video/saniye | Bug'sız stabil!") | |
return WhisperResponse(model=model_name, results=final_results) | |
except Exception as e: | |
print(f"Whisper Hatası: {e}") | |
import traceback | |
traceback.print_exc() | |
raise HTTPException(status_code=500, detail=f"Whisper Hatası: {str(e)}") | |
def root(): | |
return {"status": "ok", "message": "Edu-BERT API çalışıyor"} | |
def health_check(): | |
"""Sağlık kontrolü endpoint'i""" | |
try: | |
# Hangi donanımda çalıştığımızı belirle | |
if device == -1: | |
device_info = "CPU" | |
else: | |
gpu_name = torch.cuda.get_device_name(0) | |
device_info = f"GPU: {gpu_name}" | |
bert_models = load_pipeline.cache_info().currsize if hasattr(load_pipeline, 'cache_info') else 0 | |
whisper_models = load_whisper_model.cache_info().currsize if hasattr(load_whisper_model, 'cache_info') else 0 | |
return { | |
"status": "healthy", | |
"device": device_info, | |
"bert_models_loaded": bert_models, | |
"whisper_models_loaded": whisper_models, | |
"endpoints": ["/predict", "/whisper", "/whisper-single", "/health"], | |
"processing_mode": "SEQUENTIAL - openai-whisper ESKİ STABİL VERSİYON" | |
} | |
except Exception as e: | |
return {"status": "error", "message": f"Sağlık kontrolü hatası: {str(e)}"} | |
# Local debug (optional) | |
# if __name__ == "__main__": | |
# import uvicorn | |
# uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True) | |