import torch import requests from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from datasets import load_dataset from pydantic import BaseModel from fastapi import FastAPI class URLPayload(BaseModel): url: str app = FastAPI() def process_audio(url: str): response = requests.get(url) with open("/data/audio.mp3", mode="wb") as file: file.write(response.content) device = "cuda" model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=8192, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch.float16, device=device ) dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation") whisper_result = pipe("/data/audio.mp3", generate_kwargs={"language": "polish"}) return whisper_result @app.post("/process/") async def process_audio_endpoint(payload: URLPayload): result = process_audio(payload.url) return result