Spaces:

rbcurzon
/

speech-to-text

Running

App Files Files Community

speech-to-text / app.py

rbcurzon

Update app.py

83733e7 verified 8 months ago

raw

history blame

2.34 kB

	# -- coding: utf-8 --
	"""main.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/17Umb-Po_5pESiRv3-dcDRyootgqBjjWM
	"""

	import torch
	from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# load model and processor
	model_id = "rbcurzon/whisper-small-fil"
	pipe = pipeline("automatic-speech-recognition", model=model_id, device=device)

	"""FastAPI"""

	import os
	import io
	from fastapi import FastAPI, WebSocket, UploadFile, File
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from google import genai
	from google.genai import types

	client = genai.Client(api_key=os.environ.get("GENAI_API_KEY")) # Do not share api key

	def translate(text, srcLang, tgtLang):
	sys_instruct = "You are a professional translator. Do not give explanation."
	response = client.models.generate_content(
	model="gemini-2.0-flash",
	config=types.GenerateContentConfig(
	system_instruction=sys_instruct),
	contents=f"Translate the following from {srcLang} to {tgtLang}. Return nothing but the {tgtLang} translation: {text} ",
	)
	print(response)
	return response.text

	from tempfile import NamedTemporaryFile
	from fastapi import UploadFile, Form, File
	from pathlib import Path
	from typing import Annotated
	import shutil
	import aiofiles

	# def save_upload_file_tmp(upload_file: UploadFile) -> Path:


	app = FastAPI(
	title="Real-Time Audio Processor",
	description="Process and transcribe audio in real-time using Whisper"
	)

	@app.post("/test/")
	async def test(file: UploadFile=File(...),
	srcLang: str= Form(...),
	tgtLang: str= Form(...)):
	# Download audio
	async with aiofiles.open(file.filename, 'wb') as out_file:
	content = await file.read() # async read
	await out_file.write(content) # async write

	result = pipe(content,
	max_new_tokens=256,
	chunk_length_s=30,
	batch_size=8,
	generate_kwargs={"task": "transcribe", "language": "tagalog"})
	translatedResult = translate(result['text'], srcLang=srcLang, tgtLang=tgtLang)
	return {"transcribed_text":result['text'], "translated_text":translatedResult}