Spaces:

StarPigeon
/

ViDove

Sleeping

ViDove / pipeline.py

Eason Lu

replace whisper api to whisper lib

04ae3b4 about 2 years ago

5.27 kB

	import openai
	from pytube import YouTube
	import argparse
	import os
	import io
	import whisper
	import ffmpeg

	parser = argparse.ArgumentParser()
	parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
	parser.add_argument("--local_path", help="local video path here", default=None, type=str, required=False)
	parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False) # New argument
	parser.add_argument("--download", help="download path", default='./downloads', type=str, required=False)
	parser.add_argument("--output_dir", help="translate result path", default='./results', type=str, required=False)
	parser.add_argument("--video_name", help="video name", default='placeholder', type=str, required=False)
	parser.add_argument("--model_name", help="model name only support text-davinci-003 and gpt-3.5-turbo", type=str, required=False, default="gpt-3.5-turbo")
	args = parser.parse_args()

	# input should be either video file or youtube video link.
	if args.link is None and args.local_path is None and args.srt_file is None:
	print("need video source or srt file")
	exit()

	openai.api_key = os.getenv("OPENAI_API_KEY")
	DOWNLOAD_PATH = args.download
	RESULT_PATH = args.output_dir
	VIDEO_NAME = args.video_name
	n_threshold = 1000 # Token limit for the GPT-3.5 model
	# model_name = "text-davinci-003" # replace this to our own fintune model
	model_name = args.model_name
	# model_name = "davinci"

	# get source audio
	if args.link is not None and args.local_path is None:
	# Download audio from YouTube
	video_link = args.link
	try:
	video = YouTube(video_link)
	audio = video.streams.filter(only_audio=True, file_extension='mp4').first()
	audio.download(DOWNLOAD_PATH)
	print('Download Completed!')
	except Exception as e:
	print("Connection Error")
	print(e)
	audio_path = '{}/{}'.format(DOWNLOAD_PATH, audio.default_filename)
	audio_file = open(audio_path, "rb")
	VIDEO_NAME = audio.default_filename.split('.')[0]
	elif args.local_path is not None:
	# Read from local
	audio_file= open(args.local_path, "rb")
	audio_path = args.local_path


	# Instead of using the script_en variable directly, we'll use script_input
	srt_file_en = args.srt_file
	if srt_file_en is not None:
	with open(srt_file_en, 'r') as f:
	script_input = f.read()
	else:
	# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
	srt_file_en = "{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME)
	if not os.path.exists(srt_file_en):
	# use OpenAI API for transcribe
	# transcript = openai.Audio.transcribe("whisper-1", audio_file)

	# use local whisper model
	model = whisper.load_model("base") # using base model in local machine (may use large model on our server)
	transcript = model.transcribe(audio_path)

	#Write SRT file
	from whisper.utils import WriteSRT

	with open(srt_file_en, 'w', encoding="utf-8") as srt:
	writer = WriteSRT(RESULT_PATH)
	writer.write_result(transcript, srt)

	# split the video script(open ai prompt limit: about 5000)
	with open(srt_file_en, 'r') as f:
	script_en = f.read()
	script_input = script_en

	from srt2ass import srt2ass
	assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
	print('ASS subtitle saved as: ' + assSub_en)

	# Split the video script by sentences and create chunks within the token limit
	n_threshold = 4096 # Token limit for the GPT-3 model
	script_split = script_input.split('.')

	script_arr = []
	script = ""
	for sentence in script_split:
	if len(script) + len(sentence) + 1 <= n_threshold:
	script += sentence + '.'
	else:
	script_arr.append(script.strip())
	script = sentence + '.'
	if script.strip():
	script_arr.append(script.strip())

	# Translate and save
	for s in script_arr:
	# using chatgpt model
	if model_name == "gpt-3.5-turbo":
	# print(s + "\n")
	response = openai.ChatCompletion.create(
	model=model_name,
	messages = [
	{"role": "system", "content": "You are a helpful assistant that translates English to Chinese and have decent background in starcraft2."},
	{"role": "user", "content": 'Translate the following English text to Chinese: "{}"'.format(s)}
	],
	temperature=0.1
	)
	with open(f"{RESULT_PATH}/{VIDEO_NAME}_zh.srt", 'a+') as f:
	f.write(response['choices'][0]['message']['content'].strip())

	if model_name == "text-davinci-003":
	prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
	# print(prompt)
	response = openai.Completion.create(
	model=model_name,
	prompt=prompt,
	temperature=0.1,
	max_tokens=2000,
	top_p=1.0,
	frequency_penalty=0.0,
	presence_penalty=0.0
	)

	with open(f"{RESULT_PATH}/{VIDEO_NAME}_zh.srt", 'a+') as f:
	f.write(response['choices'][0]['text'].strip())

	assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
	print('ASS subtitle saved as: ' + assSub_zh)