Spaces:
Runtime error
Runtime error
import torch | |
from diffusers import TextToVideoSDPipeline, DiffusionPipeline | |
from diffusers.utils import export_to_video | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import PIL | |
from io import BytesIO | |
from gtts import gTTS | |
import time | |
from pydub import AudioSegment | |
import nltk | |
from together import Together | |
import base64 | |
tokenizer = AutoTokenizer.from_pretrained("ParisNeo/LLama-3.2-3B-Lollms-Finetuned-GGUF") | |
model0 = AutoModelForCausalLM.from_pretrained("ParisNeo/LLama-3.2-3B-Lollms-Finetuned-GGUF", ignore_mismatched_sizes=True) | |
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") | |
model0 = model0.to(device) | |
# Initialize Chat History | |
def chat_with_llama(user_input, chat_history): | |
# Prepare formatted prompt | |
prompt = "You are a helpful, respectful and honest general-purpose assistant." | |
for user_content, assist_content in chat_history: | |
prompt += f"user: {user_content}\n" | |
prompt += f"assistant: {assist_content}\n" | |
prompt += f"user: {user_input}\n'assistant:" | |
# Tokenize and generate response | |
inputs = tokenizer(prompt, return_tensors="pt").to("cuda:1") | |
output = model0.generate(inputs["input_ids"], max_length=4096, max_new_tokens = 1024, temperature=0.7, max_time = 10.0, repetition_penalty = 1.0) | |
response = tokenizer.decode(output[0], skip_special_tokens=True) | |
# Extract and append assistant's response | |
assistant_reply = response.split("assistant:")[-1].split('user:')[0].strip() | |
chat_history.append((user_input, assistant_reply)) | |
return assistant_reply, chat_history | |
api_key='YOUR API KEY HERE' | |
client = Together(api_key=api_key) | |
def chat_api(user_input, chat_history): | |
messages = [] | |
for user_content, assist_content in chat_history: | |
messages += [ | |
{"role":"user", "content":user_content}, | |
{"role":"assistant", "content":assist_content} | |
] | |
messages += [{"role":"user", "content":user_input}] | |
response = client.chat.completions.create( | |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo", | |
messages=messages, | |
) | |
reply = response.choices[0].message.content | |
chat_history.append((user_input, reply)) | |
return reply, chat_history | |
def tti_api(prompt, num_steps = 25, width = 512, heights = 512): | |
response = client.images.generate( | |
prompt=prompt, | |
model="black-forest-labs/FLUX.1-dev", | |
width=width, | |
height=heights, | |
steps=num_steps, | |
n=1, | |
response_format="b64_json" | |
) | |
image_data = base64.b64decode(response.data[0].b64_json) | |
return image_data | |
prompt = 'A nice black lexus 570 car running on the snowy road.' | |
image = tti_api(prompt, num_steps = 25) | |
image = PIL.Image.open(BytesIO(image)) | |
image.save('result.png') | |
image.show() | |
def ttv(prompt, num_steps = 50): | |
# Load the text-to-video model from Hugging Face | |
model_id = "damo-vilab/text-to-video-ms-1.7b" # ModelScope Text-to-Video model | |
#model_id = "guoyww/animatediff-motion-adapter-v1-5-2" # ModelScope Text-to-Video | |
pipe = TextToVideoSDPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16") | |
pipe.to("cuda:0") # Use GPU if available | |
# Generate video frames | |
print("Generating video... This may take some time.") | |
with torch.no_grad(): | |
video_frames = pipe(prompt, num_frames=32, height=256, width=256, num_inference_steps=num_steps).frames[0] | |
# Save the generated video | |
video_path = export_to_video(video_frames, output_video_path="output_video.mp4") | |
return video_path | |
test_video = ttv('An awesome lexus 570 car running on the snowy road, high quality', num_steps = 50) | |
# Ensure the sentence tokenizer is downloaded (if not already) | |
nltk.download('punkt') | |
# Function to convert text to speech and generate SRT content | |
def tts(text): | |
# Initialize the Google TTS engine with language (e.g., 'en' for English) | |
tts = gTTS(text=text, lang='en', slow=False) | |
# Save to an audio file | |
audio_path = "output.mp3" | |
tts.save(audio_path) | |
# Load the audio file with pydub to get the duration | |
audio = AudioSegment.from_mp3(audio_path) | |
duration_ms = len(audio) # Duration in milliseconds | |
# Split the text into sentences using NLTK | |
sentences = nltk.sent_tokenize(text) | |
# Estimate the duration per sentence | |
chunk_duration_ms = duration_ms // len(sentences) # Estimated duration per sentence | |
# Generate SRT content | |
srt_content = "" | |
start_time = 0 # Start time of the first subtitle | |
for idx, sentence in enumerate(sentences): | |
end_time = start_time + chunk_duration_ms | |
start_time_formatted = time.strftime('%H:%M:%S', time.gmtime(start_time / 1000)) + ',' + f'{start_time % 1000:03d}' | |
end_time_formatted = time.strftime('%H:%M:%S', time.gmtime(end_time / 1000)) + ',' + f'{end_time % 1000:03d}' | |
srt_content += f"{idx + 1}\n" | |
srt_content += f"{start_time_formatted} --> {end_time_formatted}\n" | |
srt_content += f"{sentence}\n\n" | |
start_time = end_time # Update start time for the next sentence | |
return audio_path, srt_content | |
def tti(prompt, num_steps = 50, width = 512, heights = 512): | |
# Load the pre-trained Stable Diffusion pipeline from Hugging Face | |
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") | |
#pipe.load_lora_weights("FradigmaDangerYT/dalle-e-mini") | |
# Move the pipeline to GPU (you can select the GPU with cuda:1 for the second GPU) | |
device0 = torch.device("cuda:0") # Use "cuda:0" for the first GPU, "cuda:1" for the second GPU | |
pipe.to(device0) | |
print(heights) | |
# Generate an image | |
image = pipe(prompt, num_inference_steps = num_steps, width = width, height = heights).images[0] # Generate image from the prompt | |
return image | |
prompt = 'A nice black lexus 570 car running on the snowy road.' | |
image = tti(prompt, num_steps = 25, width = 320, heights = 240) | |
# image = PIL.Image.open(BytesIO(image)) | |
image.save('result.png') | |
image.show() | |
# If demo is on, turn off demo | |
try: | |
demo.close() | |
except: | |
pass | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# Gradio based Text-to-Any Project | |
""") | |
with gr.Tab(label="Llama-Chat"): | |
radios0 = gr.Radio(['use api', 'use loaded model'], value="use api", show_label = False) | |
gptDialog = gr.Chatbot(label = "Llama-Chat", max_height=512, min_height=512, | |
autoscroll= True) | |
with gr.Row(equal_height=True): | |
prompt0 = gr.Textbox(label = 'Prompt Input', lines = 1, scale = 9, max_lines=2, | |
autofocus=True, autoscroll=True, placeholder='Type your message here...') | |
with gr.Column(scale = 1): | |
generate_btn0 = gr.Button('generate') | |
clear_btn0 = gr.Button('clear') | |
with gr.Tab(label="Text-to-Image/Video"): | |
with gr.Row(): | |
radios1 = gr.Radio(['use api', 'use loaded model'], value="use api", show_label = False) | |
steps = gr.Slider(value = 50, minimum = 20, maximum = 100, step = 1, label = 'num_steps') | |
width = gr.Slider(value = 1024, minimum = 240, maximum = 1792, step = 16, label = 'width') | |
heights = gr.Slider(value = 512, minimum = 160, maximum = 1792, step = 16, label = 'heights') | |
with gr.Row(): | |
outputImg = gr.Image(type='pil',height= 512, width=512, label="Output Image", interactive=False) | |
outputVideo = gr.Video(width=512, height=512, label = "Output Video", interactive=False) | |
with gr.Row(equal_height=True): | |
prompt1 = gr.Textbox(label = 'Prompt Input', lines = 1, scale = 9, max_lines=2, | |
autofocus=True, autoscroll=True, placeholder='Type your message here...') | |
with gr.Column(scale = 1): | |
generate_btn1 = gr.Button('generate image') | |
generate_btn11 = gr.Button('generate video') | |
with gr.Tab(label = "Text-to-Speech"): | |
outputAudio = gr.Audio(label="Audio Output", interactive = False) | |
outputSrt = gr.Textbox(label = 'Script Output', lines = 10, max_lines = 5, placeholder = 'Script output here') | |
with gr.Row(equal_height=False): | |
prompt2 = gr.Textbox(label = 'Prompt Input', lines = 5, scale = 9, max_lines=5, | |
autofocus=True, autoscroll=True, placeholder='Type your message here...') | |
with gr.Column(scale = 1): | |
generate_btn2 = gr.Button('generate') | |
clear_btn2 = gr.Button('clear') | |
with gr.Tab(label = 'About'): | |
pass | |
def generate_txt(prompt, check, history): | |
if check == 'use api': | |
response, history = chat_api(prompt, history) | |
if response == None: | |
gr.Warning('Can not reach api.') | |
else: | |
response, history = chat_with_llama(prompt, history) | |
if response == None: | |
gr.Warning('Failed to load model.') | |
return '', history | |
def clear_chat(): | |
history = [] | |
gr.Info('Cleaned successfully!') | |
return history | |
def generate_img(prompt, check, num_steps, width, heights): | |
if check == 'use api': | |
image = tti_api(prompt, num_steps = num_steps, width = width, heights = heights) | |
image = PIL.Image.open(BytesIO(image)) | |
if not image: | |
gr.Warning('Can not reach api') | |
gr.Info('Generated Image Successfully!') | |
else: | |
image = tti(prompt, num_steps = num_steps, width = width, heights = heights) | |
gr.Info('Generated Image Successfully!') | |
return image | |
def generate_video(prompt, num_steps): | |
video = ttv(prompt, num_steps) | |
gr.Info('Generated Video Successfully!') | |
return video | |
def generate_speech(prompt): | |
audio, script = tts(prompt) | |
gr.Info('Generated Speech Successfully!') | |
return audio, script | |
def clear_speech(): | |
gr.Info('Cleaned Successfully!') | |
return None, '' | |
prompt0.submit(generate_txt, [prompt0, radios0, gptDialog], [prompt0, gptDialog]) | |
prompt1.submit(generate_img, [prompt1, radios1], [outputImg]) | |
# generate button click event | |
generate_btn0.click(generate_txt, [prompt0, radios0, gptDialog], [prompt0, gptDialog]) | |
generate_btn1.click(generate_img, [prompt1, radios1, steps, width, heights], [outputImg]) | |
generate_btn11.click(generate_video, [prompt1, steps], [outputVideo]) | |
generate_btn2.click(generate_speech, [prompt2], [outputAudio, outputSrt]) | |
# clear button click event | |
clear_btn0.click(clear_chat, [], [gptDialog]) | |
clear_btn2.click(clear_speech, [], [outputAudio, outputSrt]) | |
demo.launch(share = True) | |