Spaces:
Sleeping
Sleeping
import os | |
import re | |
import json | |
import shutil | |
import pyttsx3 | |
from pydub import AudioSegment | |
from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
from dotenv import load_dotenv | |
# Load environment variables from .env file | |
load_dotenv() | |
# Initialize GPT-2 model and tokenizer | |
model_name = "distilgpt2" | |
tokenizer = GPT2Tokenizer.from_pretrained(model_name) | |
model = GPT2LMHeadModel.from_pretrained(model_name) | |
# System prompt and article content | |
system_prompt = """Generate a conversation between Sascha and Marina based on the article content provided. | |
Sascha is the article writer, and Marina is the interviewer. Make it engaging and emotional, with natural pauses (like "uh") | |
to make it sound conversational. This is for a podcast called "The Machine Learning Engineer".""" | |
# TTS voice map for Sascha and Marina | |
speaker_voice_map = { | |
"Sascha": "pyttsx3", # Sascha will use pyttsx3 for offline TTS | |
"Marina": "pyttsx3" # Marina uses pyttsx3 for offline TTS | |
} | |
# Initialize pyttsx3 engine for offline TTS | |
engine = pyttsx3.init() | |
engine.setProperty('rate', 150) # Speed of speech | |
engine.setProperty('volume', 0.9) # Volume (0.0 to 1.0) | |
# Pyttsx3 TTS function for offline TTS | |
def synthesize_speech_pyttsx3(text, speaker, index): | |
filename = f"audio-files/{index}_{speaker}.mp3" | |
engine.save_to_file(text, filename) | |
engine.runAndWait() | |
print(f'Audio content written to file "{filename}"') | |
# Function to synthesize speech based on the speaker | |
def synthesize_speech(text, speaker, index): | |
synthesize_speech_pyttsx3(text, speaker, index) | |
# Function to sort filenames naturally | |
def natural_sort_key(filename): | |
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] | |
# Function to merge audio files | |
def merge_audios(audio_folder, output_file): | |
combined = AudioSegment.empty() | |
audio_files = sorted( | |
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")], | |
key=natural_sort_key | |
) | |
for filename in audio_files: | |
audio_path = os.path.join(audio_folder, filename) | |
print(f"Processing: {audio_path}") | |
audio = AudioSegment.from_file(audio_path) | |
combined += audio | |
combined.export(output_file, format="mp3") | |
print(f"Merged audio saved as {output_file}") | |
# Function to generate conversation using distilgpt2 | |
def generate_conversation(article): | |
input_text = f"{system_prompt}\n\n{article}\n\nSascha: " | |
inputs = tokenizer.encode(input_text, return_tensors="pt") | |
outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=1.0) | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Parse conversation into JSON format | |
lines = re.split(r'(Sascha:|Marina:)', generated_text)[1:] # split by speaker names | |
conversation = [{"speaker": lines[i].strip(), "text": lines[i + 1].strip()} for i in range(0, len(lines), 2)] | |
formatted_json = json.dumps(conversation, indent=4) | |
print(formatted_json) | |
return conversation | |
# Function to generate the podcast audio | |
def generate_audio(conversation): | |
if os.path.exists('audio-files'): | |
shutil.rmtree('audio-files') | |
os.makedirs('audio-files', exist_ok=True) | |
for index, part in enumerate(conversation): | |
speaker = part['speaker'] | |
text = part['text'] | |
synthesize_speech(text, speaker, index) | |
output_file = "podcast.mp3" | |
merge_audios("audio-files", output_file) | |
return output_file | |
# Read the article from the file | |
with open('function-calling.txt', 'r') as file: | |
article = file.read() | |
# Generate conversation and audio | |
conversation = generate_conversation(article) | |
generate_audio(conversation) | |