Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

App Files Files Community

UST_Deep_learning_asm1 / app.py

IrisDeng

Update app.py

f1d429a verified 3 days ago

raw

history blame

2.63 kB

	import streamlit as st
	from PIL import Image
	from transformers import pipeline
	from gtts import gTTS
	import torch

	st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")

	# Load models once
	caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
	story_pipeline = pipeline("text-generation", model="Qwen/Qwen2-1.5B")


	def extract_image_caption(image_data):
	img_obj = Image.open(image_data)
	caption_results = caption_pipeline(img_obj)
	return caption_results[0]['generated_text']

	def compose_story_from_caption(caption_detail):
	prompt_text = (
	"You are a talented and imaginative storyteller for children aged 3 to 10. "
	"Using the details derived from the image below, craft a captivating tale that goes beyond merely describing the scene. "
	"Let your creativity shine by introducing engaging characters, adventurous journeys, and delightful surprises. "
	"Your story should be vivid, original, and between 100 and 300 words in length.\n\n"
	f"Image Details: {caption_detail}\n\nStory:"
	)
	story_results = story_pipeline(prompt_text, num_return_sequences=1)
	story_text = story_results[0]['generated_text']
	return story_text.split("Story:", 1)[1].strip() if "Story:" in story_text else story_text.strip()

	def convert_text_to_audio(text_content, audio_path="output.mp3"):
	tts_engine = gTTS(text=text_content, lang="en")
	tts_engine.save(audio_path)
	return audio_path

	def run_app():
	st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
	st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")

	uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])

	if uploaded_image is not None:
	image_display = Image.open(uploaded_image)
	st.image(image_display, caption="Uploaded Image", use_container_width=True)

	with st.spinner("Generating caption for the image..."):
	caption_text = extract_image_caption(uploaded_image)
	st.write("Generated Caption:", caption_text)

	with st.spinner("Composing story..."):
	story_text = compose_story_from_caption(caption_text)
	st.write("Story:")
	st.write(story_text)

	with st.spinner("Converting text to audio..."):
	audio_file = convert_text_to_audio(story_text)
	st.audio(audio_file, format="audio/mp3")

	if __name__ == "__main__":
	run_app()