import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS import torch st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") # 判断是否有可用的 GPU,如果有则使用 GPU(device=0),否则使用 CPU(device=-1) device_id = 0 if torch.cuda.is_available() else -1 def generate_caption(image_file): image = Image.open(image_file) # 使用 GPU 进行图像描述生成,如果可用 caption_generator = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", device=device_id ) caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(caption): # 使用 GPU 进行文本生成操作 story_generator = pipeline( "text-generation", model="Qwen/Qwen2-1.5B", device=device_id ) messages = ( "You are a creative children's story writer. Based on the following image details, " "please write an imaginative story for children aged 3-10. Do not simply rephrase the image details; " "instead, expand creatively by adding fun characters, adventures, and unexpected twists. " "The story must be at least 100 words long.\n\n" f"Image Details: {caption}\n\nStory:" ) result = story_generator(messages, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] return story 000000000 # 以下部分为生成插图示例代码,已注释。如果需要使用 GPU,请取消注释并确保 diffusers 相关依赖已经安装 # @st.cache_resource # def load_image_generator(): # from diffusers import DiffusionPipeline # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if device == "cuda" else torch.float32 # pipe = DiffusionPipeline.from_pretrained( # "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch_dtype # ) # pipe = pipe.to(device) # return pipe # # def generate_illustration(prompt): # pipe = load_image_generator() # image_result = pipe(prompt) # generated_image = image_result.images[0] # return generated_image def text_to_speech(text, output_file="output.mp3"): tts = gTTS(text=text, lang="en") tts.save(output_file) return output_file def main(): st.markdown("

Your Image to Audio Story 🦜

", unsafe_allow_html=True) st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!") uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded image", use_container_width=True) with st.spinner("Image caption being generated..."): caption = generate_caption(uploaded_file) st.write("**Image Caption:**", caption) with st.spinner("Generating story..."): story = generate_story(caption) st.write("**Story:**") st.write(story) # 如果需要生成插图,请取消以下代码的注释 # with st.spinner("Generating illustration..."): # illustration = generate_illustration(story[:200]) # st.write("### Story Illustrations:") # st.image(illustration, caption="Story Illustrations", use_container_width=True) with st.spinner("Converting to voice..."): audio_file = text_to_speech(story) st.audio(audio_file, format="audio/mp3") if __name__ == "__main__": main()