import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS import torch st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") # 判断是否有可用的 GPU,如果有则使用 GPU(device=0),否则使用 CPU(device=-1) device_id = 0 if torch.cuda.is_available() else -1 def generate_caption(image_file): image = Image.open(image_file) # 使用 GPU 进行图像描述生成,如果可用 caption_generator = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", device=device_id ) caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(caption): # 使用 GPU 进行文本生成操作 story_generator = pipeline( "text-generation", model="Qwen/Qwen2-1.5B", device=device_id ) messages = ( "Please based on following image caption: " + caption + ", generate a complete fairy tale story for children with at least 100 words and max 300 words" ) result = story_generator(messages, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] return story # 以下部分为生成插图示例代码,已注释。如果需要使用 GPU,请取消注释并确保 diffusers 相关依赖已经安装 # @st.cache_resource # def load_image_generator(): # from diffusers import DiffusionPipeline # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if device == "cuda" else torch.float32 # pipe = DiffusionPipeline.from_pretrained( # "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch_dtype # ) # pipe = pipe.to(device) # return pipe # # def generate_illustration(prompt): # pipe = load_image_generator() # image_result = pipe(prompt) # generated_image = image_result.images[0] # return generated_image def text_to_speech(text, output_file="output.mp3"): tts = gTTS(text=text, lang="en") tts.save(output_file) return output_file def main(): st.markdown("