|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
from PIL import Image |
|
|
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
|
|
def generate_caption(image_path): |
|
image = Image.open(image_path) |
|
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base") |
|
inputs = processor(image, return_tensors="pt") |
|
output = model.generate(**inputs) |
|
caption = processor.decode(output[0], skip_special_tokens=True) |
|
return caption |
|
from transformers import pipeline |
|
|
|
def generate_story(caption): |
|
|
|
generator = pipeline("text-generation", model="gpt2") |
|
prompt = f"由以下图片得到的描述: '{caption}',请根据这个描述生成一个完整的童话故事,故事至少100个单词。" |
|
result = generator(prompt, max_length=300, num_return_sequences=1) |
|
story = result[0]['generated_text'] |
|
|
|
if len(story.split()) < 100: |
|
|
|
story += " " + generate_story(caption) |
|
return story |
|
from gtts import gTTS |
|
|
|
def text_to_speech(text, output_file="output.mp3"): |
|
tts = gTTS(text=text, lang='en') |
|
tts.save(output_file) |
|
return output_file |
|
import streamlit as st |
|
from PIL import Image |
|
|
|
def main(): |
|
st.title("儿童故事生成应用") |
|
st.write("上传一张图片,我们将根据图片生成有趣的故事,并转换成语音播放给你听!") |
|
|
|
uploaded_file = st.file_uploader("选择一张图片", type=["png", "jpg", "jpeg"]) |
|
if uploaded_file is not None: |
|
image = Image.open(uploaded_file) |
|
st.image(image, caption="上传的图片", use_container_width=True) |
|
|
|
|
|
caption = generate_caption(uploaded_file) |
|
st.write("图片描述:", caption) |
|
|
|
|
|
story = generate_story(caption) |
|
st.write("生成的故事:", story) |
|
|
|
|
|
audio_file = text_to_speech(story) |
|
st.audio(audio_file, format="audio/mp3") |
|
|
|
if __name__ == "__main__": |
|
main() |