|
import requests |
|
import torch |
|
import os |
|
import io |
|
from PIL import Image |
|
import soundfile as sf |
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
from urllib.request import urlopen |
|
|
|
|
|
|
|
model_path = "microsoft/Phi-4-multimodal-instruct" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
device_map="cuda", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
|
|
_attn_implementation='flash_attention_2', |
|
).cuda() |
|
|
|
|
|
generation_config = GenerationConfig.from_pretrained(model_path) |
|
|
|
|
|
user_prompt = '<|user|>' |
|
assistant_prompt = '<|assistant|>' |
|
prompt_suffix = '<|end|>' |
|
|
|
|
|
print("\n--- IMAGE PROCESSING ---") |
|
image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg' |
|
prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}' |
|
print(f'>>> Prompt\n{prompt}') |
|
|
|
|
|
image = Image.open(requests.get(image_url, stream=True).raw) |
|
inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0') |
|
|
|
|
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=1000, |
|
generation_config=generation_config, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
print(f'>>> Response\n{response}') |
|
|
|
|
|
print("\n--- AUDIO PROCESSING ---") |
|
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac" |
|
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation." |
|
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}' |
|
print(f'>>> Prompt\n{prompt}') |
|
|
|
|
|
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read())) |
|
|
|
|
|
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0') |
|
|
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=1000, |
|
generation_config=generation_config, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
print(f'>>> Response\n{response}') |
|
|