Segizu commited on
Commit
2ad5729
·
1 Parent(s): a9e34c3
Files changed (2) hide show
  1. main.py +77 -0
  2. requirements.txt +10 -0
main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import torch
3
+ import os
4
+ import io
5
+ from PIL import Image
6
+ import soundfile as sf
7
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
8
+ from urllib.request import urlopen
9
+
10
+
11
+ # Define model path
12
+ model_path = "microsoft/Phi-4-multimodal-instruct"
13
+
14
+ # Load model and processor
15
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ model_path,
18
+ device_map="cuda",
19
+ torch_dtype="auto",
20
+ trust_remote_code=True,
21
+ # if you do not use Ampere or later GPUs, change attention to "eager"
22
+ _attn_implementation='flash_attention_2',
23
+ ).cuda()
24
+
25
+ # Load generation config
26
+ generation_config = GenerationConfig.from_pretrained(model_path)
27
+
28
+ # Define prompt structure
29
+ user_prompt = '<|user|>'
30
+ assistant_prompt = '<|assistant|>'
31
+ prompt_suffix = '<|end|>'
32
+
33
+ # Part 1: Image Processing
34
+ print("\n--- IMAGE PROCESSING ---")
35
+ image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
36
+ prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
37
+ print(f'>>> Prompt\n{prompt}')
38
+
39
+ # Download and open image
40
+ image = Image.open(requests.get(image_url, stream=True).raw)
41
+ inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
42
+
43
+ # Generate response
44
+ generate_ids = model.generate(
45
+ **inputs,
46
+ max_new_tokens=1000,
47
+ generation_config=generation_config,
48
+ )
49
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
50
+ response = processor.batch_decode(
51
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
52
+ )[0]
53
+ print(f'>>> Response\n{response}')
54
+
55
+ # Part 2: Audio Processing
56
+ print("\n--- AUDIO PROCESSING ---")
57
+ audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
58
+ speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
59
+ prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
60
+ print(f'>>> Prompt\n{prompt}')
61
+
62
+ # Downlowd and open audio file
63
+ audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
64
+
65
+ # Process with the model
66
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
67
+
68
+ generate_ids = model.generate(
69
+ **inputs,
70
+ max_new_tokens=1000,
71
+ generation_config=generation_config,
72
+ )
73
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
74
+ response = processor.batch_decode(
75
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
76
+ )[0]
77
+ print(f'>>> Response\n{response}')
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ flash_attn==2.7.4.post1
2
+ torch==2.6.0
3
+ transformers==4.48.2
4
+ accelerate==1.3.0
5
+ soundfile==0.13.1
6
+ pillow==11.1.0
7
+ scipy==1.15.2
8
+ torchvision==0.21.0
9
+ backoff==2.2.1
10
+ peft==0.13.2