|
--- |
|
library_name: transformers |
|
pipeline_tag: text-generation |
|
inference: true |
|
widget: |
|
- text: Hello! |
|
example_title: Hello world |
|
group: Python |
|
--- |
|
|
|
This tiny model is for debugging. It is randomly initialized with the config adapted from [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct). |
|
|
|
### Example usage: |
|
|
|
```python |
|
import io |
|
import os |
|
from urllib.request import urlopen |
|
|
|
import torch |
|
|
|
import requests |
|
import soundfile as sf |
|
from PIL import Image |
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
|
|
# Define model path |
|
model_id = "yujiepan/phi-4-multimodal-tiny-random" |
|
|
|
# Load model and processor |
|
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
device_map="cuda", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
attn_implementation='flash_attention_2', |
|
).cuda() |
|
|
|
# Load generation config |
|
generation_config = GenerationConfig.from_pretrained(model_id) |
|
|
|
# Define prompt structure |
|
user_prompt = '<|user|>' |
|
assistant_prompt = '<|assistant|>' |
|
prompt_suffix = '<|end|>' |
|
|
|
# Part 1: Image Processing |
|
print("\n--- IMAGE PROCESSING ---") |
|
image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg' |
|
prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}' |
|
print(f'>>> Prompt\n{prompt}') |
|
|
|
# Download and open image |
|
image = Image.open(requests.get(image_url, stream=True).raw) |
|
inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0') |
|
|
|
# Generate response |
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=8, |
|
generation_config=generation_config, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
print(f'>>> Response\n{response}') |
|
|
|
# Part 2: Audio Processing |
|
print("\n--- AUDIO PROCESSING ---") |
|
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac" |
|
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation." |
|
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}' |
|
print(f'>>> Prompt\n{prompt}') |
|
|
|
# Downlowd and open audio file |
|
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read())) |
|
|
|
# Process with the model |
|
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0') |
|
|
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=8, |
|
generation_config=generation_config, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
print(f'>>> Response\n{response}') |
|
``` |
|
|
|
### Codes to create this repo: |
|
|
|
```python |
|
import json |
|
import shutil |
|
import sys |
|
from pathlib import Path |
|
|
|
import torch |
|
|
|
from huggingface_hub import hf_hub_download |
|
from transformers import ( |
|
AutoConfig, |
|
AutoModelForCausalLM, |
|
AutoProcessor, |
|
AutoTokenizer, |
|
GenerationConfig, |
|
pipeline, |
|
set_seed, |
|
) |
|
|
|
source_model_id = "microsoft/Phi-4-multimodal-instruct" |
|
save_folder = "/tmp/yujiepan/phi-4-multimodal-tiny-random" |
|
Path(save_folder).mkdir(exist_ok=True) |
|
AutoTokenizer.from_pretrained(source_model_id).save_pretrained(save_folder) |
|
|
|
# preprocessor config |
|
for json_file in ['preprocessor_config.json', 'processor_config.json', 'config.json']: |
|
with open(hf_hub_download(source_model_id, json_file), 'r') as f: |
|
config = json.load(f) |
|
auto_map = config.get('auto_map', {}) |
|
for key, value in auto_map.items(): |
|
if '.' in value: |
|
auto_map[key] = f'{source_model_id}--{value}' |
|
with open(f'{save_folder}/{json_file}', 'w') as f: |
|
json.dump(config, f, indent=2) |
|
|
|
# model config |
|
with open(f'{save_folder}/config.json', 'r') as f: |
|
config = json.load(f) |
|
|
|
config['hidden_size'] = 16 |
|
config['intermediate_size'] = 32 |
|
config['num_attention_heads'] = 2 |
|
config['num_hidden_layers'] = 2 |
|
config['num_key_value_heads'] = 1 |
|
|
|
config['audio_processor']['config']['num_blocks'] = 2 |
|
config['audio_processor']['config']['attention_dim'] = 16 |
|
config['audio_processor']['config']['attention_heads'] = 2 |
|
config['audio_processor']['config']['nemo_conv_settings']['conv_channels'] = 16 |
|
config['audio_processor']['config']['depthwise_seperable_out_channel'] = 16 |
|
config['audio_processor']['config']['ext_pw_out_channel'] = 16 |
|
config['audio_processor']['config']['linear_units'] = 24 |
|
|
|
config['vision_lora']['r'] = 8 |
|
config['vision_lora']['lora_alpha'] = 16 |
|
config['speech_lora']['r'] = 8 |
|
config['speech_lora']['lora_alpha'] = 16 |
|
|
|
config['rope_scaling']['long_factor'] = [1.0] * 3 |
|
config['rope_scaling']['short_factor'] = [1.0] * 3 |
|
|
|
with open(f'{save_folder}/config.json', 'w') as f: |
|
json.dump(config, f, indent=2) |
|
|
|
config = AutoConfig.from_pretrained( |
|
save_folder, |
|
trust_remote_code=True, |
|
) |
|
|
|
Path(save_folder, 'phi4mm').mkdir(exist_ok=True) |
|
for python_files in ['modeling_phi4mm.py', 'configuration_phi4mm.py', 'speech_conformer_encoder.py', 'vision_siglip_navit.py', 'processing_phi4mm.py']: |
|
with open(hf_hub_download(source_model_id, python_files), 'r') as f: |
|
codes = f.read() |
|
with open(f'{save_folder}/phi4mm/{python_files}', 'w') as f: |
|
f.write(codes) |
|
with open(Path(save_folder, 'phi4mm/vision_siglip_navit.py'), 'r') as f: |
|
codes = f.read() |
|
codes = codes.replace('def get_siglip_vision_model', '# modified for tiny-random\ndef get_siglip_vision_model') |
|
codes = codes.replace('"hidden_size": 1152,', '"hidden_size": 16,') |
|
codes = codes.replace('"intermediate_size": 4304,', '"intermediate_size": 32,') |
|
codes = codes.replace('"num_attention_heads": 16,', '"num_attention_heads": 2,') |
|
codes = codes.replace('"num_hidden_layers": 27,', '"num_hidden_layers": 2,') |
|
with open(Path(save_folder, 'phi4mm/vision_siglip_navit.py'), 'w') as f: |
|
f.write(codes) |
|
|
|
sys.path.append(str(Path(save_folder))) |
|
from phi4mm.modeling_phi4mm import Phi4MMForCausalLM |
|
print(Phi4MMForCausalLM) # ensure imported |
|
model = Phi4MMForCausalLM(config).to(torch.bfloat16) |
|
|
|
set_seed(42) |
|
with torch.no_grad(): |
|
for name, p in sorted(model.named_parameters()): |
|
torch.nn.init.normal_(p, 0, 0.5) |
|
print(name, p.shape) |
|
|
|
model.save_pretrained(Path(save_folder)) |
|
shutil.rmtree(Path(save_folder, 'phi4mm')) |
|
generation_config = GenerationConfig.from_pretrained( |
|
source_model_id, trust_remote_code=True, |
|
) |
|
generation_config.save_pretrained(save_folder) |
|
``` |