riddhimanrana
/

fastvlm-0.5b-captions

Image-Text-to-Text

text-generation

Model card Files Files and versions

fastvlm-0.5b-captions / predict.py

riddhimanrana's picture

Create predict.py

e3a071a verified 3 months ago

history blame contribute delete

3.68 kB

	# Credit for this script goes to @ariG23498 who opened a PR on the apple/ml-fastvlm repo to add this model to hugging face transformers. The apple team still needs to convert the weights in order for it to be officially available.

	import os
	import argparse

	import torch
	from PIL import Image

	from llava.conversation import conv_templates
	from llava.mm_utils import tokenizer_image_token, process_images
	from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

	from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPImageProcessor

	def predict(args):
	model_id = args.model_path.split("/")[-1]
	print(f"{model_id=}")

	# Remove generation config from model folder
	# to read generation parameters from args
	model_path = os.path.expanduser(args.model_path)
	generation_config = None
	if os.path.exists(os.path.join(model_path, 'generation_config.json')):
	generation_config = os.path.join(model_path, '.generation_config.json')
	os.rename(os.path.join(model_path, 'generation_config.json'),
	generation_config)

	tokenizer = AutoTokenizer.from_pretrained(f"riddhimanrana/{model_id}")
	model = AutoModelForCausalLM.from_pretrained(f"riddhimanrana/{model_id}", torch_dtype=torch.float16, device_map="cuda")
	image_processor = CLIPImageProcessor.from_pretrained(f"riddhimanrana/{model_id}")

	# Construct prompt
	qs = args.prompt
	if model.config.mm_use_im_start_end:
	qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
	else:
	qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
	conv = conv_templates[args.conv_mode].copy()
	conv.append_message(conv.roles[0], qs)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	# Set the pad token id for generation
	model.generation_config.pad_token_id = tokenizer.pad_token_id

	# Tokenize prompt
	input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(torch.device("cuda"))

	# Load and preprocess image
	image = Image.open(args.image_file).convert('RGB')
	image_tensor = process_images([image], image_processor, model.config)[0]

	# Run inference
	with torch.inference_mode():
	output_ids = model.generate(
	input_ids,
	images=image_tensor.unsqueeze(0).half(),
	image_sizes=[image.size],
	do_sample=True if args.temperature > 0 else False,
	temperature=args.temperature,
	top_p=args.top_p,
	num_beams=args.num_beams,
	max_new_tokens=256,
	use_cache=True)

	outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
	print(outputs)

	# Restore generation config
	if generation_config is not None:
	os.rename(generation_config, os.path.join(model_path, 'generation_config.json'))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--model-path", type=str, default="./llava-v1.5-0.5b")
	parser.add_argument("--model-base", type=str, default=None)
	parser.add_argument("--image-file", type=str, default=None, help="location of image file")
	parser.add_argument("--prompt", type=str, default="Describe the image.", help="Prompt for VLM.")
	parser.add_argument("--conv-mode", type=str, default="qwen_2")
	parser.add_argument("--temperature", type=float, default=0.0)
	parser.add_argument("--top_p", type=float, default=None)
	parser.add_argument("--num_beams", type=int, default=1)
	args = parser.parse_args()

	predict(args)