not run

#10
by sdyy - opened

from transformers import pipeline

captioner = pipeline(model="deepseek-ai/Janus-1.3B")
captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")

Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
config.json: 100%
 1.45k/1.45k [00:00<00:00, 74.6kB/s]

KeyError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/transformers/models/auto/configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1037 try:
-> 1038 config_class = CONFIG_MAPPING[config_dict["model_type"]]
1039 except KeyError:

3 frames
KeyError: 'multi_modality'

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/transformers/models/auto/configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1038 config_class = CONFIG_MAPPING[config_dict["model_type"]]
1039 except KeyError:
-> 1040 raise ValueError(
1041 f"The checkpoint you are trying to load has model type {config_dict['model_type']} "
1042 "but Transformers does not recognize this architecture. This could be because of an "

ValueError: The checkpoint you are trying to load has model type multi_modality but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

from PIL import Image
from transformers import pipeline

vqa_pipeline = pipeline("visual-question-answering")

image = Image.open("/content/demo (1).jpeg")
question = "Is there a dog?"

vqa_pipeline(image, question, top_k=1)

No model was supplied, defaulted to dandelin/vilt-b32-finetuned-vqa and revision d0a1f6a (https://huggingface.co/dandelin/vilt-b32-finetuned-vqa).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
[{'score': 0.9991255402565002, 'answer': 'yes'}]

from transformers import pipeline

captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")

config.json: 100%
 4.34k/4.34k [00:00<00:00, 166kB/s]
pytorch_model.bin: 100%
 982M/982M [00:13<00:00, 121MB/s]
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
"architectures": [
"ViTModel"
],
"attention_probs_dropout_prob": 0.0,
"encoder_stride": 16,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"image_size": 224,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"model_type": "vit",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 16,
"qkv_bias": true,
"transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
"activation_function": "gelu_new",
"add_cross_attention": true,
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50256,
"decoder_start_token_id": 50256,
"embd_pdrop": 0.1,
"eos_token_id": 50256,
"initializer_range": 0.02,
"is_decoder": true,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 1024,
"n_embd": 768,
"n_head": 12,
"n_inner": null,
"n_layer": 12,
"n_positions": 1024,
"pad_token_id": 50256,
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 50
}
},
"transformers_version": "4.47.1",
"use_cache": true,
"vocab_size": 50257
}

tokenizer_config.json: 100%
 236/236 [00:00<00:00, 10.4kB/s]
vocab.json: 100%
 798k/798k [00:00<00:00, 12.9MB/s]
merges.txt: 100%
 456k/456k [00:00<00:00, 7.40MB/s]
tokenizer.json: 100%
 1.36M/1.36M [00:00<00:00, 13.1MB/s]
special_tokens_map.json: 100%
 120/120 [00:00<00:00, 6.53kB/s]
preprocessor_config.json: 100%
 211/211 [00:00<00:00, 8.35kB/s]
Device set to use cpu
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
We strongly recommend passing in an attention_mask since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
You may ignore this warning if your pad_token_id (50256) is identical to the bos_token_id (50256), eos_token_id (50256), or the sep_token_id (None), and your input is not padded.
[{'generated_text': 'two birds are standing next to each other '}]

from transformers import pipeline

captioner = pipeline(model="impactframes/Janus-1.3B")
captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")

3 frames
KeyError: 'multi_modality'

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/transformers/models/auto/configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1038 config_class = CONFIG_MAPPING[config_dict["model_type"]]
1039 except KeyError:
-> 1040 raise ValueError(
1041 f"The checkpoint you are trying to load has model type {config_dict['model_type']} "
1042 "but Transformers does not recognize this architecture. This could be because of an "

ValueError: The checkpoint you are trying to load has model type multi_modality but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.
config.json: 100%
 1.45k/1.45k [00:00<00:00, 20.6kB/s]

from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image

prepare image + question

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "How many cats are there?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

prepare inputs

encoding = processor(image, text, return_tensors="pt")

forward pass

outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: 2

000000039769.jpg

how to run in colab t4

without flash-attn

Sign up or log in to comment