Puuje commited on
Commit
ce2f576
·
verified ·
1 Parent(s): bd6fccd

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,27 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - visual-question-answering
4
+ license: apache-2.0
5
+ widget:
6
+ - text: "What's the animal doing?"
7
+ src: "https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg"
8
+ - text: "What is on top of the building?"
9
+ src: "https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg"
10
+ ---
11
+
12
+ # Vision-and-Language Transformer (ViLT), fine-tuned on VQAv2
13
+
14
+ Vision-and-Language Transformer (ViLT) model fine-tuned on [VQAv2](https://visualqa.org/). It was introduced in the paper [ViLT: Vision-and-Language Transformer
15
+ Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Kim et al. and first released in [this repository](https://github.com/dandelin/ViLT).
16
+
17
+ Disclaimer: The team releasing ViLT did not write a model card for this model so this model card has been written by the Hugging Face team.
18
+
19
+ ## Intended uses & limitations
20
+
21
+ You can use the raw model for visual question answering.
22
+
23
+ ### How to use
24
+
25
+ Here is how to use this model in PyTorch:
26
+
27
+ ```python
28
+ from transformers import ViltProcessor, ViltForQuestionAnswering
29
+ import requests
30
+ from PIL import Image
31
+
32
+ # prepare image + question
33
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
34
+ image = Image.open(requests.get(url, stream=True).raw)
35
+ text = "How many cats are there?"
36
+
37
+ processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
38
+ model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
39
+
40
+ # prepare inputs
41
+ encoding = processor(image, text, return_tensors="pt")
42
+
43
+ # forward pass
44
+ outputs = model(**encoding)
45
+ logits = outputs.logits
46
+ idx = logits.argmax(-1).item()
47
+ print("Predicted answer:", model.config.id2label[idx])
48
+ ```
49
+
50
+ ## Training data
51
+
52
+ (to do)
53
+
54
+ ## Training procedure
55
+
56
+ ### Preprocessing
57
+
58
+ (to do)
59
+
60
+ ### Pretraining
61
+
62
+ (to do)
63
+
64
+ ## Evaluation results
65
+
66
+ (to do)
67
+
68
+ ### BibTeX entry and citation info
69
+
70
+ ```bibtex
71
+ @misc{kim2021vilt,
72
+ title={ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision},
73
+ author={Wonjae Kim and Bokyung Son and Ildoo Kim},
74
+ year={2021},
75
+ eprint={2102.03334},
76
+ archivePrefix={arXiv},
77
+ primaryClass={stat.ML}
78
+ }
79
+ ```
config.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "ViltFeatureExtractor",
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_std": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "resample": 3,
16
+ "size": 384,
17
+ "size_divisor": 32
18
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5f3409947b0369487ece7c5868f0040ceb67d25735dbb4ac5e99e03bab3a19
3
+ size 470435927
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 40, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff