File size: 4,294 Bytes
497ac96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
{
  "builder_name": "common_voice_11_0",
  "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n",
  "config_name": "ur",
  "dataset_size": 57641379,
  "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 16413 validated hours of speech  in 100 languages, but more voices and languages are always added.",
  "download_checksums": {
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/n_shards.json": {
      "num_bytes": 12179,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/train/ur_train_0.tar": {
      "num_bytes": 110970880,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/dev/ur_dev_0.tar": {
      "num_bytes": 84695040,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/test/ur_test_0.tar": {
      "num_bytes": 84951040,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/other/ur_other_0.tar": {
      "num_bytes": 992716800,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/other/ur_other_1.tar": {
      "num_bytes": 874895360,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/other/ur_other_2.tar": {
      "num_bytes": 130252800,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/audio/ur/invalidated/ur_invalidated_0.tar": {
      "num_bytes": 91883520,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/transcript/ur/train.tsv": {
      "num_bytes": 1039872,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/transcript/ur/dev.tsv": {
      "num_bytes": 817949,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/transcript/ur/test.tsv": {
      "num_bytes": 806965,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/transcript/ur/other.tsv": {
      "num_bytes": 21175312,
      "checksum": null
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/main/transcript/ur/invalidated.tsv": {
      "num_bytes": 858871,
      "checksum": null
    }
  },
  "download_size": 2395076588,
  "features": {
    "audio": {
      "sampling_rate": 48000,
      "_type": "Audio"
    },
    "sentence": {
      "dtype": "string",
      "_type": "Value"
    }
  },
  "homepage": "https://commonvoice.mozilla.org/en/datasets",
  "license": "https://creativecommons.org/publicdomain/zero/1.0/",
  "size_in_bytes": 2452717967,
  "splits": {
    "train": {
      "name": "train",
      "num_bytes": 2410889,
      "num_examples": 4129,
      "dataset_name": "common_voice_11_0"
    },
    "validation": {
      "name": "validation",
      "num_bytes": 1901452,
      "num_examples": 3303,
      "dataset_name": "common_voice_11_0"
    },
    "test": {
      "name": "test",
      "num_bytes": 1896640,
      "num_examples": 3302,
      "dataset_name": "common_voice_11_0"
    },
    "other": {
      "name": "other",
      "num_bytes": 49446711,
      "num_examples": 85123,
      "dataset_name": "common_voice_11_0"
    },
    "invalidated": {
      "name": "invalidated",
      "num_bytes": 1985687,
      "num_examples": 3275,
      "dataset_name": "common_voice_11_0"
    }
  },
  "version": {
    "version_str": "11.0.0",
    "major": 11,
    "minor": 0,
    "patch": 0
  }
}