guipenedo HF staff commited on
Commit
9a7091f
·
unverified ·
1 Parent(s): baa687b

georgian tokenizer and south azerbeijani

Browse files
Files changed (2) hide show
  1. data/Kartvelian.json +21 -3
  2. data/Turkic.json +40 -2
data/Kartvelian.json CHANGED
@@ -29,16 +29,34 @@
29
  "iso_3_code": "kat",
30
  "children": [],
31
  "family": "Kartvelian",
32
- "tokenizers": {},
 
 
 
 
 
 
 
 
33
  "node_i": "4608",
34
- "native_tokenizers": [],
 
 
35
  "scripts": [
36
  "Geor"
37
  ]
38
  }
39
  ],
40
  "family": "Kartvelian",
41
- "tokenizers": {},
 
 
 
 
 
 
 
 
42
  "node_i": "4606",
43
  "native_tokenizers": [],
44
  "scripts": []
 
29
  "iso_3_code": "kat",
30
  "children": [],
31
  "family": "Kartvelian",
32
+ "tokenizers": {
33
+ "Geor": {
34
+ "full_object": "GeorgianTokenizer()",
35
+ "original_lang_name": "georgian",
36
+ "original_lang_code": "kat",
37
+ "script": "Geor",
38
+ "class_name": "GeorgianTokenizer"
39
+ }
40
+ },
41
  "node_i": "4608",
42
+ "native_tokenizers": [
43
+ "Geor"
44
+ ],
45
  "scripts": [
46
  "Geor"
47
  ]
48
  }
49
  ],
50
  "family": "Kartvelian",
51
+ "tokenizers": {
52
+ "Geor": {
53
+ "full_object": "GeorgianTokenizer()",
54
+ "original_lang_name": "georgian",
55
+ "original_lang_code": "kat",
56
+ "script": "Geor",
57
+ "class_name": "GeorgianTokenizer"
58
+ }
59
+ },
60
  "node_i": "4606",
61
  "native_tokenizers": [],
62
  "scripts": []
data/Turkic.json CHANGED
@@ -372,9 +372,19 @@
372
  "iso_3_code": "azb",
373
  "children": [],
374
  "family": "Turkic",
375
- "tokenizers": {},
 
 
 
 
 
 
 
 
376
  "node_i": "10583",
377
- "native_tokenizers": [],
 
 
378
  "scripts": [
379
  "Arab"
380
  ]
@@ -407,6 +417,13 @@
407
  ],
408
  "family": "Turkic",
409
  "tokenizers": {
 
 
 
 
 
 
 
410
  "Latn": {
411
  "full_object": "SpaCyTokenizer(\"az\")",
412
  "original_lang_name": "azerbaijani",
@@ -525,6 +542,13 @@
525
  "children": [],
526
  "family": "Turkic",
527
  "tokenizers": {
 
 
 
 
 
 
 
528
  "Latn": {
529
  "full_object": "SpaCyTokenizer(\"tr\")",
530
  "original_lang_name": "turkish",
@@ -544,6 +568,13 @@
544
  ],
545
  "family": "Turkic",
546
  "tokenizers": {
 
 
 
 
 
 
 
547
  "Latn": {
548
  "full_object": "SpaCyTokenizer(\"tr\")",
549
  "original_lang_name": "turkish",
@@ -559,6 +590,13 @@
559
  ],
560
  "family": "Turkic",
561
  "tokenizers": {
 
 
 
 
 
 
 
562
  "Latn": {
563
  "full_object": "SpaCyTokenizer(\"tr\")",
564
  "original_lang_name": "turkish",
 
372
  "iso_3_code": "azb",
373
  "children": [],
374
  "family": "Turkic",
375
+ "tokenizers": {
376
+ "Arab": {
377
+ "full_object": "SpaCyTokenizer(\"fa\")",
378
+ "original_lang_name": "persian",
379
+ "original_lang_code": "azb",
380
+ "script": "Arab",
381
+ "class_name": "SpaCyTokenizer"
382
+ }
383
+ },
384
  "node_i": "10583",
385
+ "native_tokenizers": [
386
+ "Arab"
387
+ ],
388
  "scripts": [
389
  "Arab"
390
  ]
 
417
  ],
418
  "family": "Turkic",
419
  "tokenizers": {
420
+ "Arab": {
421
+ "full_object": "SpaCyTokenizer(\"fa\")",
422
+ "original_lang_name": "persian",
423
+ "original_lang_code": "azb",
424
+ "script": "Arab",
425
+ "class_name": "SpaCyTokenizer"
426
+ },
427
  "Latn": {
428
  "full_object": "SpaCyTokenizer(\"az\")",
429
  "original_lang_name": "azerbaijani",
 
542
  "children": [],
543
  "family": "Turkic",
544
  "tokenizers": {
545
+ "Arab": {
546
+ "full_object": "SpaCyTokenizer(\"fa\")",
547
+ "original_lang_name": "persian",
548
+ "original_lang_code": "azb",
549
+ "script": "Arab",
550
+ "class_name": "SpaCyTokenizer"
551
+ },
552
  "Latn": {
553
  "full_object": "SpaCyTokenizer(\"tr\")",
554
  "original_lang_name": "turkish",
 
568
  ],
569
  "family": "Turkic",
570
  "tokenizers": {
571
+ "Arab": {
572
+ "full_object": "SpaCyTokenizer(\"fa\")",
573
+ "original_lang_name": "persian",
574
+ "original_lang_code": "azb",
575
+ "script": "Arab",
576
+ "class_name": "SpaCyTokenizer"
577
+ },
578
  "Latn": {
579
  "full_object": "SpaCyTokenizer(\"tr\")",
580
  "original_lang_name": "turkish",
 
590
  ],
591
  "family": "Turkic",
592
  "tokenizers": {
593
+ "Arab": {
594
+ "full_object": "SpaCyTokenizer(\"fa\")",
595
+ "original_lang_name": "persian",
596
+ "original_lang_code": "azb",
597
+ "script": "Arab",
598
+ "class_name": "SpaCyTokenizer"
599
+ },
600
  "Latn": {
601
  "full_object": "SpaCyTokenizer(\"tr\")",
602
  "original_lang_name": "turkish",