georgian tokenizer and south azerbeijani
Browse files- data/Kartvelian.json +21 -3
- data/Turkic.json +40 -2
data/Kartvelian.json
CHANGED
@@ -29,16 +29,34 @@
|
|
29 |
"iso_3_code": "kat",
|
30 |
"children": [],
|
31 |
"family": "Kartvelian",
|
32 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"node_i": "4608",
|
34 |
-
"native_tokenizers": [
|
|
|
|
|
35 |
"scripts": [
|
36 |
"Geor"
|
37 |
]
|
38 |
}
|
39 |
],
|
40 |
"family": "Kartvelian",
|
41 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
"node_i": "4606",
|
43 |
"native_tokenizers": [],
|
44 |
"scripts": []
|
|
|
29 |
"iso_3_code": "kat",
|
30 |
"children": [],
|
31 |
"family": "Kartvelian",
|
32 |
+
"tokenizers": {
|
33 |
+
"Geor": {
|
34 |
+
"full_object": "GeorgianTokenizer()",
|
35 |
+
"original_lang_name": "georgian",
|
36 |
+
"original_lang_code": "kat",
|
37 |
+
"script": "Geor",
|
38 |
+
"class_name": "GeorgianTokenizer"
|
39 |
+
}
|
40 |
+
},
|
41 |
"node_i": "4608",
|
42 |
+
"native_tokenizers": [
|
43 |
+
"Geor"
|
44 |
+
],
|
45 |
"scripts": [
|
46 |
"Geor"
|
47 |
]
|
48 |
}
|
49 |
],
|
50 |
"family": "Kartvelian",
|
51 |
+
"tokenizers": {
|
52 |
+
"Geor": {
|
53 |
+
"full_object": "GeorgianTokenizer()",
|
54 |
+
"original_lang_name": "georgian",
|
55 |
+
"original_lang_code": "kat",
|
56 |
+
"script": "Geor",
|
57 |
+
"class_name": "GeorgianTokenizer"
|
58 |
+
}
|
59 |
+
},
|
60 |
"node_i": "4606",
|
61 |
"native_tokenizers": [],
|
62 |
"scripts": []
|
data/Turkic.json
CHANGED
@@ -372,9 +372,19 @@
|
|
372 |
"iso_3_code": "azb",
|
373 |
"children": [],
|
374 |
"family": "Turkic",
|
375 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
"node_i": "10583",
|
377 |
-
"native_tokenizers": [
|
|
|
|
|
378 |
"scripts": [
|
379 |
"Arab"
|
380 |
]
|
@@ -407,6 +417,13 @@
|
|
407 |
],
|
408 |
"family": "Turkic",
|
409 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
"Latn": {
|
411 |
"full_object": "SpaCyTokenizer(\"az\")",
|
412 |
"original_lang_name": "azerbaijani",
|
@@ -525,6 +542,13 @@
|
|
525 |
"children": [],
|
526 |
"family": "Turkic",
|
527 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
"Latn": {
|
529 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
530 |
"original_lang_name": "turkish",
|
@@ -544,6 +568,13 @@
|
|
544 |
],
|
545 |
"family": "Turkic",
|
546 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
"Latn": {
|
548 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
549 |
"original_lang_name": "turkish",
|
@@ -559,6 +590,13 @@
|
|
559 |
],
|
560 |
"family": "Turkic",
|
561 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
562 |
"Latn": {
|
563 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
564 |
"original_lang_name": "turkish",
|
|
|
372 |
"iso_3_code": "azb",
|
373 |
"children": [],
|
374 |
"family": "Turkic",
|
375 |
+
"tokenizers": {
|
376 |
+
"Arab": {
|
377 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
378 |
+
"original_lang_name": "persian",
|
379 |
+
"original_lang_code": "azb",
|
380 |
+
"script": "Arab",
|
381 |
+
"class_name": "SpaCyTokenizer"
|
382 |
+
}
|
383 |
+
},
|
384 |
"node_i": "10583",
|
385 |
+
"native_tokenizers": [
|
386 |
+
"Arab"
|
387 |
+
],
|
388 |
"scripts": [
|
389 |
"Arab"
|
390 |
]
|
|
|
417 |
],
|
418 |
"family": "Turkic",
|
419 |
"tokenizers": {
|
420 |
+
"Arab": {
|
421 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
422 |
+
"original_lang_name": "persian",
|
423 |
+
"original_lang_code": "azb",
|
424 |
+
"script": "Arab",
|
425 |
+
"class_name": "SpaCyTokenizer"
|
426 |
+
},
|
427 |
"Latn": {
|
428 |
"full_object": "SpaCyTokenizer(\"az\")",
|
429 |
"original_lang_name": "azerbaijani",
|
|
|
542 |
"children": [],
|
543 |
"family": "Turkic",
|
544 |
"tokenizers": {
|
545 |
+
"Arab": {
|
546 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
547 |
+
"original_lang_name": "persian",
|
548 |
+
"original_lang_code": "azb",
|
549 |
+
"script": "Arab",
|
550 |
+
"class_name": "SpaCyTokenizer"
|
551 |
+
},
|
552 |
"Latn": {
|
553 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
554 |
"original_lang_name": "turkish",
|
|
|
568 |
],
|
569 |
"family": "Turkic",
|
570 |
"tokenizers": {
|
571 |
+
"Arab": {
|
572 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
573 |
+
"original_lang_name": "persian",
|
574 |
+
"original_lang_code": "azb",
|
575 |
+
"script": "Arab",
|
576 |
+
"class_name": "SpaCyTokenizer"
|
577 |
+
},
|
578 |
"Latn": {
|
579 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
580 |
"original_lang_name": "turkish",
|
|
|
590 |
],
|
591 |
"family": "Turkic",
|
592 |
"tokenizers": {
|
593 |
+
"Arab": {
|
594 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
595 |
+
"original_lang_name": "persian",
|
596 |
+
"original_lang_code": "azb",
|
597 |
+
"script": "Arab",
|
598 |
+
"class_name": "SpaCyTokenizer"
|
599 |
+
},
|
600 |
"Latn": {
|
601 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
602 |
"original_lang_name": "turkish",
|