art-manuh commited on
Commit
cb24b6c
·
verified ·
1 Parent(s): d469dc8

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +9 -13
  2. app.py +30 -63
  3. requirements.txt +5 -1
  4. swahili.csv +117 -0
  5. swahili_dataset.csv +117 -0
  6. tokenizer.pickle +3 -0
  7. toxic.h5 +3 -0
  8. train.ipynb +240 -0
README.md CHANGED
@@ -1,13 +1,9 @@
1
- ---
2
- title: Maliza Uhalifu Mtandaoni
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
-
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
+ # Hate Speech Classification System
2
+
3
+ The model parses text data and returns boolean responses in relation to data provided.
4
+
5
+ sdk: gradio
6
+ sdk_version: 3.36.1
7
+ app_file: app.py
8
+
9
+ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
 
 
 
 
app.py CHANGED
@@ -1,63 +1,30 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
-
61
-
62
- if __name__ == "__main__":
63
- demo.launch()
 
1
+ import tensorflow as tf
2
+ import gradio as gr
3
+ from gradio.components import input
4
+ import pandas as pd
5
+ import pickle
6
+ from keras.utils import pad_sequences
7
+
8
+ max_len = 200
9
+
10
+ # set the vocabulary mapping on a new TextVectorization layer
11
+ with open('tokenizer.pickle', 'rb') as handle:
12
+ tokenizer = pickle.load(handle)
13
+ model = tf.keras.models.load_model('toxic.h5')
14
+
15
+
16
+ arr=["toxic","severe_toxic","obscene threat","insult","identity_hate"]
17
+ def score_comment(comment):
18
+ sequences = tokenizer.texts_to_sequences([comment])
19
+ inp = pad_sequences(sequences,maxlen=max_len)
20
+ results = model.predict(inp)
21
+ text = ''
22
+ for i in range(len(arr)):
23
+ text += '{}: {}\n'.format(arr[i], results[0][i]>0.5)
24
+ return text
25
+
26
+ interface = gr.Interface(fn=score_comment,
27
+ inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
28
+ outputs='text')
29
+
30
+ interface.launch(debug=True,share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1 +1,5 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
1
+ gradio==3.36.1
2
+ keras==2.13.1
3
+ pandas==2.0.3
4
+ Pillow==10.0.0
5
+ tensorflow==2.13.0
swahili.csv ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ comment,toxic,severe_toxic,obscene threat,insult,identity_hate
2
+ Huyu msee ni malaya,1,0,0,1,0
3
+ Zakayo ni mbwa,0,0,0,1,0
4
+ Chira ni shoga,1,1,0,1,0
5
+ Akinirushia mamdomo mi ntamuua,1,1,1,0,0
6
+ Haga sana,0,0,0,1,0
7
+ Kuma ya mamako,1,1,0,1,0
8
+ Wewe ni mjinga sana!,1,0,0,1,0
9
+ Nataka nikuchape vibaya!,0,1,0,0,0
10
+ Huo ni upuuzi mtupu!,1,0,0,0,0
11
+ Wewe ni takataka!,0,0,0,0,1
12
+ Nitakuua!,0,0,1,0,0
13
+ Unakera sana!,1,0,0,1,0
14
+ Unazungumza kama mpumbavu!,1,0,0,0,0
15
+ Wewe ni mchafu!,0,0,0,1,0
16
+ Nitaharibu maisha yako!,0,0,1,0,0
17
+ Hiyo ni ujinga wa hali ya juu!,1,0,0,0,0
18
+ Wewe ni nani kuzungumza hivyo?,0,0,0,1,0
19
+ Mimi nitakumaliza!,0,1,1,0,0
20
+ Una sura mbaya sana!,0,0,0,1,0
21
+ Usinieleze mambo yako ya kijinga!,1,0,0,0,0
22
+ Wewe ni kichekesho!,0,0,0,1,0
23
+ Nitakupiga hadi utoe damu!,0,1,0,0,0
24
+ Wewe ni taka taka!,1,0,0,1,0
25
+ Ondoka hapa!,1,0,0,0,0
26
+ Nitakuvunja miguu!,0,1,1,0,0
27
+ Wewe ni mpuuzi tu!,1,0,0,1,0
28
+ Nitakuchoma moto!,0,1,1,0,0
29
+ Unafanya mambo ya kipumbavu!,1,0,0,0,0
30
+ Nakuona ni kama mavi ya mbwa!,0,0,0,1,0
31
+ Usiniletee upuuzi wako!,1,0,0,0,0
32
+ Utakufa maskini!,0,0,0,1,0
33
+ Wewe ni bure kabisa!,1,0,0,0,0
34
+ Nitakufanya ujutie kuzaliwa kwako!,0,1,1,0,0
35
+ Wewe ni kama paka wa mtaani!,0,0,0,1,0
36
+ Nitaharibu uso wako!,0,1,1,0,0
37
+ Unafikiri wewe ni nani?,1,0,0,0,0
38
+ Hakuna mtu anayekupenda!,1,0,0,0,0
39
+ Unapaswa kufa!,0,0,1,0,0
40
+ Wewe ni mnyama tu!,1,0,0,0,0
41
+ Wewe ni mjinga na kipumbavu!,1,0,0,1,0
42
+ Nitakupiga vibaya hadi ulie!,0,1,0,0,0
43
+ Usiniletee upumbavu wako!,1,0,0,0,0
44
+ Wewe ni bure kama taka!,1,0,0,1,0
45
+ Wewe ni kipande cha mbwa!,1,0,0,1,0
46
+ Nitaharibu kila kitu kwako!,0,1,1,0,0
47
+ Wewe ni mfano wa upumbavu!,1,0,0,0,0
48
+ Unakera kama mbu!,1,0,0,1,0
49
+ Wewe ni shetani!,1,0,0,0,0
50
+ Nitakutoa roho!,0,1,1,0,0
51
+ Hakuna mtu anataka kukuona!,1,0,0,0,0
52
+ Wewe ni mbwa mwitu!,1,0,0,0,0
53
+ Nitakupiga ngumi!,0,1,0,0,0
54
+ Wewe ni sumu!,1,0,0,0,0
55
+ Nitachoma nyumba yako!,0,1,1,0,0
56
+ Wewe ni mfano wa ujinga!,1,0,0,0,0
57
+ Wewe ni kama kinyesi!,1,0,0,1,0
58
+ Nitaharibu maisha yako!,0,1,1,0,0
59
+ Wewe ni bure kabisa!,1,0,0,1,0
60
+ Wewe ni kama panya!,1,0,0,0,0
61
+ Nakuona kama panya!,1,0,0,1,0
62
+ Wewe ni kipande cha mavi!,1,0,0,1,0
63
+ Wewe ni chizi!,1,0,0,0,0
64
+ Wewe ni mshenzi!,1,0,0,1,0
65
+ Nitakupiga teke!,0,1,0,0,0
66
+ Wewe ni bure kama takataka!,1,0,0,1,0
67
+ Wewe ni kama kuku wa mtaani!,1,0,0,0,0
68
+ Nitakukata kichwa!,0,1,1,0,0
69
+ Wewe ni bure kabisa kama takataka!,1,0,0,1,0
70
+ Wewe ni mwehu!,1,0,0,0,0
71
+ Nitakutupa jalalani!,0,1,1,0,0
72
+ Wewe ni kipande cha taka!,1,0,0,1,0
73
+ Wewe ni kama mbwa mwitu!,1,0,0,0,0
74
+ Wewe ni mfano wa pumbavu!,1,0,0,0,0
75
+ Nitakupiga teke hadi ufe!,0,1,0,0,0
76
+ Wewe ni mfano wa upuuzi!,1,0,0,0,0
77
+ Wewe ni kipande cha taka!,1,0,0,1,0
78
+ Wewe ni bure kama kinyesi!,1,0,0,1,0
79
+ Wewe ni mnyama!,1,0,0,0,0
80
+ Nitakupiga hadi utoe machozi!,0,1,0,0,0
81
+ Wewe ni kichekesho cha dunia!,1,0,0,1,0
82
+ Wewe ni takataka ya mtaa!,1,0,0,1,0
83
+ Nitakupiga na risasi!,0,1,1,0,0
84
+ Wewe ni mfano wa upumbavu!,1,0,0,0,0
85
+ Wewe ni kama mbwa!,1,0,0,0,0
86
+ Nitaharibu uso wako!,0,1,1,0,0
87
+ Wewe ni kama paka wa mtaani!,1,0,0,0,0
88
+ Nitakufanya ujutie kuzaliwa kwako!,0,1,1,0,0
89
+ Wewe ni kipande cha mbwa!,1,0,0,1,0
90
+ Nitakupiga vibaya hadi ulie!,0,1,0,0,0
91
+ Wewe ni kipande cha takataka!,1,0,0,1,0
92
+ Wewe ni sumu kwa jamii!,1,0,0,0,0
93
+ Nitakuchoma moto!,0,1,1,0,0
94
+ Wewe ni kichekesho cha mtaa!,1,0,0,1,0
95
+ Wewe ni kama kinyesi cha mbwa!,1,0,0,1,0
96
+ Nitakuvunja miguu!,0,1,1,0,0
97
+ Wewe ni takataka kabisa!,1,0,0,1,0
98
+ Wewe ni kama mavi ya mbwa!,1,0,0,1,0
99
+ Nitakupiga hadi utoe damu!,0,1,0,0,0
100
+ Ukiniletea mamdomo mi ntakumada,1,1,1,1,1
101
+ Madem wa Kenya ni malaya,1,1,1,1,1
102
+ Uskii jamaa anatupa haga,1,1,0,1,0
103
+ Sura kama punda,1,0,0,1,0
104
+ Manze ngeus ni nyangumi,1,1,0,1,1
105
+ Si unakaa poa aki,0,0,0,0,0
106
+ Huyo chali ni mpoa,0,0,0,0,0
107
+ Ngeus amenidrive crazy,0,0,0,0,0
108
+ Johnte ni fala achana tu na yeye,1,0,1,1,1
109
+ Nairobii kumejaa malaya,1,1,1,1,1
110
+ Ngeus wake wake ni hoe,1,1,0,1,1
111
+ Dem wake ni whore,1,1,1,1,1
112
+ Manze fuck.,0,1,0,0,0
113
+ Jamaa anapenda nginyo sana,1,1,0,1,0
114
+ Jamaa anastream gwati sana,1,0,1,1,1
115
+ Njoti yake ni kama maziwa mala,1,1,0,1,1
116
+ Niaje bro,0,0,0,0,0
117
+ Kamaa ako na ufala sana,1,0,0,1,0
swahili_dataset.csv ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ comment,toxic,severe_toxic,obscene threat,insult,identity_hate
2
+ Huyu msee ni malaya,1,0,0,1,0
3
+ Zakayo ni mbwa,0,0,0,1,0
4
+ Chira ni shoga,1,1,0,1,0
5
+ Akinirushia mamdomo mi ntamuua,1,1,1,0,0
6
+ Haga sana,0,0,0,1,0
7
+ Kuma ya mamako,1,1,0,1,0
8
+ Wewe ni mjinga sana!,1,0,0,1,0
9
+ Nataka nikuchape vibaya!,0,1,0,0,0
10
+ Huo ni upuuzi mtupu!,1,0,0,0,0
11
+ Wewe ni takataka!,0,0,0,0,1
12
+ Nitakuua!,0,0,1,0,0
13
+ Unakera sana!,1,0,0,1,0
14
+
15
+ Unazungumza kama mpumbavu!,1,0,0,0,0
16
+ Wewe ni mchafu!,0,0,0,1,0
17
+ Nitaharibu maisha yako!,0,0,1,0,0
18
+ Hiyo ni ujinga wa hali ya juu!,1,0,0,0,0
19
+ Wewe ni nani kuzungumza hivyo?,0,0,0,1,0
20
+ Mimi nitakumaliza!,0,1,1,0,0
21
+ Una sura mbaya sana!,0,0,0,1,0
22
+ Usinieleze mambo yako ya kijinga!,1,0,0,0,0
23
+ Wewe ni kichekesho!,0,0,0,1,0
24
+ Nitakupiga hadi utoe damu!,0,1,0,0,0
25
+ Wewe ni taka taka!,1,0,0,1,0
26
+ Ondoka hapa!,1,0,0,0,0
27
+ Nitakuvunja miguu!,0,1,1,0,0
28
+ Wewe ni mpuuzi tu!,1,0,0,1,0
29
+ Nitakuchoma moto!,0,1,1,0,0
30
+ Unafanya mambo ya kipumbavu!,1,0,0,0,0
31
+ Nakuona ni kama mavi ya mbwa!,0,0,0,1,0
32
+ Usiniletee upuuzi wako!,1,0,0,0,0
33
+ Utakufa maskini!,0,0,0,1,0
34
+ Wewe ni bure kabisa!,1,0,0,0,0
35
+ Nitakufanya ujutie kuzaliwa kwako!,0,1,1,0,0
36
+ Wewe ni kama paka wa mtaani!,0,0,0,1,0
37
+ Nitaharibu uso wako!,0,1,1,0,0
38
+ Unafikiri wewe ni nani?,1,0,0,0,0
39
+ Hakuna mtu anayekupenda!,1,0,0,0,0
40
+ Unapaswa kufa!,0,0,1,0,0
41
+ Wewe ni mnyama tu!,1,0,0,0,0
42
+ Wewe ni mjinga na kipumbavu!,1,0,0,1,0
43
+ Nitakupiga vibaya hadi ulie!,0,1,0,0,0
44
+ Usiniletee upumbavu wako!,1,0,0,0,0
45
+ Wewe ni bure kama taka!,1,0,0,1,0
46
+ Wewe ni kipande cha mbwa!,1,0,0,1,0
47
+ Nitaharibu kila kitu kwako!,0,1,1,0,0
48
+ Wewe ni mfano wa upumbavu!,1,0,0,0,0
49
+ Unakera kama mbu!,1,0,0,1,0
50
+ Wewe ni shetani!,1,0,0,0,0
51
+ Nitakutoa roho!,0,1,1,0,0
52
+ Hakuna mtu anataka kukuona!,1,0,0,0,0
53
+ Wewe ni mbwa mwitu!,1,0,0,0,0
54
+ Nitakupiga ngumi!,0,1,0,0,0
55
+ Wewe ni sumu!,1,0,0,0,0
56
+ Nitachoma nyumba yako!,0,1,1,0,0
57
+ Wewe ni mfano wa ujinga!,1,0,0,0,0
58
+ Wewe ni kama kinyesi!,1,0,0,1,0
59
+ Nitaharibu maisha yako!,0,1,1,0,0
60
+ Wewe ni bure kabisa!,1,0,0,1,0
61
+ Wewe ni kama panya!,1,0,0,0,0
62
+ Nakuona kama panya!,1,0,0,1,0
63
+ Wewe ni kipande cha mavi!,1,0,0,1,0
64
+ Wewe ni chizi!,1,0,0,0,0
65
+ Wewe ni mshenzi!,1,0,0,1,0
66
+ Nitakupiga teke!,0,1,0,0,0
67
+ Wewe ni bure kama takataka!,1,0,0,1,0
68
+ Wewe ni kama kuku wa mtaani!,1,0,0,0,0
69
+ Nitakukata kichwa!,0,1,1,0,0
70
+ Wewe ni bure kabisa kama takataka!,1,0,0,1,0
71
+ Wewe ni mwehu!,1,0,0,0,0
72
+ Nitakutupa jalalani!,0,1,1,0,0
73
+ Wewe ni kipande cha taka!,1,0,0,1,0
74
+ Wewe ni kama mbwa mwitu!,1,0,0,0,0
75
+ Wewe ni mfano wa pumbavu!,1,0,0,0,0
76
+ Nitakupiga teke hadi ufe!,0,1,0,0,0
77
+ Wewe ni mfano wa upuuzi!,1,0,0,0,0
78
+ Wewe ni kipande cha taka!,1,0,0,1,0
79
+ Wewe ni bure kama kinyesi!,1,0,0,1,0
80
+ Wewe ni mnyama!,1,0,0,0,0
81
+ Nitakupiga hadi utoe machozi!,0,1,0,0,0
82
+ Wewe ni kichekesho cha dunia!,1,0,0,1,0
83
+ Wewe ni takataka ya mtaa!,1,0,0,1,0
84
+ Nitakupiga na risasi!,0,1,1,0,0
85
+ Wewe ni mfano wa upumbavu!,1,0,0,0,0
86
+ Wewe ni kama mbwa!,1,0,0,0,0
87
+ Nitaharibu uso wako!,0,1,1,0,0
88
+ Wewe ni kama paka wa mtaani!,1,0,0,0,0
89
+ Nitakufanya ujutie kuzaliwa kwako!,0,1,1,0,0
90
+ Wewe ni kipande cha mbwa!,1,0,0,1,0
91
+ Nitakupiga vibaya hadi ulie!,0,1,0,0,0
92
+ Wewe ni kipande cha takataka!,1,0,0,1,0
93
+ Wewe ni sumu kwa jamii!,1,0,0,0,0
94
+ Nitakuchoma moto!,0,1,1,0,0
95
+ Wewe ni kichekesho cha mtaa!,1,0,0,1,0
96
+ Wewe ni kama kinyesi cha mbwa!,1,0,0,1,0
97
+ Nitakuvunja miguu!,0,1,1,0,0
98
+ Wewe ni takataka kabisa!,1,0,0,1,0
99
+ Wewe ni kama mavi ya mbwa!,1,0,0,1,0
100
+ Nitakupiga hadi utoe damu!,0,1,0,0,0
101
+ Ukiniletea mamdomo mi ntakumada,1,1,1,1,1
102
+ Madem wa Kenya ni malaya,1,1,1,1,1
103
+ Uskii jamaa anatupa haga,1,1,0,1,0
104
+ Sura kama punda,1,0,0,1,0
105
+ Manze ngeus ni nyangumi,1,1,0,1,1
106
+ Si unakaa poa aki,0,0,0,0,0
107
+ Huyo chali ni mpoa,0,0,0,0,0
108
+ Ngeus amenidrive crazy,0,0,0,0,0
109
+ Johnte ni fala achana tu na yeye,1,0,1,1,1
110
+ Nairobii kumejaa malaya,1,1,1,1,1
111
+ Ngeus wake wake ni hoe,1,1,0,1,1
112
+ Dem wake ni whore,1,0,0,,1
113
+ Manze fuck.,0,1,0,0,0
114
+ Jamaa anapenda nginyo sana,1,1,0,1,0
115
+ Jamaa anastream gwati sana,1,0,1,1,1
116
+ Njoti yake ni kama maziwa mala,1,1,0,1,1
117
+
tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77d70fdcb351caea5ee6d9dfbd607f61ac419b4a04ec521d84605bbc9f41165
3
+ size 7740158
toxic.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73b4cc477172e73a10c43cc26bc3c628a71b2a2a6c145b5edf6a8ce42d4905e8
3
+ size 1816640
train.ipynb ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Add Additional Datasets to The Model."
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "import numpy as np\n",
18
+ "from sklearn.model_selection import train_test_split\n",
19
+ "# from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
20
+ "import tensorflow as tf\n",
21
+ "import pickle"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 13,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>comment</th>\n",
51
+ " <th>toxic</th>\n",
52
+ " <th>severe_toxic</th>\n",
53
+ " <th>obscene threat</th>\n",
54
+ " <th>insult</th>\n",
55
+ " <th>identity_hate</th>\n",
56
+ " </tr>\n",
57
+ " </thead>\n",
58
+ " <tbody>\n",
59
+ " <tr>\n",
60
+ " <th>0</th>\n",
61
+ " <td>Huyu msee ni malaya</td>\n",
62
+ " <td>1</td>\n",
63
+ " <td>0</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>1.0</td>\n",
66
+ " <td>0</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>1</th>\n",
70
+ " <td>Zakayo ni mbwa</td>\n",
71
+ " <td>0</td>\n",
72
+ " <td>0</td>\n",
73
+ " <td>0</td>\n",
74
+ " <td>1.0</td>\n",
75
+ " <td>0</td>\n",
76
+ " </tr>\n",
77
+ " <tr>\n",
78
+ " <th>2</th>\n",
79
+ " <td>Chira ni shoga</td>\n",
80
+ " <td>1</td>\n",
81
+ " <td>1</td>\n",
82
+ " <td>0</td>\n",
83
+ " <td>1.0</td>\n",
84
+ " <td>0</td>\n",
85
+ " </tr>\n",
86
+ " <tr>\n",
87
+ " <th>3</th>\n",
88
+ " <td>Akinirushia mamdomo mi ntamuua</td>\n",
89
+ " <td>1</td>\n",
90
+ " <td>1</td>\n",
91
+ " <td>1</td>\n",
92
+ " <td>0.0</td>\n",
93
+ " <td>0</td>\n",
94
+ " </tr>\n",
95
+ " <tr>\n",
96
+ " <th>4</th>\n",
97
+ " <td>Haga sana</td>\n",
98
+ " <td>0</td>\n",
99
+ " <td>0</td>\n",
100
+ " <td>0</td>\n",
101
+ " <td>1.0</td>\n",
102
+ " <td>0</td>\n",
103
+ " </tr>\n",
104
+ " <tr>\n",
105
+ " <th>5</th>\n",
106
+ " <td>Kuma ya mamako</td>\n",
107
+ " <td>1</td>\n",
108
+ " <td>1</td>\n",
109
+ " <td>0</td>\n",
110
+ " <td>1.0</td>\n",
111
+ " <td>0</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>6</th>\n",
115
+ " <td>Wewe ni mjinga sana!</td>\n",
116
+ " <td>1</td>\n",
117
+ " <td>0</td>\n",
118
+ " <td>0</td>\n",
119
+ " <td>1.0</td>\n",
120
+ " <td>0</td>\n",
121
+ " </tr>\n",
122
+ " <tr>\n",
123
+ " <th>7</th>\n",
124
+ " <td>Nataka nikuchape vibaya!</td>\n",
125
+ " <td>0</td>\n",
126
+ " <td>1</td>\n",
127
+ " <td>0</td>\n",
128
+ " <td>0.0</td>\n",
129
+ " <td>0</td>\n",
130
+ " </tr>\n",
131
+ " <tr>\n",
132
+ " <th>8</th>\n",
133
+ " <td>Huo ni upuuzi mtupu!</td>\n",
134
+ " <td>1</td>\n",
135
+ " <td>0</td>\n",
136
+ " <td>0</td>\n",
137
+ " <td>0.0</td>\n",
138
+ " <td>0</td>\n",
139
+ " </tr>\n",
140
+ " <tr>\n",
141
+ " <th>9</th>\n",
142
+ " <td>Wewe ni takataka!</td>\n",
143
+ " <td>0</td>\n",
144
+ " <td>0</td>\n",
145
+ " <td>0</td>\n",
146
+ " <td>0.0</td>\n",
147
+ " <td>1</td>\n",
148
+ " </tr>\n",
149
+ " </tbody>\n",
150
+ "</table>\n",
151
+ "</div>"
152
+ ],
153
+ "text/plain": [
154
+ " comment toxic severe_toxic obscene threat \\\n",
155
+ "0 Huyu msee ni malaya 1 0 0 \n",
156
+ "1 Zakayo ni mbwa 0 0 0 \n",
157
+ "2 Chira ni shoga 1 1 0 \n",
158
+ "3 Akinirushia mamdomo mi ntamuua 1 1 1 \n",
159
+ "4 Haga sana 0 0 0 \n",
160
+ "5 Kuma ya mamako 1 1 0 \n",
161
+ "6 Wewe ni mjinga sana! 1 0 0 \n",
162
+ "7 Nataka nikuchape vibaya! 0 1 0 \n",
163
+ "8 Huo ni upuuzi mtupu! 1 0 0 \n",
164
+ "9 Wewe ni takataka! 0 0 0 \n",
165
+ "\n",
166
+ " insult identity_hate \n",
167
+ "0 1.0 0 \n",
168
+ "1 1.0 0 \n",
169
+ "2 1.0 0 \n",
170
+ "3 0.0 0 \n",
171
+ "4 1.0 0 \n",
172
+ "5 1.0 0 \n",
173
+ "6 1.0 0 \n",
174
+ "7 0.0 0 \n",
175
+ "8 0.0 0 \n",
176
+ "9 0.0 1 "
177
+ ]
178
+ },
179
+ "execution_count": 13,
180
+ "metadata": {},
181
+ "output_type": "execute_result"
182
+ }
183
+ ],
184
+ "source": [
185
+ "dataframe = pd.read_csv('swahili.csv')\n",
186
+ "\n",
187
+ "texts = dataframe['comment'].values\n",
188
+ "labels = dataframe[['toxic', 'severe_toxic', 'obscene threat', 'insult', 'identity_hate']].values\n",
189
+ "\n",
190
+ "dataframe.head(10)"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "#Preprocess and Prepare Data for Training:\n",
200
+ "max_len = 200\n",
201
+ "\n",
202
+ "# Load the tokenizer\n",
203
+ "with open('tokenizer.pickle', 'rb') as handle:\n",
204
+ " tokenizer = pickle.load(handle)\n",
205
+ "\n",
206
+ "# Pad & Tokenize Data\n",
207
+ "sequences = tokenizer.texts_to_sequences(texts)\n",
208
+ "padded_sequences = pad_sequences(sequences, maxlen=max_len)\n",
209
+ "\n",
210
+ "# Data spliting\n",
211
+ "X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)\n",
212
+ "\n",
213
+ "# Create TensorFlow datasets\n",
214
+ "train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)\n",
215
+ "val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(32)\n"
216
+ ]
217
+ }
218
+ ],
219
+ "metadata": {
220
+ "kernelspec": {
221
+ "display_name": "base",
222
+ "language": "python",
223
+ "name": "python3"
224
+ },
225
+ "language_info": {
226
+ "codemirror_mode": {
227
+ "name": "ipython",
228
+ "version": 3
229
+ },
230
+ "file_extension": ".py",
231
+ "mimetype": "text/x-python",
232
+ "name": "python",
233
+ "nbconvert_exporter": "python",
234
+ "pygments_lexer": "ipython3",
235
+ "version": "3.11.7"
236
+ }
237
+ },
238
+ "nbformat": 4,
239
+ "nbformat_minor": 2
240
+ }