Spaces:

zaidmehdi
/

arabic-dialect-classifier

Sleeping

zaidmehdi commited on Mar 25, 2024

Commit

ff82938

1 Parent(s): a10e0a0

return label encoder

Files changed (1) hide show

src/utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ def get_datasetdict_object(df_train, df_val, df_test):
 def tokenize(batch, tokenizer):
-    return tokenizer(batch["tweet"], padding='max_length', max_length=256, truncation=True)
 def get_dataset(train_path:str, test_path:str, tokenizer):
@@ -46,7 +46,8 @@ def get_dataset(train_path:str, test_path:str, tokenizer):
     dataset = dataset.map(lambda x: tokenize(x, tokenizer), batched=True)
     dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
-    return dataset
 def serialize_data(data, output_path:str):
     with open(output_path, "wb") as f:

 def tokenize(batch, tokenizer):
+    return tokenizer(batch["tweet"], padding='max_length', max_length=768, truncation=True)
 def get_dataset(train_path:str, test_path:str, tokenizer):
     dataset = dataset.map(lambda x: tokenize(x, tokenizer), batched=True)
     dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
+    return dataset, encoder
 def serialize_data(data, output_path:str):
     with open(output_path, "wb") as f: