Spaces:
Runtime error
Runtime error
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| os.environ['WANDB_DISABLED'] = "true" | |
| import pandas as pd | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from transformers import ( | |
| AutoTokenizer, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer, | |
| AutoModelForSequenceClassification | |
| ) | |
| from datasets import Dataset | |
| ####################################### | |
| ########## FinBERT training ########### | |
| ####################################### | |
| class args: | |
| model = 'ProsusAI/finbert' | |
| df = pd.read_csv('all-data.csv', | |
| names = ['labels','messages'], | |
| encoding='ISO-8859-1') | |
| df = df[['messages', 'labels']] | |
| le = LabelEncoder() | |
| df['labels'] = le.fit_transform(df['labels']) | |
| X, y = df['messages'].values, df['labels'].values | |
| xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1) | |
| xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2) | |
| train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain}) | |
| valid_dataset_raw = Dataset.from_dict({'text':xvalid, 'labels':yvalid}) | |
| tokenizer = AutoTokenizer.from_pretrained(args.model) | |
| def tokenize_fn(examples): | |
| return tokenizer(examples['text'], truncation=True) | |
| train_dataset = train_dataset_raw.map(tokenize_fn, batched=True) | |
| valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True) | |
| data_collator = DataCollatorWithPadding(tokenizer) | |
| model = AutoModelForSequenceClassification.from_pretrained(args.model) | |
| train_args = TrainingArguments( | |
| './Finbert Trained/', | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=2*16, | |
| num_train_epochs=5, | |
| learning_rate=2e-5, | |
| weight_decay=0.01, | |
| warmup_ratio=0.1, | |
| do_eval=True, | |
| do_train=True, | |
| do_predict=True, | |
| evaluation_strategy='epoch', | |
| save_strategy="no", | |
| ) | |
| trainer = Trainer( | |
| model, | |
| train_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=valid_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer | |
| ) | |
| trainer.train() | |
| # saving the model and the weights | |
| model.save_pretrained('fine_tuned_FinBERT') | |
| # saving the tokenizer | |
| tokenizer.save_pretrained("fine_tuned_FinBERT/tokenizer/") | |