import torch from datasets import load_dataset from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments # Step 1: Load the dataset (use a dataset with English-Hindi pairs) dataset = load_dataset("opus_books", "en-hi") # Replace with the appropriate dataset # Step 2: Load the pre-trained MarianMT model for English-Hindi translation model_name = "Helsinki-NLP/opus-mt-en-hi" model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) # Step 3: Tokenization function def tokenize_function(examples): return tokenizer(examples['en'], padding="max_length", truncation=True, max_length=128) # Tokenize the dataset tokenized_datasets = dataset.map(tokenize_function, batched=True) # Step 4: Prepare training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", # Evaluate every epoch learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", # Directory for logs logging_steps=10, save_steps=500, save_total_limit=2, ) # Step 5: Define a Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], tokenizer=tokenizer, ) # Step 6: Start Training trainer.train() # Step 7: Save the trained model trainer.save_model("fine_tuned_en_to_hi_model")
Browse files
README.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
datasets:
|
4 |
+
- open-thoughts/OpenThoughts-114k
|
5 |
+
metrics:
|
6 |
+
- code_eval
|
7 |
+
base_model:
|
8 |
+
- deepseek-ai/DeepSeek-R1
|
9 |
+
new_version: deepseek-ai/DeepSeek-R1
|
10 |
+
pipeline_tag: translation
|
11 |
+
library_name: flair
|
12 |
+
tags:
|
13 |
+
- code
|
14 |
+
---
|