boltuix
/

bert-mini

@@ -132,8 +132,8 @@ from transformers import pipeline
 mlm_pipeline = pipeline("fill-mask", model="boltuix/bert-mini")
 # Test example
-result = mlm_pipeline("She wore a beautiful [MASK] to the party.")
-print(result[0]["sequence"])  # Example output: "She wore a beautiful dress to the party."
 ```
 ## Quickstart: Text Classification
@@ -319,73 +319,76 @@ To adapt `bert-mini` for custom tasks (e.g., specific IoT commands):
 1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
 2. **Fine-Tune with Hugging Face**:
    ```python
-   import torch
-   from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
-   from datasets import Dataset
-   import pandas as pd
-   # Prepare sample dataset
-   data = {
-       "text": [
-           "Turn on the fan",
-           "Switch off the light",
-           "Invalid command",
-           "Activate the air conditioner",
-           "Turn off the heater",
-           "Gibberish input"
-       ],
-       "label": [1, 1, 0, 1, 1, 0]  # 1 for valid IoT commands, 0 for invalid
-   }
-   df = pd.DataFrame(data)
-   dataset = Dataset.from_pandas(df)
-   # Load tokenizer and model
-   model_name = "boltuix/bert-mini"
-   tokenizer = BertTokenizer.from_pretrained(model_name)
-   model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
-   # Tokenize dataset
-   def tokenize_function(examples):
-       return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
-   tokenized_dataset = dataset.map(tokenize_function, batched=True)
-   tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
-   # Define training arguments
-   training_args = TrainingArguments(
-       output_dir="./bert_mini_results",
-       num_train_epochs=5,
-       per_device_train_batch_size=2,
-       logging_dir="./bert_mini_logs",
-       logging_steps=10,
-       save_steps=100,
-       evaluation_strategy="no",
-       learning_rate=3e-5,
-   )
-   # Initialize Trainer
-   trainer = Trainer(
-       model=model,
-       args=training_args,
-       train_dataset=tokenized_dataset,
-   )
-   # Fine-tune
-   trainer.train()
-   # Save model
-   model.save_pretrained("./fine_tuned_bert_mini")
-   tokenizer.save_pretrained("./fine_tuned_bert_mini")
-   # Example inference
-   text = "Turn on the light"
-   inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
-   model.eval()
-   with torch.no_grad():
-       outputs = model(**inputs)
-       logits = outputs.logits
-       predicted_class = torch.argmax(logits, dim=1).item()
-   print(f"Predicted class for '{text}': {'Valid IoT Command' if predicted_class == 1 else 'Invalid Command'}")
    ```
 3. **Deploy**: Export to ONNX or TensorFlow Lite for edge devices.

 mlm_pipeline = pipeline("fill-mask", model="boltuix/bert-mini")
 # Test example
+result = mlm_pipeline("The train arrived at the [MASK] on time.")
+print(result[0]["sequence"])  # Example output: "The train arrived at the station on time."
 ```
 ## Quickstart: Text Classification
 1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
 2. **Fine-Tune with Hugging Face**:
    ```python
+    # Install the datasets library
+    !pip install datasets
+    import torch
+    from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
+    from datasets import Dataset
+    import pandas as pd
+    # Prepare sample dataset
+    data = {
+        "text": [
+            "Turn on the fan",
+            "Switch off the light",
+            "Invalid command",
+            "Activate the air conditioner",
+            "Turn off the heater",
+            "Gibberish input"
+        ],
+        "label": [1, 1, 0, 1, 1, 0]  # 1 for valid IoT commands, 0 for invalid
+    }
+    df = pd.DataFrame(data)
+    dataset = Dataset.from_pandas(df)
+    # Load tokenizer and model
+    model_name = "boltuix/bert-mini"
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    # Tokenize dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
+    # Define training arguments
+    training_args = TrainingArguments(
+        output_dir="./bert_mini_results",
+        num_train_epochs=5,
+        per_device_train_batch_size=2,
+        logging_dir="./bert_mini_logs",
+        logging_steps=10,
+        save_steps=100,
+        # Changed evaluation_strategy to eval_strategy
+        eval_strategy="no",  # Use 'no', 'steps', or 'epoch'
+        learning_rate=3e-5,
+    )
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset,
+    )
+    # Fine-tune
+    trainer.train()
+    # Save model
+    model.save_pretrained("./fine_tuned_bert_mini")
+    tokenizer.save_pretrained("./fine_tuned_bert_mini")
+    # Example inference
+    text = "Turn on the light"
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        predicted_class = torch.argmax(logits, dim=1).item()
+    print(f"Predicted class for '{text}': {'Valid IoT Command' if predicted_class == 1 else 'Invalid Command'}")
    ```
 3. **Deploy**: Export to ONNX or TensorFlow Lite for edge devices.