boltuix commited on
Commit
888ffec
·
verified ·
1 Parent(s): 60deff5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +72 -69
README.md CHANGED
@@ -132,8 +132,8 @@ from transformers import pipeline
132
  mlm_pipeline = pipeline("fill-mask", model="boltuix/bert-mini")
133
 
134
  # Test example
135
- result = mlm_pipeline("She wore a beautiful [MASK] to the party.")
136
- print(result[0]["sequence"]) # Example output: "She wore a beautiful dress to the party."
137
  ```
138
 
139
  ## Quickstart: Text Classification
@@ -319,73 +319,76 @@ To adapt `bert-mini` for custom tasks (e.g., specific IoT commands):
319
  1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
320
  2. **Fine-Tune with Hugging Face**:
321
  ```python
322
- import torch
323
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
324
- from datasets import Dataset
325
- import pandas as pd
326
-
327
- # Prepare sample dataset
328
- data = {
329
- "text": [
330
- "Turn on the fan",
331
- "Switch off the light",
332
- "Invalid command",
333
- "Activate the air conditioner",
334
- "Turn off the heater",
335
- "Gibberish input"
336
- ],
337
- "label": [1, 1, 0, 1, 1, 0] # 1 for valid IoT commands, 0 for invalid
338
- }
339
- df = pd.DataFrame(data)
340
- dataset = Dataset.from_pandas(df)
341
-
342
- # Load tokenizer and model
343
- model_name = "boltuix/bert-mini"
344
- tokenizer = BertTokenizer.from_pretrained(model_name)
345
- model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
346
-
347
- # Tokenize dataset
348
- def tokenize_function(examples):
349
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
350
-
351
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
352
- tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
353
-
354
- # Define training arguments
355
- training_args = TrainingArguments(
356
- output_dir="./bert_mini_results",
357
- num_train_epochs=5,
358
- per_device_train_batch_size=2,
359
- logging_dir="./bert_mini_logs",
360
- logging_steps=10,
361
- save_steps=100,
362
- evaluation_strategy="no",
363
- learning_rate=3e-5,
364
- )
365
-
366
- # Initialize Trainer
367
- trainer = Trainer(
368
- model=model,
369
- args=training_args,
370
- train_dataset=tokenized_dataset,
371
- )
372
-
373
- # Fine-tune
374
- trainer.train()
375
-
376
- # Save model
377
- model.save_pretrained("./fine_tuned_bert_mini")
378
- tokenizer.save_pretrained("./fine_tuned_bert_mini")
379
-
380
- # Example inference
381
- text = "Turn on the light"
382
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
383
- model.eval()
384
- with torch.no_grad():
385
- outputs = model(**inputs)
386
- logits = outputs.logits
387
- predicted_class = torch.argmax(logits, dim=1).item()
388
- print(f"Predicted class for '{text}': {'Valid IoT Command' if predicted_class == 1 else 'Invalid Command'}")
 
 
 
389
  ```
390
  3. **Deploy**: Export to ONNX or TensorFlow Lite for edge devices.
391
 
 
132
  mlm_pipeline = pipeline("fill-mask", model="boltuix/bert-mini")
133
 
134
  # Test example
135
+ result = mlm_pipeline("The train arrived at the [MASK] on time.")
136
+ print(result[0]["sequence"]) # Example output: "The train arrived at the station on time."
137
  ```
138
 
139
  ## Quickstart: Text Classification
 
319
  1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
320
  2. **Fine-Tune with Hugging Face**:
321
  ```python
322
+ # Install the datasets library
323
+ !pip install datasets
324
+ import torch
325
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
326
+ from datasets import Dataset
327
+ import pandas as pd
328
+
329
+ # Prepare sample dataset
330
+ data = {
331
+ "text": [
332
+ "Turn on the fan",
333
+ "Switch off the light",
334
+ "Invalid command",
335
+ "Activate the air conditioner",
336
+ "Turn off the heater",
337
+ "Gibberish input"
338
+ ],
339
+ "label": [1, 1, 0, 1, 1, 0] # 1 for valid IoT commands, 0 for invalid
340
+ }
341
+ df = pd.DataFrame(data)
342
+ dataset = Dataset.from_pandas(df)
343
+
344
+ # Load tokenizer and model
345
+ model_name = "boltuix/bert-mini"
346
+ tokenizer = BertTokenizer.from_pretrained(model_name)
347
+ model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
348
+
349
+ # Tokenize dataset
350
+ def tokenize_function(examples):
351
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
352
+
353
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
354
+ tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
355
+
356
+ # Define training arguments
357
+ training_args = TrainingArguments(
358
+ output_dir="./bert_mini_results",
359
+ num_train_epochs=5,
360
+ per_device_train_batch_size=2,
361
+ logging_dir="./bert_mini_logs",
362
+ logging_steps=10,
363
+ save_steps=100,
364
+ # Changed evaluation_strategy to eval_strategy
365
+ eval_strategy="no", # Use 'no', 'steps', or 'epoch'
366
+ learning_rate=3e-5,
367
+ )
368
+
369
+ # Initialize Trainer
370
+ trainer = Trainer(
371
+ model=model,
372
+ args=training_args,
373
+ train_dataset=tokenized_dataset,
374
+ )
375
+
376
+ # Fine-tune
377
+ trainer.train()
378
+
379
+ # Save model
380
+ model.save_pretrained("./fine_tuned_bert_mini")
381
+ tokenizer.save_pretrained("./fine_tuned_bert_mini")
382
+
383
+ # Example inference
384
+ text = "Turn on the light"
385
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
386
+ model.eval()
387
+ with torch.no_grad():
388
+ outputs = model(**inputs)
389
+ logits = outputs.logits
390
+ predicted_class = torch.argmax(logits, dim=1).item()
391
+ print(f"Predicted class for '{text}': {'Valid IoT Command' if predicted_class == 1 else 'Invalid Command'}")
392
  ```
393
  3. **Deploy**: Export to ONNX or TensorFlow Lite for edge devices.
394