waiv
/

FinVoc2Vec

@@ -53,11 +53,52 @@ We introduce FinVoc2Vec, a vocal tone classifier designed for real-world corpora
 In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
 5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
-## Example Usage
 ```python
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
 from datasets import load_dataset
 from dataclasses import dataclass
@@ -65,6 +106,8 @@ from typing import Dict, List, Optional, Union
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
 import torchaudio
 @dataclass
 class DataCollatorWithPadding:
@@ -120,7 +163,7 @@ def preprocess_audio(batch: Dict,
     return result
 # load model
-model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
 # load feature extractor
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
@@ -151,10 +194,14 @@ with torch.no_grad():
     for batch in data_loader:
         attention_mask, inputs = batch['attention_mask'], batch['input_values']
         model_output = model(inputs, attention_mask=attention_mask)
-        logits = model_output['logits'].to(torch.float32)
-        probs = F.softmax(logits, dim=1).numpy()
         label_to_id = model.config.label2id
         dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],

 In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
 5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
+## Example using a demo dataset
+```python
+import torch
+from datasets import load_dataset
+from transformers import Wav2Vec2FeatureExtractor, AutoModel
+import numpy as np
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load model and feature extractor
+model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True).to(device)
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
+# load dataset
+demo_dataset = load_dataset("waiv/FinVoc2Vec_demo")
+arrays = [demo['audio']['array'] for demo in demo_dataset['test']]
+# extract features
+features = feature_extractor(
+    arrays,
+    sampling_rate=feature_extractor.sampling_rate,
+    padding=True,
+    truncation=False)
+# convert to tensor
+inputs = torch.tensor(np.array(features['input_values']), dtype=torch.float32).to(device)
+attention_mask = torch.tensor(np.array(features['attention_mask']), dtype=torch.long).to(device)
+# apply model
+prob_dict = {}
+with torch.no_grad():
+    model_output = model(inputs, attention_mask=attention_mask)
+    logits = model_output['logits'].to(torch.float32).to('cpu')
+    probs = torch.nn.functional.softmax(logits, dim=1).numpy()
+    label_to_id = model.config.label2id
+    for i, id in enumerate(demo_dataset['test']['id']):
+        prob_dict[id] = {'prob_negative': probs[i, label_to_id['negative']],
+                         'prob_neutral': probs[i, label_to_id['neutral']],
+                         'prob_positive': probs[i, label_to_id['positive']]}
+```
+## Example using audio files
 ```python
 import torch
 from torch.utils.data import DataLoader
 from datasets import load_dataset
 from dataclasses import dataclass
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
 import torchaudio
+device = "cuda" if torch.cuda.is_available() else "cpu"
 @dataclass
 class DataCollatorWithPadding:
     return result
 # load model
+model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True).to(device)
 # load feature extractor
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
     for batch in data_loader:
         attention_mask, inputs = batch['attention_mask'], batch['input_values']
+        inputs.to(device)
+        attention_mask.to(device)
         model_output = model(inputs, attention_mask=attention_mask)
+        logits = model_output['logits'].to(torch.float32).to('cpu')
+        probs = torch.nn.functional.softmax(logits, dim=1).numpy()
         label_to_id = model.config.label2id
         dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],