waiv commited on
Commit
460e2b9
·
verified ·
1 Parent(s): d30fe13

Update README, add ds code

Browse files
Files changed (1) hide show
  1. README.md +52 -5
README.md CHANGED
@@ -53,11 +53,52 @@ We introduce FinVoc2Vec, a vocal tone classifier designed for real-world corpora
53
  In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
54
  5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- ## Example Usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ```python
59
  import torch
60
- import torch.nn.functional as F
61
  from torch.utils.data import DataLoader
62
  from datasets import load_dataset
63
  from dataclasses import dataclass
@@ -65,6 +106,8 @@ from typing import Dict, List, Optional, Union
65
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
66
  import torchaudio
67
 
 
 
68
  @dataclass
69
  class DataCollatorWithPadding:
70
 
@@ -120,7 +163,7 @@ def preprocess_audio(batch: Dict,
120
  return result
121
 
122
  # load model
123
- model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
124
 
125
  # load feature extractor
126
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
@@ -151,10 +194,14 @@ with torch.no_grad():
151
  for batch in data_loader:
152
 
153
  attention_mask, inputs = batch['attention_mask'], batch['input_values']
 
 
 
 
154
  model_output = model(inputs, attention_mask=attention_mask)
155
 
156
- logits = model_output['logits'].to(torch.float32)
157
- probs = F.softmax(logits, dim=1).numpy()
158
 
159
  label_to_id = model.config.label2id
160
  dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],
 
53
  In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing
54
  5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone — positive, negative, or neutral.
55
 
56
+ ## Example using a demo dataset
57
+ ```python
58
+ import torch
59
+ from datasets import load_dataset
60
+ from transformers import Wav2Vec2FeatureExtractor, AutoModel
61
+ import numpy as np
62
+
63
+ device = "cuda" if torch.cuda.is_available() else "cpu"
64
+
65
+ # load model and feature extractor
66
+ model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True).to(device)
67
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
68
+
69
+ # load dataset
70
+ demo_dataset = load_dataset("waiv/FinVoc2Vec_demo")
71
+
72
+ arrays = [demo['audio']['array'] for demo in demo_dataset['test']]
73
 
74
+ # extract features
75
+ features = feature_extractor(
76
+ arrays,
77
+ sampling_rate=feature_extractor.sampling_rate,
78
+ padding=True,
79
+ truncation=False)
80
+
81
+ # convert to tensor
82
+ inputs = torch.tensor(np.array(features['input_values']), dtype=torch.float32).to(device)
83
+ attention_mask = torch.tensor(np.array(features['attention_mask']), dtype=torch.long).to(device)
84
+
85
+ # apply model
86
+ prob_dict = {}
87
+ with torch.no_grad():
88
+ model_output = model(inputs, attention_mask=attention_mask)
89
+ logits = model_output['logits'].to(torch.float32).to('cpu')
90
+ probs = torch.nn.functional.softmax(logits, dim=1).numpy()
91
+
92
+ label_to_id = model.config.label2id
93
+ for i, id in enumerate(demo_dataset['test']['id']):
94
+ prob_dict[id] = {'prob_negative': probs[i, label_to_id['negative']],
95
+ 'prob_neutral': probs[i, label_to_id['neutral']],
96
+ 'prob_positive': probs[i, label_to_id['positive']]}
97
+ ```
98
+
99
+ ## Example using audio files
100
  ```python
101
  import torch
 
102
  from torch.utils.data import DataLoader
103
  from datasets import load_dataset
104
  from dataclasses import dataclass
 
106
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
107
  import torchaudio
108
 
109
+ device = "cuda" if torch.cuda.is_available() else "cpu"
110
+
111
  @dataclass
112
  class DataCollatorWithPadding:
113
 
 
163
  return result
164
 
165
  # load model
166
+ model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True).to(device)
167
 
168
  # load feature extractor
169
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
 
194
  for batch in data_loader:
195
 
196
  attention_mask, inputs = batch['attention_mask'], batch['input_values']
197
+
198
+ inputs.to(device)
199
+ attention_mask.to(device)
200
+
201
  model_output = model(inputs, attention_mask=attention_mask)
202
 
203
+ logits = model_output['logits'].to(torch.float32).to('cpu')
204
+ probs = torch.nn.functional.softmax(logits, dim=1).numpy()
205
 
206
  label_to_id = model.config.label2id
207
  dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],