Ansh9728
/

distilhubert-finetuned-gtzan

+---
+license: apache-2.0
+datasets:
+- marsyas/gtzan
+language:
+- en
+metrics:
+- accuracy
+base_model:
+- ntu-spml/distilhubert
+pipeline_tag: audio-classification
+---
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+DistilHuBERT by NTU Speech Processing & Machine Learning Lab
+The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz.
+Note: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out this blog for more in-detail explanation of how to fine-tune the model.
+### Model Architecture and Objective
+HubertForSequenceClassification(
+  (hubert): HubertModel(
+    (feature_extractor): HubertFeatureEncoder(
+      (conv_layers): ModuleList(
+        (0): HubertGroupNormConvLayer(
+          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
+          (activation): GELUActivation()
+          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
+        )
+        (1-4): 4 x HubertNoLayerNormConvLayer(
+          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
+          (activation): GELUActivation()
+        )
+        (5-6): 2 x HubertNoLayerNormConvLayer(
+          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
+          (activation): GELUActivation()
+        )
+      )
+    )
+    (feature_projection): HubertFeatureProjection(
+      (projection): Linear(in_features=512, out_features=768, bias=True)
+      (dropout): Dropout(p=0.0, inplace=False)
+    )
+    (encoder): HubertEncoder(
+      (pos_conv_embed): HubertPositionalConvEmbedding(
+        (conv): ParametrizedConv1d(
+          768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        (padding): HubertSamePadLayer()
+        (activation): GELUActivation()
+      )
+      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+      (dropout): Dropout(p=0.1, inplace=False)
+      (layers): ModuleList(
+        (0-1): 2 x HubertEncoderLayer(
+          (attention): HubertSdpaAttention(
+            (k_proj): Linear(in_features=768, out_features=768, bias=True)
+            (v_proj): Linear(in_features=768, out_features=768, bias=True)
+            (q_proj): Linear(in_features=768, out_features=768, bias=True)
+            (out_proj): Linear(in_features=768, out_features=768, bias=True)
+          )
+          (dropout): Dropout(p=0.1, inplace=False)
+          (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (feed_forward): HubertFeedForward(
+            (intermediate_dropout): Dropout(p=0.1, inplace=False)
+            (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True)
+            (intermediate_act_fn): GELUActivation()
+            (output_dense): Linear(in_features=3072, out_features=768, bias=True)
+            (output_dropout): Dropout(p=0.1, inplace=False)
+          )
+          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+  )
+  (projector): Linear(in_features=768, out_features=256, bias=True)
+  (classifier): Linear(in_features=256, out_features=10, bias=True)
+)