Spaces:

Afrinetwork
/

stts

Runtime error

App Files Files Community

Afrinetwork7 commited on Aug 21, 2024

Commit

03f9951

verified ·

1 Parent(s): 42573d3

Update whisper_jax/layers.py

Browse files

Files changed (1) hide show

whisper_jax/layers.py +1 -66

whisper_jax/layers.py CHANGED Viewed

@@ -56,16 +56,6 @@ NdInitializer = Callable[[PRNGKey, Shape, DType, InitializerAxis, InitializerAxi
 default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
-def nd_dense_init(scale, mode, distribution):
-    """Initializer with in_axis, out_axis set at call time."""
-    def init_fn(key, shape, dtype, in_axis, out_axis):
-        fn = variance_scaling(scale, mode, distribution, in_axis, out_axis)
-        return fn(key, shape, dtype)
-    return init_fn
 def dot_product_attention(
     query: Array,
     key: Array,
@@ -78,11 +68,9 @@ def dot_product_attention(
     float32_logits: bool = False,
 ):
     """Computes dot-product attention given query, key, and value.
     This is the core function for applying attention based on
     https://arxiv.org/abs/1706.03762. It calculates the attention weights given
     query and key and combines the values using the attention weights.
     Args:
       query: queries for calculating attention with shape of `[batch, q_length,
         num_heads, qk_depth_per_head]`.
@@ -99,7 +87,6 @@ def dot_product_attention(
       dtype: the dtype of the computation (default: float32)
       float32_logits: bool, if True then compute logits in float32 to avoid
         numerical issues with bfloat16.
     Returns:
       Output of shape `[batch, length, num_heads, v_depth_per_head]`.
     """
@@ -145,7 +132,6 @@ dynamic_vector_slice_in_dim = jax.vmap(lax.dynamic_slice_in_dim, in_axes=(None,
 class MultiHeadDotProductAttention(nn.Module):
     """Multi-head dot-product attention.
     Attributes:
       num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
         should be divisible by the number of heads.
@@ -176,22 +162,18 @@ class MultiHeadDotProductAttention(nn.Module):
         deterministic: bool = False,
     ) -> Array:
         """Applies multi-head dot product attention on the input data.
         Projects the inputs into multi-headed query, key, and value vectors,
         applies dot-product attention and project the results to an output vector.
         There are two modes: decoding and non-decoding (e.g., training). The mode is
         determined by `decode` argument. For decoding, this method is called twice,
         first to initialize the cache and then for an actual decoding process. The
         two calls are differentiated by the presence of 'cached_key' in the variable
         dict. In the cache initialization stage, the cache variables are initialized
         as zeros and will be filled in the subsequent decoding process.
         In the cache initialization call, `inputs_q` has a shape [batch, length,
         q_features] and `inputs_kv`: [batch, length, kv_features]. During the
         incremental decoding stage, query, key and value all have the shape [batch,
         1, qkv_features] corresponding to a single step.
         Args:
           inputs_q: input queries of shape `[batch, q_length, q_features]`.
           inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
@@ -199,7 +181,6 @@ class MultiHeadDotProductAttention(nn.Module):
           bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
           decode: Whether to prepare and use an autoregressive cache.
           deterministic: Disables dropout if set to True.
         Returns:
           output of shape `[batch, length, q_features]`.
         """
@@ -360,7 +341,6 @@ def _canonicalize_tuple(x):
 # ------------------------------------------------------------------------------
 class DenseGeneral(nn.Module):
     """A linear transformation (without bias) with flexible axes.
     Attributes:
       features: tuple with numbers of output features.
       axis: tuple with axes to apply the transformation on.
@@ -380,10 +360,8 @@ class DenseGeneral(nn.Module):
     @nn.compact
     def __call__(self, inputs: Array) -> Array:
         """Applies a linear transformation to the inputs along multiple dimensions.
         Args:
           inputs: The nd-array to be transformed.
         Returns:
           The transformed input.
         """
@@ -432,7 +410,6 @@ def _convert_to_activation_function(fn_or_string: Union[str, Callable]) -> Calla
 class MlpBlock(nn.Module):
     """Transformer MLP / feed-forward block.
     Attributes:
       intermediate_dim: Shared dimension of hidden layers.
       activations: Type of activations for each layer.  Each element is either
@@ -482,7 +459,6 @@ class MlpBlock(nn.Module):
 class Embed(nn.Module):
     """A parameterized function from integers [0, n) to d-dimensional vectors.
     Attributes:
       num_embeddings: number of embeddings.
       features: number of feature dimensions for each embedding.
@@ -513,10 +489,8 @@ class Embed(nn.Module):
     def __call__(self, inputs: Array) -> Array:
         """Embeds the inputs along the last dimension.
         Args:
           inputs: input data, all dimensions are considered batch dimensions.
         Returns:
           Output which is embedded input data.  The output shape follows the input,
           with an additional `features` dimension appended.
@@ -536,11 +510,9 @@ class Embed(nn.Module):
     def attend(self, query: Array) -> Array:
         """Attend over the embedding using a query array.
         Args:
           query: array with last dimension equal the feature depth `features` of the
             embedding.
         Returns:
           An array with final dim `num_embeddings` corresponding to the batched
           inner-product of the array of query vectors against each embedding.
@@ -553,7 +525,6 @@ class Embed(nn.Module):
 class RelativePositionBiases(nn.Module):
     """Adds T5-style relative positional embeddings to the attention logits.
     Attributes:
       num_buckets: Number of buckets to bucket distances between key and query
         positions into.
@@ -574,7 +545,6 @@ class RelativePositionBiases(nn.Module):
     @staticmethod
     def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """Translate relative position to a bucket number for relative attention.
         The relative position is defined as memory_position - query_position, i.e.
         the distance in tokens from the attending position to the attended-to
         position.  If bidirectional=False, then positive relative positions are
@@ -585,13 +555,11 @@ class RelativePositionBiases(nn.Module):
         positions <=-max_distance map to the same bucket.  This should allow for
         more graceful generalization to longer sequences than the model has been
         trained on.
         Args:
           relative_position: an int32 array
           bidirectional: a boolean - whether the attention is bidirectional
           num_buckets: an integer
           max_distance: an integer
         Returns:
           a Tensor with the same shape as relative_position, containing int32
             values in the range [0, num_buckets)
@@ -619,13 +587,11 @@ class RelativePositionBiases(nn.Module):
     @nn.compact
     def __call__(self, qlen, klen, bidirectional=True):
         """Produce relative position embedding attention biases.
         Args:
           qlen: attention query length.
           klen: attention key length.
           bidirectional: whether to allow positive memory-query relative position
             embeddings.
         Returns:
           output: `(1, len, q_len, k_len)` attention bias
         """
@@ -749,11 +715,9 @@ def make_attention_mask(
     dtype: DType = jnp.float32,
 ) -> Array:
     """Mask-making helper for attention weights.
     In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
     attention weights will be `[batch, heads, len_q, len_kv]` and this
     function will produce `[batch, 1, len_q, len_kv]`.
     Args:
       query_input: a batched, flat input of query_length size
       key_input: a batched, flat input of key_length size
@@ -761,7 +725,6 @@ def make_attention_mask(
       extra_batch_dims: number of extra batch dims to add singleton axes for, none
         by default
       dtype: mask return dtype
     Returns:
       A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
     """
@@ -781,21 +744,17 @@ def make_attention_mask(
 def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
     """Make a causal mask for self-attention.
     In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
     will be `[batch, heads, len, len]` and this function will produce a
     causal mask of shape `[batch, 1, len, len]`.
     Note that a causal mask does not depend on the values of x; it only depends on
     the shape. If x has padding elements, they will not be treated in a special
     manner.
     Args:
       x: input array of shape `[batch, len]`
       extra_batch_dims: number of batch dims to add singleton axes for, none by
         default
       dtype: mask return dtype
     Returns:
       A `[batch, 1, len, len]` shaped causal mask for 1d attention.
     """
@@ -805,11 +764,9 @@ def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.flo
 def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
     """Combine attention masks.
     Args:
       *masks: set of attention mask arguments to combine, some can be None.
       dtype: final mask dtype
     Returns:
       Combined mask, reduced by logical and, returns None if no masks given.
     """
@@ -827,10 +784,8 @@ def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
 def combine_biases(*masks: Optional[Array]):
     """Combine attention biases.
     Args:
       *masks: set of attention bias arguments to combine, some can be None.
     Returns:
       Combined mask, reduced by summation, returns None if no masks given.
     """
@@ -853,40 +808,30 @@ def make_decoder_mask(
     decoder_segment_ids: Optional[Array] = None,
 ) -> Array:
     """Compute the self-attention mask for a decoder.
     Decoder mask is formed by combining a causal mask, a padding mask and an
     optional packing mask. If decoder_causal_attention is passed, it makes the
     masking non-causal for positions that have value of 1.
     A prefix LM is applied to a dataset which has a notion of "inputs" and
     "targets", e.g., a machine translation task. The inputs and targets are
     concatenated to form a new target. `decoder_target_tokens` is the concatenated
     decoder output tokens.
     The "inputs" portion of the concatenated sequence can attend to other "inputs"
     tokens even for those at a later time steps. In order to control this
     behavior, `decoder_causal_attention` is necessary. This is a binary mask with
     a value of 1 indicating that the position belonged to "inputs" portion of the
     original dataset.
     Example:
       Suppose we have a dataset with two examples.
       ds = [{"inputs": [6, 7], "targets": [8]},
             {"inputs": [3, 4], "targets": [5]}]
       After the data preprocessing with packing, the two examples are packed into
       one example with the following three fields (some fields are skipped for
       simplicity).
          decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
            decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
       decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
       where each array has [batch, length] shape with batch size being 1. Then,
       this function computes the following mask.
                         mask = [[[[1, 1, 0, 0, 0, 0, 0],
                                   [1, 1, 0, 0, 0, 0, 0],
                                   [1, 1, 1, 0, 0, 0, 0],
@@ -894,14 +839,11 @@ def make_decoder_mask(
                                   [0, 0, 0, 1, 1, 0, 0],
                                   [0, 0, 0, 1, 1, 1, 0],
                                   [0, 0, 0, 0, 0, 0, 0]]]]
       mask[b, 1, :, :] represents the mask for the example `b` in the batch.
       Because mask is for a self-attention layer, the mask's shape is a square of
       shape [query length, key length].
       mask[b, 1, i, j] = 1 means that the query token at position i can attend to
       the key token at position j.
     Args:
       decoder_target_tokens: decoder output tokens. [batch, length]
       dtype: dtype of the output mask.
@@ -910,7 +852,6 @@ def make_decoder_mask(
         bidirectionally. [batch, length]
       decoder_segment_ids: decoder segmentation info for packed examples. [batch,
         length]
     Returns:
       the combined decoder mask.
     """
@@ -976,7 +917,6 @@ def _conv_dimension_numbers(input_shape):
 class _Conv(nn.Module):
     """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
     Attributes:
       features: number of convolution filters.
       kernel_size: shape of the convolutional kernel. For 1D convolution,
@@ -1032,19 +972,16 @@ class _Conv(nn.Module):
     @property
     def shared_weights(self) -> bool:  # type: ignore
         """Defines whether weights are shared or not between different pixels.
         Returns:
           `True` to use shared weights in convolution (regular convolution).
           `False` to use different weights at different pixels, a.k.a.
           "locally connected layer", "unshared convolution", or "local convolution".
         """
         ...
     @nn.compact
     def __call__(self, inputs: Array) -> Array:
         """Applies a (potentially unshared) convolution to the inputs.
         Args:
           inputs: input data with dimensions (*batch_dims, spatial_dims...,
             features). This is the channels-last convention, i.e. NHWC for a 2d
@@ -1057,7 +994,6 @@ class _Conv(nn.Module):
             better performance than this default flattening approach.  If the input
             lacks a batch dimension it will be added for the convolution and removed
             n return, an allowance made to enable writing single-example code.
         Returns:
           The convolved data.
         """
@@ -1214,7 +1150,6 @@ class _Conv(nn.Module):
 class Conv(_Conv):
     """Convolution Module wrapping `lax.conv_general_dilated`.
     Attributes:
       features: number of convolution filters.
       kernel_size: shape of the convolutional kernel. For 1D convolution,
@@ -1252,4 +1187,4 @@ class Conv(_Conv):
     @property
     def shared_weights(self) -> bool:
-        return True

 default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
 def dot_product_attention(
     query: Array,
     key: Array,
     float32_logits: bool = False,
 ):
     """Computes dot-product attention given query, key, and value.
     This is the core function for applying attention based on
     https://arxiv.org/abs/1706.03762. It calculates the attention weights given
     query and key and combines the values using the attention weights.
     Args:
       query: queries for calculating attention with shape of `[batch, q_length,
         num_heads, qk_depth_per_head]`.
       dtype: the dtype of the computation (default: float32)
       float32_logits: bool, if True then compute logits in float32 to avoid
         numerical issues with bfloat16.
     Returns:
       Output of shape `[batch, length, num_heads, v_depth_per_head]`.
     """
 class MultiHeadDotProductAttention(nn.Module):
     """Multi-head dot-product attention.
     Attributes:
       num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
         should be divisible by the number of heads.
         deterministic: bool = False,
     ) -> Array:
         """Applies multi-head dot product attention on the input data.
         Projects the inputs into multi-headed query, key, and value vectors,
         applies dot-product attention and project the results to an output vector.
         There are two modes: decoding and non-decoding (e.g., training). The mode is
         determined by `decode` argument. For decoding, this method is called twice,
         first to initialize the cache and then for an actual decoding process. The
         two calls are differentiated by the presence of 'cached_key' in the variable
         dict. In the cache initialization stage, the cache variables are initialized
         as zeros and will be filled in the subsequent decoding process.
         In the cache initialization call, `inputs_q` has a shape [batch, length,
         q_features] and `inputs_kv`: [batch, length, kv_features]. During the
         incremental decoding stage, query, key and value all have the shape [batch,
         1, qkv_features] corresponding to a single step.
         Args:
           inputs_q: input queries of shape `[batch, q_length, q_features]`.
           inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
           bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
           decode: Whether to prepare and use an autoregressive cache.
           deterministic: Disables dropout if set to True.
         Returns:
           output of shape `[batch, length, q_features]`.
         """
 # ------------------------------------------------------------------------------
 class DenseGeneral(nn.Module):
     """A linear transformation (without bias) with flexible axes.
     Attributes:
       features: tuple with numbers of output features.
       axis: tuple with axes to apply the transformation on.
     @nn.compact
     def __call__(self, inputs: Array) -> Array:
         """Applies a linear transformation to the inputs along multiple dimensions.
         Args:
           inputs: The nd-array to be transformed.
         Returns:
           The transformed input.
         """
 class MlpBlock(nn.Module):
     """Transformer MLP / feed-forward block.
     Attributes:
       intermediate_dim: Shared dimension of hidden layers.
       activations: Type of activations for each layer.  Each element is either
 class Embed(nn.Module):
     """A parameterized function from integers [0, n) to d-dimensional vectors.
     Attributes:
       num_embeddings: number of embeddings.
       features: number of feature dimensions for each embedding.
     def __call__(self, inputs: Array) -> Array:
         """Embeds the inputs along the last dimension.
         Args:
           inputs: input data, all dimensions are considered batch dimensions.
         Returns:
           Output which is embedded input data.  The output shape follows the input,
           with an additional `features` dimension appended.
     def attend(self, query: Array) -> Array:
         """Attend over the embedding using a query array.
         Args:
           query: array with last dimension equal the feature depth `features` of the
             embedding.
         Returns:
           An array with final dim `num_embeddings` corresponding to the batched
           inner-product of the array of query vectors against each embedding.
 class RelativePositionBiases(nn.Module):
     """Adds T5-style relative positional embeddings to the attention logits.
     Attributes:
       num_buckets: Number of buckets to bucket distances between key and query
         positions into.
     @staticmethod
     def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """Translate relative position to a bucket number for relative attention.
         The relative position is defined as memory_position - query_position, i.e.
         the distance in tokens from the attending position to the attended-to
         position.  If bidirectional=False, then positive relative positions are
         positions <=-max_distance map to the same bucket.  This should allow for
         more graceful generalization to longer sequences than the model has been
         trained on.
         Args:
           relative_position: an int32 array
           bidirectional: a boolean - whether the attention is bidirectional
           num_buckets: an integer
           max_distance: an integer
         Returns:
           a Tensor with the same shape as relative_position, containing int32
             values in the range [0, num_buckets)
     @nn.compact
     def __call__(self, qlen, klen, bidirectional=True):
         """Produce relative position embedding attention biases.
         Args:
           qlen: attention query length.
           klen: attention key length.
           bidirectional: whether to allow positive memory-query relative position
             embeddings.
         Returns:
           output: `(1, len, q_len, k_len)` attention bias
         """
     dtype: DType = jnp.float32,
 ) -> Array:
     """Mask-making helper for attention weights.
     In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
     attention weights will be `[batch, heads, len_q, len_kv]` and this
     function will produce `[batch, 1, len_q, len_kv]`.
     Args:
       query_input: a batched, flat input of query_length size
       key_input: a batched, flat input of key_length size
       extra_batch_dims: number of extra batch dims to add singleton axes for, none
         by default
       dtype: mask return dtype
     Returns:
       A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
     """
 def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
     """Make a causal mask for self-attention.
     In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
     will be `[batch, heads, len, len]` and this function will produce a
     causal mask of shape `[batch, 1, len, len]`.
     Note that a causal mask does not depend on the values of x; it only depends on
     the shape. If x has padding elements, they will not be treated in a special
     manner.
     Args:
       x: input array of shape `[batch, len]`
       extra_batch_dims: number of batch dims to add singleton axes for, none by
         default
       dtype: mask return dtype
     Returns:
       A `[batch, 1, len, len]` shaped causal mask for 1d attention.
     """
 def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
     """Combine attention masks.
     Args:
       *masks: set of attention mask arguments to combine, some can be None.
       dtype: final mask dtype
     Returns:
       Combined mask, reduced by logical and, returns None if no masks given.
     """
 def combine_biases(*masks: Optional[Array]):
     """Combine attention biases.
     Args:
       *masks: set of attention bias arguments to combine, some can be None.
     Returns:
       Combined mask, reduced by summation, returns None if no masks given.
     """
     decoder_segment_ids: Optional[Array] = None,
 ) -> Array:
     """Compute the self-attention mask for a decoder.
     Decoder mask is formed by combining a causal mask, a padding mask and an
     optional packing mask. If decoder_causal_attention is passed, it makes the
     masking non-causal for positions that have value of 1.
     A prefix LM is applied to a dataset which has a notion of "inputs" and
     "targets", e.g., a machine translation task. The inputs and targets are
     concatenated to form a new target. `decoder_target_tokens` is the concatenated
     decoder output tokens.
     The "inputs" portion of the concatenated sequence can attend to other "inputs"
     tokens even for those at a later time steps. In order to control this
     behavior, `decoder_causal_attention` is necessary. This is a binary mask with
     a value of 1 indicating that the position belonged to "inputs" portion of the
     original dataset.
     Example:
       Suppose we have a dataset with two examples.
       ds = [{"inputs": [6, 7], "targets": [8]},
             {"inputs": [3, 4], "targets": [5]}]
       After the data preprocessing with packing, the two examples are packed into
       one example with the following three fields (some fields are skipped for
       simplicity).
          decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
            decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
       decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
       where each array has [batch, length] shape with batch size being 1. Then,
       this function computes the following mask.
                         mask = [[[[1, 1, 0, 0, 0, 0, 0],
                                   [1, 1, 0, 0, 0, 0, 0],
                                   [1, 1, 1, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 1, 0, 0],
                                   [0, 0, 0, 1, 1, 1, 0],
                                   [0, 0, 0, 0, 0, 0, 0]]]]
       mask[b, 1, :, :] represents the mask for the example `b` in the batch.
       Because mask is for a self-attention layer, the mask's shape is a square of
       shape [query length, key length].
       mask[b, 1, i, j] = 1 means that the query token at position i can attend to
       the key token at position j.
     Args:
       decoder_target_tokens: decoder output tokens. [batch, length]
       dtype: dtype of the output mask.
         bidirectionally. [batch, length]
       decoder_segment_ids: decoder segmentation info for packed examples. [batch,
         length]
     Returns:
       the combined decoder mask.
     """
 class _Conv(nn.Module):
     """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
     Attributes:
       features: number of convolution filters.
       kernel_size: shape of the convolutional kernel. For 1D convolution,
     @property
     def shared_weights(self) -> bool:  # type: ignore
         """Defines whether weights are shared or not between different pixels.
         Returns:
           `True` to use shared weights in convolution (regular convolution).
           `False` to use different weights at different pixels, a.k.a.
           "locally connected layer", "unshared convolution", or "local convolution".
         """
         ...
     @nn.compact
     def __call__(self, inputs: Array) -> Array:
         """Applies a (potentially unshared) convolution to the inputs.
         Args:
           inputs: input data with dimensions (*batch_dims, spatial_dims...,
             features). This is the channels-last convention, i.e. NHWC for a 2d
             better performance than this default flattening approach.  If the input
             lacks a batch dimension it will be added for the convolution and removed
             n return, an allowance made to enable writing single-example code.
         Returns:
           The convolved data.
         """
 class Conv(_Conv):
     """Convolution Module wrapping `lax.conv_general_dilated`.
     Attributes:
       features: number of convolution filters.
       kernel_size: shape of the convolutional kernel. For 1D convolution,
     @property
     def shared_weights(self) -> bool:
+        return True