Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores Garcia commited on Jul 18, 2023

Commit

9496f0e

1 Parent(s): 308d855

sampling tricks!

Browse files

Files changed (2) hide show

app.py +49 -11
vampnet/modules/transformer.py +109 -37

app.py CHANGED Viewed

@@ -97,28 +97,35 @@ def _vamp(data, return_mask=False):
     mask = pmask.codebook_unmask(mask, ncc)
-    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]},  use coarse2fine {data[use_coarse2fine]}")
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
-        temperature=data[temp]*10,
         return_mask=True,
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
         gen_fn=interface.coarse.generate,
     )
     if use_coarse2fine:
         zv = interface.coarse_to_fine(
             zv,
-            temperature=data[temp],
             mask=mask,
-            sampling_steps=data[num_steps]
         )
     sig = interface.to_signal(zv).cpu()
@@ -152,7 +159,9 @@ def save_vamp(data):
     sig_out.write(out_dir / "output.wav")
     _data = {
-        "temp": data[temp],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
@@ -163,6 +172,7 @@ def save_vamp(data):
         "n_conditioning_codebooks": data[n_conditioning_codebooks],
         "use_coarse2fine": data[use_coarse2fine],
         "stretch_factor": data[stretch_factor],
     }
     # save with yaml
@@ -385,16 +395,28 @@ with gr.Blocks() as demo:
                     value=0.0
                 )
-            temp = gr.Slider(
-                label="temperature",
                 minimum=0.0,
                 maximum=10.0,
-                value=0.8
             )
             with gr.Accordion("sampling settings", open=False):
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
                     value=False
@@ -435,6 +457,18 @@ with gr.Blocks() as demo:
                 value=0.0
             )
         # mask settings
         with gr.Column():
@@ -463,7 +497,9 @@ with gr.Blocks() as demo:
     _inputs = {
             input_audio,
             num_steps,
-            temp,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
@@ -476,7 +512,9 @@ with gr.Blocks() as demo:
             typical_mass,
             typical_min_tokens,
             beat_mask_width,
-            beat_mask_downbeats
         }
     # connect widgets

     mask = pmask.codebook_unmask(mask, ncc)
+    print(data)
+    _top_p = data[top_p] if data[top_p] > 0 else None
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
+    _seed = data[seed] if data[seed] > 0 else None
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
+        mask_temperature=data[masktemp]*10,
+        sampling_temperature=data[sampletemp],
         return_mask=True,
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
+        top_p=_top_p,
         gen_fn=interface.coarse.generate,
+        seed=_seed,
     )
     if use_coarse2fine:
         zv = interface.coarse_to_fine(
             zv,
+            mask_temperature=data[masktemp]*10,
+            sampling_temperature=data[sampletemp],
             mask=mask,
+            sampling_steps=data[num_steps],
+            seed=_seed,
         )
     sig = interface.to_signal(zv).cpu()
     sig_out.write(out_dir / "output.wav")
     _data = {
+        "masktemp": data[masktemp],
+        "sampletemp": data[sampletemp],
+        "top_p": data[top_p],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
         "n_conditioning_codebooks": data[n_conditioning_codebooks],
         "use_coarse2fine": data[use_coarse2fine],
         "stretch_factor": data[stretch_factor],
+        "seed": data[seed],
     }
     # save with yaml
                     value=0.0
                 )
+            masktemp = gr.Slider(
+                label="mask temperature",
                 minimum=0.0,
                 maximum=10.0,
+                value=1.5
             )
+            sampletemp = gr.Slider(
+                label="sample temperature",
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0
+            )
             with gr.Accordion("sampling settings", open=False):
+                top_p = gr.Slider(
+                    label="top p (0.0 = off)",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.0
+                )
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
                     value=False
                 value=0.0
             )
+            use_new_trick = gr.Checkbox(
+                label="new trick",
+                value=False
+            )
+            seed = gr.Number(
+                label="seed (0 for random)",
+                value=0,
+                precision=0,
+            )
         # mask settings
         with gr.Column():
     _inputs = {
             input_audio,
             num_steps,
+            masktemp,
+            sampletemp,
+            top_p,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
             typical_mass,
             typical_min_tokens,
             beat_mask_width,
+            beat_mask_downbeats,
+            seed,
+            seed
         }
     # connect widgets

vampnet/modules/transformer.py CHANGED Viewed

@@ -367,6 +367,15 @@ class TransformerLayer(nn.Module):
         return x, position_bias, encoder_decoder_position_bias
 class TransformerStack(nn.Module):
     def __init__(
@@ -580,20 +589,20 @@ class VampNet(at.ml.BaseModel):
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
-        temperature: float = 2.5,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
         return_signal=True,
     ):
         logging.debug(f"beginning generation with {sampling_steps} steps")
-        #####################
-        # resolve temperature #
-        #####################
-        logging.debug(f"temperature: {temperature}")
         #####################
@@ -641,13 +650,11 @@ class VampNet(at.ml.BaseModel):
         #################
         # begin sampling #
         #################
         for i in range(sampling_steps):
             logging.debug(f"step {i} of {sampling_steps}")
-            # our current temperature
-            logging.debug(f"temperature: {temperature}")
             # our current schedule step
             r = scalar_to_batch_tensor(
                 (i + 1) / sampling_steps,
@@ -664,39 +671,19 @@ class VampNet(at.ml.BaseModel):
             # NOTE: this collapses the codebook dimension into the sequence dimension
             logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
-            if typical_filtering:
-                typical_filter(logits,
-                               typical_mass=typical_mass,
-                               typical_min_tokens=typical_min_tokens
-                )
             logging.debug(f"permuted logits with shape: {logits.shape}")
-            # logits2probs
-            probs = torch.softmax(logits, dim=-1)
-            logging.debug(f"computed probs with shape: {probs.shape}")
-            # sample from logits with multinomial sampling
-            b = probs.shape[0]
-            probs = rearrange(probs, "b seq prob -> (b seq) prob")
-            sampled_z =  torch.multinomial(probs, 1).squeeze(-1)
-            sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
-            probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
-            # get the confidences: which tokens did we sample?
-            selected_probs = (
-                torch.take_along_dim(
-                    probs, sampled_z.long().unsqueeze(-1),
-                    dim=-1
-                ).squeeze(-1)
-            )
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
@@ -733,7 +720,7 @@ class VampNet(at.ml.BaseModel):
             # get our new mask
             mask = mask_by_random_topk(
-                num_to_mask, selected_probs, temperature * (1-r)
             )
             # update the mask
@@ -766,6 +753,91 @@ class VampNet(at.ml.BaseModel):
         else:
             return sampled_z
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):
     """

         return x, position_bias, encoder_decoder_position_bias
+def t_schedule(n_steps, max_temp=1.0, min_temp=0.0, k=1.0):
+    x = np.linspace(0, 1, n_steps)
+    a = (0.5 - min_temp) / (max_temp - min_temp)
+    x = (x * 12) - 6
+    x0 = np.log((1 / a - 1) + 1e-5) / k
+    y = (1 / (1 + np.exp(- k *(x-x0))))[::-1]
+    return y
 class TransformerStack(nn.Module):
     def __init__(
         time_steps: int = 300,
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
+        sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
+        mask_temperature: float = 20.5,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
+        top_p=None,
         return_signal=True,
+        seed: int = None
     ):
+        if seed is not None:
+            at.util.seed(seed)
         logging.debug(f"beginning generation with {sampling_steps} steps")
         #####################
         #################
         # begin sampling #
         #################
+        t_sched = t_schedule(sampling_steps, max_temp=sampling_temperature)
         for i in range(sampling_steps):
             logging.debug(f"step {i} of {sampling_steps}")
             # our current schedule step
             r = scalar_to_batch_tensor(
                 (i + 1) / sampling_steps,
             # NOTE: this collapses the codebook dimension into the sequence dimension
             logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
+            b = logits.shape[0]
             logging.debug(f"permuted logits with shape: {logits.shape}")
+            sampled_z, selected_probs = sample_from_logits(
+                logits, sample=True, temperature=t_sched[i],
+                typical_filtering=typical_filtering, typical_mass=typical_mass,
+                typical_min_tokens=typical_min_tokens,
+                top_k=None, top_p=top_p, return_probs=True
+            )
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
             # get our new mask
             mask = mask_by_random_topk(
+                num_to_mask, selected_probs, mask_temperature * (1-r)
             )
             # update the mask
         else:
             return sampled_z
+def sample_from_logits(
+        logits,
+        sample: bool = True,
+        temperature: float = 1.0,
+        top_k: int = None,
+        top_p: float = None,
+        typical_filtering: bool = False,
+        typical_mass: float = 0.2,
+        typical_min_tokens: int = 1,
+        return_probs: bool = False
+    ):
+    """Convenience function to sample from a categorial distribution with input as
+    unnormalized logits.
+    Parameters
+    ----------
+    logits : Tensor[..., vocab_size]
+    config: SamplingConfig
+        The set of hyperparameters to be used for sampling
+        sample : bool, optional
+            Whether to perform multinomial sampling, by default True
+        temperature : float, optional
+            Scaling parameter when multinomial samping, by default 1.0
+        top_k : int, optional
+            Restricts sampling to only `top_k` values acc. to probability,
+            by default None
+        top_p : float, optional
+            Restricts sampling to only those values with cumulative
+            probability = `top_p`, by default None
+    Returns
+    -------
+    Tensor[...]
+        Sampled tokens
+    """
+    shp = logits.shape[:-1]
+    if typical_filtering:
+        typical_filter(logits,
+                        typical_mass=typical_mass,
+                        typical_min_tokens=typical_min_tokens
+        )
+    # Apply top_k sampling
+    if top_k is not None:
+        v, _ = logits.topk(top_k)
+        logits[logits < v[..., [-1]]] = -float("inf")
+    # Apply top_p (nucleus) sampling
+    if top_p is not None and top_p < 1.0:
+        v, sorted_indices = logits.sort(descending=True)
+        cumulative_probs = v.softmax(dim=-1).cumsum(dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Right shift indices_to_remove to keep 1st token over threshold
+        sorted_indices_to_remove = F.pad(sorted_indices_to_remove, (1, 0), value=False)[
+            ..., :-1
+        ]
+        # Compute indices_to_remove in unsorted array
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            -1, sorted_indices, sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = -float("inf")
+    # Perform multinomial sampling after normalizing logits
+    probs = (
+        F.softmax(logits / temperature, dim=-1)
+        if temperature > 0
+        else logits.softmax(dim=-1)
+    )
+    token = (
+        probs.view(-1, probs.size(-1)).multinomial(1).squeeze(1).view(*shp)
+        if sample
+        else logits.argmax(-1)
+    )
+    if return_probs:
+        token_probs = probs.take_along_dim(token.unsqueeze(-1), dim=-1).squeeze(-1)
+        return token, token_probs
+    else:
+        return token
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):
     """