dkounadis
/

artificial-styletts2

@@ -91,7 +91,6 @@ def overlay(x, soundscape=None):
             target_rate=24000)[0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
-        k = background.shape[0]
@@ -100,21 +99,11 @@ def overlay(x, soundscape=None):
-        hop = int(.99 * k)  # only overlap 10%
-        n_repeat = len(x) // hop
-        total = np.zeros( hop * (n_repeat + 2))  # add some extra pad space for last frame to fit
-        m = np.ones(k)
-        overlap = k - hop
-        m[hop:] = np.linspace(1, 0, overlap)  # tril mask for avg sound in the interpolated hop
-        for j in range(n_repeat):
-            total[j*hop:j*hop + k] += m * background  # the total is already smoothly fading due to the previous mask. Only new addition of signal needs to rise smoothly
-        print((total < -1).sum(), (total > 1).sum(), 'OUTOF BOUNDS\n\n\n\n')
-        # total = total.clip(-1, 1)  # if too many signals were added on top of each other
-        # print(total[40000:70000].tolist())
-        print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
         # less periodic
@@ -124,7 +113,7 @@ def overlay(x, soundscape=None):
         # amplify sounds full [-1,1]
         total /= np.abs(total).max() + 1e-7
-        x = .4 * x + .6 * total[:len(x)]
     else:

             target_rate=24000)[0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
+        n_repeat = len(x) // background.shape[0] + 1
+        total = np.tile(background, n_repeat)
         # less periodic
         # amplify sounds full [-1,1]
         total /= np.abs(total).max() + 1e-7
+        x = .5 * x + .5 * total[:len(x)]
     else:

audiocraft/builders.py CHANGED Viewed

@@ -79,8 +79,8 @@ class AudioGen(nn.Module):
                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            # print('______________\nGENTOk 5', gen_tokens)
-            print('GENAUD 5', x.sum(), x.shape)
         return x / x.abs().max(2, keepdims=True)[0] + 1e-7

                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            # print('______________\nAudioGen Tokens', gen_tokens)
         return x / x.abs().max(2, keepdims=True)[0] + 1e-7

live_demo.py CHANGED Viewed

@@ -29,10 +29,11 @@ args.speed = 1.14
 os.system('cls' if os.name == 'nt' else 'clear')
 while True:
     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
-    args.soundscape = _str
     _str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f:

 os.system('cls' if os.name == 'nt' else 'clear')
 while True:
     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
     _str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
+    args.soundscape = _str
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f: