moretts

Sleeping

App Files Files Community

Gregniuki commited on Mar 2

Commit

cc97bdc

verified ·

1 Parent(s): a05745b

Update model/utils.py

Browse files

Files changed (1) hide show

model/utils.py +30 -7

model/utils.py CHANGED Viewed

@@ -76,21 +76,44 @@ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d
     return num / den.clamp(min=1.0)
-# simple utf-8 tokenizer, since paper went character based
 def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]:  # noqa: F722
-    list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text]  # ByT5 style
     text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
     return text
-# char tokenizer, based on custom dataset's extracted .txt file
 def list_str_to_idx(
     text: list[str] | list[list[str]],
-    vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
-    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text

     return num / den.clamp(min=1.0)
 def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]:  # noqa: F722
+    # Split each string into words
+    list_words = [t.split() for t in text]
+    # Convert words to tensors (assuming words are already in byte format)
+    list_tensors = [torch.tensor([*bytes(" ".join(words), "UTF-8")]) for words in list_words]  # ByT5 style
     text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
     return text
 def list_str_to_idx(
     text: list[str] | list[list[str]],
+    vocab_map: dict[str, int],  # {word: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
+    # Split each string into words if not already split
+    if isinstance(text[0], str):
+        list_words = []
+        for t in text:
+            # Split the text by triple spaces
+            parts = t.split("   ")
+            words = []
+            for i, part in enumerate(parts):
+                # Split each part into words (by single spaces)
+                words.extend(part.split())
+                # Add a space token if there are more parts (i.e., triple spaces were present)
+                if i < len(parts) - 1:
+                    words.append(" ")  # Add a space token
+            list_words.append(words)
+    else:
+        list_words = text
+    # Convert words to their corresponding indices using vocab_map
+    list_idx_tensors = [
+        torch.tensor([vocab_map.get(word, 0) for word in words])  # Use 0 for unknown words
+        for words in list_words
+    ]
+    # Pad the sequences
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text