Upload formats.py with huggingface_hub
Browse files- formats.py +44 -2
formats.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import (
|
| 2 |
Any,
|
| 3 |
Dict,
|
|
@@ -14,9 +15,51 @@ class Format(StreamInstanceOperator):
|
|
| 14 |
pass
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
class SystemFormat(Format):
|
| 18 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
| 19 |
|
|
|
|
|
|
|
| 20 |
SystemFormat expects the input instance to contain:
|
| 21 |
1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
|
| 22 |
2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
|
|
@@ -107,7 +150,6 @@ class SystemFormat(Format):
|
|
| 107 |
instance=instance, field_name="system_prompt"
|
| 108 |
)
|
| 109 |
|
| 110 |
-
# pop "system_prompt", "instruction", and "target_prefix" from instance
|
| 111 |
if "target_prefix" in instance:
|
| 112 |
instance.pop("target_prefix")
|
| 113 |
if "instruction" in instance:
|
|
@@ -122,7 +164,6 @@ class SystemFormat(Format):
|
|
| 122 |
demos is not None and isoftype(demos, List[Dict[str, Any]])
|
| 123 |
), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
|
| 124 |
demo_instances = demos
|
| 125 |
-
# pop demos from instance
|
| 126 |
instance.pop(self.demos_field)
|
| 127 |
|
| 128 |
demos_string = ""
|
|
@@ -143,5 +184,6 @@ class SystemFormat(Format):
|
|
| 143 |
target_prefix=target_prefix,
|
| 144 |
**self.format_args,
|
| 145 |
)
|
|
|
|
| 146 |
instance["source"] = output
|
| 147 |
return instance
|
|
|
|
| 1 |
+
import re
|
| 2 |
from typing import (
|
| 3 |
Any,
|
| 4 |
Dict,
|
|
|
|
| 15 |
pass
|
| 16 |
|
| 17 |
|
| 18 |
+
def apply_capital_new_line_notation(text: str) -> str:
|
| 19 |
+
r"""Transforms a given string by applying the Capital New Line Notation.
|
| 20 |
+
|
| 21 |
+
The Capital New Line Notation (\N) is designed to manage newline behavior in a string efficiently.
|
| 22 |
+
This custom notation aims to consolidate multiple newline characters (\n) into a single newline under
|
| 23 |
+
specific conditions, with tailored handling based on whether there's preceding text. The function
|
| 24 |
+
distinguishes between two primary scenarios:
|
| 25 |
+
|
| 26 |
+
1. If there's text (referred to as a prefix) followed by any number of \n characters and then one or
|
| 27 |
+
more \N, the entire sequence is replaced with a single \n. This effectively simplifies multiple
|
| 28 |
+
newlines and notation characters into a single newline when there's preceding text.
|
| 29 |
+
2. If the string starts with \n characters followed by \N without any text before this sequence, or if
|
| 30 |
+
\N is at the very beginning of the string, the sequence is completely removed. This case is
|
| 31 |
+
applicable when the notation should not introduce any newlines due to the absence of preceding text.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
text (str): The input string to be transformed, potentially containing the Capital New Line Notation
|
| 35 |
+
(\N) mixed with actual newline characters (\n).
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
str: The string after applying the Capital New Line Notation rules, which either consolidates multiple
|
| 39 |
+
newlines and notation characters into a single newline when text precedes them, or removes the
|
| 40 |
+
notation and any preceding newlines entirely if no text is present before the notation.
|
| 41 |
+
|
| 42 |
+
Examples:
|
| 43 |
+
>>> apply_capital_new_line_notation("Hello World\\n\\n\N")
|
| 44 |
+
'Hello World\\n'
|
| 45 |
+
|
| 46 |
+
>>> apply_capital_new_line_notation("\\n\\n\NGoodbye World")
|
| 47 |
+
'Goodbye World'
|
| 48 |
+
|
| 49 |
+
>>> apply_capital_new_line_notation("\N")
|
| 50 |
+
''
|
| 51 |
+
"""
|
| 52 |
+
# If sequence of \N or \n that ends with \N has no characters before delete it
|
| 53 |
+
text = re.sub(r"^(?:\n|\\N)*\\N", "", text)
|
| 54 |
+
# Replace every sequence of \N or \n that ends with \N with \n
|
| 55 |
+
return re.sub(r"[\n(\\N)]*(\\N)+", r"\n", text)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
class SystemFormat(Format):
|
| 59 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
| 60 |
|
| 61 |
+
Important: formats can use '\N' notations that means new-line if no new-line before and no empty string before.
|
| 62 |
+
|
| 63 |
SystemFormat expects the input instance to contain:
|
| 64 |
1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
|
| 65 |
2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
|
|
|
|
| 150 |
instance=instance, field_name="system_prompt"
|
| 151 |
)
|
| 152 |
|
|
|
|
| 153 |
if "target_prefix" in instance:
|
| 154 |
instance.pop("target_prefix")
|
| 155 |
if "instruction" in instance:
|
|
|
|
| 164 |
demos is not None and isoftype(demos, List[Dict[str, Any]])
|
| 165 |
), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
|
| 166 |
demo_instances = demos
|
|
|
|
| 167 |
instance.pop(self.demos_field)
|
| 168 |
|
| 169 |
demos_string = ""
|
|
|
|
| 184 |
target_prefix=target_prefix,
|
| 185 |
**self.format_args,
|
| 186 |
)
|
| 187 |
+
output = apply_capital_new_line_notation(output)
|
| 188 |
instance["source"] = output
|
| 189 |
return instance
|