Fix : input_ids sequence length problem
Browse files
__pycache__/helper_functions.cpython-310.pyc
CHANGED
Binary files a/__pycache__/helper_functions.cpython-310.pyc and b/__pycache__/helper_functions.cpython-310.pyc differ
|
|
helper_functions.py
CHANGED
@@ -112,7 +112,7 @@ def transform_single_text(
|
|
112 |
tokens = tokenize_whole_text(text, tokenizer)
|
113 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
114 |
add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
|
115 |
-
|
116 |
input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
|
117 |
return input_ids, attention_mask
|
118 |
|
@@ -158,19 +158,16 @@ def add_special_tokens_at_beginning_and_end(input_id_chunks: list[Tensor], mask_
|
|
158 |
mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
|
159 |
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
"""Adds padding tokens at the end to make sure that all chunks have exactly chunk_size tokens."""
|
164 |
-
pad_token_id = 0 # Assuming this is defined somewhere in your code
|
165 |
for i in range(len(input_id_chunks)):
|
166 |
# get required padding length
|
167 |
-
pad_len = chunk_size +2 - input_id_chunks[i].shape[0]
|
168 |
# check if tensor length satisfies required chunk size
|
169 |
if pad_len > 0:
|
170 |
# if padding length is more than 0, we must add padding
|
171 |
-
input_id_chunks[i] = torch.cat([input_id_chunks[i],
|
172 |
-
mask_chunks[i] = torch.cat([mask_chunks[i],
|
173 |
-
|
174 |
|
175 |
|
176 |
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
@@ -191,6 +188,13 @@ def split_overlapping(tensor: Tensor, chunk_size: int, stride: int, minimal_chun
|
|
191 |
|
192 |
## Voice part
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
def transform_for_inference_text(text: str,
|
195 |
tokenizer: PreTrainedTokenizerBase,
|
196 |
chunk_size: int,
|
@@ -204,7 +208,7 @@ def transform_for_inference_text(text: str,
|
|
204 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
205 |
add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
|
206 |
add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
|
207 |
-
input_ids, attention_mask =
|
208 |
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
209 |
|
210 |
def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
|
|
|
112 |
tokens = tokenize_whole_text(text, tokenizer)
|
113 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
114 |
add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
|
115 |
+
add_padding_tokens(input_id_chunks, mask_chunks , chunk_size)
|
116 |
input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
|
117 |
return input_ids, attention_mask
|
118 |
|
|
|
158 |
mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
|
159 |
|
160 |
|
161 |
+
def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor] , chunk_size) -> None:
|
162 |
+
"""Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""
|
|
|
|
|
163 |
for i in range(len(input_id_chunks)):
|
164 |
# get required padding length
|
165 |
+
pad_len = chunk_size + 2 - input_id_chunks[i].shape[0]
|
166 |
# check if tensor length satisfies required chunk size
|
167 |
if pad_len > 0:
|
168 |
# if padding length is more than 0, we must add padding
|
169 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i], Tensor([0] * pad_len)])
|
170 |
+
mask_chunks[i] = torch.cat([mask_chunks[i], Tensor([0] * pad_len)])
|
|
|
171 |
|
172 |
|
173 |
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
|
|
188 |
|
189 |
## Voice part
|
190 |
|
191 |
+
def stack_tokens_from_all_chunks_for_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
|
192 |
+
"""Reshapes data to a form compatible with BERT model input."""
|
193 |
+
input_ids = torch.stack(input_id_chunks)
|
194 |
+
attention_mask = torch.stack(mask_chunks)
|
195 |
+
|
196 |
+
return input_ids.long(), attention_mask.int()
|
197 |
+
|
198 |
def transform_for_inference_text(text: str,
|
199 |
tokenizer: PreTrainedTokenizerBase,
|
200 |
chunk_size: int,
|
|
|
208 |
input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
|
209 |
add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
|
210 |
add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
|
211 |
+
input_ids, attention_mask = stack_tokens_from_all_chunks_for_inference(input_id_chunks, mask_chunks)
|
212 |
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
213 |
|
214 |
def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
|