YsnHdn commited on
Commit
c6eb236
·
1 Parent(s): d479d0f

Fix : input_ids sequence length problem

Browse files
__pycache__/helper_functions.cpython-310.pyc CHANGED
Binary files a/__pycache__/helper_functions.cpython-310.pyc and b/__pycache__/helper_functions.cpython-310.pyc differ
 
helper_functions.py CHANGED
@@ -112,7 +112,7 @@ def transform_single_text(
112
  tokens = tokenize_whole_text(text, tokenizer)
113
  input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
114
  add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
115
- input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
116
  input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
117
  return input_ids, attention_mask
118
 
@@ -158,19 +158,16 @@ def add_special_tokens_at_beginning_and_end(input_id_chunks: list[Tensor], mask_
158
  mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
159
 
160
 
161
-
162
- def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor], chunk_size: int) -> None:
163
- """Adds padding tokens at the end to make sure that all chunks have exactly chunk_size tokens."""
164
- pad_token_id = 0 # Assuming this is defined somewhere in your code
165
  for i in range(len(input_id_chunks)):
166
  # get required padding length
167
- pad_len = chunk_size +2 - input_id_chunks[i].shape[0]
168
  # check if tensor length satisfies required chunk size
169
  if pad_len > 0:
170
  # if padding length is more than 0, we must add padding
171
- input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.tensor([pad_token_id] * pad_len)])
172
- mask_chunks[i] = torch.cat([mask_chunks[i], torch.tensor([0] * pad_len)])
173
-
174
 
175
 
176
  def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
@@ -191,6 +188,13 @@ def split_overlapping(tensor: Tensor, chunk_size: int, stride: int, minimal_chun
191
 
192
  ## Voice part
193
 
 
 
 
 
 
 
 
194
  def transform_for_inference_text(text: str,
195
  tokenizer: PreTrainedTokenizerBase,
196
  chunk_size: int,
@@ -204,7 +208,7 @@ def transform_for_inference_text(text: str,
204
  input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
205
  add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
206
  add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
207
- input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
208
  return {"input_ids": input_ids, "attention_mask": attention_mask}
209
 
210
  def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
 
112
  tokens = tokenize_whole_text(text, tokenizer)
113
  input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
114
  add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
115
+ add_padding_tokens(input_id_chunks, mask_chunks , chunk_size)
116
  input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
117
  return input_ids, attention_mask
118
 
 
158
  mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])
159
 
160
 
161
+ def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor] , chunk_size) -> None:
162
+ """Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""
 
 
163
  for i in range(len(input_id_chunks)):
164
  # get required padding length
165
+ pad_len = chunk_size + 2 - input_id_chunks[i].shape[0]
166
  # check if tensor length satisfies required chunk size
167
  if pad_len > 0:
168
  # if padding length is more than 0, we must add padding
169
+ input_id_chunks[i] = torch.cat([input_id_chunks[i], Tensor([0] * pad_len)])
170
+ mask_chunks[i] = torch.cat([mask_chunks[i], Tensor([0] * pad_len)])
 
171
 
172
 
173
  def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
 
188
 
189
  ## Voice part
190
 
191
+ def stack_tokens_from_all_chunks_for_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
192
+ """Reshapes data to a form compatible with BERT model input."""
193
+ input_ids = torch.stack(input_id_chunks)
194
+ attention_mask = torch.stack(mask_chunks)
195
+
196
+ return input_ids.long(), attention_mask.int()
197
+
198
  def transform_for_inference_text(text: str,
199
  tokenizer: PreTrainedTokenizerBase,
200
  chunk_size: int,
 
208
  input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens, chunk_size, stride, minimal_chunk_length)
209
  add_special_tokens_at_beginning_and_end_inference(input_id_chunks, mask_chunks)
210
  add_padding_tokens_inference(input_id_chunks, mask_chunks, chunk_size)
211
+ input_ids, attention_mask = stack_tokens_from_all_chunks_for_inference(input_id_chunks, mask_chunks)
212
  return {"input_ids": input_ids, "attention_mask": attention_mask}
213
 
214
  def add_special_tokens_at_beginning_and_end_inference(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None: