padding_length = self._max_seq_length - valid_length
// use padding tokens for the rest
input_ids.extend([self._tokenizer.vocab["[PAD]"]] * padding_length)
segment_ids.extend([self._tokenizer.vocab["[PAD]"]] * padding_length)
return np.array(input_ids, dtype="int32"), np.array(valid_length, dtype="int32"),\
np.array(segment_ids, dtype="int32")
After Change
// use padding tokens for the input_ids and 0 for segment_ids
input_ids.extend(
[self._tokenizer.vocab[self._tokenizer.vocab.padding_token]] * padding_length)
segment_ids.extend([0] * padding_length)
return np.array(input_ids, dtype="int32"), np.array(valid_length, dtype="int32"),\
np.array(segment_ids, dtype="int32")