// user did not specify a meaningful sequence length
// override the sequence length by the maximum seq length of the current batch
if max_seq_length is None:
max_seq_length = min(max(len(ta) + len(tb) for ta, tb in all_tokens) + 3,
max_position_embeddings) // 3 account for maximum 3 special symbols
logger.warning(""max_seq_length" is undefined, "
"and bert config json defines "max_position_embeddings"=%d. "
"hence set "max_seq_length"=%d according to the current batch." % (
max_position_embeddings, max_seq_length))
for (tokens_a, tokens_b) in all_tokens:
if tokens_b:
// Modifies `tokens_a` and `tokens_b` in place so that the total
// length is less than the specified length.
// Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
// Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
// The convention in BERT is:
// (a) For sequence pairs:
// tokens: [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
// type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
// (b) For single sequences:
// tokens: [CLS] the dog is hairy . [SEP]
// type_ids: 0 0 0 0 0 0 0
//
// Where "type_ids" are used to indicate whether this is the first
// sequence or the second sequence. The embedding vectors for `type=0` and
// `type=1` were learned during pre-training and are added to the wordpiece
// embedding vector (and position vector). This is not *strictly* necessary
// since the [SEP] token unambiguously separates the sequences, but it makes
// it easier for the model to learn the concept of sequences.
//
// For classification tasks, the first vector (corresponding to [CLS]) is
// used as as the "sentence vector". Note that this only makes sense because
// the entire model is fine-tuned.
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
input_type_ids = [0] * len(tokens)
input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]
if tokens_b:
tokens += tokens_b + ["[SEP]"]
input_type_ids += [1] * (len(tokens_b) + 1)
input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
// Zero-pad up to the sequence length. more pythonic
pad_len = max_seq_length - len(input_ids)
input_ids += [0] * pad_len
input_mask += [0] * pad_len
input_type_ids += [0] * pad_len
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(input_type_ids) == max_seq_length
logger.debug("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.debug("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
After Change
// user did not specify a meaningful sequence length
// override the sequence length by the maximum seq length of the current batch
if max_seq_length is None:
max_seq_length = max(len(ta) + len(tb) for ta, tb in all_tokens)
// add special tokens into account
// case 1: Account for [CLS], tokens_a [SEP], tokens_b [SEP] -> 3 additional tokens
// case 2: Account for [CLS], tokens_a [SEP] -> 2 additional tokens
max_seq_length += 3 if any(len(tb) for _, tb in all_tokens) else 2
max_seq_length = min(max_seq_length, max_position_embeddings)
logger.warning(""max_seq_length" is undefined, "
"and bert config json defines "max_position_embeddings"=%d. "
"hence set "max_seq_length"=%d according to the current batch." % (
max_position_embeddings, max_seq_length))
for (tokens_a, tokens_b) in all_tokens:
if tokens_b:
// Modifies `tokens_a` and `tokens_b` in place so that the total
// length is less than the specified length.
// Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
// Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
// The convention in BERT is:
// (a) For sequence pairs:
// tokens: [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
// type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
// (b) For single sequences:
// tokens: [CLS] the dog is hairy . [SEP]
// type_ids: 0 0 0 0 0 0 0
//
// Where "type_ids" are used to indicate whether this is the first
// sequence or the second sequence. The embedding vectors for `type=0` and
// `type=1` were learned during pre-training and are added to the wordpiece
// embedding vector (and position vector). This is not *strictly* necessary
// since the [SEP] token unambiguously separates the sequences, but it makes
// it easier for the model to learn the concept of sequences.
//
// For classification tasks, the first vector (corresponding to [CLS]) is
// used as as the "sentence vector". Note that this only makes sense because
// the entire model is fine-tuned.
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
input_type_ids = [0] * len(tokens)
input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]
if tokens_b:
tokens += tokens_b + ["[SEP]"]
input_type_ids += [1] * (len(tokens_b) + 1)
input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
// Zero-pad up to the sequence length. more pythonic
pad_len = max_seq_length - len(input_ids)
input_ids += [0] * pad_len
input_mask += [0] * pad_len
input_type_ids += [0] * pad_len
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(input_type_ids) == max_seq_length
logger.debug("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.debug("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))