752f3796546cd4aef6e5f330634623af1b8e2710,server/bert_serving/server/bert/extract_features.py,,convert_lst_to_features,#,41

Before Change


    // user did not specify a meaningful sequence length
    // override the sequence length by the maximum seq length of the current batch
    if max_seq_length is None:
        max_seq_length = min(max(len(ta) + len(tb) for ta, tb in all_tokens) + 3,
                             max_position_embeddings)  // 3 account for maximum 3 special symbols
        logger.warning(""max_seq_length" is undefined, "
                       "and bert config json defines "max_position_embeddings"=%d. "
                       "hence set "max_seq_length"=%d according to the current batch." % (
                           max_position_embeddings, max_seq_length))

    for (tokens_a, tokens_b) in all_tokens:
        if tokens_b:
            // Modifies `tokens_a` and `tokens_b` in place so that the total
            // length is less than the specified length.
            // Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            // Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        // The convention in BERT is:
        // (a) For sequence pairs:
        //  tokens:   [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
        //  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        // (b) For single sequences:
        //  tokens:   [CLS] the dog is hairy . [SEP]
        //  type_ids: 0     0   0   0  0     0 0
        //
        // Where "type_ids" are used to indicate whether this is the first
        // sequence or the second sequence. The embedding vectors for `type=0` and
        // `type=1` were learned during pre-training and are added to the wordpiece
        // embedding vector (and position vector). This is not *strictly* necessary
        // since the [SEP] token unambiguously separates the sequences, but it makes
        // it easier for the model to learn the concept of sequences.
        //
        // For classification tasks, the first vector (corresponding to [CLS]) is
        // used as as the "sentence vector". Note that this only makes sense because
        // the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        input_type_ids = [0] * len(tokens)
        input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            input_type_ids += [1] * (len(tokens_b) + 1)
            input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        // Zero-pad up to the sequence length. more pythonic
        pad_len = max_seq_length - len(input_ids)
        input_ids += [0] * pad_len
        input_mask += [0] * pad_len
        input_type_ids += [0] * pad_len

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(input_type_ids) == max_seq_length

        logger.debug("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.debug("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

After Change


    // user did not specify a meaningful sequence length
    // override the sequence length by the maximum seq length of the current batch
    if max_seq_length is None:
        max_seq_length = max(len(ta) + len(tb) for ta, tb in all_tokens)
        // add special tokens into account
        // case 1: Account for [CLS], tokens_a [SEP], tokens_b [SEP] -> 3 additional tokens
        // case 2: Account for [CLS], tokens_a [SEP] -> 2 additional tokens
        max_seq_length += 3 if any(len(tb) for _, tb in all_tokens) else 2
        max_seq_length = min(max_seq_length, max_position_embeddings)
        logger.warning(""max_seq_length" is undefined, "
                       "and bert config json defines "max_position_embeddings"=%d. "
                       "hence set "max_seq_length"=%d according to the current batch." % (
                           max_position_embeddings, max_seq_length))

    for (tokens_a, tokens_b) in all_tokens:
        if tokens_b:
            // Modifies `tokens_a` and `tokens_b` in place so that the total
            // length is less than the specified length.
            // Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            // Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        // The convention in BERT is:
        // (a) For sequence pairs:
        //  tokens:   [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
        //  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        // (b) For single sequences:
        //  tokens:   [CLS] the dog is hairy . [SEP]
        //  type_ids: 0     0   0   0  0     0 0
        //
        // Where "type_ids" are used to indicate whether this is the first
        // sequence or the second sequence. The embedding vectors for `type=0` and
        // `type=1` were learned during pre-training and are added to the wordpiece
        // embedding vector (and position vector). This is not *strictly* necessary
        // since the [SEP] token unambiguously separates the sequences, but it makes
        // it easier for the model to learn the concept of sequences.
        //
        // For classification tasks, the first vector (corresponding to [CLS]) is
        // used as as the "sentence vector". Note that this only makes sense because
        // the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        input_type_ids = [0] * len(tokens)
        input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            input_type_ids += [1] * (len(tokens_b) + 1)
            input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        // Zero-pad up to the sequence length. more pythonic
        pad_len = max_seq_length - len(input_ids)
        input_ids += [0] * pad_len
        input_mask += [0] * pad_len
        input_type_ids += [0] * pad_len

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(input_type_ids) == max_seq_length

        logger.debug("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.debug("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 2

Instances

Link

Project Name: hanxiao/bert-as-service

Commit Name: 752f3796546cd4aef6e5f330634623af1b8e2710

Time: 2019-02-13

Author: hanhxiao@tencent.com

File Name: server/bert_serving/server/bert/extract_features.py

Class Name:

Method Name: convert_lst_to_features

Link

Project Name: chainer/chainercv

Commit Name: b21b8b41eee0776833d39c5bcbf3a299494ffb4f

Time: 2018-06-19

Author: shingogo@hotmail.co.jp

File Name: chainercv/experimental/links/model/fcis/fcis_resnet101.py

Class Name: FCISResNet101Head

Method Name: _pool

Link

Project Name: chainer/chainercv

Commit Name: 8d99e10342647b4c9e63176fc316d51ab6033804

Time: 2017-05-19

Author: yuyuniitani@gmail.com

File Name: chainercv/links/model/faster_rcnn/faster_rcnn.py

Class Name: FasterRCNNBase

Method Name: prepare