if hasattr(self._tokenizer, "vocab"):
vocab_field_name = "vocab"
elif hasattr(self._tokenizer, "encoder"):
vocab_field_name = "encoder"
else:
logger.warning(
Wasn"t able to fetch vocabulary from pretrained transformers lib.
Neither <vocab> nor <encoder> are the valid fields for vocab.
Your tokens will still be correctly indexed, but vocabulary file will not be saved.
)
if vocab_field_name is not None:
pretrained_vocab = getattr(self._tokenizer, vocab_field_name)
for word, idx in pretrained_vocab.items():
vocab._token_to_index[self._namespace][word] = idx
vocab._index_to_token[self._namespace][idx] = word
self._added_to_vocabulary = True
@overrides
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):