0bcc8da0344cddc9dfff82a788df519c19489500,nltk/tokenize/treebank.py,TreebankWordTokenizer,span_tokenize,#TreebankWordTokenizer#Any#,147
Before Change
if "" " in text:
// Find double quotes and converted quotes
matched = [ m.group() for m in re.finditer(r" [(``)(\"\")(" )]+", text)]
// Replace converted quotes back to double quotes
tokens = [matched.pop(0) if tok in [" "" , "``" , "" "" ] else tok for tok in raw_tokens]
else :
tokens = raw_tokens
return align_tokens (tokens, text)
After Change
ix = 0
spans = []
for word_token in self.tokenize(text):
if word_token in ("``" , "" "" ) :
orig_idx = text.find(word_token, ix)
quote_idx = text.find("" ", ix)
if orig_idx < 0:
real_token = " ""
elif quote_idx < 0 :
real_token = word_token
elif orig_idx < quote_idx:
real_token = word_token
else :
real_token = "" "
else:
real_token = word_token
ix = text.find(real_token, ix)
end = ix + len(real_token)
spans.append((ix, end))
ix = end
return spans
class TreebankWordDetokenizer(TokenizerI):
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 7
Instances Project Name: nltk/nltk
Commit Name: 0bcc8da0344cddc9dfff82a788df519c19489500
Time: 2017-10-17
Author: lyyb46@gmail.com
File Name: nltk/tokenize/treebank.py
Class Name: TreebankWordTokenizer
Method Name: span_tokenize
Project Name: nipunsadvilkar/pySBD
Commit Name: 0e364f4b0f70679ca984c0ba5629c569135804a4
Time: 2020-07-26
Author: nipunsadvilkar@gmail.com
File Name: pysbd/segmenter.py
Class Name: Segmenter
Method Name: sentences_with_char_spans
Project Name: estnltk/estnltk
Commit Name: 65c6e38fa1d0f0d49a3b0dea22047b01bb0d5b13
Time: 2015-07-10
Author: amatsin@gmail.com
File Name: estnltk/wiki/images.py
Class Name:
Method Name: imageParser