4e144c9f842d7415d8be5bdbb5912d88ae32cced,pycorrector/seq2seq/corpus_reader.py,CGEDReader,read_samples_by_string,#CGEDReader#,80

Before Change


        with open(path, "r", encoding="utf-8") as f:
            dom_tree = minidom.parse(f)
            docs = dom_tree.documentElement.getElementsByTagName("DOC")
            for doc in docs:
                source_text = doc.getElementsByTagName("TEXT")[0]. \
                    childNodes[0].data.strip()
                target_text = doc.getElementsByTagName("CORRECTION")[0]. \
                    childNodes[0].data.strip()
                source = segment(source_text, cut_type="char")
                target = segment(target_text, cut_type="char")
                yield source, target

    def unknown_token(self):
        return CGEDReader.UNKNOWN_TOKEN

    def read_tokens(self, path, is_infer=False):

After Change


    def read_samples_by_string(self, path):
        with open(path, "r", encoding="utf-8") as f:
            while True:
                line_src = f.readline()
                line_dst = f.readline()
                if not line_src or len(line_src) < 5:
                    break
                source = line_src.lower()[5:].strip().split()
                target = line_dst.lower()[5:].strip().split()
                yield source, target


    def unknown_token(self):
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 10

Instances


Project Name: shibing624/pycorrector
Commit Name: 4e144c9f842d7415d8be5bdbb5912d88ae32cced
Time: 2018-04-16
Author: 507153809@qq.com
File Name: pycorrector/seq2seq/corpus_reader.py
Class Name: CGEDReader
Method Name: read_samples_by_string


Project Name: shibing624/pycorrector
Commit Name: 52dd8f17b382dea2ddaf3b4054d7845c8c3b4f72
Time: 2018-03-29
Author: 507153809@qq.com
File Name: pycorrector/seq2seq/fce_reader.py
Class Name: FCEReader
Method Name: read_samples_by_string


Project Name: shibing624/pycorrector
Commit Name: 4e144c9f842d7415d8be5bdbb5912d88ae32cced
Time: 2018-04-16
Author: 507153809@qq.com
File Name: pycorrector/seq2seq/corpus_reader.py
Class Name: CGEDReader
Method Name: read_tokens