43a50ab6a3f717c738c8abba16779be59878bf4b,src/fonduer/parser/preprocessors/hocr_doc_preprocessor.py,HOCRDocPreprocessor,_parse_file,#HOCRDocPreprocessor#,45

Before Change


                    word.unwrap()
            for parent in root.find_all(attrs={"fonduer": "1"}):
                if self.space:
                    parent.string.replace_with(" ".join(parent.stripped_strings))
                else:
                    parent.string.replace_with("".join(parent.stripped_strings))
                // Rmove the mark

After Change


                    word.unwrap()

            // Clean-up
            for i, parent in enumerate(root.find_all(attrs={"fonduer": "1"})):
                // Concat consecutive NavigableString
                parent.smooth()  // beautifulsoup4 >= 4.8.0

                // Remove linebreaks and excess spaces
                // in reverse order b/c removing element from list in loop
                for child in reversed(parent.contents):
                    if isinstance(child, NavigableString):
                        if child.strip() == "":  // remove if space or linebreak
                            child.extract()
                        else:
                            tmp = re.sub(r"[\n\s]+", " " if self.space else "", child)
                            n = NavigableString(tmp.strip())
                            child.replace_with(n)
                del parent["fonduer"]

        name = os.path.basename(fp)[: os.path.basename(fp).rfind(".")]
        stable_id = self._get_stable_id(name)
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 5

Instances


Project Name: HazyResearch/fonduer
Commit Name: 43a50ab6a3f717c738c8abba16779be59878bf4b
Time: 2020-10-06
Author: hiromu.hota@hal.hitachi.com
File Name: src/fonduer/parser/preprocessors/hocr_doc_preprocessor.py
Class Name: HOCRDocPreprocessor
Method Name: _parse_file


Project Name: daniel-kukiela/nmt-chatbot
Commit Name: 4aafb1f432f7622bec2cb35b9d6a6165babd44d9
Time: 2017-11-23
Author: daniel@kukiela.pl
File Name: inference.py
Class Name:
Method Name:


Project Name: danforthcenter/plantcv
Commit Name: ee7dc09812df5dded34adcde4267d9c754e3b655
Time: 2019-09-11
Author: haleyschuhl@gmail.com
File Name: plantcv/plantcv/hyperspectral/read_data.py
Class Name:
Method Name: read_data