43a50ab6a3f717c738c8abba16779be59878bf4b,src/fonduer/parser/preprocessors/hocr_doc_preprocessor.py,HOCRDocPreprocessor,_parse_file,#HOCRDocPreprocessor#,45
Before Change
word.unwrap()
for parent in root.find_all(attrs={"fonduer": "1"}):
if self.space:
parent.string.replace_with(" ".join(parent.stripped_strings) )
else:
parent.string.replace_with("".join(parent.stripped_strings))
// Rmove the mark
After Change
word.unwrap()
// Clean-up
for i, parent in enumerate(root.find_all(attrs={"fonduer": "1"})) :
// Concat consecutive NavigableString
parent.smooth() // beautifulsoup4 >= 4.8.0
// Remove linebreaks and excess spaces
// in reverse order b/c removing element from list in loop
for child in reversed(parent.contents):
if isinstance(child, NavigableString):
if child.strip() == "": // remove if space or linebreak
child.extract()
else:
tmp = re.sub(r"[\n\s]+", " " if self.space else "", child)
n = NavigableString(tmp.strip())
child.replace_with(n)
del parent["fonduer"]
name = os.path.basename(fp)[: os.path.basename(fp).rfind(".")]
stable_id = self._get_stable_id(name)
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 5
Instances Project Name: HazyResearch/fonduer
Commit Name: 43a50ab6a3f717c738c8abba16779be59878bf4b
Time: 2020-10-06
Author: hiromu.hota@hal.hitachi.com
File Name: src/fonduer/parser/preprocessors/hocr_doc_preprocessor.py
Class Name: HOCRDocPreprocessor
Method Name: _parse_file
Project Name: daniel-kukiela/nmt-chatbot
Commit Name: 4aafb1f432f7622bec2cb35b9d6a6165babd44d9
Time: 2017-11-23
Author: daniel@kukiela.pl
File Name: inference.py
Class Name:
Method Name:
Project Name: danforthcenter/plantcv
Commit Name: ee7dc09812df5dded34adcde4267d9c754e3b655
Time: 2019-09-11
Author: haleyschuhl@gmail.com
File Name: plantcv/plantcv/hyperspectral/read_data.py
Class Name:
Method Name: read_data