caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()
// SpaCy on mac has issue on parallel parseing
if os.name == "posix":
PARALLEL = 1
else:
PARALLEL = 2 // Travis only gives 2 cores
max_docs = 2
docs_path = "tests/data/html_simple/"
pdf_path = "tests/data/pdf_simple/"
// Preprocessor for the Docs
preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
// Create an Parser and parse the diseases document
parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
parser.apply(preprocessor, parallelism=PARALLEL)
// Grab the diseases document
doc = session.query(Document).order_by(Document.name).all()[0]
assert doc.name == "diseases"
logger.info("Doc: {}".format(doc))
for sentence in doc.sentences:
After Change
// Preprocessor for the Docs
preprocessor = HTMLDocPreprocessor(docs_path)
doc, text = next(preprocessor.parse_file(docs_path, "diseases"))
// Check that doc has a name
assert doc.name == "diseases"
// Create an Parser and parse the diseases document
parser_udf = get_parser_udf(
structural=True, lingual=True, visual=True, pdf_path=pdf_path
)
for _ in parser_udf.apply((doc, text)):
pass
logger.info("Doc: {}".format(doc))
for sentence in doc.sentences:
logger.info(" Sentence: {}".format(sentence.text))