d7bcbade517750cc4fdb7642415e4cf05ae584e8,tutorial/utils.py,,collect_pubtator_annotations,#,8

Before Change


    Given a list of ddlite Documents with PubTator/CDR annotations,
    extract a dictionary of annotations by type.
    
    annotations = defaultdict(list)
    for a in doc.attribs["root"].xpath(".//annotation"):

        // Relation annotations
        if len(a.xpath("./infon[@key="relation"]")) > 0:
            type = a.xpath("./infon[@key="relation"]/text()")[0]
            types = a.xpath("./infon[@key != "relation"]/@key")
            mesh_ids = a.xpath("./infon[@key != "relation"]/text()")
            annotations[type].append(PubtatorRelation(types=types, mesh_ids=mesh_ids))

        // Mention annotations
        else:
            txt = a.xpath("./text/text()")[0]
            offset = int(a.xpath("./location/@offset")[0])
            length = int(a.xpath("./location/@length")[0])
            type = a.xpath("./infon[@key="type"]/text()")[0]
            mesh = a.xpath("./infon[@key="MESH"]/text()")[0]
            annotations[type].append(PubtatorMention(mesh_id=mesh, text=txt,
                                                     char_offset=offset, char_length=length))
    return annotations

After Change


    extract a set of Ngram objects indexed according to **Sentence character indexing**
    NOTE: Assume the sentences are provided in correct order & have standard separator.
    
    sent_offsets = [ensure_dict(s)[CHAR_OFFSETS][0] for s in sents]

    // Get Ngrams
    ngrams = []
    for a in doc.attribs["root"].xpath(".//annotation"):

        // Relation annotations
        if len(a.xpath("./infon[@key="relation"]")) > 0:

            // TODO: Pull these out!
            type = a.xpath("./infon[@key="relation"]/text()")[0]
            types = a.xpath("./infon[@key != "relation"]/@key")
            mesh_ids = a.xpath("./infon[@key != "relation"]/text()")
            pass

        // Mention annotations
        else:
            txt = a.xpath("./text/text()")[0]
            offset = int(a.xpath("./location/@offset")[0])
            length = int(a.xpath("./location/@length")[0])
            type = a.xpath("./infon[@key="type"]/text()")[0]
            mesh = a.xpath("./infon[@key="MESH"]/text()")[0]
            
            // Get sentence id and relative character offset
            si = len(sent_offsets) - 1
            for i,so in enumerate(sent_offsets):
                if offset == so:
                    si = i
                    break
                elif offset < so:
                    si = i - 1
                    break
            //offset -= sent_offsets[si]
            ngrams.append(Ngram(offset, offset + length - 1, sents[si], metadata={
                "mesh_id" : mesh, "type" : type}))
    return ngrams
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 9

Instances


Project Name: snorkel-team/snorkel
Commit Name: d7bcbade517750cc4fdb7642415e4cf05ae584e8
Time: 2016-07-03
Author: ajratner@gmail.com
File Name: tutorial/utils.py
Class Name:
Method Name: collect_pubtator_annotations


Project Name: HazyResearch/fonduer
Commit Name: 1d6771befb95f4ae94f308899633294a003dcfd6
Time: 2020-07-24
Author: hiromu.hota@hal.hitachi.com
File Name: src/fonduer/utils/data_model_utils/structural.py
Class Name:
Method Name: lowest_common_ancestor_depth


Project Name: snorkel-team/snorkel
Commit Name: d7bcbade517750cc4fdb7642415e4cf05ae584e8
Time: 2016-07-03
Author: ajratner@gmail.com
File Name: tutorial/utils.py
Class Name:
Method Name: collect_pubtator_annotations


Project Name: HazyResearch/fonduer
Commit Name: 1d6771befb95f4ae94f308899633294a003dcfd6
Time: 2020-07-24
Author: hiromu.hota@hal.hitachi.com
File Name: src/fonduer/utils/data_model_utils/structural.py
Class Name:
Method Name: common_ancestor