Given a list of ddlite Documents with PubTator/CDR annotations,
extract a dictionary of annotations by type.
annotations = defaultdict(list)
for a in doc.attribs["root"].xpath(".//annotation"):
// Relation annotations
if len(a.xpath("./infon[@key="relation"]")) > 0:
type = a.xpath("./infon[@key="relation"]/text()")[0]
types = a.xpath("./infon[@key != "relation"]/@key")
mesh_ids = a.xpath("./infon[@key != "relation"]/text()")
annotations[type].append(PubtatorRelation(types=types, mesh_ids=mesh_ids))
// Mention annotations
else:
txt = a.xpath("./text/text()")[0]
offset = int(a.xpath("./location/@offset")[0])
length = int(a.xpath("./location/@length")[0])
type = a.xpath("./infon[@key="type"]/text()")[0]
mesh = a.xpath("./infon[@key="MESH"]/text()")[0]
annotations[type].append(PubtatorMention(mesh_id=mesh, text=txt,
char_offset=offset, char_length=length))
return annotations
After Change
sent_offsets = [ensure_dict(s)[CHAR_OFFSETS][0] for s in sents]
// Get Ngrams
ngrams = []
for a in doc.attribs["root"].xpath(".//annotation"):
// Relation annotations
if len(a.xpath("./infon[@key="relation"]")) > 0: