Skip to content

Commit 2bc7cc0

Browse files
committed
read.Conll2012 coreference format reader
1 parent 290edbc commit 2bc7cc0

File tree

1 file changed

+110
-0
lines changed

1 file changed

+110
-0
lines changed

udapi/block/read/conll2012.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
""""Conll2012 is a reader block for the coreference in CoNLL-2012 format.
2+
3+
This implementation was tested on the LitBank files only, so far.
4+
LitBank does not use most of the columns, so the implementation
5+
should be improved to handle other types of CoNLL-2012 files.
6+
"""
7+
import json
8+
import logging
9+
import re
10+
11+
import udapi.block.read.conllu
12+
from udapi.core.root import Root
13+
from udapi.core.node import Node
14+
15+
#RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\); part (\d+)')
16+
RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\)')
17+
18+
class Conll2012(udapi.block.read.conllu.Conllu):
19+
"""A reader of the Conll2012 files."""
20+
21+
def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs):
22+
"""Create the Conll2012 reader object.
23+
24+
Args:
25+
attributes: comma-separated list of column names in the input files
26+
(default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref')
27+
For ignoring a column, use "_" as its name.
28+
Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based)
29+
word-order number/index (usualy called ID).
30+
"""
31+
super().__init__(**kwargs)
32+
self.node_attributes = attributes.split(',')
33+
self._docname = 'd'
34+
35+
def parse_comment_line(self, line, root):
36+
if line.startswith("#end document"):
37+
return
38+
match = RE_BEGIN.match(line)
39+
if match:
40+
docname = match.group(1)
41+
root.newdoc = docname
42+
self._global_entity = 'eid-etype-head-other'
43+
root.comment += '$GLOBAL.ENTITY\n'
44+
self._docname = docname
45+
else:
46+
logging.warning(f"Unexpected comment line: {line}")
47+
48+
def parse_node_line(self, line, root, nodes):
49+
fields = line.split('\t')
50+
if len(fields) != len(self.node_attributes):
51+
if self.strict:
52+
raise RuntimeError('Wrong number of columns in %r' % line)
53+
fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
54+
55+
# This implementation is slower than in read.Conllu,
56+
# but it allows for arbitrary columns
57+
node = root.create_child()
58+
for (n_attribute, attribute_name) in enumerate(self.node_attributes):
59+
value = fields[n_attribute]
60+
if attribute_name == 'docname':
61+
if value != self._docname:
62+
logging.warning(f"Document name mismatch {value} != {self._docname}")
63+
64+
# convert the zero-based index to one-based
65+
elif attribute_name == 'ord':
66+
setattr(node, 'ord', int(value) + 1)
67+
68+
elif attribute_name == 'coref':
69+
if value and value != '_':
70+
entities = value.split("|")
71+
modified_entities = []
72+
escaped_docname = self._docname.replace("-", "")
73+
for entity in entities:
74+
entity_num = entity.replace("(", "").replace(")","")
75+
modified_entity = f"{escaped_docname}_e{entity_num}--1"
76+
if entity.startswith("(") and entity.endswith(")"):
77+
modified_entity = "(" + modified_entity + ")"
78+
elif entity.startswith("("):
79+
modified_entity = "(" + modified_entity
80+
elif entity.endswith(")"):
81+
modified_entity = f"{escaped_docname}_e{entity_num}" + ")"
82+
83+
# to avoid parentheses clashes, put the entities with ")" first
84+
if modified_entity.startswith("("):
85+
modified_entities.append(modified_entity)
86+
else:
87+
modified_entities.insert(0, modified_entity)
88+
node.misc['Entity'] = ''.join(modified_entities)
89+
90+
elif attribute_name == 'form' or (attribute_name != '_' and value != '_'):
91+
setattr(node, attribute_name, value)
92+
nodes.append(node)
93+
94+
def read_tree_from_lines(self, lines):
95+
root = Root()
96+
nodes = [root]
97+
for line in lines:
98+
if line == '':
99+
pass
100+
elif line[0] == '#':
101+
self.parse_comment_line(line, root)
102+
else:
103+
self.parse_node_line(line, root, nodes)
104+
105+
# If no nodes were read from the filehandle (so only root remained in nodes),
106+
# we return None as a sign of failure (end of file or more than one empty line).
107+
if len(nodes) == 1:
108+
return None
109+
110+
return root

0 commit comments

Comments
 (0)