|
| 1 | +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. |
| 2 | +
|
| 3 | +This implementation was tested on the LitBank files only, so far. |
| 4 | +LitBank does not use most of the columns, so the implementation |
| 5 | +should be improved to handle other types of CoNLL-2012 files. |
| 6 | +""" |
| 7 | +import json |
| 8 | +import logging |
| 9 | +import re |
| 10 | + |
| 11 | +import udapi.block.read.conllu |
| 12 | +from udapi.core.root import Root |
| 13 | +from udapi.core.node import Node |
| 14 | + |
| 15 | +#RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\); part (\d+)') |
| 16 | +RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\)') |
| 17 | + |
| 18 | +class Conll2012(udapi.block.read.conllu.Conllu): |
| 19 | + """A reader of the Conll2012 files.""" |
| 20 | + |
| 21 | + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs): |
| 22 | + """Create the Conll2012 reader object. |
| 23 | +
|
| 24 | + Args: |
| 25 | + attributes: comma-separated list of column names in the input files |
| 26 | + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref') |
| 27 | + For ignoring a column, use "_" as its name. |
| 28 | + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) |
| 29 | + word-order number/index (usualy called ID). |
| 30 | + """ |
| 31 | + super().__init__(**kwargs) |
| 32 | + self.node_attributes = attributes.split(',') |
| 33 | + self._docname = 'd' |
| 34 | + |
| 35 | + def parse_comment_line(self, line, root): |
| 36 | + if line.startswith("#end document"): |
| 37 | + return |
| 38 | + match = RE_BEGIN.match(line) |
| 39 | + if match: |
| 40 | + docname = match.group(1) |
| 41 | + root.newdoc = docname |
| 42 | + self._global_entity = 'eid-etype-head-other' |
| 43 | + root.comment += '$GLOBAL.ENTITY\n' |
| 44 | + self._docname = docname |
| 45 | + else: |
| 46 | + logging.warning(f"Unexpected comment line: {line}") |
| 47 | + |
| 48 | + def parse_node_line(self, line, root, nodes): |
| 49 | + fields = line.split('\t') |
| 50 | + if len(fields) != len(self.node_attributes): |
| 51 | + if self.strict: |
| 52 | + raise RuntimeError('Wrong number of columns in %r' % line) |
| 53 | + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) |
| 54 | + |
| 55 | + # This implementation is slower than in read.Conllu, |
| 56 | + # but it allows for arbitrary columns |
| 57 | + node = root.create_child() |
| 58 | + for (n_attribute, attribute_name) in enumerate(self.node_attributes): |
| 59 | + value = fields[n_attribute] |
| 60 | + if attribute_name == 'docname': |
| 61 | + if value != self._docname: |
| 62 | + logging.warning(f"Document name mismatch {value} != {self._docname}") |
| 63 | + |
| 64 | + # convert the zero-based index to one-based |
| 65 | + elif attribute_name == 'ord': |
| 66 | + setattr(node, 'ord', int(value) + 1) |
| 67 | + |
| 68 | + elif attribute_name == 'coref': |
| 69 | + if value and value != '_': |
| 70 | + entities = value.split("|") |
| 71 | + modified_entities = [] |
| 72 | + escaped_docname = self._docname.replace("-", "") |
| 73 | + for entity in entities: |
| 74 | + entity_num = entity.replace("(", "").replace(")","") |
| 75 | + modified_entity = f"{escaped_docname}_e{entity_num}--1" |
| 76 | + if entity.startswith("(") and entity.endswith(")"): |
| 77 | + modified_entity = "(" + modified_entity + ")" |
| 78 | + elif entity.startswith("("): |
| 79 | + modified_entity = "(" + modified_entity |
| 80 | + elif entity.endswith(")"): |
| 81 | + modified_entity = f"{escaped_docname}_e{entity_num}" + ")" |
| 82 | + |
| 83 | + # to avoid parentheses clashes, put the entities with ")" first |
| 84 | + if modified_entity.startswith("("): |
| 85 | + modified_entities.append(modified_entity) |
| 86 | + else: |
| 87 | + modified_entities.insert(0, modified_entity) |
| 88 | + node.misc['Entity'] = ''.join(modified_entities) |
| 89 | + |
| 90 | + elif attribute_name == 'form' or (attribute_name != '_' and value != '_'): |
| 91 | + setattr(node, attribute_name, value) |
| 92 | + nodes.append(node) |
| 93 | + |
| 94 | + def read_tree_from_lines(self, lines): |
| 95 | + root = Root() |
| 96 | + nodes = [root] |
| 97 | + for line in lines: |
| 98 | + if line == '': |
| 99 | + pass |
| 100 | + elif line[0] == '#': |
| 101 | + self.parse_comment_line(line, root) |
| 102 | + else: |
| 103 | + self.parse_node_line(line, root, nodes) |
| 104 | + |
| 105 | + # If no nodes were read from the filehandle (so only root remained in nodes), |
| 106 | + # we return None as a sign of failure (end of file or more than one empty line). |
| 107 | + if len(nodes) == 1: |
| 108 | + return None |
| 109 | + |
| 110 | + return root |
0 commit comments