Projectivize and well-formed filters translated to pure Python (translated from SyntaxNet code: document_filters.cc)
‘sentence’ object should be similar to that in SyntaxNet, with HEAD and other properties coming from the CoNLL file. See: CoNLL File Parsing Module for definition of the ‘sentence’ structure.
Projectivize filter
Check whether the given sentence is projective or not.
Return value is whether the sentence was originally projective or not.
– If it wasn’t, return False
– If it was, return True
projectivize parameter: whether or not to fix sentence to be projective
(does not affect return value)
""" Projectivize Filter (document_filters.cc) Check whether the given sentence is projective or not. Return value is whether the sentence was originally projective or not. - If it wasn't, return False - If it was, return True projectivize parameter: whether or not to fix sentence to be projective (does not affect return value) """ import copy import logging logger = logging.getLogger('ProjectivizeFilter') def checkProjectivity(sentence, projectivize=False): if projectivize: oldsentence = copy.deepcopy(sentence) wasProjective = True num_tokens = len(sentence.tokens) # Left and right boundaries for arcs. The left and right ends of an arc are # bounded by the arcs that pass over it. If an arc exceeds these bounds it # will cross an arc passing over it, making it a non-projective arc. left = [None for i in range(num_tokens)] right = [None for i in range(num_tokens)] # Lift the shortest non-projective arc until the document is projective. while True: # Initialize boundaries to the whole document for all arcs. for i in range(num_tokens): left[i] = -1 right[i] = num_tokens - 1 # Find left and right bounds for each token. for i in range(num_tokens): head_index = sentence.tokens[i].HEAD # Find left and right end of arc l = min(i, head_index) r = max(i, head_index) # Bound all tokens under the arc. for j in range(l+1, r): if left[j] < l: left[j] = l if right[j] > r: right[j] = r # Find deepest non-projective arc. deepest_arc = -1 max_depth = -1 # The non-projective arcs are those that exceed their bounds. for i in range(num_tokens): head_index = sentence.tokens[i].HEAD if head_index == -1: # any crossing arc must be deeper continue l = min(i, head_index) r = max(i, head_index) left_bound = max(left[l], left[r]) right_bound = min(right[l], right[r]) if (l < left_bound) or (r > right_bound): # Found non-projective arc. logger.debug('Found non-projective arc') wasProjective = False if not projectivize: return wasProjective # Pick the deepest as the best candidate for lifting. depth = 0 j = i while j != -1: depth += 1 j = sentence.tokens[j].HEAD if depth > max_depth: deepest_arc = i max_depth = depth # If there are no more non-projective arcs we are done. if deepest_arc == -1: if not wasProjective: logger.debug('Projectivized non-projective arc') logger.debug('Before\n' + oldsentence.toFileOutput()) logger.debug('After\n' + sentence.toFileOutput()) return wasProjective # Lift non-projective arc. lifted_head = sentence.tokens[sentence.tokens[deepest_arc].HEAD].HEAD sentence.tokens[deepest_arc].HEAD = lifted_head assert None
Well-formed filter
Check that parse tree input is single-root, connected, acyclic, and projective.
''' Well-Formed Filter (document_filters.cc) Check that input is single-root, connected, acyclic, and projective. ''' import logging logger = logging.getLogger('WellFormedFilter') from projectivize_filter import checkProjectivity ''' Determine whether all HEADs are within the bounds of the sentence ''' def allHeadsExist(sentence): minIndex = -1 # root token maxIndex = len(sentence.tokens)-1 for t in sentence.tokens: if t.HEAD < minIndex or t.HEAD > maxIndex: return False return True ''' Determine whether the sentence is single rooted ''' def isSingleRooted(sentence): allHeads = [] for t in sentence.tokens: if t.HEAD == -1: allHeads.append(t) return len(allHeads) == 1 ''' Determine whether or not the sentence has a cycle (in HEADs) ''' def hasCycle(sentence): visited = [-1 for t in sentence.tokens] for i in range(len(sentence.tokens)): # Already visited node if visited[i] != -1: continue t = i while t != -1: if visited[t] == -1: # If it is not visited yet, mark it. visited[t] = i elif visited[t] < i: # If the index number is smaller than index and not -1, the # token has already been visited. break else: # Loop detected return True t = sentence.tokens[t].HEAD return False class WellFormedFilter(object): def __init__(self): self.nonProjectiveCount = 0 self.projectivizedCount = 0 self.nonWellFormedCount = 0 ''' Determine whether the sentence can be parsed by arc-standard and arc-eager or not projectivize: whether to make non-projective sentences projective ''' def isWellFormed(self, sentence, projectivize=False): if len(sentence.tokens) == 0: logger.debug('Not well-formed: token length is zero') logger.debug('"'+sentence.toSimpleRepresentation()+'"') self.nonWellFormedCount += 1 return False if not allHeadsExist(sentence): logger.debug('Not well-formed: not all HEADs exist as tokens') logger.debug('"'+sentence.toSimpleRepresentation()+'"') self.nonWellFormedCount += 1 return False if not isSingleRooted(sentence): logger.debug('Not well-formed: tree doesn\'t have single ROOT') logger.debug('"'+sentence.toSimpleRepresentation()+'"') self.nonWellFormedCount += 1 return False if hasCycle(sentence): logger.debug('Not well-formed: tree has a cycle') logger.debug('"'+sentence.toSimpleRepresentation()+'"') self.nonWellFormedCount += 1 return False if not checkProjectivity(sentence, projectivize=projectivize): self.nonProjectiveCount += 1 # if it wasn't projective if not projectivize: # ... and we didn't projectivize it... then it's invalid logger.debug('Not well-formed: non-projective and' \ ' projectivize disabled') logger.debug('"'+sentence.toSimpleRepresentation()+'"') # only count them as non-well-formed when projectivize is off self.nonWellFormedCount += 1 return False else: # we succesfully projectivized a non-projective sentence # consider well-formed self.projectivizedCount += 1 # if we did projectivize it, then we can keep going return True