Projectivize and well-formed filters translated to pure Python (translated from SyntaxNet code: document_filters.cc)
‘sentence’ object should be similar to that in SyntaxNet, with HEAD and other properties coming from the CoNLL file. See: CoNLL File Parsing Module for definition of the ‘sentence’ structure.
Projectivize filter
Check whether the given sentence is projective or not.
Return value is whether the sentence was originally projective or not.
– If it wasn’t, return False
– If it was, return True
projectivize parameter: whether or not to fix sentence to be projective
(does not affect return value)
"""
Projectivize Filter
(document_filters.cc)
Check whether the given sentence is projective or not.
Return value is whether the sentence was originally projective or not.
- If it wasn't, return False
- If it was, return True
projectivize parameter: whether or not to fix sentence to be projective
(does not affect return value)
"""
import copy
import logging
logger = logging.getLogger('ProjectivizeFilter')
def checkProjectivity(sentence, projectivize=False):
if projectivize:
oldsentence = copy.deepcopy(sentence)
wasProjective = True
num_tokens = len(sentence.tokens)
# Left and right boundaries for arcs. The left and right ends of an arc are
# bounded by the arcs that pass over it. If an arc exceeds these bounds it
# will cross an arc passing over it, making it a non-projective arc.
left = [None for i in range(num_tokens)]
right = [None for i in range(num_tokens)]
# Lift the shortest non-projective arc until the document is projective.
while True:
# Initialize boundaries to the whole document for all arcs.
for i in range(num_tokens):
left[i] = -1
right[i] = num_tokens - 1
# Find left and right bounds for each token.
for i in range(num_tokens):
head_index = sentence.tokens[i].HEAD
# Find left and right end of arc
l = min(i, head_index)
r = max(i, head_index)
# Bound all tokens under the arc.
for j in range(l+1, r):
if left[j] < l:
left[j] = l
if right[j] > r:
right[j] = r
# Find deepest non-projective arc.
deepest_arc = -1
max_depth = -1
# The non-projective arcs are those that exceed their bounds.
for i in range(num_tokens):
head_index = sentence.tokens[i].HEAD
if head_index == -1:
# any crossing arc must be deeper
continue
l = min(i, head_index)
r = max(i, head_index)
left_bound = max(left[l], left[r])
right_bound = min(right[l], right[r])
if (l < left_bound) or (r > right_bound):
# Found non-projective arc.
logger.debug('Found non-projective arc')
wasProjective = False
if not projectivize:
return wasProjective
# Pick the deepest as the best candidate for lifting.
depth = 0
j = i
while j != -1:
depth += 1
j = sentence.tokens[j].HEAD
if depth > max_depth:
deepest_arc = i
max_depth = depth
# If there are no more non-projective arcs we are done.
if deepest_arc == -1:
if not wasProjective:
logger.debug('Projectivized non-projective arc')
logger.debug('Before\n' + oldsentence.toFileOutput())
logger.debug('After\n' + sentence.toFileOutput())
return wasProjective
# Lift non-projective arc.
lifted_head = sentence.tokens[sentence.tokens[deepest_arc].HEAD].HEAD
sentence.tokens[deepest_arc].HEAD = lifted_head
assert None
Well-formed filter
Check that parse tree input is single-root, connected, acyclic, and projective.
'''
Well-Formed Filter
(document_filters.cc)
Check that input is single-root, connected, acyclic, and projective.
'''
import logging
logger = logging.getLogger('WellFormedFilter')
from projectivize_filter import checkProjectivity
'''
Determine whether all HEADs are within the bounds of the sentence
'''
def allHeadsExist(sentence):
minIndex = -1 # root token
maxIndex = len(sentence.tokens)-1
for t in sentence.tokens:
if t.HEAD < minIndex or t.HEAD > maxIndex:
return False
return True
'''
Determine whether the sentence is single rooted
'''
def isSingleRooted(sentence):
allHeads = []
for t in sentence.tokens:
if t.HEAD == -1:
allHeads.append(t)
return len(allHeads) == 1
'''
Determine whether or not the sentence has a cycle (in HEADs)
'''
def hasCycle(sentence):
visited = [-1 for t in sentence.tokens]
for i in range(len(sentence.tokens)):
# Already visited node
if visited[i] != -1:
continue
t = i
while t != -1:
if visited[t] == -1:
# If it is not visited yet, mark it.
visited[t] = i
elif visited[t] < i:
# If the index number is smaller than index and not -1, the
# token has already been visited.
break
else:
# Loop detected
return True
t = sentence.tokens[t].HEAD
return False
class WellFormedFilter(object):
def __init__(self):
self.nonProjectiveCount = 0
self.projectivizedCount = 0
self.nonWellFormedCount = 0
'''
Determine whether the sentence can be parsed by arc-standard and arc-eager
or not
projectivize: whether to make non-projective sentences projective
'''
def isWellFormed(self, sentence, projectivize=False):
if len(sentence.tokens) == 0:
logger.debug('Not well-formed: token length is zero')
logger.debug('"'+sentence.toSimpleRepresentation()+'"')
self.nonWellFormedCount += 1
return False
if not allHeadsExist(sentence):
logger.debug('Not well-formed: not all HEADs exist as tokens')
logger.debug('"'+sentence.toSimpleRepresentation()+'"')
self.nonWellFormedCount += 1
return False
if not isSingleRooted(sentence):
logger.debug('Not well-formed: tree doesn\'t have single ROOT')
logger.debug('"'+sentence.toSimpleRepresentation()+'"')
self.nonWellFormedCount += 1
return False
if hasCycle(sentence):
logger.debug('Not well-formed: tree has a cycle')
logger.debug('"'+sentence.toSimpleRepresentation()+'"')
self.nonWellFormedCount += 1
return False
if not checkProjectivity(sentence, projectivize=projectivize):
self.nonProjectiveCount += 1
# if it wasn't projective
if not projectivize:
# ... and we didn't projectivize it... then it's invalid
logger.debug('Not well-formed: non-projective and' \
' projectivize disabled')
logger.debug('"'+sentence.toSimpleRepresentation()+'"')
# only count them as non-well-formed when projectivize is off
self.nonWellFormedCount += 1
return False
else:
# we succesfully projectivized a non-projective sentence
# consider well-formed
self.projectivizedCount += 1
# if we did projectivize it, then we can keep going
return True