Arc projectivize and well-formed filters for dependency parsing in pure Python (translated from SyntaxNet code)

Projectivize and well-formed filters translated to pure Python (translated from SyntaxNet code: document_filters.cc)

‘sentence’ object should be similar to that in SyntaxNet, with HEAD and other properties coming from the CoNLL file. See: CoNLL File Parsing Module for definition of the ‘sentence’ structure.

Projectivize filter

Check whether the given sentence is projective or not.

Return value is whether the sentence was originally projective or not.
– If it wasn’t, return False
– If it was, return True

projectivize parameter: whether or not to fix sentence to be projective
(does not affect return value)

"""
Projectivize Filter
(document_filters.cc)

Check whether the given sentence is projective or not.

Return value is whether the sentence was originally projective or not.
  - If it wasn't, return False
  - If it was, return True

projectivize parameter: whether or not to fix sentence to be projective
(does not affect return value)
"""

import copy
import logging
logger = logging.getLogger('ProjectivizeFilter')

def checkProjectivity(sentence, projectivize=False):
    if projectivize:
        oldsentence = copy.deepcopy(sentence)

    wasProjective = True
    num_tokens = len(sentence.tokens)

    # Left and right boundaries for arcs. The left and right ends of an arc are
    # bounded by the arcs that pass over it. If an arc exceeds these bounds it
    # will cross an arc passing over it, making it a non-projective arc.

    left = [None for i in range(num_tokens)]
    right = [None for i in range(num_tokens)]

    # Lift the shortest non-projective arc until the document is projective.
    while True:
        # Initialize boundaries to the whole document for all arcs.
        for i in range(num_tokens):
            left[i] = -1
            right[i] = num_tokens - 1

        # Find left and right bounds for each token.
        for i in range(num_tokens):
            head_index = sentence.tokens[i].HEAD

            # Find left and right end of arc
            l = min(i, head_index)
            r = max(i, head_index)

            # Bound all tokens under the arc.
            for j in range(l+1, r):
                if left[j] < l:
                    left[j] = l
                if right[j] > r:
                    right[j] = r

        # Find deepest non-projective arc.
        deepest_arc = -1
        max_depth = -1

        # The non-projective arcs are those that exceed their bounds.
        for i in range(num_tokens):
            head_index = sentence.tokens[i].HEAD

            if head_index == -1:
                # any crossing arc must be deeper
                continue

            l = min(i, head_index)
            r = max(i, head_index)

            left_bound = max(left[l], left[r])
            right_bound = min(right[l], right[r])

            if (l < left_bound) or (r > right_bound):
                # Found non-projective arc.
                logger.debug('Found non-projective arc')
                wasProjective = False
                if not projectivize:
                    return wasProjective

                # Pick the deepest as the best candidate for lifting.
                depth = 0
                j = i
                while j != -1:
                    depth += 1
                    j = sentence.tokens[j].HEAD
                
                if depth > max_depth:
                    deepest_arc = i
                    max_depth = depth

        # If there are no more non-projective arcs we are done.
        if deepest_arc == -1:
            if not wasProjective:
                logger.debug('Projectivized non-projective arc')
                logger.debug('Before\n' + oldsentence.toFileOutput())
                logger.debug('After\n' + sentence.toFileOutput())
            return wasProjective

        # Lift non-projective arc.
        lifted_head = sentence.tokens[sentence.tokens[deepest_arc].HEAD].HEAD
        sentence.tokens[deepest_arc].HEAD = lifted_head

    assert None

Well-formed filter

Check that parse tree input is single-root, connected, acyclic, and projective.

'''
Well-Formed Filter
(document_filters.cc)

Check that input is single-root, connected, acyclic, and projective.
'''
import logging
logger = logging.getLogger('WellFormedFilter')
from projectivize_filter import checkProjectivity

'''
Determine whether all HEADs are within the bounds of the sentence
'''
def allHeadsExist(sentence):
    minIndex = -1 # root token
    maxIndex = len(sentence.tokens)-1

    for t in sentence.tokens:
        if t.HEAD < minIndex or t.HEAD > maxIndex:
            return False

    return True

'''
Determine whether the sentence is single rooted
'''
def isSingleRooted(sentence):
    allHeads = []
    for t in sentence.tokens:
        if t.HEAD == -1:
            allHeads.append(t)
    return len(allHeads) == 1

'''
Determine whether or not the sentence has a cycle (in HEADs)
'''
def hasCycle(sentence):
    visited = [-1 for t in sentence.tokens]

    for i in range(len(sentence.tokens)):
        # Already visited node
        if visited[i] != -1:
            continue

        t = i
        while t != -1:
            if visited[t] == -1:
                # If it is not visited yet, mark it.
                visited[t] = i
            elif visited[t] < i:
                # If the index number is smaller than index and not -1, the
                # token has already been visited.
                break
            else:
                # Loop detected
                return True
            t = sentence.tokens[t].HEAD

    return False

class WellFormedFilter(object):
    def __init__(self):
        self.nonProjectiveCount = 0
        self.projectivizedCount = 0
        self.nonWellFormedCount = 0

    '''
    Determine whether the sentence can be parsed by arc-standard and arc-eager
    or not

    projectivize: whether to make non-projective sentences projective
    '''
    def isWellFormed(self, sentence, projectivize=False):
        if len(sentence.tokens) == 0:
            logger.debug('Not well-formed: token length is zero')
            logger.debug('"'+sentence.toSimpleRepresentation()+'"')
            self.nonWellFormedCount += 1
            return False

        if not allHeadsExist(sentence):
            logger.debug('Not well-formed: not all HEADs exist as tokens')
            logger.debug('"'+sentence.toSimpleRepresentation()+'"')
            self.nonWellFormedCount += 1
            return False

        if not isSingleRooted(sentence):
            logger.debug('Not well-formed: tree doesn\'t have single ROOT')
            logger.debug('"'+sentence.toSimpleRepresentation()+'"')
            self.nonWellFormedCount += 1
            return False

        if hasCycle(sentence):
            logger.debug('Not well-formed: tree has a cycle')
            logger.debug('"'+sentence.toSimpleRepresentation()+'"')
            self.nonWellFormedCount += 1
            return False

        if not checkProjectivity(sentence, projectivize=projectivize):
            self.nonProjectiveCount += 1

            # if it wasn't projective
            if not projectivize:
                # ... and we didn't projectivize it... then it's invalid
                logger.debug('Not well-formed: non-projective and' \
                    ' projectivize disabled')
                logger.debug('"'+sentence.toSimpleRepresentation()+'"')

                # only count them as non-well-formed when projectivize is off
                self.nonWellFormedCount += 1
                return False
            else:
                # we succesfully projectivized a non-projective sentence
                # consider well-formed
                self.projectivizedCount += 1

        # if we did projectivize it, then we can keep going

        return True

Arc projectivize and well-formed filters for dependency parsing in pure Python (translated from SyntaxNet code)

Projectivize filter

Well-formed filter

Leave a Reply Cancel reply