Module to help parse CoNLL (including optional special functionality for handling Korean characters). For use with POS tagging and dependency parsing.
Depends on: Well-formed/projectivize filters
conll_utils
'''
A set of classes to handle input and output of CoNLL-U files
http://universaldependencies.org/docs/format.html
The Parsed* classes are useful to store extra properties needed during
the parsing process that are external to the Conll instances themselves
'''
import logging
import well_formed_filter
def isNonCompatibilityJamo(c):
assert len(c) == 1
# HANGUL JAMO: (U+1100-U+11FF)
if ord(c) >= 0x1100 and ord(c) <= 0x11FF:
return True
else:
return False
def normalizeToCompatJamo(s):
from jamo import h2j, j2hcj
out = ''
for c in s:
if isNonCompatibilityJamo(c):
out += j2hcj(c)
else:
out += c
assert len(s) == len(out)
return out
def encodeNoneAsUnderscore(s):
if s == None:
return '_'
else:
return s
def encodeNoneAsUnderscore_Int(i):
if i == None:
return '_'
else:
return str(i)
'''
Represents a CoNLL token and all its properties (except index)
'''
class ConllToken(object):
def __init__(self):
self.FORM = None
self.LEMMA = None
self.UPOSTAG = None
self.XPOSTAG = None
self.FEATS = []
'''
Make sure to subtract one from the HEAD value in the file
Root becomes -1
HEAD then becomes n, which refers to the n'th 0-based index entry
in the parent ConllSentence
Our parser also requires this to start at -1
'''
self.HEAD = None
self.DEPREL = None
self.DEPS = None
self.MISC = None
# morpheme extraction extension
self.morphemes = []
def __str__(self):
return self.toFileOutput('_')
def __repr__(self):
return self.__str__()
def parseMorphemesFromLemma(self):
self.morphemes = []
if self.LEMMA:
for elem in self.LEMMA.split(' + '):
word, pos = elem.rsplit('/', 1)
#pos = 'SEG' # HACK TEST
self.morphemes.append((word, pos))
def toFileOutput(self, ID):
def checkTab(s):
assert '\t' not in s, 'field must not contain a tab: ' + s
return s
def checkPipe(s):
assert '|' not in s, 'field must not contain a pipe: ' + s
return s
assert self.FORM != None
assert type(self.FEATS) is list
cols = [str(ID),
checkTab(self.FORM),
checkTab(encodeNoneAsUnderscore(self.LEMMA)),
checkTab(encodeNoneAsUnderscore(self.UPOSTAG)),
checkTab(encodeNoneAsUnderscore(self.XPOSTAG)),
'|'.join(checkPipe(checkTab(f)) for f in self.FEATS),
encodeNoneAsUnderscore_Int(self.HEAD+1), # +1 when writing as file
checkTab(encodeNoneAsUnderscore(self.DEPREL)),
checkTab(encodeNoneAsUnderscore(self.DEPS)), # TODO
checkTab(encodeNoneAsUnderscore(self.MISC))]
return '\t'.join(cols)
'''
Represents a ConllToken, as parsed
'''
class ParsedConllToken(ConllToken):
def __init__(self):
super().__init__()
self.parsedLabel = None
self.parsedHead = None
self.HEAD = -1 # match default value in sentence.proto
def setParsedLabel(self, label):
self.parsedLabel = label
def setParsedHead(self, head):
self.parsedHead = head
def clearParsedHead(self):
self.parsedHead = -1 # match ParserState: always use -1 as <ROOT>
'''
Stores an ordered list of CoNLL tokens
'''
class ConllSentence(object):
def __init__(self):
self.tokens = []
'''
Convert to file output representation
'''
def toFileOutput(self):
return '\n'.join(self.tokens[ID-1].toFileOutput(ID) \
for ID in range(1, len(self.tokens)+1))
def genSyntaxNetJson(self, token, break_level=None, start_index=0):
break_contents = ''
if break_level:
break_contents = \
'''
break_level : %s''' % break_level
return \
'''token: {
word : "%s"
start : %d
end : %d
head : %d
tag : "%s"
category: "%s"
label : "%s"%s
}''' % (token.FORM, start_index, start_index+len(token.FORM)-1, token.HEAD,
token.XPOSTAG, token.UPOSTAG, token.DEPREL, break_contents)
def genSyntaxNetTextHeader(self):
return 'text : "%s"' % (' '.join(t.FORM for t in self.tokens))
'''
Convert to SyntaxNet JSON format
'''
def toSyntaxNetJson(self):
out = []
start_index = 0
out.append(self.genSyntaxNetTextHeader())
for i in range(len(self.tokens)):
if i == 0:
out.append(self.genSyntaxNetJson(self.tokens[i],
break_level='SENTENCE_BREAK',
start_index=start_index))
else:
out.append(self.genSyntaxNetJson(self.tokens[i],
start_index=start_index))
start_index += len(self.tokens[i].FORM) + 1 # assume space
return '\n'.join(out)
'''
Output the token separated by spaces
'''
def toSimpleRepresentation(self):
return ' '.join(t.FORM for t in self.tokens)
class ParsedConllSentence(ConllSentence):
def __init__(self, docid):
super().__init__()
self.docid_ = docid
def docid(self):
return self.docid_
## checked accessor
def mutableToken(self, i):
assert i >= 0
assert i < len(self.tokens)
return self.tokens[i]
def tokenSize(self):
return len(self.tokens)
'''
Stores an ordered list of sentences within a CoNLL file
keepMalformed:
Whether to retain non-projective and invalid examples
projectivize:
Whether to retain non-projective examples by projectivizing them
logStats:
Log statistics about the corpus
enableLemmaMorphemes:
Enable parsing of morphemes in the LEMMA column, format: 웅가로/NNP + 가/JKS
6 웅가로가 웅가로/NNP + 가/JKS _ _ _ 11 NP_SBJ _ _
'''
class ConllFile(object):
def __init__(self, parsed=False,
checkParserConformity=False,
keepMalformed=False, projectivize=False,
logStats=False, enableLemmaMorphemes=False,
compatibleJamo=False):
#self.sentenceIndex = None
self.sentences = []
# use parsed variant of structures
self.parsed = parsed
self.logger = logging.getLogger('ConllUtils')
self.keepMalformed = keepMalformed
self.projectivize = projectivize
self.logStats = logStats
self.enableLemmaMorphemes = enableLemmaMorphemes
self.compatibleJamo = compatibleJamo
self.checkParserConformity = checkParserConformity
'''
Read CoNLL-U from the given string
excludeCols: CoNLL column indices to exclude from reading
sometimes we just want to get rid of certain
attributes of a token
1-based index
'''
def read(self, s, excludeCols=[]):
assert 1 not in excludeCols, 'cannot exclude reading of ID'
assert 2 not in excludeCols, 'cannot exclude reading of FORM'
if self.checkParserConformity:
well_formed_inst = well_formed_filter.WellFormedFilter()
else:
assert self.keepMalformed, 'in order to discard malformed ' \
'sentences, you must enable parser ' \
'conformity checking'
# arbitrary ID that can be used with parser
if self.parsed:
docid = 0
ln_num = 0
current_sentence = None
# if we encounter an error during processing a sentence
invalid_sentence = False
# set up iterator
# if there is no iterator, set one up
# if there was an iterator, leave it at its current position
#if self.sentenceIndex == None:
# self.sentenceIndex = len(self.sentences)
def commit(s):
# if we're even getting rid of malformed sentences in the first
# place...
if self.checkParserConformity:
if not self.keepMalformed:
if not well_formed_inst.isWellFormed(s,
projectivize=self.projectivize):
# if the sentence is non-projective and projectivize
# is enabled, the sentence will be fixed and not discarded
self.logger.debug('line %d: discarding malformed or non' \
'-projective sentence: "%s"' % \
(ln_num, s.toSimpleRepresentation()))
# as long as we discard the sentence here,
# discarded sentences' words, tags, and labels
# won't be added to the lexicon, which is exactly the
# behavior we want.
return
self.sentences.append(s)
def processUnderscore(s):
if s == '_':
return None
else:
return s
# token index (to check that it's in order)
current_ID = 0
lines = s.split('\n')
for ln in lines:
ln_num += 1
ln = ln.strip()
if ln.startswith(';'):
# ignore comments
continue
if not ln:
# a completely blank line indicates we need to commit the
# current sentence
if current_sentence != None:
if not invalid_sentence:
commit(current_sentence)
current_sentence = None
current_ID = 0
invalid_sentence = False
continue
if ln[0] == '#': # ignore comments completely
continue
if invalid_sentence: # don't process invalid sentences
continue
cols = [x.strip() for x in ln.split('\t')]
assert len(cols) >= 2, \
'line %d: must have at least ID and FORM: ' % ln_num + str(cols)
if '-' in cols[0] or '.' in cols[0]:
self.logger.warning('line %d: not implemented: ID=%s, ' \
'invalidating sentence' % (ln_num, cols[0]))
invalid_sentence = True
continue
else:
ID = int(cols[0])
assert ID==current_ID+1, 'line %d: token IDs must be in order' \
' and increment by one' % ln_num
current_ID = ID
if current_ID == 1:
if self.parsed:
current_sentence = ParsedConllSentence(docid)
docid += 1
else:
current_sentence = ConllSentence()
if self.parsed:
current_token = ParsedConllToken()
else:
current_token = ConllToken()
#if self.parsed:
# current_token.FORM = normalizeDigits(cols[1])
#else:
# current_token.FORM = cols[1]
# for SyntaxNet,
# normalization ONLY happens in lexicon builder
# yet numbers and up as <UNKNOWN> during training
# interesting...
# let this be underscore if needed (don't call processUnderscore())
if self.compatibleJamo:
cols[1] = normalizeToCompatJamo(cols[1])
current_token.FORM = cols[1]
if len(cols) > 2 and (3 not in excludeCols):
# let this be underscore if needed
# (don't call processUnderscore())
if self.compatibleJamo:
cols[2] = normalizeToCompatJamo(cols[2])
current_token.LEMMA = cols[2]
if self.enableLemmaMorphemes:
try:
current_token.parseMorphemesFromLemma()
except:
self.logger.warning('line %d: invalid morpheme '
'sequence: %s,'
' invalidating sentence' % (ln_num, \
current_token.LEMMA))
invalid_sentence = True
continue
if len(cols) > 3 and (4 not in excludeCols):
current_token.UPOSTAG = processUnderscore(cols[3])
if len(cols) > 4 and (5 not in excludeCols):
current_token.XPOSTAG = processUnderscore(cols[4])
if len(cols) > 5 and (6 not in excludeCols):
if processUnderscore(cols[5]):
current_token.FEATS = \
[x.strip() for x in cols[5].split('|')]
else:
current_token.FEATS = []
if len(cols) > 6 and (7 not in excludeCols):
current_token.HEAD = processUnderscore(cols[6])
if current_token.HEAD != None:
if '-' in current_token.HEAD or '.' in current_token.HEAD:
self.logger.warning('line %d: not implemented: HEAD=%s,'
' invalidating sentence' % (ln_num, \
current_token.HEAD))
invalid_sentence = True
continue
else:
# it's important for parsing that HEAD start at -1
current_token.HEAD = int(current_token.HEAD)-1
if len(cols) > 7 and (8 not in excludeCols):
current_token.DEPREL = processUnderscore(cols[7])
if len(cols) > 8 and (9 not in excludeCols):
# TODO
current_token.DEPS = processUnderscore(cols[8])
if len(cols) > 9 and (10 not in excludeCols):
current_token.MISC = processUnderscore(cols[9])
current_sentence.tokens.append(current_token)
# an EOF indicates we need to commit the current sentence
if current_sentence != None:
if not invalid_sentence:
commit(current_sentence)
current_sentence = None
current_ID = 0
invalid_sentence = False
if self.logStats:
if self.checkParserConformity:
self.logger.info('Projectivized %d/%d non-projective sentences' \
' (%.2f%% of set)' % \
(well_formed_inst.projectivizedCount, \
well_formed_inst.nonProjectiveCount,
100.0 * float(well_formed_inst.projectivizedCount) \
/ float(len(self.sentences))
))
# if we're even getting rid of malformed sentences in the first
# place...
if self.checkParserConformity:
if not self.keepMalformed:
if self.projectivize:
# the definition of this variable changes when projectivize
# is on
self.logger.info('Discarded %d non-well-formed sentences'
% (well_formed_inst.nonWellFormedCount))
else:
self.logger.info('Discarded %d non-well-formed and ' \
' non-projective sentences' % \
(well_formed_inst.nonWellFormedCount))
self.logger.info('%d valid sentences processed in total' % \
len(self.sentences))
'''
Write the current CoNLL-U data to the specified file descriptor
'''
def write(self, fd):
data = [s.toFileOutput() for s in self.sentences]
fd.write('\n\n'.join(data))
fd.flush()
def __iter__(self):
index = 0
while index < len(self.sentences):
yield self.sentences[index]
index += 1
class ParsedConllFile(ConllFile):
def __init__(self, checkParserConformity=False, keepMalformed=False,
projectivize=False, logStats=False,
enableLemmaMorphemes=False, compatibleJamo=False):
super().__init__(parsed=True,
checkParserConformity=checkParserConformity,
keepMalformed=keepMalformed,
projectivize=projectivize, logStats=logStats,
enableLemmaMorphemes=enableLemmaMorphemes,
compatibleJamo=compatibleJamo)