Module to help parse CoNLL (including optional special functionality for handling Korean characters). For use with POS tagging and dependency parsing.
Depends on: Well-formed/projectivize filters
conll_utils
''' A set of classes to handle input and output of CoNLL-U files http://universaldependencies.org/docs/format.html The Parsed* classes are useful to store extra properties needed during the parsing process that are external to the Conll instances themselves ''' import logging import well_formed_filter def isNonCompatibilityJamo(c): assert len(c) == 1 # HANGUL JAMO: (U+1100-U+11FF) if ord(c) >= 0x1100 and ord(c) <= 0x11FF: return True else: return False def normalizeToCompatJamo(s): from jamo import h2j, j2hcj out = '' for c in s: if isNonCompatibilityJamo(c): out += j2hcj(c) else: out += c assert len(s) == len(out) return out def encodeNoneAsUnderscore(s): if s == None: return '_' else: return s def encodeNoneAsUnderscore_Int(i): if i == None: return '_' else: return str(i) ''' Represents a CoNLL token and all its properties (except index) ''' class ConllToken(object): def __init__(self): self.FORM = None self.LEMMA = None self.UPOSTAG = None self.XPOSTAG = None self.FEATS = [] ''' Make sure to subtract one from the HEAD value in the file Root becomes -1 HEAD then becomes n, which refers to the n'th 0-based index entry in the parent ConllSentence Our parser also requires this to start at -1 ''' self.HEAD = None self.DEPREL = None self.DEPS = None self.MISC = None # morpheme extraction extension self.morphemes = [] def __str__(self): return self.toFileOutput('_') def __repr__(self): return self.__str__() def parseMorphemesFromLemma(self): self.morphemes = [] if self.LEMMA: for elem in self.LEMMA.split(' + '): word, pos = elem.rsplit('/', 1) #pos = 'SEG' # HACK TEST self.morphemes.append((word, pos)) def toFileOutput(self, ID): def checkTab(s): assert '\t' not in s, 'field must not contain a tab: ' + s return s def checkPipe(s): assert '|' not in s, 'field must not contain a pipe: ' + s return s assert self.FORM != None assert type(self.FEATS) is list cols = [str(ID), checkTab(self.FORM), checkTab(encodeNoneAsUnderscore(self.LEMMA)), checkTab(encodeNoneAsUnderscore(self.UPOSTAG)), checkTab(encodeNoneAsUnderscore(self.XPOSTAG)), '|'.join(checkPipe(checkTab(f)) for f in self.FEATS), encodeNoneAsUnderscore_Int(self.HEAD+1), # +1 when writing as file checkTab(encodeNoneAsUnderscore(self.DEPREL)), checkTab(encodeNoneAsUnderscore(self.DEPS)), # TODO checkTab(encodeNoneAsUnderscore(self.MISC))] return '\t'.join(cols) ''' Represents a ConllToken, as parsed ''' class ParsedConllToken(ConllToken): def __init__(self): super().__init__() self.parsedLabel = None self.parsedHead = None self.HEAD = -1 # match default value in sentence.proto def setParsedLabel(self, label): self.parsedLabel = label def setParsedHead(self, head): self.parsedHead = head def clearParsedHead(self): self.parsedHead = -1 # match ParserState: always use -1 as <ROOT> ''' Stores an ordered list of CoNLL tokens ''' class ConllSentence(object): def __init__(self): self.tokens = [] ''' Convert to file output representation ''' def toFileOutput(self): return '\n'.join(self.tokens[ID-1].toFileOutput(ID) \ for ID in range(1, len(self.tokens)+1)) def genSyntaxNetJson(self, token, break_level=None, start_index=0): break_contents = '' if break_level: break_contents = \ ''' break_level : %s''' % break_level return \ '''token: { word : "%s" start : %d end : %d head : %d tag : "%s" category: "%s" label : "%s"%s }''' % (token.FORM, start_index, start_index+len(token.FORM)-1, token.HEAD, token.XPOSTAG, token.UPOSTAG, token.DEPREL, break_contents) def genSyntaxNetTextHeader(self): return 'text : "%s"' % (' '.join(t.FORM for t in self.tokens)) ''' Convert to SyntaxNet JSON format ''' def toSyntaxNetJson(self): out = [] start_index = 0 out.append(self.genSyntaxNetTextHeader()) for i in range(len(self.tokens)): if i == 0: out.append(self.genSyntaxNetJson(self.tokens[i], break_level='SENTENCE_BREAK', start_index=start_index)) else: out.append(self.genSyntaxNetJson(self.tokens[i], start_index=start_index)) start_index += len(self.tokens[i].FORM) + 1 # assume space return '\n'.join(out) ''' Output the token separated by spaces ''' def toSimpleRepresentation(self): return ' '.join(t.FORM for t in self.tokens) class ParsedConllSentence(ConllSentence): def __init__(self, docid): super().__init__() self.docid_ = docid def docid(self): return self.docid_ ## checked accessor def mutableToken(self, i): assert i >= 0 assert i < len(self.tokens) return self.tokens[i] def tokenSize(self): return len(self.tokens) ''' Stores an ordered list of sentences within a CoNLL file keepMalformed: Whether to retain non-projective and invalid examples projectivize: Whether to retain non-projective examples by projectivizing them logStats: Log statistics about the corpus enableLemmaMorphemes: Enable parsing of morphemes in the LEMMA column, format: 웅가로/NNP + 가/JKS 6 웅가로가 웅가로/NNP + 가/JKS _ _ _ 11 NP_SBJ _ _ ''' class ConllFile(object): def __init__(self, parsed=False, checkParserConformity=False, keepMalformed=False, projectivize=False, logStats=False, enableLemmaMorphemes=False, compatibleJamo=False): #self.sentenceIndex = None self.sentences = [] # use parsed variant of structures self.parsed = parsed self.logger = logging.getLogger('ConllUtils') self.keepMalformed = keepMalformed self.projectivize = projectivize self.logStats = logStats self.enableLemmaMorphemes = enableLemmaMorphemes self.compatibleJamo = compatibleJamo self.checkParserConformity = checkParserConformity ''' Read CoNLL-U from the given string excludeCols: CoNLL column indices to exclude from reading sometimes we just want to get rid of certain attributes of a token 1-based index ''' def read(self, s, excludeCols=[]): assert 1 not in excludeCols, 'cannot exclude reading of ID' assert 2 not in excludeCols, 'cannot exclude reading of FORM' if self.checkParserConformity: well_formed_inst = well_formed_filter.WellFormedFilter() else: assert self.keepMalformed, 'in order to discard malformed ' \ 'sentences, you must enable parser ' \ 'conformity checking' # arbitrary ID that can be used with parser if self.parsed: docid = 0 ln_num = 0 current_sentence = None # if we encounter an error during processing a sentence invalid_sentence = False # set up iterator # if there is no iterator, set one up # if there was an iterator, leave it at its current position #if self.sentenceIndex == None: # self.sentenceIndex = len(self.sentences) def commit(s): # if we're even getting rid of malformed sentences in the first # place... if self.checkParserConformity: if not self.keepMalformed: if not well_formed_inst.isWellFormed(s, projectivize=self.projectivize): # if the sentence is non-projective and projectivize # is enabled, the sentence will be fixed and not discarded self.logger.debug('line %d: discarding malformed or non' \ '-projective sentence: "%s"' % \ (ln_num, s.toSimpleRepresentation())) # as long as we discard the sentence here, # discarded sentences' words, tags, and labels # won't be added to the lexicon, which is exactly the # behavior we want. return self.sentences.append(s) def processUnderscore(s): if s == '_': return None else: return s # token index (to check that it's in order) current_ID = 0 lines = s.split('\n') for ln in lines: ln_num += 1 ln = ln.strip() if ln.startswith(';'): # ignore comments continue if not ln: # a completely blank line indicates we need to commit the # current sentence if current_sentence != None: if not invalid_sentence: commit(current_sentence) current_sentence = None current_ID = 0 invalid_sentence = False continue if ln[0] == '#': # ignore comments completely continue if invalid_sentence: # don't process invalid sentences continue cols = [x.strip() for x in ln.split('\t')] assert len(cols) >= 2, \ 'line %d: must have at least ID and FORM: ' % ln_num + str(cols) if '-' in cols[0] or '.' in cols[0]: self.logger.warning('line %d: not implemented: ID=%s, ' \ 'invalidating sentence' % (ln_num, cols[0])) invalid_sentence = True continue else: ID = int(cols[0]) assert ID==current_ID+1, 'line %d: token IDs must be in order' \ ' and increment by one' % ln_num current_ID = ID if current_ID == 1: if self.parsed: current_sentence = ParsedConllSentence(docid) docid += 1 else: current_sentence = ConllSentence() if self.parsed: current_token = ParsedConllToken() else: current_token = ConllToken() #if self.parsed: # current_token.FORM = normalizeDigits(cols[1]) #else: # current_token.FORM = cols[1] # for SyntaxNet, # normalization ONLY happens in lexicon builder # yet numbers and up as <UNKNOWN> during training # interesting... # let this be underscore if needed (don't call processUnderscore()) if self.compatibleJamo: cols[1] = normalizeToCompatJamo(cols[1]) current_token.FORM = cols[1] if len(cols) > 2 and (3 not in excludeCols): # let this be underscore if needed # (don't call processUnderscore()) if self.compatibleJamo: cols[2] = normalizeToCompatJamo(cols[2]) current_token.LEMMA = cols[2] if self.enableLemmaMorphemes: try: current_token.parseMorphemesFromLemma() except: self.logger.warning('line %d: invalid morpheme ' 'sequence: %s,' ' invalidating sentence' % (ln_num, \ current_token.LEMMA)) invalid_sentence = True continue if len(cols) > 3 and (4 not in excludeCols): current_token.UPOSTAG = processUnderscore(cols[3]) if len(cols) > 4 and (5 not in excludeCols): current_token.XPOSTAG = processUnderscore(cols[4]) if len(cols) > 5 and (6 not in excludeCols): if processUnderscore(cols[5]): current_token.FEATS = \ [x.strip() for x in cols[5].split('|')] else: current_token.FEATS = [] if len(cols) > 6 and (7 not in excludeCols): current_token.HEAD = processUnderscore(cols[6]) if current_token.HEAD != None: if '-' in current_token.HEAD or '.' in current_token.HEAD: self.logger.warning('line %d: not implemented: HEAD=%s,' ' invalidating sentence' % (ln_num, \ current_token.HEAD)) invalid_sentence = True continue else: # it's important for parsing that HEAD start at -1 current_token.HEAD = int(current_token.HEAD)-1 if len(cols) > 7 and (8 not in excludeCols): current_token.DEPREL = processUnderscore(cols[7]) if len(cols) > 8 and (9 not in excludeCols): # TODO current_token.DEPS = processUnderscore(cols[8]) if len(cols) > 9 and (10 not in excludeCols): current_token.MISC = processUnderscore(cols[9]) current_sentence.tokens.append(current_token) # an EOF indicates we need to commit the current sentence if current_sentence != None: if not invalid_sentence: commit(current_sentence) current_sentence = None current_ID = 0 invalid_sentence = False if self.logStats: if self.checkParserConformity: self.logger.info('Projectivized %d/%d non-projective sentences' \ ' (%.2f%% of set)' % \ (well_formed_inst.projectivizedCount, \ well_formed_inst.nonProjectiveCount, 100.0 * float(well_formed_inst.projectivizedCount) \ / float(len(self.sentences)) )) # if we're even getting rid of malformed sentences in the first # place... if self.checkParserConformity: if not self.keepMalformed: if self.projectivize: # the definition of this variable changes when projectivize # is on self.logger.info('Discarded %d non-well-formed sentences' % (well_formed_inst.nonWellFormedCount)) else: self.logger.info('Discarded %d non-well-formed and ' \ ' non-projective sentences' % \ (well_formed_inst.nonWellFormedCount)) self.logger.info('%d valid sentences processed in total' % \ len(self.sentences)) ''' Write the current CoNLL-U data to the specified file descriptor ''' def write(self, fd): data = [s.toFileOutput() for s in self.sentences] fd.write('\n\n'.join(data)) fd.flush() def __iter__(self): index = 0 while index < len(self.sentences): yield self.sentences[index] index += 1 class ParsedConllFile(ConllFile): def __init__(self, checkParserConformity=False, keepMalformed=False, projectivize=False, logStats=False, enableLemmaMorphemes=False, compatibleJamo=False): super().__init__(parsed=True, checkParserConformity=checkParserConformity, keepMalformed=keepMalformed, projectivize=projectivize, logStats=logStats, enableLemmaMorphemes=enableLemmaMorphemes, compatibleJamo=compatibleJamo)