CoNLL File Parsing Module

Module to help parse CoNLL (including optional special functionality for handling Korean characters). For use with POS tagging and dependency parsing.

Depends on: Well-formed/projectivize filters

conll_utils

  1. '''
  2. A set of classes to handle input and output of CoNLL-U files
  3. http://universaldependencies.org/docs/format.html
  4. The Parsed* classes are useful to store extra properties needed during
  5. the parsing process that are external to the Conll instances themselves
  6. '''
  7. import logging
  8. import well_formed_filter
  9. def isNonCompatibilityJamo(c):
  10. assert len(c) == 1
  11. # HANGUL JAMO: (U+1100-U+11FF)
  12. if ord(c) >= 0x1100 and ord(c) <= 0x11FF:
  13. return True
  14. else:
  15. return False
  16. def normalizeToCompatJamo(s):
  17. from jamo import h2j, j2hcj
  18. out = ''
  19. for c in s:
  20. if isNonCompatibilityJamo(c):
  21. out += j2hcj(c)
  22. else:
  23. out += c
  24. assert len(s) == len(out)
  25. return out
  26. def encodeNoneAsUnderscore(s):
  27. if s == None:
  28. return '_'
  29. else:
  30. return s
  31. def encodeNoneAsUnderscore_Int(i):
  32. if i == None:
  33. return '_'
  34. else:
  35. return str(i)
  36. '''
  37. Represents a CoNLL token and all its properties (except index)
  38. '''
  39. class ConllToken(object):
  40. def __init__(self):
  41. self.FORM = None
  42. self.LEMMA = None
  43. self.UPOSTAG = None
  44. self.XPOSTAG = None
  45. self.FEATS = []
  46. '''
  47. Make sure to subtract one from the HEAD value in the file
  48. Root becomes -1
  49. HEAD then becomes n, which refers to the n'th 0-based index entry
  50. in the parent ConllSentence
  51. Our parser also requires this to start at -1
  52. '''
  53. self.HEAD = None
  54. self.DEPREL = None
  55. self.DEPS = None
  56. self.MISC = None
  57. # morpheme extraction extension
  58. self.morphemes = []
  59. def __str__(self):
  60. return self.toFileOutput('_')
  61. def __repr__(self):
  62. return self.__str__()
  63. def parseMorphemesFromLemma(self):
  64. self.morphemes = []
  65. if self.LEMMA:
  66. for elem in self.LEMMA.split(' + '):
  67. word, pos = elem.rsplit('/', 1)
  68. #pos = 'SEG' # HACK TEST
  69. self.morphemes.append((word, pos))
  70. def toFileOutput(self, ID):
  71. def checkTab(s):
  72. assert '\t' not in s, 'field must not contain a tab: ' + s
  73. return s
  74. def checkPipe(s):
  75. assert '|' not in s, 'field must not contain a pipe: ' + s
  76. return s
  77. assert self.FORM != None
  78. assert type(self.FEATS) is list
  79. cols = [str(ID),
  80. checkTab(self.FORM),
  81. checkTab(encodeNoneAsUnderscore(self.LEMMA)),
  82. checkTab(encodeNoneAsUnderscore(self.UPOSTAG)),
  83. checkTab(encodeNoneAsUnderscore(self.XPOSTAG)),
  84. '|'.join(checkPipe(checkTab(f)) for f in self.FEATS),
  85. encodeNoneAsUnderscore_Int(self.HEAD+1), # +1 when writing as file
  86. checkTab(encodeNoneAsUnderscore(self.DEPREL)),
  87. checkTab(encodeNoneAsUnderscore(self.DEPS)), # TODO
  88. checkTab(encodeNoneAsUnderscore(self.MISC))]
  89. return '\t'.join(cols)
  90. '''
  91. Represents a ConllToken, as parsed
  92. '''
  93. class ParsedConllToken(ConllToken):
  94. def __init__(self):
  95. super().__init__()
  96. self.parsedLabel = None
  97. self.parsedHead = None
  98. self.HEAD = -1 # match default value in sentence.proto
  99. def setParsedLabel(self, label):
  100. self.parsedLabel = label
  101. def setParsedHead(self, head):
  102. self.parsedHead = head
  103. def clearParsedHead(self):
  104. self.parsedHead = -1 # match ParserState: always use -1 as <ROOT>
  105. '''
  106. Stores an ordered list of CoNLL tokens
  107. '''
  108. class ConllSentence(object):
  109. def __init__(self):
  110. self.tokens = []
  111. '''
  112. Convert to file output representation
  113. '''
  114. def toFileOutput(self):
  115. return '\n'.join(self.tokens[ID-1].toFileOutput(ID) \
  116. for ID in range(1, len(self.tokens)+1))
  117. def genSyntaxNetJson(self, token, break_level=None, start_index=0):
  118. break_contents = ''
  119. if break_level:
  120. break_contents = \
  121. '''
  122. break_level : %s''' % break_level
  123. return \
  124. '''token: {
  125. word : "%s"
  126. start : %d
  127. end : %d
  128. head : %d
  129. tag : "%s"
  130. category: "%s"
  131. label : "%s"%s
  132. }''' % (token.FORM, start_index, start_index+len(token.FORM)-1, token.HEAD,
  133. token.XPOSTAG, token.UPOSTAG, token.DEPREL, break_contents)
  134. def genSyntaxNetTextHeader(self):
  135. return 'text : "%s"' % (' '.join(t.FORM for t in self.tokens))
  136. '''
  137. Convert to SyntaxNet JSON format
  138. '''
  139. def toSyntaxNetJson(self):
  140. out = []
  141. start_index = 0
  142. out.append(self.genSyntaxNetTextHeader())
  143. for i in range(len(self.tokens)):
  144. if i == 0:
  145. out.append(self.genSyntaxNetJson(self.tokens[i],
  146. break_level='SENTENCE_BREAK',
  147. start_index=start_index))
  148. else:
  149. out.append(self.genSyntaxNetJson(self.tokens[i],
  150. start_index=start_index))
  151. start_index += len(self.tokens[i].FORM) + 1 # assume space
  152. return '\n'.join(out)
  153. '''
  154. Output the token separated by spaces
  155. '''
  156. def toSimpleRepresentation(self):
  157. return ' '.join(t.FORM for t in self.tokens)
  158. class ParsedConllSentence(ConllSentence):
  159. def __init__(self, docid):
  160. super().__init__()
  161. self.docid_ = docid
  162. def docid(self):
  163. return self.docid_
  164. ## checked accessor
  165. def mutableToken(self, i):
  166. assert i >= 0
  167. assert i < len(self.tokens)
  168. return self.tokens[i]
  169. def tokenSize(self):
  170. return len(self.tokens)
  171. '''
  172. Stores an ordered list of sentences within a CoNLL file
  173. keepMalformed:
  174. Whether to retain non-projective and invalid examples
  175. projectivize:
  176. Whether to retain non-projective examples by projectivizing them
  177. logStats:
  178. Log statistics about the corpus
  179. enableLemmaMorphemes:
  180. Enable parsing of morphemes in the LEMMA column, format: 웅가로/NNP + 가/JKS
  181. 6 웅가로가 웅가로/NNP + 가/JKS _ _ _ 11 NP_SBJ _ _
  182. '''
  183. class ConllFile(object):
  184. def __init__(self, parsed=False,
  185. checkParserConformity=False,
  186. keepMalformed=False, projectivize=False,
  187. logStats=False, enableLemmaMorphemes=False,
  188. compatibleJamo=False):
  189. #self.sentenceIndex = None
  190. self.sentences = []
  191. # use parsed variant of structures
  192. self.parsed = parsed
  193. self.logger = logging.getLogger('ConllUtils')
  194. self.keepMalformed = keepMalformed
  195. self.projectivize = projectivize
  196. self.logStats = logStats
  197. self.enableLemmaMorphemes = enableLemmaMorphemes
  198. self.compatibleJamo = compatibleJamo
  199. self.checkParserConformity = checkParserConformity
  200. '''
  201. Read CoNLL-U from the given string
  202. excludeCols: CoNLL column indices to exclude from reading
  203. sometimes we just want to get rid of certain
  204. attributes of a token
  205. 1-based index
  206. '''
  207. def read(self, s, excludeCols=[]):
  208. assert 1 not in excludeCols, 'cannot exclude reading of ID'
  209. assert 2 not in excludeCols, 'cannot exclude reading of FORM'
  210. if self.checkParserConformity:
  211. well_formed_inst = well_formed_filter.WellFormedFilter()
  212. else:
  213. assert self.keepMalformed, 'in order to discard malformed ' \
  214. 'sentences, you must enable parser ' \
  215. 'conformity checking'
  216. # arbitrary ID that can be used with parser
  217. if self.parsed:
  218. docid = 0
  219. ln_num = 0
  220. current_sentence = None
  221. # if we encounter an error during processing a sentence
  222. invalid_sentence = False
  223. # set up iterator
  224. # if there is no iterator, set one up
  225. # if there was an iterator, leave it at its current position
  226. #if self.sentenceIndex == None:
  227. # self.sentenceIndex = len(self.sentences)
  228. def commit(s):
  229. # if we're even getting rid of malformed sentences in the first
  230. # place...
  231. if self.checkParserConformity:
  232. if not self.keepMalformed:
  233. if not well_formed_inst.isWellFormed(s,
  234. projectivize=self.projectivize):
  235. # if the sentence is non-projective and projectivize
  236. # is enabled, the sentence will be fixed and not discarded
  237. self.logger.debug('line %d: discarding malformed or non' \
  238. '-projective sentence: "%s"' % \
  239. (ln_num, s.toSimpleRepresentation()))
  240. # as long as we discard the sentence here,
  241. # discarded sentences' words, tags, and labels
  242. # won't be added to the lexicon, which is exactly the
  243. # behavior we want.
  244. return
  245. self.sentences.append(s)
  246. def processUnderscore(s):
  247. if s == '_':
  248. return None
  249. else:
  250. return s
  251. # token index (to check that it's in order)
  252. current_ID = 0
  253. lines = s.split('\n')
  254. for ln in lines:
  255. ln_num += 1
  256. ln = ln.strip()
  257. if ln.startswith(';'):
  258. # ignore comments
  259. continue
  260. if not ln:
  261. # a completely blank line indicates we need to commit the
  262. # current sentence
  263. if current_sentence != None:
  264. if not invalid_sentence:
  265. commit(current_sentence)
  266. current_sentence = None
  267. current_ID = 0
  268. invalid_sentence = False
  269. continue
  270. if ln[0] == '#': # ignore comments completely
  271. continue
  272. if invalid_sentence: # don't process invalid sentences
  273. continue
  274. cols = [x.strip() for x in ln.split('\t')]
  275. assert len(cols) >= 2, \
  276. 'line %d: must have at least ID and FORM: ' % ln_num + str(cols)
  277. if '-' in cols[0] or '.' in cols[0]:
  278. self.logger.warning('line %d: not implemented: ID=%s, ' \
  279. 'invalidating sentence' % (ln_num, cols[0]))
  280. invalid_sentence = True
  281. continue
  282. else:
  283. ID = int(cols[0])
  284. assert ID==current_ID+1, 'line %d: token IDs must be in order' \
  285. ' and increment by one' % ln_num
  286. current_ID = ID
  287. if current_ID == 1:
  288. if self.parsed:
  289. current_sentence = ParsedConllSentence(docid)
  290. docid += 1
  291. else:
  292. current_sentence = ConllSentence()
  293. if self.parsed:
  294. current_token = ParsedConllToken()
  295. else:
  296. current_token = ConllToken()
  297. #if self.parsed:
  298. # current_token.FORM = normalizeDigits(cols[1])
  299. #else:
  300. # current_token.FORM = cols[1]
  301. # for SyntaxNet,
  302. # normalization ONLY happens in lexicon builder
  303. # yet numbers and up as <UNKNOWN> during training
  304. # interesting...
  305. # let this be underscore if needed (don't call processUnderscore())
  306. if self.compatibleJamo:
  307. cols[1] = normalizeToCompatJamo(cols[1])
  308. current_token.FORM = cols[1]
  309. if len(cols) > 2 and (3 not in excludeCols):
  310. # let this be underscore if needed
  311. # (don't call processUnderscore())
  312. if self.compatibleJamo:
  313. cols[2] = normalizeToCompatJamo(cols[2])
  314. current_token.LEMMA = cols[2]
  315. if self.enableLemmaMorphemes:
  316. try:
  317. current_token.parseMorphemesFromLemma()
  318. except:
  319. self.logger.warning('line %d: invalid morpheme '
  320. 'sequence: %s,'
  321. ' invalidating sentence' % (ln_num, \
  322. current_token.LEMMA))
  323. invalid_sentence = True
  324. continue
  325. if len(cols) > 3 and (4 not in excludeCols):
  326. current_token.UPOSTAG = processUnderscore(cols[3])
  327. if len(cols) > 4 and (5 not in excludeCols):
  328. current_token.XPOSTAG = processUnderscore(cols[4])
  329. if len(cols) > 5 and (6 not in excludeCols):
  330. if processUnderscore(cols[5]):
  331. current_token.FEATS = \
  332. [x.strip() for x in cols[5].split('|')]
  333. else:
  334. current_token.FEATS = []
  335. if len(cols) > 6 and (7 not in excludeCols):
  336. current_token.HEAD = processUnderscore(cols[6])
  337. if current_token.HEAD != None:
  338. if '-' in current_token.HEAD or '.' in current_token.HEAD:
  339. self.logger.warning('line %d: not implemented: HEAD=%s,'
  340. ' invalidating sentence' % (ln_num, \
  341. current_token.HEAD))
  342. invalid_sentence = True
  343. continue
  344. else:
  345. # it's important for parsing that HEAD start at -1
  346. current_token.HEAD = int(current_token.HEAD)-1
  347. if len(cols) > 7 and (8 not in excludeCols):
  348. current_token.DEPREL = processUnderscore(cols[7])
  349. if len(cols) > 8 and (9 not in excludeCols):
  350. # TODO
  351. current_token.DEPS = processUnderscore(cols[8])
  352. if len(cols) > 9 and (10 not in excludeCols):
  353. current_token.MISC = processUnderscore(cols[9])
  354. current_sentence.tokens.append(current_token)
  355. # an EOF indicates we need to commit the current sentence
  356. if current_sentence != None:
  357. if not invalid_sentence:
  358. commit(current_sentence)
  359. current_sentence = None
  360. current_ID = 0
  361. invalid_sentence = False
  362. if self.logStats:
  363. if self.checkParserConformity:
  364. self.logger.info('Projectivized %d/%d non-projective sentences' \
  365. ' (%.2f%% of set)' % \
  366. (well_formed_inst.projectivizedCount, \
  367. well_formed_inst.nonProjectiveCount,
  368. 100.0 * float(well_formed_inst.projectivizedCount) \
  369. / float(len(self.sentences))
  370. ))
  371. # if we're even getting rid of malformed sentences in the first
  372. # place...
  373. if self.checkParserConformity:
  374. if not self.keepMalformed:
  375. if self.projectivize:
  376. # the definition of this variable changes when projectivize
  377. # is on
  378. self.logger.info('Discarded %d non-well-formed sentences'
  379. % (well_formed_inst.nonWellFormedCount))
  380. else:
  381. self.logger.info('Discarded %d non-well-formed and ' \
  382. ' non-projective sentences' % \
  383. (well_formed_inst.nonWellFormedCount))
  384. self.logger.info('%d valid sentences processed in total' % \
  385. len(self.sentences))
  386. '''
  387. Write the current CoNLL-U data to the specified file descriptor
  388. '''
  389. def write(self, fd):
  390. data = [s.toFileOutput() for s in self.sentences]
  391. fd.write('\n\n'.join(data))
  392. fd.flush()
  393. def __iter__(self):
  394. index = 0
  395. while index < len(self.sentences):
  396. yield self.sentences[index]
  397. index += 1
  398. class ParsedConllFile(ConllFile):
  399. def __init__(self, checkParserConformity=False, keepMalformed=False,
  400. projectivize=False, logStats=False,
  401. enableLemmaMorphemes=False, compatibleJamo=False):
  402. super().__init__(parsed=True,
  403. checkParserConformity=checkParserConformity,
  404. keepMalformed=keepMalformed,
  405. projectivize=projectivize, logStats=logStats,
  406. enableLemmaMorphemes=enableLemmaMorphemes,
  407. compatibleJamo=compatibleJamo)
'''
A set of classes to handle input and output of CoNLL-U files

http://universaldependencies.org/docs/format.html

The Parsed* classes are useful to store extra properties needed during
the parsing process that are external to the Conll instances themselves
'''

import logging
import well_formed_filter

def isNonCompatibilityJamo(c):
    assert len(c) == 1
    # HANGUL JAMO: (U+1100-U+11FF)
    if ord(c) >= 0x1100 and ord(c) <= 0x11FF:
        return True
    else:
        return False

def normalizeToCompatJamo(s):
    from jamo import h2j, j2hcj
    out = ''
    for c in s:
        if isNonCompatibilityJamo(c):
            out += j2hcj(c)
        else:
            out += c
    assert len(s) == len(out)
    return out

def encodeNoneAsUnderscore(s):
    if s == None:
        return '_'
    else:
        return s

def encodeNoneAsUnderscore_Int(i):
    if i == None:
        return '_'
    else:
        return str(i)

'''
Represents a CoNLL token and all its properties (except index)
'''
class ConllToken(object):
    def __init__(self):
        self.FORM = None
        self.LEMMA = None
        self.UPOSTAG = None
        self.XPOSTAG = None
        self.FEATS = []

        '''
        Make sure to subtract one from the HEAD value in the file
        Root becomes -1

        HEAD then becomes n, which refers to the n'th 0-based index entry
        in the parent ConllSentence

        Our parser also requires this to start at -1
        '''
        self.HEAD = None

        self.DEPREL = None
        self.DEPS = None
        self.MISC = None

        # morpheme extraction extension
        self.morphemes = []

    def __str__(self):
        return self.toFileOutput('_')

    def __repr__(self):
        return self.__str__()

    def parseMorphemesFromLemma(self):
        self.morphemes = []
        if self.LEMMA:
            for elem in self.LEMMA.split(' + '):
                word, pos = elem.rsplit('/', 1)
                #pos = 'SEG' # HACK TEST
                self.morphemes.append((word, pos))

    def toFileOutput(self, ID):
        def checkTab(s):
            assert '\t' not in s, 'field must not contain a tab: ' + s
            return s

        def checkPipe(s):
            assert '|' not in s, 'field must not contain a pipe: ' + s
            return s

        assert self.FORM != None
        assert type(self.FEATS) is list

        cols = [str(ID),
            checkTab(self.FORM),
            checkTab(encodeNoneAsUnderscore(self.LEMMA)),
            checkTab(encodeNoneAsUnderscore(self.UPOSTAG)),
            checkTab(encodeNoneAsUnderscore(self.XPOSTAG)),
            '|'.join(checkPipe(checkTab(f)) for f in self.FEATS),
            encodeNoneAsUnderscore_Int(self.HEAD+1), # +1 when writing as file
            checkTab(encodeNoneAsUnderscore(self.DEPREL)),
            checkTab(encodeNoneAsUnderscore(self.DEPS)),   # TODO
            checkTab(encodeNoneAsUnderscore(self.MISC))]

        return '\t'.join(cols)

'''
Represents a ConllToken, as parsed
'''
class ParsedConllToken(ConllToken):
    def __init__(self):
        super().__init__()
        self.parsedLabel = None
        self.parsedHead = None
        self.HEAD = -1 # match default value in sentence.proto

    def setParsedLabel(self, label):
        self.parsedLabel = label

    def setParsedHead(self, head):
        self.parsedHead = head

    def clearParsedHead(self):
        self.parsedHead = -1 # match ParserState: always use -1 as <ROOT>

'''
Stores an ordered list of CoNLL tokens
'''
class ConllSentence(object):
    def __init__(self):
        self.tokens = []

    '''
    Convert to file output representation
    '''
    def toFileOutput(self):
        return '\n'.join(self.tokens[ID-1].toFileOutput(ID) \
            for ID in range(1, len(self.tokens)+1))

    def genSyntaxNetJson(self, token, break_level=None, start_index=0):
        break_contents = ''
        if break_level:
            break_contents = \
'''
  break_level       : %s''' % break_level

        return \
'''token: {
  word    : "%s"
  start   : %d
  end     : %d
  head    : %d
  tag     : "%s"
  category: "%s"
  label   : "%s"%s
}''' % (token.FORM, start_index, start_index+len(token.FORM)-1, token.HEAD,
        token.XPOSTAG, token.UPOSTAG, token.DEPREL, break_contents)

    def genSyntaxNetTextHeader(self):
        return 'text       : "%s"' % (' '.join(t.FORM for t in self.tokens))

    '''
    Convert to SyntaxNet JSON format
    '''
    def toSyntaxNetJson(self):
        out = []
        start_index = 0
        out.append(self.genSyntaxNetTextHeader())
        for i in range(len(self.tokens)):
            if i == 0:
                out.append(self.genSyntaxNetJson(self.tokens[i],
                                                 break_level='SENTENCE_BREAK',
                                                 start_index=start_index))
            else:
                out.append(self.genSyntaxNetJson(self.tokens[i],
                                                 start_index=start_index))
            start_index += len(self.tokens[i].FORM) + 1 # assume space
        return '\n'.join(out)

    '''
    Output the token separated by spaces
    '''
    def toSimpleRepresentation(self):
        return ' '.join(t.FORM for t in self.tokens)

class ParsedConllSentence(ConllSentence):
    def __init__(self, docid):
        super().__init__()
        self.docid_ = docid

    def docid(self):
        return self.docid_

    ## checked accessor
    def mutableToken(self, i):
        assert i >= 0
        assert i < len(self.tokens)
        return self.tokens[i]

    def tokenSize(self):
        return len(self.tokens)
    
'''
Stores an ordered list of sentences within a CoNLL file

keepMalformed:
Whether to retain non-projective and invalid examples

projectivize:
Whether to retain non-projective examples by projectivizing them

logStats:
Log statistics about the corpus

enableLemmaMorphemes:
Enable parsing of morphemes in the LEMMA column, format: 웅가로/NNP + 가/JKS

6	웅가로가	웅가로/NNP + 가/JKS	_	_	_	11	NP_SBJ	_	_
'''
class ConllFile(object):
    def __init__(self, parsed=False,
                 checkParserConformity=False,
                 keepMalformed=False, projectivize=False,
                 logStats=False, enableLemmaMorphemes=False,
                 compatibleJamo=False):
        #self.sentenceIndex = None
        self.sentences = []
        # use parsed variant of structures
        self.parsed = parsed
        self.logger = logging.getLogger('ConllUtils')
        self.keepMalformed = keepMalformed
        self.projectivize = projectivize
        self.logStats = logStats
        self.enableLemmaMorphemes = enableLemmaMorphemes
        self.compatibleJamo = compatibleJamo
        self.checkParserConformity = checkParserConformity


    '''
    Read CoNLL-U from the given string

    excludeCols: CoNLL column indices to exclude from reading
                 sometimes we just want to get rid of certain
                 attributes of a token
                 1-based index
    '''
    def read(self, s, excludeCols=[]):
        assert 1 not in excludeCols, 'cannot exclude reading of ID'
        assert 2 not in excludeCols, 'cannot exclude reading of FORM'

        if self.checkParserConformity:
            well_formed_inst = well_formed_filter.WellFormedFilter()
        else:
            assert self.keepMalformed, 'in order to discard malformed ' \
                                       'sentences, you must enable parser ' \
                                       'conformity checking'

        # arbitrary ID that can be used with parser
        if self.parsed:
            docid = 0

        ln_num = 0

        current_sentence = None
        
        # if we encounter an error during processing a sentence
        invalid_sentence = False

        # set up iterator
        # if there is no iterator, set one up
        # if there was an iterator, leave it at its current position
        #if self.sentenceIndex == None:
        #    self.sentenceIndex = len(self.sentences)

        def commit(s):
            # if we're even getting rid of malformed sentences in the first
            # place...
            if self.checkParserConformity:
                if not self.keepMalformed:
                    if not well_formed_inst.isWellFormed(s,
                            projectivize=self.projectivize):
                        # if the sentence is non-projective and projectivize
                        # is enabled, the sentence will be fixed and not discarded
                        self.logger.debug('line %d: discarding malformed or non' \
                            '-projective sentence: "%s"' % \
                            (ln_num, s.toSimpleRepresentation()))
                        # as long as we discard the sentence here,
                        # discarded sentences' words, tags, and labels
                        # won't be added to the lexicon, which is exactly the
                        # behavior we want.
                        return

            self.sentences.append(s)

        def processUnderscore(s):
            if s == '_':
                return None
            else:
                return s

        # token index (to check that it's in order)
        current_ID = 0

        lines = s.split('\n')
        for ln in lines:
            ln_num += 1
            ln = ln.strip()
            if ln.startswith(';'):
                # ignore comments
                continue
            if not ln:
                # a completely blank line indicates we need to commit the
                # current sentence
                if current_sentence != None:
                    if not invalid_sentence:
                        commit(current_sentence)

                    current_sentence = None
                    current_ID = 0
                    invalid_sentence = False
                continue
            if ln[0] == '#': # ignore comments completely
                continue
            if invalid_sentence: # don't process invalid sentences
                continue
            cols = [x.strip() for x in ln.split('\t')]
            assert len(cols) >= 2, \
                'line %d: must have at least ID and FORM: ' % ln_num + str(cols)

            if '-' in cols[0] or '.' in cols[0]:
                self.logger.warning('line %d: not implemented: ID=%s, ' \
                                    'invalidating sentence' % (ln_num, cols[0]))
                invalid_sentence = True
                continue
            else:
                ID = int(cols[0])
                assert ID==current_ID+1, 'line %d: token IDs must be in order' \
                   ' and increment by one' % ln_num

            current_ID = ID

            if current_ID == 1:
                if self.parsed:
                    current_sentence = ParsedConllSentence(docid)
                    docid += 1
                else:
                    current_sentence = ConllSentence()

            if self.parsed:
                current_token = ParsedConllToken()
            else:
                current_token = ConllToken()

            #if self.parsed:
            #    current_token.FORM = normalizeDigits(cols[1])
            #else:
            #    current_token.FORM = cols[1]

            # for SyntaxNet,
            # normalization ONLY happens in lexicon builder
            # yet numbers and up as <UNKNOWN> during training
            # interesting...

            # let this be underscore if needed (don't call processUnderscore())
            if self.compatibleJamo:
                cols[1] = normalizeToCompatJamo(cols[1])
            current_token.FORM = cols[1]

            if len(cols) > 2 and (3 not in excludeCols):
                # let this be underscore if needed
                # (don't call processUnderscore())
                if self.compatibleJamo:
                    cols[2] = normalizeToCompatJamo(cols[2])
                current_token.LEMMA = cols[2]

                if self.enableLemmaMorphemes:
                    try:
                        current_token.parseMorphemesFromLemma()
                    except:
                        self.logger.warning('line %d: invalid morpheme '
                                            'sequence: %s,'
                            ' invalidating sentence' % (ln_num, \
                            current_token.LEMMA))

                        invalid_sentence = True
                        continue
            if len(cols) > 3 and (4 not in excludeCols):
                current_token.UPOSTAG = processUnderscore(cols[3])
            if len(cols) > 4 and (5 not in excludeCols):
                current_token.XPOSTAG = processUnderscore(cols[4])
            if len(cols) > 5 and (6 not in excludeCols):
                if processUnderscore(cols[5]):
                    current_token.FEATS = \
                        [x.strip() for x in cols[5].split('|')]
                else:
                    current_token.FEATS = []
            if len(cols) > 6 and (7 not in excludeCols):
                current_token.HEAD = processUnderscore(cols[6])
                if current_token.HEAD != None:
                    if '-' in current_token.HEAD or '.' in current_token.HEAD:
                        self.logger.warning('line %d: not implemented: HEAD=%s,'
                            ' invalidating sentence' % (ln_num, \
                            current_token.HEAD))

                        invalid_sentence = True
                        continue
                    else:
                        # it's important for parsing that HEAD start at -1
                        current_token.HEAD = int(current_token.HEAD)-1
            if len(cols) > 7 and (8 not in excludeCols):
                current_token.DEPREL = processUnderscore(cols[7])
            if len(cols) > 8 and (9 not in excludeCols):
                # TODO
                current_token.DEPS = processUnderscore(cols[8])
            if len(cols) > 9 and (10 not in excludeCols):
                current_token.MISC = processUnderscore(cols[9])

            current_sentence.tokens.append(current_token)

        # an EOF indicates we need to commit the current sentence
        if current_sentence != None:
            if not invalid_sentence:
                commit(current_sentence)

            current_sentence = None
            current_ID = 0
            invalid_sentence = False

        if self.logStats:
            if self.checkParserConformity:
                self.logger.info('Projectivized %d/%d non-projective sentences' \
                    ' (%.2f%% of set)' % \
                    (well_formed_inst.projectivizedCount, \
                    well_formed_inst.nonProjectiveCount,
                    100.0 * float(well_formed_inst.projectivizedCount) \
                        / float(len(self.sentences))
                    ))

            # if we're even getting rid of malformed sentences in the first
            # place...
            if self.checkParserConformity:
                if not self.keepMalformed:
                    if self.projectivize:
                        # the definition of this variable changes when projectivize
                        # is on
                        self.logger.info('Discarded %d non-well-formed sentences'
                                         % (well_formed_inst.nonWellFormedCount))
                    else:
                        self.logger.info('Discarded %d non-well-formed and ' \
                            ' non-projective sentences' % \
                            (well_formed_inst.nonWellFormedCount))

            self.logger.info('%d valid sentences processed in total' % \
                len(self.sentences))

    '''
    Write the current CoNLL-U data to the specified file descriptor
    '''
    def write(self, fd):
        data = [s.toFileOutput() for s in self.sentences]
        fd.write('\n\n'.join(data))
        fd.flush()

    def __iter__(self):
        index = 0
        while index < len(self.sentences):
            yield self.sentences[index]
            index += 1

class ParsedConllFile(ConllFile):
    def __init__(self, checkParserConformity=False, keepMalformed=False,
                 projectivize=False, logStats=False,
                 enableLemmaMorphemes=False, compatibleJamo=False):
        super().__init__(parsed=True,
                         checkParserConformity=checkParserConformity,
                         keepMalformed=keepMalformed,
                         projectivize=projectivize, logStats=logStats,
                         enableLemmaMorphemes=enableLemmaMorphemes,
                         compatibleJamo=compatibleJamo)

 

Leave a Reply

Your email address will not be published. Required fields are marked *