Convert Sejong POS-tagged corpus format to CoNLL-U format

Convert Sejong POS-tagged corpus format to CoNLL-U format (useful for training Google SyntaxNet)

  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. '''
  4. Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
  5. SyntaxNet
  6. http://universaldependencies.org/docs/format.html
  7. Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)
  8. Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>
  9. sejong_corpus_dir should contain UTF-16 text files
  10. (BTAA0001.txt, etc...)
  11. output_dir will contain:
  12. - tagged-training-corpus.conllu
  13. - tagged-tuning-corpus.conllu
  14. - tagged-dev-corpus.conllu (evaluation set)
  15. '''
  16. import os, sys
  17. if(len(sys.argv) < 3):
  18. print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
  19. sys.exit(1)
  20. SEJONG_PATH = sys.argv[1]
  21. output_dir = sys.argv[2]
  22. from os import listdir
  23. from os.path import isfile, join
  24. SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
  25. # Process in sensible order
  26. SEJONG_FILES.sort()
  27. if output_dir.endswith('/'):
  28. output_dir = output_dir[:-1]
  29. try:
  30. os.mkdir(output_dir)
  31. except FileExistsError:
  32. pass
  33. print('Processing corpus and outputting to %s...\n' % output_dir)
  34. class CoNLLSentence(object):
  35. def __init__(self):
  36. # 장식품/NNG + 으로/JKB + …/SE
  37. # [장식품, 으로, …]
  38. self.FORM = []
  39. # ['NNG', 'JKB', 'SE']
  40. self.XPOSTAG = []
  41. def toString(self):
  42. assert(len(self.FORM) == len(self.XPOSTAG))
  43. self.ID = [(i+1) for i in range(0, len(self.FORM))]
  44. self.LEMMA = self.FORM
  45. self.UPOSTAG = self.XPOSTAG
  46. self.FEATS = ['_' for i in range(0, len(self.FORM))]
  47. self.HEAD = ['_' for i in range(0, len(self.FORM))]
  48. self.DEPREL = ['_' for i in range(0, len(self.FORM))]
  49. s = []
  50. for i in range(0, len(self.FORM)):
  51. s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
  52. self.FORM[i],
  53. self.LEMMA[i],
  54. self.UPOSTAG[i],
  55. self.XPOSTAG[i],
  56. self.FEATS[i],
  57. self.HEAD[i],
  58. self.DEPREL[i]))
  59. return '\n'.join(s)
  60. '''
  61. Process a file and return a list of sentences
  62. '''
  63. def processFile(path):
  64. # BTAE0201.txt has encoding error. be lenient.
  65. f = open(path, 'r', encoding='utf16', errors='ignore')
  66. contents = f.read()
  67. f.close()
  68. # Some of the files don't have <group>!
  69. # strings -f -el *.txt |grep "<group>"
  70. # For example, BTAZ0223.txt
  71. contents = contents.replace('<head>', '<p>')
  72. contents = contents.replace('</head>', '</p>')
  73. # Detected Sejong tagset
  74. tagset = set()
  75. # CoNLL sentences
  76. sentences = []
  77. # There's even <p> before </teiHeader>
  78. paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
  79. for p in paras:
  80. p = p.split('</p>')[0].strip()
  81. sentence = CoNLLSentence()
  82. try:
  83. lines = p.split('\n')
  84. #skipPara = False
  85. for ln in lines:
  86. if not ln:
  87. continue
  88. #BTAA0001-00000013 세계적인 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
  89. cols = ln.split('\t')
  90. if ('<' in cols[0]):
  91. # Do we want to process dates? Not sure yet...
  92. print('Ignoring tag: ' + cols[0])
  93. #skipPara = True
  94. break
  95. if len(cols) != 3:
  96. #print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
  97. raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
  98. #print(cols)
  99. lineid = cols[0].strip()
  100. # Don't recommend stripping the other columns
  101. # Sometimes spaces are tagged, and apparently they could be anywhere
  102. # BTHO0432-00032162 주장자( 杖子)를 주장자/NNG + (/SS +  /SW + 杖子/SH + )/SS + 를/JKO
  103. word = cols[1]
  104. # BTBA0233-00016517 2(남북한)+4(미 2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
  105. # Some data even contains +. What we really want is to split by ' + '
  106. tags = cols[2].split(' + ')
  107. for tagelem in tags:
  108. slashes = tagelem.split('/')
  109. # sometimes component contains slash
  110. # only consider tag to be after last /
  111. # >>> slashes = '////////SP'.split('/')
  112. wordcomponent = '/'.join(slashes[0:len(slashes)-1])
  113. tag = slashes[len(slashes)-1]
  114. if (not wordcomponent) or (not tag):
  115. #print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
  116. raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
  117. #print(wordcomponent, tag)
  118. assert(not('\t' in wordcomponent))
  119. assert(not('\n' in wordcomponent))
  120. assert(not('\t' in tag))
  121. assert(not('\n' in tag))
  122. sentence.FORM.append(wordcomponent)
  123. sentence.XPOSTAG.append(tag)
  124. tagset.add(tag)
  125. #print(sentence.toString() + '\n')
  126. sentences.append(sentence)
  127. except Exception as e:
  128. print(e)
  129. print('Processing paragraph failed.')
  130. return (tagset, sentences)
  131. tagset = set()
  132. sentences = list()
  133. print('Found %d files to process.' % len(SEJONG_FILES))
  134. m = 0
  135. #SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
  136. for f in SEJONG_FILES:
  137. m += 1
  138. print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
  139. thisFileTagset, thisFileSentences = processFile(f)
  140. tagset = tagset.union(thisFileTagset)
  141. sentences += thisFileSentences
  142. # index of sentences within sentences list for each set
  143. training_set_indices = []
  144. tuning_set_indices = []
  145. testing_set_indices = []
  146. # split to 60-20-20 (training, tuning, testing sets)
  147. for i in range(len(sentences)):
  148. if i % 10 == 0 or i % 10 == 1: # ~20%
  149. tuning_set_indices.append(i)
  150. elif i % 10 == 2 or i % 10 == 3: # ~20%
  151. testing_set_indices.append(i)
  152. else: # ~60%
  153. training_set_indices.append(i)
  154. print()
  155. print('Found %d sentences (%d-%d-%d).' % (len(sentences),
  156. len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
  157. print('Complete tagset found: ' + str(tagset))
  158. print()
  159. print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))
  160. fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
  161. for i in range(len(training_set_indices)):
  162. idx = training_set_indices[i]
  163. fd.write(sentences[idx].toString() + '\n')
  164. # no extra newline after last element
  165. if(i < len(training_set_indices)-1):
  166. fd.write('\n')
  167. fd.close()
  168. print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))
  169. fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
  170. for i in range(len(tuning_set_indices)):
  171. idx = tuning_set_indices[i]
  172. fd.write(sentences[idx].toString() + '\n')
  173. if(i < len(tuning_set_indices)-1):
  174. fd.write('\n')
  175. fd.close()
  176. print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))
  177. fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
  178. for i in range(len(testing_set_indices)):
  179. idx = testing_set_indices[i]
  180. fd.write(sentences[idx].toString() + '\n')
  181. if(i < len(testing_set_indices)-1):
  182. fd.write('\n')
  183. fd.close()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
SyntaxNet

http://universaldependencies.org/docs/format.html

Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)

Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>

sejong_corpus_dir should contain UTF-16 text files
(BTAA0001.txt, etc...)

output_dir will contain:
- tagged-training-corpus.conllu
- tagged-tuning-corpus.conllu
- tagged-dev-corpus.conllu (evaluation set)
'''

import os, sys

if(len(sys.argv) < 3):
    print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
    sys.exit(1)

SEJONG_PATH = sys.argv[1]
output_dir = sys.argv[2]

from os import listdir
from os.path import isfile, join
SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
# Process in sensible order
SEJONG_FILES.sort()

if output_dir.endswith('/'):
    output_dir = output_dir[:-1]

try:
    os.mkdir(output_dir)
except FileExistsError:
    pass

print('Processing corpus and outputting to %s...\n' % output_dir)

class CoNLLSentence(object):
    def __init__(self):
                # 장식품/NNG + 으로/JKB + …/SE

        # [장식품, 으로, …]
        self.FORM = []
        # ['NNG', 'JKB', 'SE']
        self.XPOSTAG = []

    def toString(self):
        assert(len(self.FORM) == len(self.XPOSTAG))

        self.ID = [(i+1) for i in range(0, len(self.FORM))]
        self.LEMMA = self.FORM
        self.UPOSTAG = self.XPOSTAG
        self.FEATS = ['_' for i in range(0, len(self.FORM))]
        self.HEAD = ['_' for i in range(0, len(self.FORM))]
        self.DEPREL = ['_' for i in range(0, len(self.FORM))]

        s = []

        for i in range(0, len(self.FORM)):
            s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
                self.FORM[i],
                self.LEMMA[i],
                self.UPOSTAG[i],
                self.XPOSTAG[i],
                self.FEATS[i],
                self.HEAD[i],
                self.DEPREL[i]))

        return '\n'.join(s)
        

'''
Process a file and return a list of sentences
'''
def processFile(path):
    # BTAE0201.txt has encoding error. be lenient.
    f = open(path, 'r', encoding='utf16', errors='ignore')
    contents = f.read()
    f.close()
    
    # Some of the files don't have <group>!
    # strings -f -el *.txt |grep "<group>"
    # For example, BTAZ0223.txt
    
    contents = contents.replace('<head>', '<p>')
    contents = contents.replace('</head>', '</p>')
    
    # Detected Sejong tagset
    tagset = set()

    # CoNLL sentences
    sentences = []
    
    # There's even <p> before </teiHeader>
    paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
    for p in paras:
        p = p.split('</p>')[0].strip()
        
        sentence = CoNLLSentence()

        try:
            lines = p.split('\n')
            #skipPara = False
            for ln in lines:
                if not ln:
                    continue
                
                #BTAA0001-00000013	세계적인	세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
                cols = ln.split('\t')
                
                if ('<' in cols[0]):
                    # Do we want to process dates? Not sure yet...
                    print('Ignoring tag: ' + cols[0])
                    #skipPara = True
                    break
                
                if len(cols) != 3:
                    #print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
                    raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
                
                #print(cols)
                lineid = cols[0].strip()
                # Don't recommend stripping the other columns
                # Sometimes spaces are tagged, and apparently they could be anywhere
                # BTHO0432-00032162	주장자( 杖子)를	주장자/NNG + (/SS +  /SW + 杖子/SH + )/SS + 를/JKO
                word = cols[1]
                # BTBA0233-00016517	2(남북한)+4(미	2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
                # Some data even contains +. What we really want is to split by ' + '
                tags = cols[2].split(' + ')
                
                for tagelem in tags:
                    slashes = tagelem.split('/')
                    # sometimes component contains slash
                    # only consider tag to be after last /
                    # >>> slashes = '////////SP'.split('/')
                    wordcomponent = '/'.join(slashes[0:len(slashes)-1])
                    tag = slashes[len(slashes)-1]
                    
                    if (not wordcomponent) or (not tag):
                        #print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
                        raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
                    
                    #print(wordcomponent, tag)
                    assert(not('\t' in wordcomponent))
                    assert(not('\n' in wordcomponent))
                    assert(not('\t' in tag))
                    assert(not('\n' in tag))
                    sentence.FORM.append(wordcomponent)
                    sentence.XPOSTAG.append(tag)
                    tagset.add(tag)

            #print(sentence.toString() + '\n')
            sentences.append(sentence)
        except Exception as e:
            print(e)
            print('Processing paragraph failed.')

    return (tagset, sentences)

tagset = set()
sentences = list()

print('Found %d files to process.' % len(SEJONG_FILES))

m = 0
#SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
for f in SEJONG_FILES:
    m += 1
    print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
    thisFileTagset, thisFileSentences = processFile(f)
    tagset = tagset.union(thisFileTagset)
    sentences += thisFileSentences

# index of sentences within sentences list for each set
training_set_indices = []
tuning_set_indices = []
testing_set_indices = []

# split to 60-20-20 (training, tuning, testing sets)
for i in range(len(sentences)):
    if i % 10 == 0 or i % 10 == 1:     # ~20%
        tuning_set_indices.append(i)
    elif i % 10 == 2 or i % 10 == 3:   # ~20%
        testing_set_indices.append(i)
    else:                               # ~60%
        training_set_indices.append(i)

print()
print('Found %d sentences (%d-%d-%d).' % (len(sentences),
    len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
print('Complete tagset found: ' + str(tagset))
print()

print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))

fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(training_set_indices)):
    idx = training_set_indices[i]
    fd.write(sentences[idx].toString() + '\n')
    # no extra newline after last element
    if(i < len(training_set_indices)-1):
        fd.write('\n')
fd.close()

print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))

fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(tuning_set_indices)):
    idx = tuning_set_indices[i]
    fd.write(sentences[idx].toString() + '\n')
    if(i < len(tuning_set_indices)-1):
        fd.write('\n')
fd.close()

print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))

fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(testing_set_indices)):
    idx = testing_set_indices[i]
    fd.write(sentences[idx].toString() + '\n')
    if(i < len(testing_set_indices)-1):
        fd.write('\n')
fd.close()

 

Leave a Reply

Your email address will not be published. Required fields are marked *