Convert Sejong POS-tagged corpus format to CoNLL-U format (useful for training Google SyntaxNet)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
SyntaxNet
http://universaldependencies.org/docs/format.html
Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)
Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>
sejong_corpus_dir should contain UTF-16 text files
(BTAA0001.txt, etc...)
output_dir will contain:
- tagged-training-corpus.conllu
- tagged-tuning-corpus.conllu
- tagged-dev-corpus.conllu (evaluation set)
'''
import os, sys
if(len(sys.argv) < 3):
print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
sys.exit(1)
SEJONG_PATH = sys.argv[1]
output_dir = sys.argv[2]
from os import listdir
from os.path import isfile, join
SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
# Process in sensible order
SEJONG_FILES.sort()
if output_dir.endswith('/'):
output_dir = output_dir[:-1]
try:
os.mkdir(output_dir)
except FileExistsError:
pass
print('Processing corpus and outputting to %s...\n' % output_dir)
class CoNLLSentence(object):
def __init__(self):
# 장식품/NNG + 으로/JKB + …/SE
# [장식품, 으로, …]
self.FORM = []
# ['NNG', 'JKB', 'SE']
self.XPOSTAG = []
def toString(self):
assert(len(self.FORM) == len(self.XPOSTAG))
self.ID = [(i+1) for i in range(0, len(self.FORM))]
self.LEMMA = self.FORM
self.UPOSTAG = self.XPOSTAG
self.FEATS = ['_' for i in range(0, len(self.FORM))]
self.HEAD = ['_' for i in range(0, len(self.FORM))]
self.DEPREL = ['_' for i in range(0, len(self.FORM))]
s = []
for i in range(0, len(self.FORM)):
s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
self.FORM[i],
self.LEMMA[i],
self.UPOSTAG[i],
self.XPOSTAG[i],
self.FEATS[i],
self.HEAD[i],
self.DEPREL[i]))
return '\n'.join(s)
'''
Process a file and return a list of sentences
'''
def processFile(path):
# BTAE0201.txt has encoding error. be lenient.
f = open(path, 'r', encoding='utf16', errors='ignore')
contents = f.read()
f.close()
# Some of the files don't have <group>!
# strings -f -el *.txt |grep "<group>"
# For example, BTAZ0223.txt
contents = contents.replace('<head>', '<p>')
contents = contents.replace('</head>', '</p>')
# Detected Sejong tagset
tagset = set()
# CoNLL sentences
sentences = []
# There's even <p> before </teiHeader>
paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
for p in paras:
p = p.split('</p>')[0].strip()
sentence = CoNLLSentence()
try:
lines = p.split('\n')
#skipPara = False
for ln in lines:
if not ln:
continue
#BTAA0001-00000013 세계적인 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
cols = ln.split('\t')
if ('<' in cols[0]):
# Do we want to process dates? Not sure yet...
print('Ignoring tag: ' + cols[0])
#skipPara = True
break
if len(cols) != 3:
#print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
#print(cols)
lineid = cols[0].strip()
# Don't recommend stripping the other columns
# Sometimes spaces are tagged, and apparently they could be anywhere
# BTHO0432-00032162 주장자( 杖子)를 주장자/NNG + (/SS + /SW + 杖子/SH + )/SS + 를/JKO
word = cols[1]
# BTBA0233-00016517 2(남북한)+4(미 2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
# Some data even contains +. What we really want is to split by ' + '
tags = cols[2].split(' + ')
for tagelem in tags:
slashes = tagelem.split('/')
# sometimes component contains slash
# only consider tag to be after last /
# >>> slashes = '////////SP'.split('/')
wordcomponent = '/'.join(slashes[0:len(slashes)-1])
tag = slashes[len(slashes)-1]
if (not wordcomponent) or (not tag):
#print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
#print(wordcomponent, tag)
assert(not('\t' in wordcomponent))
assert(not('\n' in wordcomponent))
assert(not('\t' in tag))
assert(not('\n' in tag))
sentence.FORM.append(wordcomponent)
sentence.XPOSTAG.append(tag)
tagset.add(tag)
#print(sentence.toString() + '\n')
sentences.append(sentence)
except Exception as e:
print(e)
print('Processing paragraph failed.')
return (tagset, sentences)
tagset = set()
sentences = list()
print('Found %d files to process.' % len(SEJONG_FILES))
m = 0
#SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
for f in SEJONG_FILES:
m += 1
print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
thisFileTagset, thisFileSentences = processFile(f)
tagset = tagset.union(thisFileTagset)
sentences += thisFileSentences
# index of sentences within sentences list for each set
training_set_indices = []
tuning_set_indices = []
testing_set_indices = []
# split to 60-20-20 (training, tuning, testing sets)
for i in range(len(sentences)):
if i % 10 == 0 or i % 10 == 1: # ~20%
tuning_set_indices.append(i)
elif i % 10 == 2 or i % 10 == 3: # ~20%
testing_set_indices.append(i)
else: # ~60%
training_set_indices.append(i)
print()
print('Found %d sentences (%d-%d-%d).' % (len(sentences),
len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
print('Complete tagset found: ' + str(tagset))
print()
print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))
fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(training_set_indices)):
idx = training_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
# no extra newline after last element
if(i < len(training_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))
fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(tuning_set_indices)):
idx = tuning_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(tuning_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))
fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(testing_set_indices)):
idx = testing_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(testing_set_indices)-1):
fd.write('\n')
fd.close()