Convert Sejong POS-tagged corpus format to CoNLL-U format (useful for training Google SyntaxNet)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
SyntaxNet
http://universaldependencies.org/docs/format.html
Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)
Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>
sejong_corpus_dir should contain UTF-16 text files
(BTAA0001.txt, etc...)
output_dir will contain:
- tagged-training-corpus.conllu
- tagged-tuning-corpus.conllu
- tagged-dev-corpus.conllu (evaluation set)
'''
import os, sys
if(len(sys.argv) < 3):
print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
sys.exit(1)
SEJONG_PATH = sys.argv[1]
output_dir = sys.argv[2]
from os import listdir
from os.path import isfile, join
SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
# Process in sensible order
SEJONG_FILES.sort()
if output_dir.endswith('/'):
output_dir = output_dir[:-1]
try:
os.mkdir(output_dir)
except FileExistsError:
pass
print('Processing corpus and outputting to %s...\n' % output_dir)
class CoNLLSentence(object):
def __init__(self):
# 장식품/NNG + 으로/JKB + …/SE
# [장식품, 으로, …]
self.FORM = []
# ['NNG', 'JKB', 'SE']
self.XPOSTAG = []
def toString(self):
assert(len(self.FORM) == len(self.XPOSTAG))
self.ID = [(i+1) for i in range(0, len(self.FORM))]
self.LEMMA = self.FORM
self.UPOSTAG = self.XPOSTAG
self.FEATS = ['_' for i in range(0, len(self.FORM))]
self.HEAD = ['_' for i in range(0, len(self.FORM))]
self.DEPREL = ['_' for i in range(0, len(self.FORM))]
s = []
for i in range(0, len(self.FORM)):
s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
self.FORM[i],
self.LEMMA[i],
self.UPOSTAG[i],
self.XPOSTAG[i],
self.FEATS[i],
self.HEAD[i],
self.DEPREL[i]))
return '\n'.join(s)
'''
Process a file and return a list of sentences
'''
def processFile(path):
# BTAE0201.txt has encoding error. be lenient.
f = open(path, 'r', encoding='utf16', errors='ignore')
contents = f.read()
f.close()
# Some of the files don't have <group>!
# strings -f -el *.txt |grep "<group>"
# For example, BTAZ0223.txt
contents = contents.replace('<head>', '<p>')
contents = contents.replace('</head>', '</p>')
# Detected Sejong tagset
tagset = set()
# CoNLL sentences
sentences = []
# There's even <p> before </teiHeader>
paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
for p in paras:
p = p.split('</p>')[0].strip()
sentence = CoNLLSentence()
try:
lines = p.split('\n')
#skipPara = False
for ln in lines:
if not ln:
continue
#BTAA0001-00000013 세계적인 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
cols = ln.split('\t')
if ('<' in cols[0]):
# Do we want to process dates? Not sure yet...
print('Ignoring tag: ' + cols[0])
#skipPara = True
break
if len(cols) != 3:
#print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
#print(cols)
lineid = cols[0].strip()
# Don't recommend stripping the other columns
# Sometimes spaces are tagged, and apparently they could be anywhere
# BTHO0432-00032162 주장자( 杖子)를 주장자/NNG + (/SS + /SW + 杖子/SH + )/SS + 를/JKO
word = cols[1]
# BTBA0233-00016517 2(남북한)+4(미 2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
# Some data even contains +. What we really want is to split by ' + '
tags = cols[2].split(' + ')
for tagelem in tags:
slashes = tagelem.split('/')
# sometimes component contains slash
# only consider tag to be after last /
# >>> slashes = '////////SP'.split('/')
wordcomponent = '/'.join(slashes[0:len(slashes)-1])
tag = slashes[len(slashes)-1]
if (not wordcomponent) or (not tag):
#print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
#print(wordcomponent, tag)
assert(not('\t' in wordcomponent))
assert(not('\n' in wordcomponent))
assert(not('\t' in tag))
assert(not('\n' in tag))
sentence.FORM.append(wordcomponent)
sentence.XPOSTAG.append(tag)
tagset.add(tag)
#print(sentence.toString() + '\n')
sentences.append(sentence)
except Exception as e:
print(e)
print('Processing paragraph failed.')
return (tagset, sentences)
tagset = set()
sentences = list()
print('Found %d files to process.' % len(SEJONG_FILES))
m = 0
#SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
for f in SEJONG_FILES:
m += 1
print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
thisFileTagset, thisFileSentences = processFile(f)
tagset = tagset.union(thisFileTagset)
sentences += thisFileSentences
# index of sentences within sentences list for each set
training_set_indices = []
tuning_set_indices = []
testing_set_indices = []
# split to 60-20-20 (training, tuning, testing sets)
for i in range(len(sentences)):
if i % 10 == 0 or i % 10 == 1: # ~20%
tuning_set_indices.append(i)
elif i % 10 == 2 or i % 10 == 3: # ~20%
testing_set_indices.append(i)
else: # ~60%
training_set_indices.append(i)
print()
print('Found %d sentences (%d-%d-%d).' % (len(sentences),
len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
print('Complete tagset found: ' + str(tagset))
print()
print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))
fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(training_set_indices)):
idx = training_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
# no extra newline after last element
if(i < len(training_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))
fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(tuning_set_indices)):
idx = tuning_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(tuning_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))
fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(testing_set_indices)):
idx = testing_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(testing_set_indices)-1):
fd.write('\n')
fd.close()
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- '''
- Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
- SyntaxNet
- http://universaldependencies.org/docs/format.html
- Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)
- Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>
- sejong_corpus_dir should contain UTF-16 text files
- (BTAA0001.txt, etc...)
- output_dir will contain:
- - tagged-training-corpus.conllu
- - tagged-tuning-corpus.conllu
- - tagged-dev-corpus.conllu (evaluation set)
- '''
- import os, sys
- if(len(sys.argv) < 3):
- print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
- sys.exit(1)
- SEJONG_PATH = sys.argv[1]
- output_dir = sys.argv[2]
- from os import listdir
- from os.path import isfile, join
- SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
- # Process in sensible order
- SEJONG_FILES.sort()
- if output_dir.endswith('/'):
- output_dir = output_dir[:-1]
- try:
- os.mkdir(output_dir)
- except FileExistsError:
- pass
- print('Processing corpus and outputting to %s...\n' % output_dir)
- class CoNLLSentence(object):
- def __init__(self):
- # 장식품/NNG + 으로/JKB + …/SE
- # [장식품, 으로, …]
- self.FORM = []
- # ['NNG', 'JKB', 'SE']
- self.XPOSTAG = []
- def toString(self):
- assert(len(self.FORM) == len(self.XPOSTAG))
- self.ID = [(i+1) for i in range(0, len(self.FORM))]
- self.LEMMA = self.FORM
- self.UPOSTAG = self.XPOSTAG
- self.FEATS = ['_' for i in range(0, len(self.FORM))]
- self.HEAD = ['_' for i in range(0, len(self.FORM))]
- self.DEPREL = ['_' for i in range(0, len(self.FORM))]
- s = []
- for i in range(0, len(self.FORM)):
- s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
- self.FORM[i],
- self.LEMMA[i],
- self.UPOSTAG[i],
- self.XPOSTAG[i],
- self.FEATS[i],
- self.HEAD[i],
- self.DEPREL[i]))
- return '\n'.join(s)
-
- '''
- Process a file and return a list of sentences
- '''
- def processFile(path):
- # BTAE0201.txt has encoding error. be lenient.
- f = open(path, 'r', encoding='utf16', errors='ignore')
- contents = f.read()
- f.close()
-
- # Some of the files don't have <group>!
- # strings -f -el *.txt |grep "<group>"
- # For example, BTAZ0223.txt
-
- contents = contents.replace('<head>', '<p>')
- contents = contents.replace('</head>', '</p>')
-
- # Detected Sejong tagset
- tagset = set()
- # CoNLL sentences
- sentences = []
-
- # There's even <p> before </teiHeader>
- paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
- for p in paras:
- p = p.split('</p>')[0].strip()
-
- sentence = CoNLLSentence()
- try:
- lines = p.split('\n')
- #skipPara = False
- for ln in lines:
- if not ln:
- continue
-
- #BTAA0001-00000013 세계적인 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
- cols = ln.split('\t')
-
- if ('<' in cols[0]):
- # Do we want to process dates? Not sure yet...
- print('Ignoring tag: ' + cols[0])
- #skipPara = True
- break
-
- if len(cols) != 3:
- #print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
- raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
-
- #print(cols)
- lineid = cols[0].strip()
- # Don't recommend stripping the other columns
- # Sometimes spaces are tagged, and apparently they could be anywhere
- # BTHO0432-00032162 주장자( 杖子)를 주장자/NNG + (/SS + /SW + 杖子/SH + )/SS + 를/JKO
- word = cols[1]
- # BTBA0233-00016517 2(남북한)+4(미 2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
- # Some data even contains +. What we really want is to split by ' + '
- tags = cols[2].split(' + ')
-
- for tagelem in tags:
- slashes = tagelem.split('/')
- # sometimes component contains slash
- # only consider tag to be after last /
- # >>> slashes = '////////SP'.split('/')
- wordcomponent = '/'.join(slashes[0:len(slashes)-1])
- tag = slashes[len(slashes)-1]
-
- if (not wordcomponent) or (not tag):
- #print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
- raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
-
- #print(wordcomponent, tag)
- assert(not('\t' in wordcomponent))
- assert(not('\n' in wordcomponent))
- assert(not('\t' in tag))
- assert(not('\n' in tag))
- sentence.FORM.append(wordcomponent)
- sentence.XPOSTAG.append(tag)
- tagset.add(tag)
- #print(sentence.toString() + '\n')
- sentences.append(sentence)
- except Exception as e:
- print(e)
- print('Processing paragraph failed.')
- return (tagset, sentences)
- tagset = set()
- sentences = list()
- print('Found %d files to process.' % len(SEJONG_FILES))
- m = 0
- #SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
- for f in SEJONG_FILES:
- m += 1
- print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
- thisFileTagset, thisFileSentences = processFile(f)
- tagset = tagset.union(thisFileTagset)
- sentences += thisFileSentences
- # index of sentences within sentences list for each set
- training_set_indices = []
- tuning_set_indices = []
- testing_set_indices = []
- # split to 60-20-20 (training, tuning, testing sets)
- for i in range(len(sentences)):
- if i % 10 == 0 or i % 10 == 1: # ~20%
- tuning_set_indices.append(i)
- elif i % 10 == 2 or i % 10 == 3: # ~20%
- testing_set_indices.append(i)
- else: # ~60%
- training_set_indices.append(i)
- print()
- print('Found %d sentences (%d-%d-%d).' % (len(sentences),
- len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
- print('Complete tagset found: ' + str(tagset))
- print()
- print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))
- fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
- for i in range(len(training_set_indices)):
- idx = training_set_indices[i]
- fd.write(sentences[idx].toString() + '\n')
- # no extra newline after last element
- if(i < len(training_set_indices)-1):
- fd.write('\n')
- fd.close()
- print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))
- fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
- for i in range(len(tuning_set_indices)):
- idx = tuning_set_indices[i]
- fd.write(sentences[idx].toString() + '\n')
- if(i < len(tuning_set_indices)-1):
- fd.write('\n')
- fd.close()
- print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))
- fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
- for i in range(len(testing_set_indices)):
- idx = testing_set_indices[i]
- fd.write(sentences[idx].toString() + '\n')
- if(i < len(testing_set_indices)-1):
- fd.write('\n')
- fd.close()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
Convert Sejong POS-tagged corpus format to CoNLL-U format for use with Google
SyntaxNet
http://universaldependencies.org/docs/format.html
Outputs training, testing, and tuning sets (60-20-20 ratio, randomly chosen)
Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>
sejong_corpus_dir should contain UTF-16 text files
(BTAA0001.txt, etc...)
output_dir will contain:
- tagged-training-corpus.conllu
- tagged-tuning-corpus.conllu
- tagged-dev-corpus.conllu (evaluation set)
'''
import os, sys
if(len(sys.argv) < 3):
print('Arguments: ./corpus_to_conll.py <sejong_corpus_dir> <output_dir>')
sys.exit(1)
SEJONG_PATH = sys.argv[1]
output_dir = sys.argv[2]
from os import listdir
from os.path import isfile, join
SEJONG_FILES = [join(SEJONG_PATH, f) for f in listdir(SEJONG_PATH) if isfile(join(SEJONG_PATH, f))]
# Process in sensible order
SEJONG_FILES.sort()
if output_dir.endswith('/'):
output_dir = output_dir[:-1]
try:
os.mkdir(output_dir)
except FileExistsError:
pass
print('Processing corpus and outputting to %s...\n' % output_dir)
class CoNLLSentence(object):
def __init__(self):
# 장식품/NNG + 으로/JKB + …/SE
# [장식품, 으로, …]
self.FORM = []
# ['NNG', 'JKB', 'SE']
self.XPOSTAG = []
def toString(self):
assert(len(self.FORM) == len(self.XPOSTAG))
self.ID = [(i+1) for i in range(0, len(self.FORM))]
self.LEMMA = self.FORM
self.UPOSTAG = self.XPOSTAG
self.FEATS = ['_' for i in range(0, len(self.FORM))]
self.HEAD = ['_' for i in range(0, len(self.FORM))]
self.DEPREL = ['_' for i in range(0, len(self.FORM))]
s = []
for i in range(0, len(self.FORM)):
s.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.ID[i],
self.FORM[i],
self.LEMMA[i],
self.UPOSTAG[i],
self.XPOSTAG[i],
self.FEATS[i],
self.HEAD[i],
self.DEPREL[i]))
return '\n'.join(s)
'''
Process a file and return a list of sentences
'''
def processFile(path):
# BTAE0201.txt has encoding error. be lenient.
f = open(path, 'r', encoding='utf16', errors='ignore')
contents = f.read()
f.close()
# Some of the files don't have <group>!
# strings -f -el *.txt |grep "<group>"
# For example, BTAZ0223.txt
contents = contents.replace('<head>', '<p>')
contents = contents.replace('</head>', '</p>')
# Detected Sejong tagset
tagset = set()
# CoNLL sentences
sentences = []
# There's even <p> before </teiHeader>
paras = contents.split('</teiHeader>')[1].split('<p>')[1:]
for p in paras:
p = p.split('</p>')[0].strip()
sentence = CoNLLSentence()
try:
lines = p.split('\n')
#skipPara = False
for ln in lines:
if not ln:
continue
#BTAA0001-00000013 세계적인 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM
cols = ln.split('\t')
if ('<' in cols[0]):
# Do we want to process dates? Not sure yet...
print('Ignoring tag: ' + cols[0])
#skipPara = True
break
if len(cols) != 3:
#print('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
raise ValueError('Parsing error in %s! Column may contain tab or unexpected tag' % (path))
#print(cols)
lineid = cols[0].strip()
# Don't recommend stripping the other columns
# Sometimes spaces are tagged, and apparently they could be anywhere
# BTHO0432-00032162 주장자( 杖子)를 주장자/NNG + (/SS + /SW + 杖子/SH + )/SS + 를/JKO
word = cols[1]
# BTBA0233-00016517 2(남북한)+4(미 2/SN + (/SS + 남/NNP + 북한/NNP + )/SS + +/SW + 4/SN + (/SS + 미/NNP
# Some data even contains +. What we really want is to split by ' + '
tags = cols[2].split(' + ')
for tagelem in tags:
slashes = tagelem.split('/')
# sometimes component contains slash
# only consider tag to be after last /
# >>> slashes = '////////SP'.split('/')
wordcomponent = '/'.join(slashes[0:len(slashes)-1])
tag = slashes[len(slashes)-1]
if (not wordcomponent) or (not tag):
#print('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
raise ValueError('Parsing error in %s! Word or tag empty: %s' % (path, str(tags)))
#print(wordcomponent, tag)
assert(not('\t' in wordcomponent))
assert(not('\n' in wordcomponent))
assert(not('\t' in tag))
assert(not('\n' in tag))
sentence.FORM.append(wordcomponent)
sentence.XPOSTAG.append(tag)
tagset.add(tag)
#print(sentence.toString() + '\n')
sentences.append(sentence)
except Exception as e:
print(e)
print('Processing paragraph failed.')
return (tagset, sentences)
tagset = set()
sentences = list()
print('Found %d files to process.' % len(SEJONG_FILES))
m = 0
#SEJONG_FILES = ['../data/sejong_entire/BTHO0136.txt']
for f in SEJONG_FILES:
m += 1
print('Processing %s (%0.2f%%)' % (f, 100.0 * float(m) / float(len(SEJONG_FILES))))
thisFileTagset, thisFileSentences = processFile(f)
tagset = tagset.union(thisFileTagset)
sentences += thisFileSentences
# index of sentences within sentences list for each set
training_set_indices = []
tuning_set_indices = []
testing_set_indices = []
# split to 60-20-20 (training, tuning, testing sets)
for i in range(len(sentences)):
if i % 10 == 0 or i % 10 == 1: # ~20%
tuning_set_indices.append(i)
elif i % 10 == 2 or i % 10 == 3: # ~20%
testing_set_indices.append(i)
else: # ~60%
training_set_indices.append(i)
print()
print('Found %d sentences (%d-%d-%d).' % (len(sentences),
len(training_set_indices), len(tuning_set_indices), len(testing_set_indices)))
print('Complete tagset found: ' + str(tagset))
print()
print('Writing %s/tagged-training-corpus.conllu (%d sentences)...' % (output_dir, len(training_set_indices)))
fd = open('%s/tagged-training-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(training_set_indices)):
idx = training_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
# no extra newline after last element
if(i < len(training_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-tuning-corpus.conllu (%d sentences)...' % (output_dir, len(tuning_set_indices)))
fd = open('%s/tagged-tuning-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(tuning_set_indices)):
idx = tuning_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(tuning_set_indices)-1):
fd.write('\n')
fd.close()
print('Writing %s/tagged-dev-corpus.conllu (%d sentences)...' % (output_dir, len(testing_set_indices)))
fd = open('%s/tagged-dev-corpus.conllu' % output_dir, 'w', encoding='utf-8')
for i in range(len(testing_set_indices)):
idx = testing_set_indices[i]
fd.write(sentences[idx].toString() + '\n')
if(i < len(testing_set_indices)-1):
fd.write('\n')
fd.close()