Align TED English and Korean xml corpuses (requires preprocessing/normalizing/tokenizing code)

This is a script for aligning English and Korean corpuses obtainable below. It also outputs vocab files for the NMT training TensorFlow example.

https://wit3.fbk.eu/mono.php?release=XML_releases&tinfo=cleanedhtml_ted

 

  1. #!/usr/bin/python3
  2. import random
  3. from normalizer import Normalizer
  4. from tokenizer import Tokenizer
  5. '''
  6. Align TED English and Korean xml corpuses:
  7. https://wit3.fbk.eu/mono.php?release=XML_releases&tinfo=cleanedhtml_ted
  8. '''
  9. # character threshold
  10. # if number of characters is less than this, don't output it
  11. ko_len_threshold = 10
  12. en_len_threshold = 20
  13. class TEDVideo(object):
  14. def __init__(self):
  15. self.url = None
  16. self.transcriptions = []
  17. '''
  18. Parse an array of TEDVideo objects from an xml file
  19. '''
  20. def parse_ted_file(fn):
  21. all_videos = []
  22. contents = None
  23. with open(fn, 'r', encoding='utf-8') as fd:
  24. contents = fd.read()
  25. for tmp in contents.split('<file')[1:]:
  26. ted_video = TEDVideo()
  27. ted_video.url = tmp.split('<url>')[1].split('</url>')[0].lower().strip()
  28. transcription_part = tmp.split('<transcription>')[1].split('</transcription')[0]
  29. invalid_file = False
  30. for ln in transcription_part.split('\n'):
  31. ln = ln.strip()
  32. if not ln:
  33. continue
  34. #print(ln)
  35. if '" />' in ln:
  36. invalid_file = True
  37. continue # erroneous line <seekvideo id="551600" />
  38. seekvideo_time = int(ln.split(' id="')[1].split('"')[0])
  39. seekvideo_text = ln.split('">')[1].split('</seekvideo>')[0].strip()
  40. seekvideo_text_normalized = Normalizer.normalize(seekvideo_text)
  41. tk = Tokenizer.tokenize(seekvideo_text_normalized,
  42. keep_punctuation=True,
  43. keep_symbols=True)
  44. #print((seekvideo_time, seekvideo_text))
  45. ted_video.transcriptions.append((seekvideo_time, ' '.join(tk)))
  46. if not invalid_file:
  47. all_videos.append(ted_video)
  48. return all_videos
  49. parsed_en_videos = parse_ted_file('ted_en-20160408.xml')
  50. parsed_ko_videos = parse_ted_file('ted_ko-20160408.xml')
  51. url_set_en = set([v.url for v in parsed_en_videos])
  52. url_set_ko = set([v.url for v in parsed_ko_videos])
  53. monoling_set = url_set_en | url_set_ko
  54. biling_set = url_set_en & url_set_ko
  55. url_to_tedvideo_en = {}
  56. for v in parsed_en_videos:
  57. url_to_tedvideo_en[v.url] = v
  58. url_to_tedvideo_ko = {}
  59. for v in parsed_ko_videos:
  60. url_to_tedvideo_ko[v.url] = v
  61. print('Detected %d/%d bilingual videos total' % (len(biling_set), len(monoling_set)))
  62. #with open('ted_ko-20160408.xml', 'r', encoding='utf-8') as fd:
  63. # contents = fd.read()
  64. sent_ko = []
  65. sent_en = []
  66. vocab_ko = set()
  67. vocab_en = set()
  68. # re-order videos in random order each time
  69. biling_set = list(biling_set)
  70. random.shuffle(biling_set)
  71. for url in biling_set:
  72. en_video = url_to_tedvideo_en[url]
  73. ko_video = url_to_tedvideo_ko[url]
  74. transcript_en = en_video.transcriptions
  75. transcript_ko = ko_video.transcriptions
  76. biling_timestamps = set(list(zip(*transcript_en))[0]) & set(list(zip(*transcript_ko))[0])
  77. transcript_en_dict = dict(transcript_en)
  78. transcript_ko_dict = dict(transcript_ko)
  79. # unzip and check timestamp consistency
  80. #if list(zip(*transcript_en))[0] != list(zip(*transcript_ko))[0]:
  81. #print('en', list(zip(*transcript_en))[0])
  82. #print('ko', list(zip(*transcript_ko))[0])
  83. #print('Timestamps inconsistent for: %s. Skipping.' % url)
  84. # continue
  85. # or should we just print only the matching timestamps?
  86. for ts in biling_timestamps:
  87. if len(transcript_ko_dict[ts]) >= ko_len_threshold and len(transcript_en_dict[ts]) >= en_len_threshold:
  88. sent_ko.append(transcript_ko_dict[ts])
  89. sent_en.append(transcript_en_dict[ts])
  90. for wd in transcript_ko_dict[ts].split():
  91. vocab_ko.add(wd)
  92. for wd in transcript_en_dict[ts].split():
  93. vocab_en.add(wd)
  94. #fd_ko.write(transcript_ko_dict[ts] + '\n')
  95. #fd_en.write(transcript_en_dict[ts] + '\n')
  96. assert(len(sent_ko) == len(sent_en))
  97. # train/test/dev sets
  98. # 80/10/10
  99. num_train = int(len(sent_ko) * 0.80)
  100. num_test = int(len(sent_ko) * 0.10)
  101. num_dev = len(sent_ko)-num_train-num_test
  102. train_ko = sent_ko[:num_train]
  103. test_ko = sent_ko[num_train:num_train+num_test]
  104. dev_ko = sent_ko[num_train+num_test:]
  105. train_en = sent_en[:num_train]
  106. test_en = sent_en[num_train:num_train+num_test]
  107. dev_en = sent_en[num_train+num_test:]
  108. assert len(train_ko)==num_train
  109. assert len(test_ko)==num_test
  110. assert len(dev_ko)==num_dev
  111. assert len(train_en)==num_train
  112. assert len(test_en)==num_test
  113. assert len(dev_en)==num_dev
  114. for datasets in [('train', train_ko), ('test', test_ko), ('dev', dev_ko)]:
  115. with open('ted_biling_%s.ko' % datasets[0], 'w', encoding='utf-8') as fd:
  116. for d in datasets[1]:
  117. fd.write(d + '\n')
  118. for datasets in [('train', train_en), ('test', test_en), ('dev', dev_en)]:
  119. with open('ted_biling_%s.en' % datasets[0], 'w', encoding='utf-8') as fd:
  120. for d in datasets[1]:
  121. fd.write(d + '\n')
  122. # write vocab files for NMT
  123. std_vocab = list(['<unk>', '<s>', '</s>'])
  124. with open('ted_biling_vocab.ko', 'w', encoding='utf-8') as fd:
  125. for v in std_vocab + list(vocab_ko):
  126. fd.write('%s\n' % v)
  127. with open('ted_biling_vocab.en', 'w', encoding='utf-8') as fd:
  128. for v in std_vocab + list(vocab_en):
  129. fd.write('%s\n' % v)
#!/usr/bin/python3
import random
from normalizer import Normalizer
from tokenizer import Tokenizer

'''
Align TED English and Korean xml corpuses:
https://wit3.fbk.eu/mono.php?release=XML_releases&tinfo=cleanedhtml_ted
'''

# character threshold
# if number of characters is less than this, don't output it
ko_len_threshold = 10
en_len_threshold = 20

class TEDVideo(object):
    def __init__(self):
        self.url = None
        self.transcriptions = []

'''
Parse an array of TEDVideo objects from an xml file
'''
def parse_ted_file(fn):
    all_videos = []
    contents = None
    with open(fn, 'r', encoding='utf-8') as fd:
        contents = fd.read()
    for tmp in contents.split('<file')[1:]:
        ted_video = TEDVideo()
        ted_video.url = tmp.split('<url>')[1].split('</url>')[0].lower().strip()
        transcription_part = tmp.split('<transcription>')[1].split('</transcription')[0]
        invalid_file = False
        for ln in transcription_part.split('\n'):
            ln = ln.strip()
            if not ln:
                continue
            #print(ln)
            if '" />' in ln:
                invalid_file = True
                continue # erroneous line <seekvideo id="551600" />
            seekvideo_time = int(ln.split(' id="')[1].split('"')[0])
            seekvideo_text = ln.split('">')[1].split('</seekvideo>')[0].strip()

            seekvideo_text_normalized = Normalizer.normalize(seekvideo_text)
            tk = Tokenizer.tokenize(seekvideo_text_normalized,
                                    keep_punctuation=True,
                                    keep_symbols=True)

            #print((seekvideo_time, seekvideo_text))
            ted_video.transcriptions.append((seekvideo_time, ' '.join(tk)))
        if not invalid_file:
            all_videos.append(ted_video)
    return all_videos

parsed_en_videos = parse_ted_file('ted_en-20160408.xml')
parsed_ko_videos = parse_ted_file('ted_ko-20160408.xml')

url_set_en = set([v.url for v in parsed_en_videos])
url_set_ko = set([v.url for v in parsed_ko_videos])
monoling_set = url_set_en | url_set_ko
biling_set = url_set_en & url_set_ko

url_to_tedvideo_en = {}
for v in parsed_en_videos:
    url_to_tedvideo_en[v.url] = v

url_to_tedvideo_ko = {}
for v in parsed_ko_videos:
    url_to_tedvideo_ko[v.url] = v

print('Detected %d/%d bilingual videos total' % (len(biling_set), len(monoling_set)))

#with open('ted_ko-20160408.xml', 'r', encoding='utf-8') as fd:
#    contents = fd.read()

sent_ko = []
sent_en = []

vocab_ko = set()
vocab_en = set()

# re-order videos in random order each time
biling_set = list(biling_set)
random.shuffle(biling_set)

for url in biling_set:
    en_video = url_to_tedvideo_en[url]
    ko_video = url_to_tedvideo_ko[url]
    transcript_en = en_video.transcriptions
    transcript_ko = ko_video.transcriptions

    biling_timestamps = set(list(zip(*transcript_en))[0]) & set(list(zip(*transcript_ko))[0])
    transcript_en_dict = dict(transcript_en)
    transcript_ko_dict = dict(transcript_ko)

    # unzip and check timestamp consistency
    #if list(zip(*transcript_en))[0] != list(zip(*transcript_ko))[0]:
        #print('en', list(zip(*transcript_en))[0])
        #print('ko', list(zip(*transcript_ko))[0])
        #print('Timestamps inconsistent for: %s. Skipping.' % url)
    #    continue
        # or should we just print only the matching timestamps?

    for ts in biling_timestamps:
        if len(transcript_ko_dict[ts]) >= ko_len_threshold and len(transcript_en_dict[ts]) >= en_len_threshold:
            sent_ko.append(transcript_ko_dict[ts])
            sent_en.append(transcript_en_dict[ts])

            for wd in transcript_ko_dict[ts].split():
                vocab_ko.add(wd)

            for wd in transcript_en_dict[ts].split():
                vocab_en.add(wd)

            #fd_ko.write(transcript_ko_dict[ts] + '\n')
            #fd_en.write(transcript_en_dict[ts] + '\n')

assert(len(sent_ko) == len(sent_en))

# train/test/dev sets
# 80/10/10

num_train = int(len(sent_ko) * 0.80)
num_test = int(len(sent_ko) * 0.10)
num_dev = len(sent_ko)-num_train-num_test

train_ko = sent_ko[:num_train]
test_ko = sent_ko[num_train:num_train+num_test]
dev_ko = sent_ko[num_train+num_test:]

train_en = sent_en[:num_train]
test_en = sent_en[num_train:num_train+num_test]
dev_en = sent_en[num_train+num_test:]

assert len(train_ko)==num_train
assert len(test_ko)==num_test
assert len(dev_ko)==num_dev

assert len(train_en)==num_train
assert len(test_en)==num_test
assert len(dev_en)==num_dev

for datasets in [('train', train_ko), ('test', test_ko), ('dev', dev_ko)]:
    with open('ted_biling_%s.ko' % datasets[0], 'w', encoding='utf-8') as fd:
        for d in datasets[1]:
            fd.write(d + '\n')

for datasets in [('train', train_en), ('test', test_en), ('dev', dev_en)]:
    with open('ted_biling_%s.en' % datasets[0], 'w', encoding='utf-8') as fd:
        for d in datasets[1]:
            fd.write(d + '\n')

# write vocab files for NMT
std_vocab = list(['<unk>', '<s>', '</s>'])

with open('ted_biling_vocab.ko', 'w', encoding='utf-8') as fd:
    for v in std_vocab + list(vocab_ko):
        fd.write('%s\n' % v)

with open('ted_biling_vocab.en', 'w', encoding='utf-8') as fd:
    for v in std_vocab + list(vocab_en):
        fd.write('%s\n' % v)

 

Leave a Reply

Your email address will not be published. Required fields are marked *