Train word embeddings using CoNLL corpus as input.
Depends on: CoNLL Utils
train_word_embeddings
# take a CoNLL corpus and train word/doc embeddings
import argparse
import os
import sys
from conll_utils import *
from gensim.models.word2vec import *
# random
from random import shuffle
# necessary for seeing logs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(description='Process CoNLL-U corpus and generate a word2vec model.')
# Required positional argument
parser.add_argument('input_file', type=str,
help='Input CoNLL-U corpus (UTF-8)')
parser.add_argument('output_file', type=str,
help='Base output filename of word2vec model (gensim)')
parser.add_argument('--epochs', type=int, default=10,
help='Training epochs (default 10). Shuffle sentences and re-train during each training epoch.')
parser.add_argument('--min-sentence-length', type=int, default=5,
help='If sentence is shorter than N Eojeols (full words), it will not be processed for inclusion in the word2vec model (default 5)')
parser.add_argument('--dimension', type=int, default=100,
help='word2vec: dimensionality of feature vectors (default 100). doc2vec mode may demand a higher value.')
parser.add_argument('--window', type=int, default=5,
help='word2vec: maximum distance between current and predicted word (default 5). doc2vec mode may demand a higher value.')
parser.add_argument('--workers', type=int, default=4,
help='word2vec: use this many worker threads to train the model (default 4)')
parser.add_argument('--min-word-occurrence', type=int, default=5,
help='word2vec: ignore all words with total frequency lower than this (default 5)')
parser.add_argument('--use-skipgram', action='store_true', default=True,
help='Use skip-gram instead of the default CBOW.')
parser.add_argument('--min-word-length', type=int, default=0,
help='word2vec: ignore all words with a length lower than this (default 0).')
#parser.add_argument('--char2vec', action='store_true', default=False,
# help='Create char2vec model (make all words their own chars).')
totalWordCount = 0
trainLabeledSentences = []
args = parser.parse_args()
trainingCorpus = ConllFile(keepMalformed=True,
checkParserConformity=False,
projectivize=False,
enableLemmaMorphemes=True,
compatibleJamo=True)
fd = open(args.input_file, 'r', encoding='utf-8')
trainingCorpus.read(fd.read())
fd.close()
for sent in trainingCorpus.sentences:
my_sent = []
for token in sent.tokens:
if args.min_word_length <= 0 or (len(token.FORM) >= args.min_word_length):
#print('add', token.morphemes)
my_sent += [m[0]+'-'+m[1] for m in token.morphemes]
#my_sent.append(token.FORM)
totalWordCount += 1
if args.min_sentence_length <= 0 or (len(my_sent) >= args.min_sentence_length):
trainLabeledSentences.append(my_sent)
print('Beginning to build model...')
try:
if(args.use_skipgram):
sgFlag = 1
else:
sgFlag = 0
model = Word2Vec(size=args.dimension, min_count=args.min_word_occurrence, window=args.window, workers=args.workers, sg=sgFlag)
print('Building vocabulary...')
model.build_vocab(trainLabeledSentences)
#for epoch in range(args.epochs):
# print('Training epoch %d/%d...' % (epoch+1, args.epochs))
# # in-place shuffle of sentences
# # NOTE: this probably works much better without train-entire-document because each sentence can get shuffled???
# shuffle(trainLabeledSentences)
model.train(trainLabeledSentences, total_examples=model.corpus_count, epochs=args.epochs)
model.save(args.output_file)
except Exception as inst:
print('Unexpected error:', inst)
print('You may not have reached the minimum word occurrence count.')