Train word embeddings from CoNLL corpus file

April 21, 2018April 21, 2018 xtknightUncategorized0 Comments

Train word embeddings using CoNLL corpus as input.

Depends on: CoNLL Utils

train_word_embeddings

# take a CoNLL corpus and train word/doc embeddings
import argparse
import os
import sys
from conll_utils import *
from gensim.models.word2vec import *

# random
from random import shuffle

# necessary for seeing logs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

parser = argparse.ArgumentParser(description='Process CoNLL-U corpus and generate a word2vec model.')
# Required positional argument
parser.add_argument('input_file', type=str,
                    help='Input CoNLL-U corpus (UTF-8)')
parser.add_argument('output_file', type=str,
                    help='Base output filename of word2vec model (gensim)')
parser.add_argument('--epochs', type=int, default=10,
                    help='Training epochs (default 10). Shuffle sentences and re-train during each training epoch.')
parser.add_argument('--min-sentence-length', type=int, default=5,
                    help='If sentence is shorter than N Eojeols (full words), it will not be processed for inclusion in the word2vec model (default 5)')
parser.add_argument('--dimension', type=int, default=100,
                    help='word2vec: dimensionality of feature vectors (default 100). doc2vec mode may demand a higher value.')
parser.add_argument('--window', type=int, default=5,
                    help='word2vec: maximum distance between current and predicted word (default 5). doc2vec mode may demand a higher value.')
parser.add_argument('--workers', type=int, default=4,
                    help='word2vec: use this many worker threads to train the model (default 4)')
parser.add_argument('--min-word-occurrence', type=int, default=5,
                    help='word2vec: ignore all words with total frequency lower than this (default 5)')
parser.add_argument('--use-skipgram', action='store_true', default=True,
                    help='Use skip-gram instead of the default CBOW.')
parser.add_argument('--min-word-length', type=int, default=0,
                    help='word2vec: ignore all words with a length lower than this (default 0).')
#parser.add_argument('--char2vec', action='store_true', default=False,
#                    help='Create char2vec model (make all words their own chars).')

totalWordCount = 0
trainLabeledSentences = []

args = parser.parse_args()

trainingCorpus = ConllFile(keepMalformed=True,
                           checkParserConformity=False,
                           projectivize=False,
                           enableLemmaMorphemes=True,
                           compatibleJamo=True)

fd = open(args.input_file, 'r', encoding='utf-8')
trainingCorpus.read(fd.read())
fd.close()

for sent in trainingCorpus.sentences:
    my_sent = []
    for token in sent.tokens:
        if args.min_word_length <= 0 or (len(token.FORM) >= args.min_word_length):
            #print('add', token.morphemes)
            my_sent += [m[0]+'-'+m[1] for m in token.morphemes]
            #my_sent.append(token.FORM)
            totalWordCount += 1
    if args.min_sentence_length <= 0 or (len(my_sent) >= args.min_sentence_length):
        trainLabeledSentences.append(my_sent)

print('Beginning to build model...')
try:
    if(args.use_skipgram):
        sgFlag = 1
    else:
        sgFlag = 0
    
    model = Word2Vec(size=args.dimension, min_count=args.min_word_occurrence, window=args.window, workers=args.workers, sg=sgFlag)

    print('Building vocabulary...')
    model.build_vocab(trainLabeledSentences)

    #for epoch in range(args.epochs):
    #    print('Training epoch %d/%d...' % (epoch+1, args.epochs))
    #    # in-place shuffle of sentences
    #    # NOTE: this probably works much better without train-entire-document because each sentence can get shuffled???
    #    shuffle(trainLabeledSentences)
    model.train(trainLabeledSentences, total_examples=model.corpus_count, epochs=args.epochs)

    model.save(args.output_file)
except Exception as inst:
    print('Unexpected error:', inst)
    print('You may not have reached the minimum word occurrence count.')

Leave a Reply Cancel reply