Visualize Gensim word2vec model in TensorBoard

Visualize Gensim word2vec model in TensorBoard.

Output files can be uploaded to http://projector.tensorflow.org or by hosting TensorBoard code on your own web server and by using the projector json file output by the script.

'''
Output Little-Endian bytes and labels file from gensim model
Also outputs necessary json config file portion
For use with TensorBoard
'''

import os
import sys
import struct
import logging
import argparse
from gensim.models import Word2Vec, Doc2Vec
# necessary for seeing logs

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
    level=logging.INFO)

parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
    TensorBoard visualization format')

# Required positional argument
parser.add_argument('input_file', type=str,
                    help='Input word2vec model file')
#parser.add_argument('output_file', type=str,
#                    help='Base output filename of TensorBoard data files')

args = parser.parse_args()

w2vmodel = Word2Vec.load(args.input_file)

try:
    vocab = w2vmodel.vocab
except:
    vocab = w2vmodel.wv.vocab

num_rows = len(vocab)

dim = w2vmodel.vector_size

try:
    base_fn = args.input_file.split('/')[-1]
except:
    base_fn = args.input_file.split('\\')[-1]

assert len(base_fn) > 0

tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)

tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')

try:
    labels_out = open(labels_out_fn, 'w', encoding='utf-8')
except:
    labels_out = open(labels_out_fn, 'w')

try:
    tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
except:
    tensor_tsv_out = open(tensor_tsv_out_fn, 'w')

labels_out.write('word\tcount\n')

for i, wd in enumerate(vocab):
    if i % 100 == 0:
        sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
            (i+1, num_rows, 100.0 * (i+1) / num_rows))

    floatvals = w2vmodel[wd].tolist()
    assert dim == len(floatvals)
    assert '\t' not in wd

    if i > 0:
        tensor_tsv_out.write('\n')

    for f_i, f in enumerate(floatvals):
        tensor_bytes_out.write(struct.pack('<f', f))
        
        if f_i > 0:
            tensor_tsv_out.write('\t%.8f' % f)
        else:
            tensor_tsv_out.write('%.8f' % f)

    try:
        labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
    except:
        labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))

sys.stderr.write('\n')
sys.stderr.flush()

tensor_bytes_out.close()
tensor_tsv_out.close()
labels_out.close()

# projector file

print('''{
  "embeddings": [
    {
      "tensorName": "%s",
      "tensorShape": [%d, %d],
      "tensorPath": "%s",
      "metadataPath": "%s"
    }
  ],
  "modelCheckpointPath": "Demo datasets"
}''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))

 

Leave a Reply

Your email address will not be published. Required fields are marked *