Visualize Gensim word2vec model in TensorBoard.
Output files can be uploaded to http://projector.tensorflow.org or by hosting TensorBoard code on your own web server and by using the projector json file output by the script.
'''
Output Little-Endian bytes and labels file from gensim model
Also outputs necessary json config file portion
For use with TensorBoard
'''
import os
import sys
import struct
import logging
import argparse
from gensim.models import Word2Vec, Doc2Vec
# necessary for seeing logs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
level=logging.INFO)
parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
TensorBoard visualization format')
# Required positional argument
parser.add_argument('input_file', type=str,
help='Input word2vec model file')
#parser.add_argument('output_file', type=str,
# help='Base output filename of TensorBoard data files')
args = parser.parse_args()
w2vmodel = Word2Vec.load(args.input_file)
try:
vocab = w2vmodel.vocab
except:
vocab = w2vmodel.wv.vocab
num_rows = len(vocab)
dim = w2vmodel.vector_size
try:
base_fn = args.input_file.split('/')[-1]
except:
base_fn = args.input_file.split('\\')[-1]
assert len(base_fn) > 0
tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)
tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')
try:
labels_out = open(labels_out_fn, 'w', encoding='utf-8')
except:
labels_out = open(labels_out_fn, 'w')
try:
tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
except:
tensor_tsv_out = open(tensor_tsv_out_fn, 'w')
labels_out.write('word\tcount\n')
for i, wd in enumerate(vocab):
if i % 100 == 0:
sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
(i+1, num_rows, 100.0 * (i+1) / num_rows))
floatvals = w2vmodel[wd].tolist()
assert dim == len(floatvals)
assert '\t' not in wd
if i > 0:
tensor_tsv_out.write('\n')
for f_i, f in enumerate(floatvals):
tensor_bytes_out.write(struct.pack('<f', f))
if f_i > 0:
tensor_tsv_out.write('\t%.8f' % f)
else:
tensor_tsv_out.write('%.8f' % f)
try:
labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
except:
labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))
sys.stderr.write('\n')
sys.stderr.flush()
tensor_bytes_out.close()
tensor_tsv_out.close()
labels_out.close()
# projector file
print('''{
"embeddings": [
{
"tensorName": "%s",
"tensorShape": [%d, %d],
"tensorPath": "%s",
"metadataPath": "%s"
}
],
"modelCheckpointPath": "Demo datasets"
}''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))
- '''
- Output Little-Endian bytes and labels file from gensim model
- Also outputs necessary json config file portion
- For use with TensorBoard
- '''
- import os
- import sys
- import struct
- import logging
- import argparse
- from gensim.models import Word2Vec, Doc2Vec
- # necessary for seeing logs
- logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
- level=logging.INFO)
- parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
- TensorBoard visualization format')
- # Required positional argument
- parser.add_argument('input_file', type=str,
- help='Input word2vec model file')
- #parser.add_argument('output_file', type=str,
- # help='Base output filename of TensorBoard data files')
- args = parser.parse_args()
- w2vmodel = Word2Vec.load(args.input_file)
- try:
- vocab = w2vmodel.vocab
- except:
- vocab = w2vmodel.wv.vocab
- num_rows = len(vocab)
- dim = w2vmodel.vector_size
- try:
- base_fn = args.input_file.split('/')[-1]
- except:
- base_fn = args.input_file.split('\\')[-1]
- assert len(base_fn) > 0
- tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
- tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
- labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)
- tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')
- try:
- labels_out = open(labels_out_fn, 'w', encoding='utf-8')
- except:
- labels_out = open(labels_out_fn, 'w')
- try:
- tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
- except:
- tensor_tsv_out = open(tensor_tsv_out_fn, 'w')
- labels_out.write('word\tcount\n')
- for i, wd in enumerate(vocab):
- if i % 100 == 0:
- sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
- (i+1, num_rows, 100.0 * (i+1) / num_rows))
- floatvals = w2vmodel[wd].tolist()
- assert dim == len(floatvals)
- assert '\t' not in wd
- if i > 0:
- tensor_tsv_out.write('\n')
- for f_i, f in enumerate(floatvals):
- tensor_bytes_out.write(struct.pack('<f', f))
-
- if f_i > 0:
- tensor_tsv_out.write('\t%.8f' % f)
- else:
- tensor_tsv_out.write('%.8f' % f)
- try:
- labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
- except:
- labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))
- sys.stderr.write('\n')
- sys.stderr.flush()
- tensor_bytes_out.close()
- tensor_tsv_out.close()
- labels_out.close()
- # projector file
- print('''{
- "embeddings": [
- {
- "tensorName": "%s",
- "tensorShape": [%d, %d],
- "tensorPath": "%s",
- "metadataPath": "%s"
- }
- ],
- "modelCheckpointPath": "Demo datasets"
- }''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))
'''
Output Little-Endian bytes and labels file from gensim model
Also outputs necessary json config file portion
For use with TensorBoard
'''
import os
import sys
import struct
import logging
import argparse
from gensim.models import Word2Vec, Doc2Vec
# necessary for seeing logs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
level=logging.INFO)
parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
TensorBoard visualization format')
# Required positional argument
parser.add_argument('input_file', type=str,
help='Input word2vec model file')
#parser.add_argument('output_file', type=str,
# help='Base output filename of TensorBoard data files')
args = parser.parse_args()
w2vmodel = Word2Vec.load(args.input_file)
try:
vocab = w2vmodel.vocab
except:
vocab = w2vmodel.wv.vocab
num_rows = len(vocab)
dim = w2vmodel.vector_size
try:
base_fn = args.input_file.split('/')[-1]
except:
base_fn = args.input_file.split('\\')[-1]
assert len(base_fn) > 0
tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)
tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')
try:
labels_out = open(labels_out_fn, 'w', encoding='utf-8')
except:
labels_out = open(labels_out_fn, 'w')
try:
tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
except:
tensor_tsv_out = open(tensor_tsv_out_fn, 'w')
labels_out.write('word\tcount\n')
for i, wd in enumerate(vocab):
if i % 100 == 0:
sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
(i+1, num_rows, 100.0 * (i+1) / num_rows))
floatvals = w2vmodel[wd].tolist()
assert dim == len(floatvals)
assert '\t' not in wd
if i > 0:
tensor_tsv_out.write('\n')
for f_i, f in enumerate(floatvals):
tensor_bytes_out.write(struct.pack('<f', f))
if f_i > 0:
tensor_tsv_out.write('\t%.8f' % f)
else:
tensor_tsv_out.write('%.8f' % f)
try:
labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
except:
labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))
sys.stderr.write('\n')
sys.stderr.flush()
tensor_bytes_out.close()
tensor_tsv_out.close()
labels_out.close()
# projector file
print('''{
"embeddings": [
{
"tensorName": "%s",
"tensorShape": [%d, %d],
"tensorPath": "%s",
"metadataPath": "%s"
}
],
"modelCheckpointPath": "Demo datasets"
}''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))