Visualize Gensim word2vec model in TensorBoard

Visualize Gensim word2vec model in TensorBoard.

Output files can be uploaded to http://projector.tensorflow.org or by hosting TensorBoard code on your own web server and by using the projector json file output by the script.

  1. '''
  2. Output Little-Endian bytes and labels file from gensim model
  3. Also outputs necessary json config file portion
  4. For use with TensorBoard
  5. '''
  6. import os
  7. import sys
  8. import struct
  9. import logging
  10. import argparse
  11. from gensim.models import Word2Vec, Doc2Vec
  12. # necessary for seeing logs
  13. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
  14. level=logging.INFO)
  15. parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
  16. TensorBoard visualization format')
  17. # Required positional argument
  18. parser.add_argument('input_file', type=str,
  19. help='Input word2vec model file')
  20. #parser.add_argument('output_file', type=str,
  21. # help='Base output filename of TensorBoard data files')
  22. args = parser.parse_args()
  23. w2vmodel = Word2Vec.load(args.input_file)
  24. try:
  25. vocab = w2vmodel.vocab
  26. except:
  27. vocab = w2vmodel.wv.vocab
  28. num_rows = len(vocab)
  29. dim = w2vmodel.vector_size
  30. try:
  31. base_fn = args.input_file.split('/')[-1]
  32. except:
  33. base_fn = args.input_file.split('\\')[-1]
  34. assert len(base_fn) > 0
  35. tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
  36. tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
  37. labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)
  38. tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')
  39. try:
  40. labels_out = open(labels_out_fn, 'w', encoding='utf-8')
  41. except:
  42. labels_out = open(labels_out_fn, 'w')
  43. try:
  44. tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
  45. except:
  46. tensor_tsv_out = open(tensor_tsv_out_fn, 'w')
  47. labels_out.write('word\tcount\n')
  48. for i, wd in enumerate(vocab):
  49. if i % 100 == 0:
  50. sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
  51. (i+1, num_rows, 100.0 * (i+1) / num_rows))
  52. floatvals = w2vmodel[wd].tolist()
  53. assert dim == len(floatvals)
  54. assert '\t' not in wd
  55. if i > 0:
  56. tensor_tsv_out.write('\n')
  57. for f_i, f in enumerate(floatvals):
  58. tensor_bytes_out.write(struct.pack('<f', f))
  59. if f_i > 0:
  60. tensor_tsv_out.write('\t%.8f' % f)
  61. else:
  62. tensor_tsv_out.write('%.8f' % f)
  63. try:
  64. labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
  65. except:
  66. labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))
  67. sys.stderr.write('\n')
  68. sys.stderr.flush()
  69. tensor_bytes_out.close()
  70. tensor_tsv_out.close()
  71. labels_out.close()
  72. # projector file
  73. print('''{
  74. "embeddings": [
  75. {
  76. "tensorName": "%s",
  77. "tensorShape": [%d, %d],
  78. "tensorPath": "%s",
  79. "metadataPath": "%s"
  80. }
  81. ],
  82. "modelCheckpointPath": "Demo datasets"
  83. }''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))
'''
Output Little-Endian bytes and labels file from gensim model
Also outputs necessary json config file portion
For use with TensorBoard
'''

import os
import sys
import struct
import logging
import argparse
from gensim.models import Word2Vec, Doc2Vec
# necessary for seeing logs

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
    level=logging.INFO)

parser = argparse.ArgumentParser(description='Convert gensim Word2Vec model to \
    TensorBoard visualization format')

# Required positional argument
parser.add_argument('input_file', type=str,
                    help='Input word2vec model file')
#parser.add_argument('output_file', type=str,
#                    help='Base output filename of TensorBoard data files')

args = parser.parse_args()

w2vmodel = Word2Vec.load(args.input_file)

try:
    vocab = w2vmodel.vocab
except:
    vocab = w2vmodel.wv.vocab

num_rows = len(vocab)

dim = w2vmodel.vector_size

try:
    base_fn = args.input_file.split('/')[-1]
except:
    base_fn = args.input_file.split('\\')[-1]

assert len(base_fn) > 0

tensor_bytes_out_fn = '%s_%d_%dd_tensors.bytes' % (base_fn, num_rows, dim)
tensor_tsv_out_fn = '%s_%d_%dd_tensors.tsv' % (base_fn, num_rows, dim)
labels_out_fn = '%s_%d_%dd_labels.tsv' % (base_fn, num_rows, dim)

tensor_bytes_out = open(tensor_bytes_out_fn, 'wb')

try:
    labels_out = open(labels_out_fn, 'w', encoding='utf-8')
except:
    labels_out = open(labels_out_fn, 'w')

try:
    tensor_tsv_out = open(tensor_tsv_out_fn, 'w', encoding='utf-8')
except:
    tensor_tsv_out = open(tensor_tsv_out_fn, 'w')

labels_out.write('word\tcount\n')

for i, wd in enumerate(vocab):
    if i % 100 == 0:
        sys.stderr.write('\rDump vocabulary: %d/%d (%.2f%%)' % \
            (i+1, num_rows, 100.0 * (i+1) / num_rows))

    floatvals = w2vmodel[wd].tolist()
    assert dim == len(floatvals)
    assert '\t' not in wd

    if i > 0:
        tensor_tsv_out.write('\n')

    for f_i, f in enumerate(floatvals):
        tensor_bytes_out.write(struct.pack('<f', f))
        
        if f_i > 0:
            tensor_tsv_out.write('\t%.8f' % f)
        else:
            tensor_tsv_out.write('%.8f' % f)

    try:
        labels_out.write('%s\t%s\n' % (wd, vocab[wd].count))
    except:
        labels_out.write(('%s\t%s\n' % (wd, vocab[wd].count)).encode('utf-8'))

sys.stderr.write('\n')
sys.stderr.flush()

tensor_bytes_out.close()
tensor_tsv_out.close()
labels_out.close()

# projector file

print('''{
  "embeddings": [
    {
      "tensorName": "%s",
      "tensorShape": [%d, %d],
      "tensorPath": "%s",
      "metadataPath": "%s"
    }
  ],
  "modelCheckpointPath": "Demo datasets"
}''' % (base_fn, num_rows, dim, tensor_bytes_out_fn, labels_out_fn))

 

Leave a Reply

Your email address will not be published. Required fields are marked *