# Generate Synthetic Data for LDA

This post has a simple Python script to generate synthetic data for LDA. The purpose of this is to test some inference algorithms for LDA, e.g., Gibbs Sampling.

The script requires Numpy and Scipy.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 ## This program is to generated synthetic data for LDA import math import random import numpy import numpy.random import sys   ## define some constant TOPIC_N = 2 VOCABULARY_SIZE = 1000 DOC_NUM = 100 TERM_PER_DOC = 100   beta = [0.01 for i in range(VOCABULARY_SIZE)] alpha = [0.9 for i in range(TOPIC_N)] FILE_NAME = sys.argv[1]   phi = [] ## generate multinomial distribution over words for each topic for i in range(TOPIC_N): topic = numpy.random.mtrand.dirichlet(beta, size = 1) phi.append(topic) ## generate words for each document output_f = open(FILE_NAME+'.doc','w') z_f = open(FILE_NAME+'.z','w') theta_f = open(FILE_NAME+'.theta','w') for i in range(DOC_NUM): buffer = {} z_buffer = {} ## keep track the true z ## first sample theta theta = numpy.random.mtrand.dirichlet(alpha,size = 1) for j in range(TERM_PER_DOC): ## first sample z z = numpy.random.multinomial(1,theta[0],size = 1) z_assignment = 0 for k in range(TOPIC_N): if z[0][k] == 1: break z_assignment += 1 if not z_assignment in z_buffer: z_buffer[z_assignment] = 0 z_buffer[z_assignment] = z_buffer[z_assignment] + 1 ## sample a word from topic z w = numpy.random.multinomial(1,phi[z_assignment][0],size = 1) w_assignment = 0 for k in range(VOCABULARY_SIZE): if w[0][k] == 1: break w_assignment += 1 if not w_assignment in buffer: buffer[w_assignment] = 0 buffer[w_assignment] = buffer[w_assignment] + 1 ## output output_f.write(str(i)+'\t'+str(TERM_PER_DOC)+'\t') for word_id, word_count in buffer.iteritems(): output_f.write(str(word_id)+':'+str(word_count)+' ') output_f.write('\n') z_f.write(str(i)+'\t'+str(TERM_PER_DOC)+'\t') for z_id, z_count in z_buffer.iteritems(): z_f.write(str(z_id)+':'+str(z_count)+' ') z_f.write('\n') theta_f.write(str(i)+'\t') for k in range(TOPIC_N): theta_f.write(str(k)+':'+str(theta[0][k])+' ') theta_f.write('\n') z_f.close() theta_f.close() output_f.close()   ## output phi output_f = open(FILE_NAME+'.phi','w') for i in range(TOPIC_N): output_f.write(str(i)+'\t') for j in range(VOCABULARY_SIZE): output_f.write(str(j)+':'+str(phi[i][0][j])+' ') output_f.write('\n') output_f.close()   ## output hyper-parameters output_f = open(FILE_NAME+'.hyper','w') output_f.write('TOPIC_N:'+str(TOPIC_N)+'\n') output_f.write('VOCABULARY_SIZE:'+str(VOCABULARY_SIZE)+'\n') output_f.write('DOC_NUM:'+str(DOC_NUM)+'\n') output_f.write('TERM_PER_DOC:'+str(TERM_PER_DOC)+'\n') output_f.write('alpha:'+str(alpha[0])+'\n') output_f.write('beta:'+str(beta[0])+'\n') output_f.close()

You can run the script by using the following command:

python generate_data.py test_data

where “test_data” is the output prefix for generated files.