Generate Synthetic Data for LDA


This post has a simple Python script to generate synthetic data for LDA. The purpose of this is to test some inference algorithms for LDA, e.g., Gibbs Sampling.

The script requires Numpy and Scipy.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
## This program is to generated synthetic data for LDA
import math
import random
import numpy
import numpy.random
import sys
 
## define some constant
TOPIC_N = 2
VOCABULARY_SIZE = 1000
DOC_NUM = 100
TERM_PER_DOC = 100
 
beta = [0.01 for i in range(VOCABULARY_SIZE)]
alpha = [0.9 for i in range(TOPIC_N)]
FILE_NAME = sys.argv[1]
 
phi = []
## generate multinomial distribution over words for each topic
for i in range(TOPIC_N):
	topic =	numpy.random.mtrand.dirichlet(beta, size = 1)
	phi.append(topic)
## generate words for each document
output_f = open(FILE_NAME+'.doc','w')
z_f = open(FILE_NAME+'.z','w')
theta_f = open(FILE_NAME+'.theta','w')
for i in range(DOC_NUM):
	buffer = {}
	z_buffer = {} ## keep track the true z
	## first sample theta
	theta = numpy.random.mtrand.dirichlet(alpha,size = 1)
	for j in range(TERM_PER_DOC):
		## first sample z
		z = numpy.random.multinomial(1,theta[0],size = 1)
		z_assignment = 0
		for k in range(TOPIC_N):
			if z[0][k] == 1:
				break
			z_assignment += 1
		if not z_assignment in z_buffer:
			z_buffer[z_assignment] = 0
		z_buffer[z_assignment] = z_buffer[z_assignment] + 1
		## sample a word from topic z
		w = numpy.random.multinomial(1,phi[z_assignment][0],size = 1)
		w_assignment = 0
		for k in range(VOCABULARY_SIZE):
			if w[0][k] == 1:
				break
			w_assignment += 1
		if not w_assignment in buffer:
			buffer[w_assignment] = 0
		buffer[w_assignment] = buffer[w_assignment] + 1
	## output
	output_f.write(str(i)+'\t'+str(TERM_PER_DOC)+'\t')
	for word_id, word_count in buffer.iteritems():
		output_f.write(str(word_id)+':'+str(word_count)+' ')
	output_f.write('\n')
	z_f.write(str(i)+'\t'+str(TERM_PER_DOC)+'\t')
	for z_id, z_count in z_buffer.iteritems():
		z_f.write(str(z_id)+':'+str(z_count)+' ')
	z_f.write('\n')
	theta_f.write(str(i)+'\t')
	for k in range(TOPIC_N):
		theta_f.write(str(k)+':'+str(theta[0][k])+' ')
	theta_f.write('\n')
z_f.close()
theta_f.close()
output_f.close()
 
## output phi
output_f = open(FILE_NAME+'.phi','w')
for i in range(TOPIC_N):
	output_f.write(str(i)+'\t')
	for j in range(VOCABULARY_SIZE):
		output_f.write(str(j)+':'+str(phi[i][0][j])+' ')
	output_f.write('\n')
output_f.close()
 
## output hyper-parameters
output_f = open(FILE_NAME+'.hyper','w')
output_f.write('TOPIC_N:'+str(TOPIC_N)+'\n')
output_f.write('VOCABULARY_SIZE:'+str(VOCABULARY_SIZE)+'\n')
output_f.write('DOC_NUM:'+str(DOC_NUM)+'\n')
output_f.write('TERM_PER_DOC:'+str(TERM_PER_DOC)+'\n')
output_f.write('alpha:'+str(alpha[0])+'\n')
output_f.write('beta:'+str(beta[0])+'\n')
output_f.close()

You can run the script by using the following command:

python generate_data.py test_data

where “test_data” is the output prefix for generated files.

Leave a comment

Your email address will not be published. Required fields are marked *