Completely rewrote the table sampling code...

...to work with multiple tables and cleaned it up significantly in the
process.
This commit is contained in:
Gregory Shikhman 2009-09-01 05:30:56 +00:00
parent e32d3dd0d0
commit b55784c099

View file

@ -23,49 +23,33 @@ def sample(size):
conn = MySQLdb.connect(configuration.DB_HOSTNAME,configuration.DB_USERNAME,configuration.DB_PASSWORD,configuration.DB_NAME)
curs = conn.cursor()
#find the maximum game_id in the DB
curs.execute("SELECT MAX(game_id) FROM GAMES")
curs.execute("SELECT DISTINCT `table` FROM _wsviews")
results = curs.fetchall()
max_id = results[0][0]
#look for an existing table with this sample size and drop it if it exists, then create a new one
tblname = configuration.DB_TABLE_PREFIX+TBLSTRING+str(size)
curs.execute("SELECT count(*) FROM information_schema.tables WHERE table_schema = 'corn' AND table_name = '"+tblname+"'")
results = curs.fetchall()
exists = results[0][0] == 1
if exists:
curs.execute("DROP TABLE IF EXISTS "+tblname)
curs.execute("""
CREATE TABLE `"""+tblname+"""` (
`game_id` int(11) NOT NULL auto_increment,
`timestamp` datetime NOT NULL,
`user_id` char(32) NOT NULL,
`serial` char(18) NOT NULL,
`platform` char(8) default NULL,
`version` char(14) default NULL,
`campaign` char(40) default NULL,
`difficulty` char(20) default NULL,
`gold` int(11) default NULL,
`turns` int(11) default NULL,
`scenario` char(40) default NULL,
`start_turn` int(11) default NULL,
`time` int(11) default NULL,
`result` enum('victory','defeat','quit') default NULL,
`end_time` int(11) default NULL,
`end_gold` int(11) default NULL,
`end_turn` int(11) default NULL,
PRIMARY KEY (`game_id`)
) ENGINE=MyISAM AUTO_INCREMENT=2450740 DEFAULT CHARSET=utf8 """)
#randomly pick size number of entries from the main DB and put them into this sample
choices = random.sample(range(1,max_id),size)
for c in choices:
curs.execute("SELECT * FROM GAMES WHERE `game_id`=%s",c)
for tbl in results:
tbl = tbl[0]
sample_tbl = "%s%s%s%d" % (configuration.DB_TABLE_PREFIX,TBLSTRING,tbl,size)
curs.execute("DROP TABLE IF EXISTS %s" % (sample_tbl))
curs.execute("CREATE TABLE %s LIKE %s" % (sample_tbl,tbl))
curs.execute("SELECT MAX(game_id) FROM %s" % (tbl,)) #we assume game_id is always the primary key
results = curs.fetchall()
if len(results) != 0:
#print results[0]
curs.execute("""INSERT INTO %s (game_id,timestamp,user_id,serial,platform,version,
campaign,difficulty,gold,turns,scenario,start_turn,time,result,end_time,end_gold,end_turn) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",results[0])
max_id = int(results[0][0])
if max_id < size:
size = max_id
choices = random.sample(range(0,max_id),size)
inserts = 0
for c in choices:
curs.execute("SELECT * FROM %s WHERE game_id=%d" % (tbl,c))
results = curs.fetchall()
if len(results) != 0:
query = "INSERT INTO %s SELECT * FROM %s WHERE game_id=%d" % (sample_tbl,tbl,c)
curs.execute(query)
conn.commit()
conn.close()
sample(10000)
sample(100000)