#!/usr/bin/python
""" << Code Source Search Engine
	This script generate/fill the database for the search in code source engine
	The structure of the files is under an ID organization:

	Example:
		ID = 1245
		directory = ./000/001/245/*.*

	Version 0.1:  2006, October 05 by [r@rgaucher.inf]
"""
import sys,os,re
import MySQLdb

# the database is a simple dictionnary with keywords and list of ID
database = {}
# regular expression for: extract words, extract ID from path, check for hexa value
chars = re.compile(r'\W+')
pattid= re.compile(r'(\d{3})/(\d{3})/(\d{3})')
hexa  = re.compile(r'0x(\d)+')
# the higher ID
maxID = -1

# split on any chars
def splitchars(line):
	return chars.split(line)

# process the tokens of the source code
def parsetoken(line,ID):
	#print "[%s] %s" % (ID,line)
	line = line.replace('\t',' ')
	line = line.strip()
	l = splitchars(line)
	for elmt in l:
		elmt = elmt.replace('\n','')
		try:
			number = int(elmt.strip().split()[0])
		except (ValueError, IndexError):
			number = None
			
		if number != None or elmt == '' or hexa.match(elmt):
				continue
		else:
			lowerElmt = elmt.lower().strip()
			if not (lowerElmt in database.keys()):
				database[lowerElmt] = list()
			if not (ID in database[lowerElmt]):
				database[lowerElmt].append(ID)
	return l

# handle the file
def process(filename, ID):
	try:
		file = open(filename, 'r')
	except IOError:
		print "Error in file %s" % filename
		return False
	else:
		out = []
		for l in file.readlines():
			out = out + parsetoken(l,ID)
		file.close()
	return len(out)

# transform a directory into ID
def transform(dirname):
	global maxID
	try:
		s = pattid.search(dirname).groups()
		tID = int ("%s%s%s" % (s[0],s[1],s[2]))
	except (ValueError, IndexError):
		tID = 0
	if tID > maxID:
		maxID = tID
	return tID

# walk in the directories
def walkdir(dirname,ID):
	all = {}
	all = [f for f in os.listdir(dirname) if os.path.isdir(os.path.join(dirname, f)) or os.path.isfile(os.path.join(dirname, f))]
	for f in all:
		if os.path.isdir(dirname + '/' + f):
			walkdir(dirname + '/' + f, ID)
		else:
			# don't index XML files or SQL or DATA
			if f.lower().find(".xml") > -1 or f.lower().find(".sql") > -1 or f.lower().find(".dat") > -1:
				continue
			if ID > -1:
				process(dirname + '/' + f, ID)
			else:
				process(dirname + '/' + f, transform(dirname))
	return True

# create the directory from the ID
def solvedir(ID):
	folder = "./"
	if ID > -1:
		glop = "%0.9i" % int(ID)
		for k in range(9):
			folder = folder + glop[k]
			if not ((k+1) % 3):
				folder = folder + '/'
	walkdir(folder, ID)

"""
	>>> main
	    connect, fill the 'database' dictionnary, fill the MySQL database
"""
if __name__ == '__main__':
	build   = False
	rebuild = False
	# connect to the MySQL server
	try:
		conn = MySQLdb.connect (host = "localhost",user = "USERNAME",passwd = "PASSWORD",db = "DATABASE")
	except MySQLdb.Error, e:
		print "Error %d: %s" % (e.args[0], e.args[1])
		sys.exit (1)
	# get the entry of the program (ID) and fill the __dict__ object
	try:
		if len(sys.argv) == 2:
			if sys.argv[1] == "build" or sys.argv[1] == "rebuild":
				build = True
				if sys.argv[1] == "rebuild":
					rebuild = True
			else:
				ID = int(sys.argv[1])
				solvedir(ID)
		else:
			print "pyIndex.py <ID|build|rebuild>"
	except Exception, e:
		print "Error: ",e

	# the table is created so we only need to parse the test case and populate the table
	try:
		cursor = conn.cursor ()
	except MySQLdb.Error, e:
		print "Error %d: %s" % (e.args[0], e.args[1])
		sys.exit(1)

	if rebuild:
		# flush the tables
		cursor.execute("TRUNCATE TABLE `words2id` ")
		cursor.execute("TRUNCATE TABLE `words` ")
		# then reset the auto_increment value
		cursor.execute("ALTER TABLE `words2id` AUTO_INCREMENT =1")
		cursor.execute("ALTER TABLE `words` AUTO_INCREMENT =1")

	if build:
		# scan all...
		solvedir(-1)

	# populate the dictionnary if the word is not in
	for k in database.keys():
		cursor.execute("SELECT WordID FROM words WHERE Word = '%s'" % k)
		row = cursor.fetchone()
		if row == None:
			sqlQuery = "INSERT INTO words VALUES (NULL, '%s')" % k
			cursor.execute(sqlQuery)

	if build:
		i=0
		for k in database.keys():
			cursor.execute("SELECT WordID FROM words WHERE Word = '%s' LIMIT 1" % k)
			row = cursor.fetchone()
			wordid = row[0]
			# insert wordid + 1, ID
			for i in range(maxID+1):
				if i in database[k]:
					cursor.execute("REPLACE INTO words2id VALUES (NULL,%i,%i)" % (wordid,i))
	else:
		# populate the relation between ID and word
		for k in database.keys():
			cursor.execute("SELECT WordID FROM words WHERE Word = '%s' LIMIT 1" % k)
			row = cursor.fetchone()
			wordid = row[0]
			# insert wordid + 1, ID
			cursor.execute("REPLACE INTO words2id VALUES (NULL,%i,%i)" % (wordid,ID))

	cursor.close()
	conn.commit()
	conn.close()