#!/usr/bin/python """ << Code Source Search Engine This script generate/fill the database for the search in code source engine The structure of the files is under an ID organization: Example: ID = 1245 directory = ./000/001/245/*.* Version 0.1: 2006, October 05 by [r@rgaucher.inf] """ import sys,os,re import MySQLdb # the database is a simple dictionnary with keywords and list of ID database = {} # regular expression for: extract words, extract ID from path, check for hexa value chars = re.compile(r'\W+') pattid= re.compile(r'(\d{3})/(\d{3})/(\d{3})') hexa = re.compile(r'0x(\d)+') # the higher ID maxID = -1 # split on any chars def splitchars(line): return chars.split(line) # process the tokens of the source code def parsetoken(line,ID): #print "[%s] %s" % (ID,line) line = line.replace('\t',' ') line = line.strip() l = splitchars(line) for elmt in l: elmt = elmt.replace('\n','') try: number = int(elmt.strip().split()[0]) except (ValueError, IndexError): number = None if number != None or elmt == '' or hexa.match(elmt): continue else: lowerElmt = elmt.lower().strip() if not (lowerElmt in database.keys()): database[lowerElmt] = list() if not (ID in database[lowerElmt]): database[lowerElmt].append(ID) return l # handle the file def process(filename, ID): try: file = open(filename, 'r') except IOError: print "Error in file %s" % filename return False else: out = [] for l in file.readlines(): out = out + parsetoken(l,ID) file.close() return len(out) # transform a directory into ID def transform(dirname): global maxID try: s = pattid.search(dirname).groups() tID = int ("%s%s%s" % (s[0],s[1],s[2])) except (ValueError, IndexError): tID = 0 if tID > maxID: maxID = tID return tID # walk in the directories def walkdir(dirname,ID): all = {} all = [f for f in os.listdir(dirname) if os.path.isdir(os.path.join(dirname, f)) or os.path.isfile(os.path.join(dirname, f))] for f in all: if os.path.isdir(dirname + '/' + f): walkdir(dirname + '/' + f, ID) else: # don't index XML files or SQL or DATA if f.lower().find(".xml") > -1 or f.lower().find(".sql") > -1 or f.lower().find(".dat") > -1: continue if ID > -1: process(dirname + '/' + f, ID) else: process(dirname + '/' + f, transform(dirname)) return True # create the directory from the ID def solvedir(ID): folder = "./" if ID > -1: glop = "%0.9i" % int(ID) for k in range(9): folder = folder + glop[k] if not ((k+1) % 3): folder = folder + '/' walkdir(folder, ID) """ >>> main connect, fill the 'database' dictionnary, fill the MySQL database """ if __name__ == '__main__': build = False rebuild = False # connect to the MySQL server try: conn = MySQLdb.connect (host = "localhost",user = "USERNAME",passwd = "PASSWORD",db = "DATABASE") except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit (1) # get the entry of the program (ID) and fill the __dict__ object try: if len(sys.argv) == 2: if sys.argv[1] == "build" or sys.argv[1] == "rebuild": build = True if sys.argv[1] == "rebuild": rebuild = True else: ID = int(sys.argv[1]) solvedir(ID) else: print "pyIndex.py " except Exception, e: print "Error: ",e # the table is created so we only need to parse the test case and populate the table try: cursor = conn.cursor () except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) if rebuild: # flush the tables cursor.execute("TRUNCATE TABLE `words2id` ") cursor.execute("TRUNCATE TABLE `words` ") # then reset the auto_increment value cursor.execute("ALTER TABLE `words2id` AUTO_INCREMENT =1") cursor.execute("ALTER TABLE `words` AUTO_INCREMENT =1") if build: # scan all... solvedir(-1) # populate the dictionnary if the word is not in for k in database.keys(): cursor.execute("SELECT WordID FROM words WHERE Word = '%s'" % k) row = cursor.fetchone() if row == None: sqlQuery = "INSERT INTO words VALUES (NULL, '%s')" % k cursor.execute(sqlQuery) if build: i=0 for k in database.keys(): cursor.execute("SELECT WordID FROM words WHERE Word = '%s' LIMIT 1" % k) row = cursor.fetchone() wordid = row[0] # insert wordid + 1, ID for i in range(maxID+1): if i in database[k]: cursor.execute("REPLACE INTO words2id VALUES (NULL,%i,%i)" % (wordid,i)) else: # populate the relation between ID and word for k in database.keys(): cursor.execute("SELECT WordID FROM words WHERE Word = '%s' LIMIT 1" % k) row = cursor.fetchone() wordid = row[0] # insert wordid + 1, ID cursor.execute("REPLACE INTO words2id VALUES (NULL,%i,%i)" % (wordid,ID)) cursor.close() conn.commit() conn.close()