#!/usr/bin/env python """ Spider Module for Grabber v0.1 Copyright (C) 2006 - Romain Gaucher - http://rgaucher.info """ import urllib import time import re,sys,os,string from BeautifulSoup import BeautifulSoup,SoupStrainer from urllib2 import URLError, HTTPError COOKIEFILE = 'cookies.lwp' # the path and filename that you want to use to save your cookies in import os.path cj = None ClientCookie = None cookielib = None import cookielib import urllib2 urlopen = urllib2.urlopen cj = cookielib.LWPCookieJar() # This is a subclass of FileCookieJar that has useful load and save methods Request = urllib2.Request txdata = None refererUrl = "http://google.com/?q=you!" txheaders = {'User-agent' : 'Grabber/0.1 (X11; U; Linux i686; en-US; rv:1.7)', 'Referer' : refererUrl} allowed=['php','html','htm','xml','xhtml','xht','xhtm', 'asp','aspx','msp','mspx','php3','php4','php5','txt','shtm', 'shtml','phtm','phtml','jhtml','pl','jsp','cfm','cfml','do','py', 'js', 'css'] database = {} database_url = [] database_css = [] database_js = [] database_ext = [] # database of unsecure external links local_url = [] dumb_params = [] # if there is no parameters associated with a given URL, associate this list of "whatever looks like" root = "http://localhost" outSpiderFile = None """ database = { u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }}, u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }}, u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }} } """ _urlEncode = {} for i in range(256): _urlEncode[chr(i)] = '%%%02x' % i for c in string.letters + string.digits + '_,.-/': _urlEncode[c] = c _urlEncode[' '] = '+' def urlEncode(s): """ Returns the encoded version of the given string, safe for using as a URL. """ return string.join(map(lambda c: _urlEncode[c], list(s)), '') def urlDecode(s): """ Returns the decoded version of the given string. Note that invalid URLs will throw exceptons. For example, a URL whose % coding is incorrect. """ mychr = chr atoi = string.atoi parts = string.split(string.replace(s, '+', ' '), '%') for i in range(1, len(parts)): part = parts[i] parts[i] = mychr(atoi(part[:2], 16)) + part[2:] return string.join(parts, '') def htmlencode(s): """ Escaping the HTML special characters """ s = s.replace("&", "&") s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("\"",""") s = s.replace("'", "'") return s def htmldecode(s): """ Unescaping the HTML special characters """ s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace(""", "\"") s = s.replace("'","'") s = s.replace("&", "&") return s def getContentDirectURL_GET(url, string): """ Get the content of the url by GET method """ ret = "" try: if len(string) > 0: url = url + "?" + (string) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req = Request(url, None, txheaders) # create a request object ret = urlopen(req) # and open it to return a handle on the url except HTTPError, e: return except URLError, e: return except IOError: return return ret def scan(currentURL): """ The Scanner is the first part of Grabber. It retrieves every information of the HTML page TODO: Reading in every href='' element for CSS and src='' for JavaScript / Image """ try: archives_hDl = getContentDirectURL_GET(currentURL,'') except IOError: log <= ("IOError @ %s" % currentURL) try: htmlContent= archives_hDl.read() except IOError, e: print "Cannot open the file,",(e.strerror) return except AttributeError: print ("Grabber cannot retrieve the given url: %s" % currentURL) return parseHtmlLinks (currentURL,htmlContent) parseHtmlParams(currentURL,htmlContent) def allowedExtensions(plop): for e in allowed: if '.'+e in plop: return True return False def makeRoot(urlLocal): if allowedExtensions(urlLocal): return urlLocal[0:urlLocal.rfind('/')+1] return urlLocal def giveGoodURL(href, urlLocal): """ It should return a good url... href = argument retrieven from the href... """ if 'javascript' in href: return htmldecode(urlLocal) if 'http://' in href or 'https://' in href: if urlLocal in href: return htmldecode(href) else: return urlLocal if len(href) < 1: return htmldecode(urlLocal) if href[0] == '?' and '?' not in urlLocal and not allowedExtensions(urlLocal): for e in allowed: if '.'+e in urlLocal: return htmldecode(urlLocal + href) return htmldecode(urlLocal + '/' + href) else: # simple name if allowedExtensions(urlLocal) or '?' in urlLocal: return htmldecode(urlLocal[0:urlLocal.rfind('/')+1] + href) else: return htmldecode(urlLocal + '/' + href) return htmldecode(href) def dl(fileAdress, destFile): """ Download the file """ try: f = urllib.urlopen(fileAdress) g = f.read() file = open(os.path.join('./', destFile), "wb") except IOError: return False file.write(g) file.close() return True def removeSESSID(urlssid): """ Remove the phpsessid information... don't care about it now """ k = urlssid.find('PHPSESSID') if k > 0: return urlssid[0:k-1] k = urlssid.find('sid') if k > 0: return urlssid[0:k-1] return urlssid def parseHtmlLinks(currentURL,htmlContent): global database_url,database_js,database_css """ Parse the HTML/XHTML code to get JS, CSS, links etc. """ links = SoupStrainer('a') # listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)] listAnchors = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links): try: string = str(tag).lower() if string.count("href") > 0: listAnchors.append(tag['href']) except TypeError: continue except KeyError: continue for a in listAnchors: goodA = giveGoodURL(a,currentURL) goodA = removeSESSID(goodA) if (root in goodA) and (goodA not in database_url): database_url.append(goodA) # parse the CSS and the JavaScript script = SoupStrainer('script') #listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)] listScripts = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script): try: string = str(tag).lower() if string.count("src") > 0 and string.count(".src") < 1: listScripts.append(tag['src']) except TypeError: continue except KeyError: continue for a in listScripts: sc = giveGoodURL(a,currentURL) if sc not in database_js: database_js.append(sc) if sc == currentURL: # remote script database_ext.append(sc) parseJavaScriptCalls() link = SoupStrainer('link') # listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)] listLinks = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link): try: string = str(tag).lower() if string.count("href") > 0: listLinks.append(tag['href']) except TypeError: continue except KeyError: continue for a in listLinks: sc = giveGoodURL(a,currentURL) if sc not in database_css: database_css.append(sc) return True jsChars = ["'",'"'] def rfindFirstJSChars(string): b = [string.rfind(k) for k in jsChars] return max(b) regDumbParam = re.compile(r'(\w+)') regDumbParamNumber = re.compile(r'(\d+)') jsParams = ["'",'"','=','+','%','\\',')','(','^','*','-'] def cleanListDumbParams(listDumb): newDumbList = [] for w in listDumb: w = w.replace(' ','') w = w.replace('\n','') #l = [c for c in jsParams if c in w] # no jsParams if len(w) > 0 and regDumbParam.match(w) and not regDumbParamNumber.match(w): newDumbList.append(w) return newDumbList def unique(L): noDupli=[] [noDupli.append(i) for i in L if not noDupli.count(i)] return noDupli def flatten(L): if type(L) != type([]): return [L] if L == []: return L return reduce(lambda L1,L2:L1+L2,map(flatten,L)) def parseJavaScriptContent(jsContent): global database_url, database_ext, dumb_params """ Parse the content of a JavaScript file """ for l in jsContent.readlines(): for e in allowed: if l.count('.'+e) > 0: # we found an external a call if l.count('http://') > 0 and l.count(root) < 1: # External link et= '.'+e b1 = l.find('http://') b2 = l.find(et) + len(et) database_ext.append(l[b1:b2]) else: # Internal link et= '.'+e b2 = l.find(et) + len(et) b1 = rfindFirstJSChars(l[:b2])+1 database_url.append(giveGoodURL(l[b1:b2],root)) # try to get a parameter k = l.find('?') if k > 0: results = l[k:].split('?') plop = [] for a in results: plop.append(cleanListDumbParams(regDumbParam.split(a))) dumb_params.append(flatten(plop)) k = l.find('&') if k > 0: results = l[k:].split('&') plop = [] for a in results: plop.append(cleanListDumbParams(regDumbParam.split(a))) plop = flatten(plop) dumb_params.append(flatten(plop)) dumb_params = unique(flatten(dumb_params)) def parseJavaScriptCalls(): global database_js """ Parse the JavaScript and download the files """ for j in database_js: jsName = j[j.rfind('/')+1:] if not os.path.exists('local/js/' + jsName): # first download the file dl(j,'local/js/' + jsName) try: jsContent = open('local/js/' + jsName, 'r') except IOError: continue parseJavaScriptContent(jsContent) jsContent.close() def splitQuery(query_string): """ Split the num=plop&truc=kikoo&o=42 into a dictionary """ try: d = dict([x.split('=') for x in query_string.split('&') ]) except ValueError: d = {} return d def dict_add(d1,d2): """ Flatten 2 dictionaries """ d={} if len(d1): for s in d1.keys(): d[s] = d1[s] if len(d2): for s in d2.keys(): d[s] = d2[s] return d def dict_add_list(d1,l1): d={} if len(d1): for s in d1.keys(): d[s] = d1[s] if len(l1): for s in l1: d[s] = 'bar' return d def parseHtmlParams(currentURL, htmlContent): global database, database_css, database_js """ Parse html to get args """ for url in database_url: k = url.find('?') if k > 0: keyUrl = url[0:k-1] query = url[k+1:] if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} lG = database[keyUrl]['GET'] lG = dict_add(lG,splitQuery(query)) database[keyUrl]['GET'] = lG elif len(dumb_params) > 0: keyUrl = url # no params in the URL... let's assign the dumb_params if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} lG = database[keyUrl]['GET'] lP = database[keyUrl]['POST'] lG = dict_add_list(lG,dumb_params) lP = dict_add_list(lP,dumb_params) database[keyUrl]['GET'] = lG database[keyUrl]['POST'] = lP # then, parse the forms forms = SoupStrainer('form') input = SoupStrainer('input') listForm = [tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms)] for f in listForm: method = 'GET' if 'method' in f or 'METHOD' in f: method = f['method'].upper() action = currentURL if 'action' in f or 'ACTION' in f: action = f['action'] keyUrl = giveGoodURL(action,currentURL) listInput = [tag for tag in BeautifulSoup(str(f), parseOnlyThese=input)] for i in listInput: if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} try: value = i['value'] except KeyError: value = '42' try: name = i['name'] except KeyError: name = 'foo' value= 'bar' continue lGP = database[keyUrl][method] lGP = dict_add(lGP,{name : value}) database[keyUrl][method] = lGP return True def runSpiderScan(entryUrl, depth = 0): global outSpiderFile print "runSpiderScan @ ", entryUrl, " | #",depth if outSpiderFile: outSpiderFile.write("\t\t%s\n" % entryUrl) scan(entryUrl) if depth > 0 and len(database_url) > 0: for a in database_url: runSpiderScan(a, depth-1) return False return True def spider(entryUrl, depth = 0): global root,outSpiderFile """ Retrieve every links """ if depth > 0: root = makeRoot(entryUrl) else: root = entryUrl # test if the spider has already be done on this website try: f = open("local/spiderSite.xml", 'r') firstLine = f.readline() f.close() if firstLine.count(root) > 0: alreadyScanned = True else: alreadyScanned = False except IOError: alreadyScanned = False print "Start scanning...", root if depth == 0: scan(root) else: if not alreadyScanned: outSpiderFile = open("local/spiderSite.xml","w") outSpiderFile.write("\n" % (root,depth) ) runSpiderScan(root, depth) if len(dumb_params) > 0: outSpiderFile.write("\n") for d in dumb_params: outSpiderFile.write("\t%s\n" % (d)) outSpiderFile.write("\n") outSpiderFile.write("\n") outSpiderFile.close() else: print "Loading the previous spider results from 'local/spiderSite.xml'" # load the XML file regUrl = re.compile(r'(.*)(.*)(.*)',re.I) regDmb = re.compile(r'(.*)(.*)(.*)',re.I) f = open("local/spiderSite.xml", 'r') for l in f.readlines(): if regUrl.match(l): out = regUrl.search(l) url = out.group(2) database_url.append(url) if regDmb.match(l): out = regDmb.search(l) param = out.group(2) dumb_params.append(param) f.close() # scan every url for currentURL in database_url: try: archives_hDl = getContentDirectURL_GET(currentURL,'') except IOError: log <= ("IOError @ %s" % currentURL) continue try: htmlContent= archives_hDl.read() except IOError, e: continue except AttributeError, e: continue parseHtmlParams(currentURL,htmlContent) outSpiderFile = open("results/touchFiles.xml","w") outSpiderFile.write("\n" % root) for i in database_url: outSpiderFile.write("\t%s\n" % i) for i in database_js: outSpiderFile.write("\t%s\n" % i) for i in database_css: outSpiderFile.write("\t%s\n" % i) outSpiderFile.write("") outSpiderFile.close() if len(database_ext) > 0: # alert of External calls outSpiderFile = open("results/externalCalls.xml","w") outSpiderFile.write("\n") for i in database_ext: outSpiderFile.write("\t%s\n" % i) outSpiderFile.write("") outSpiderFile.close()