Just another technical day

Python URL Term Scrapper (Kind of Like a Very Dumb/Simple Watson)

I wrote this a while back to scrape a given URL page for all links it contains and then search those links for term relations. The Python scrapper first finds all links in a given url. It then searches all the links found for a list of search terms provided. It will return stats on the number of times specific provided terms show up.

This may come in handy while trying to find more information on a given topic. I’ve used it on Google searches (be careful, you can only scrape google once ever 8 or so seconds before you are locked out) and wikipedia pages to gather correlation statistics between topics.

It is old code… so there might be some errors. Keep me posted!

#!/usr/bin/env python
#ENSURE permissions are 755 in order to have script run as executable

import os, sys, re, datetime
from optparse import OptionParser
import logging, urllib2

def parsePage(link, list):
    searchList = {}
    try:
        f = urllib2.urlopen(link)
        data = f.read()
        for item in list:
            if (item.title() in data) or (item.upper() in data) or (item.lower() in data):
                searchList[item]=searchList[item]+1
                searchList["count"]=searchList["count"]+1
        return searchList
    except Exception, e:
        print "An error has occurred while parsing page " +str(link)+"."
        log.error(str(datetime.datetime.now())+" "+str(e))

def searchUrl(search):
    try:
        f = urllib2.urlopen(search)
        data = f.read()
        pattern = r"/wiki(/\S*)?$" #regular expression to find url
        links = re.findall(pattern, data)
        return links
    except Exception, e:
        print "An error has occurred while trying to reach the site."
        log.error(str(datetime.datetime.now())+" "+str(e))

def main():
    try:
        parser = OptionParser() #Help menu options
        parser.add_option("-u", "--url", dest="search", help="String containing URL to search.")
        parser.add_option("-f", "--file", dest="file", help="File containing search terms.")
        (options, args) = parser.parse_args()
        if not options.search or not options.file:
            parser.error('Term file or URL to scrape not given')
        else:
            urls = searchUrl(options.search)
            f = open(options.file, 'r')
            terms = f.readlines()
            for url in urls:
                parsePage(url, terms)
            print "Results:"
            print searchList
    except Exception, e:
        log.error(str(datetime.datetime.now())+" "+str(e))

if __name__ == "__main__":
    log = logging.getLogger("error") #create error log
    log.setLevel(logging.ERROR)
    formatter = logging.Formatter('[%(levelname)s] %(message)s')
    handler = logging.FileHandler('error.log')
    handler.setFormatter(formatter)
    log.addHandler(handler)
    try:
        main()
    except Exception, e:
        print "An error has occurred, please review the error.log for more details."
        log.error(str(datetime.datetime.now())+" "+str(e))