Python URL Term Scrapper (Kind of Like a Very Dumb/Simple Watson)
I wrote this a while back to scrape a given URL page for all links it contains and then search those links for term relations. The Python scrapper first finds all links in a given url. It then searches all the links found for a list of search terms provided. It will return stats on the number of times specific provided terms show up.
This may come in handy while trying to find more information on a given topic. I’ve used it on Google searches (be careful, you can only scrape google once ever 8 or so seconds before you are locked out) and wikipedia pages to gather correlation statistics between topics.
It is old code… so there might be some errors. Keep me posted!
#!/usr/bin/env python #ENSURE permissions are 755 in order to have script run as executable import os, sys, re, datetime from optparse import OptionParser import logging, urllib2 def parsePage(link, list): searchList = {} try: f = urllib2.urlopen(link) data = f.read() for item in list: if (item.title() in data) or (item.upper() in data) or (item.lower() in data): searchList[item]=searchList[item]+1 searchList["count"]=searchList["count"]+1 return searchList except Exception, e: print "An error has occurred while parsing page " +str(link)+"." log.error(str(datetime.datetime.now())+" "+str(e)) def searchUrl(search): try: f = urllib2.urlopen(search) data = f.read() pattern = r"/wiki(/\S*)?$" #regular expression to find url links = re.findall(pattern, data) return links except Exception, e: print "An error has occurred while trying to reach the site." log.error(str(datetime.datetime.now())+" "+str(e)) def main(): try: parser = OptionParser() #Help menu options parser.add_option("-u", "--url", dest="search", help="String containing URL to search.") parser.add_option("-f", "--file", dest="file", help="File containing search terms.") (options, args) = parser.parse_args() if not options.search or not options.file: parser.error('Term file or URL to scrape not given') else: urls = searchUrl(options.search) f = open(options.file, 'r') terms = f.readlines() for url in urls: parsePage(url, terms) print "Results:" print searchList except Exception, e: log.error(str(datetime.datetime.now())+" "+str(e)) if __name__ == "__main__": log = logging.getLogger("error") #create error log log.setLevel(logging.ERROR) formatter = logging.Formatter('[%(levelname)s] %(message)s') handler = logging.FileHandler('error.log') handler.setFormatter(formatter) log.addHandler(handler) try: main() except Exception, e: print "An error has occurred, please review the error.log for more details." log.error(str(datetime.datetime.now())+" "+str(e))