Python URL Term Scrapper (Kind of Like a Very Dumb/Simple Watson)
I wrote this a while back to scrape a given URL page for all links it contains and then search those links for term relations. The Python scrapper first finds all links in a given url. It then searches all the links found for a list of search terms provided. It will return stats on the number of times specific provided terms show up.
This may come in handy while trying to find more information on a given topic. I’ve used it on Google searches (be careful, you can only scrape google once ever 8 or so seconds before you are locked out) and wikipedia pages to gather correlation statistics between topics.
It is old code… so there might be some errors. Keep me posted!
#!/usr/bin/env python
#ENSURE permissions are 755 in order to have script run as executable
import os, sys, re, datetime
from optparse import OptionParser
import logging, urllib2
def parsePage(link, list):
searchList = {}
try:
f = urllib2.urlopen(link)
data = f.read()
for item in list:
if (item.title() in data) or (item.upper() in data) or (item.lower() in data):
searchList[item]=searchList[item]+1
searchList["count"]=searchList["count"]+1
return searchList
except Exception, e:
print "An error has occurred while parsing page " +str(link)+"."
log.error(str(datetime.datetime.now())+" "+str(e))
def searchUrl(search):
try:
f = urllib2.urlopen(search)
data = f.read()
pattern = r"/wiki(/\S*)?$" #regular expression to find url
links = re.findall(pattern, data)
return links
except Exception, e:
print "An error has occurred while trying to reach the site."
log.error(str(datetime.datetime.now())+" "+str(e))
def main():
try:
parser = OptionParser() #Help menu options
parser.add_option("-u", "--url", dest="search", help="String containing URL to search.")
parser.add_option("-f", "--file", dest="file", help="File containing search terms.")
(options, args) = parser.parse_args()
if not options.search or not options.file:
parser.error('Term file or URL to scrape not given')
else:
urls = searchUrl(options.search)
f = open(options.file, 'r')
terms = f.readlines()
for url in urls:
parsePage(url, terms)
print "Results:"
print searchList
except Exception, e:
log.error(str(datetime.datetime.now())+" "+str(e))
if __name__ == "__main__":
log = logging.getLogger("error") #create error log
log.setLevel(logging.ERROR)
formatter = logging.Formatter('[%(levelname)s] %(message)s')
handler = logging.FileHandler('error.log')
handler.setFormatter(formatter)
log.addHandler(handler)
try:
main()
except Exception, e:
print "An error has occurred, please review the error.log for more details."
log.error(str(datetime.datetime.now())+" "+str(e))