Source code for pytomo.lib_links_extractor

#!/usr/bin/env python
''' Module to retrieve all links from a web page.
    Usage:
        import pytomo.lib_cache_url as lib_cache_url
        import pytomo.start_pytomo as start_pytomo
        log_file = 'test_cache_url'
        start_pytomo.configure_log_file(log_file)

        url = 'http://www.youtube.com/charts/videos_views'
        all_links = lib_cache_url.get_all_links(url)
'''
from __future__ import with_statement, absolute_import

import sys
import htmllib
import formatter
import urllib2
import logging
import socket
from optparse import OptionParser
from httplib import BadStatusLine
# global config
try:
    from . import config_pytomo
except ValueError:
    import config_pytomo
try:
    from .lib_general_download import STD_HEADERS
except ValueError:
    from lib_general_download import STD_HEADERS

CONTENT_TYPE_HEADER = 'Content-type'
TEXT_HTML_TYPE = 'text/html'

[docs]class LinksExtractor(htmllib.HTMLParser): ''' Simple HTML parser to obtain the urls from webpage''' # derive new HTML parser def __init__(self, format_page): # class constructor htmllib.HTMLParser.__init__(self, format_page) # base class constructor self.links = []
[docs] def start_a(self, attrs) : # override handler of <A ...>...</A> tags # process the attributes if len(attrs) > 0 : for attr in attrs : if attr[0] == "href" : # ignore all non HREF attributes self.links.append(attr[1]) # save the link info in the list
[docs]class HeadRequest(urllib2.Request): ''' Class to return only the header of a request'''
[docs] def get_method(self): return "HEAD"
[docs]def configure_proxy(): ''' Set the proxy according to the default''' if config_pytomo.PROXIES: proxy = urllib2.ProxyHandler(config_pytomo.PROXIES) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener)
[docs]def retrieve_header(url): ''' Return only the response header of an url''' request = HeadRequest(url, None, STD_HEADERS) try: response = urllib2.urlopen(request) except urllib2.URLError, mes: config_pytomo.LOG.warn('URLError in getting HEAD of this url: %s' '\nError message: %s' % (url, mes)) return None except BadStatusLine, mes: config_pytomo.LOG.warn('BadStatusLine in getting HEAD of this url: %s' '\nError message: %s' % (url, mes)) return None except Exception, mes: config_pytomo.LOG.exception('Unexpected exception: %s' % mes) return None return response
[docs]def retrieve_content_type_header(response): ''' Retrieve the LAST "Content-type" header of an HTTP response''' if response: try: return response.headers.getheader(CONTENT_TYPE_HEADER) except AttributeError, mes: config_pytomo.LOG.warn('Problem in getting "Content-type" header.' '\nError message: %s' % mes) return None else: return None
[docs]def main(argv=None): "Program wrapper" if argv is None: argv = sys.argv[1:] usage = "%prog [-w out_file] [-v] url" parser = OptionParser(usage=usage) parser.add_option("-w", dest = "out_file_name", default="-", help = "output file or stdout if FILE is - (default case)") parser.add_option("-v", "--verbose", dest = "verbose", action="store_true", default=False, help = "run as verbose mode") (options, args) = parser.parse_args(argv) if len(args) != 1: parser.error("Incorrect number of arguments, provide an url") if options.verbose: log_level = logging.DEBUG else: log_level = logging.INFO config_pytomo.LOG = logging.getLogger('lib_cache_url') # to not have console output config_pytomo.LOG.propagate = False config_pytomo.LOG.setLevel(log_level) if options.out_file_name == '-': handler = logging.StreamHandler(sys.stdout) else: try: with open(options.out_file_name, 'w') as _: pass except IOError: parser.error("Problem opening file: %s" % options.out_file_name) handler = logging.FileHandler(filename=options.out_file_name) log_formatter = logging.Formatter("%(asctime)s - %(filename)s - " "%(levelname)s - %(message)s") handler.setFormatter(log_formatter) config_pytomo.LOG.addHandler(handler) print('From "%s" the extracted links are:\n %s' % (args[0], get_all_links(args[0])))
if __name__ == '__main__': import doctest doctest.testmod() sys.exit(main())