moby/docs/docvalidate.py

#!/usr/bin/env python

""" I honestly don't even know how the hell this works, just use it. """
__author__ = "Scott Stamp <scott@hypermine.com>"

from HTMLParser import HTMLParser
from urlparse import urljoin
from sys import setrecursionlimit
import re
import requests

setrecursionlimit(10000)
root = 'http://localhost:8000'


class DataHolder:

    def __init__(self, value=None, attr_name='value'):
        self._attr_name = attr_name
        self.set(value)

    def __call__(self, value):
        return self.set(value)

    def set(self, value):
        setattr(self, self._attr_name, value)
        return value

    def get(self):
        return getattr(self, self._attr_name)


class Parser(HTMLParser):
    global root

    ids = set()
    crawled = set()
    anchors = {}
    pages = set()
    save_match = DataHolder(attr_name='match')

    def __init__(self, origin):
        self.origin = origin
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if 'href' in attrs:
            href = attrs['href']

            if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
                if self.save_match(re.search('.*\#(.*?)$', href)):
                    if self.origin not in self.anchors:
                        self.anchors[self.origin] = set()
                    self.anchors[self.origin].add(
                        self.save_match.match.groups(1)[0])

                url = urljoin(root, href)

                if url not in self.crawled and not re.match('^\#', href):
                    self.crawled.add(url)
                    Parser(url).feed(requests.get(url).content)

        if 'id' in attrs:
            self.ids.add(attrs['id'])
	# explicit <a name=""></a> references
        if 'name' in attrs:
            self.ids.add(attrs['name'])


r = requests.get(root)
parser = Parser(root)
parser.feed(r.content)
for anchor in sorted(parser.anchors):
    if not re.match('.*/\#.*', anchor):
        for anchor_name in parser.anchors[anchor]:
            if anchor_name not in parser.ids:
                print 'Missing - ({0}): #{1}'.format(
                    anchor.replace(root, ''), anchor_name)