12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- #!/usr/bin/env python
- """ I honestly don't even know how the hell this works, just use it. """
- __author__ = "Scott Stamp <scott@hypermine.com>"
- from HTMLParser import HTMLParser
- from urlparse import urljoin
- from sys import setrecursionlimit
- import re
- import requests
- setrecursionlimit(10000)
- root = 'http://localhost:8000'
- class DataHolder:
- def __init__(self, value=None, attr_name='value'):
- self._attr_name = attr_name
- self.set(value)
- def __call__(self, value):
- return self.set(value)
- def set(self, value):
- setattr(self, self._attr_name, value)
- return value
- def get(self):
- return getattr(self, self._attr_name)
- class Parser(HTMLParser):
- global root
- ids = set()
- crawled = set()
- anchors = {}
- pages = set()
- save_match = DataHolder(attr_name='match')
- def __init__(self, origin):
- self.origin = origin
- HTMLParser.__init__(self)
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if 'href' in attrs:
- href = attrs['href']
- if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
- if self.save_match(re.search('.*\#(.*?)$', href)):
- if self.origin not in self.anchors:
- self.anchors[self.origin] = set()
- self.anchors[self.origin].add(
- self.save_match.match.groups(1)[0])
- url = urljoin(root, href)
- if url not in self.crawled and not re.match('^\#', href):
- self.crawled.add(url)
- Parser(url).feed(requests.get(url).content)
- if 'id' in attrs:
- self.ids.add(attrs['id'])
- # explicit <a name=""></a> references
- if 'name' in attrs:
- self.ids.add(attrs['name'])
- r = requests.get(root)
- parser = Parser(root)
- parser.feed(r.content)
- for anchor in sorted(parser.anchors):
- if not re.match('.*/\#.*', anchor):
- for anchor_name in parser.anchors[anchor]:
- if anchor_name not in parser.ids:
- print 'Missing - ({0}): #{1}'.format(
- anchor.replace(root, ''), anchor_name)
|