docvalidate.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/env python
  2. """ I honestly don't even know how the hell this works, just use it. """
  3. __author__ = "Scott Stamp <scott@hypermine.com>"
  4. from HTMLParser import HTMLParser
  5. from urlparse import urljoin
  6. from sys import setrecursionlimit
  7. import re
  8. import requests
  9. setrecursionlimit(10000)
  10. root = 'http://localhost:8000'
  11. class DataHolder:
  12. def __init__(self, value=None, attr_name='value'):
  13. self._attr_name = attr_name
  14. self.set(value)
  15. def __call__(self, value):
  16. return self.set(value)
  17. def set(self, value):
  18. setattr(self, self._attr_name, value)
  19. return value
  20. def get(self):
  21. return getattr(self, self._attr_name)
  22. class Parser(HTMLParser):
  23. global root
  24. ids = set()
  25. crawled = set()
  26. anchors = {}
  27. pages = set()
  28. save_match = DataHolder(attr_name='match')
  29. def __init__(self, origin):
  30. self.origin = origin
  31. HTMLParser.__init__(self)
  32. def handle_starttag(self, tag, attrs):
  33. attrs = dict(attrs)
  34. if 'href' in attrs:
  35. href = attrs['href']
  36. if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
  37. if self.save_match(re.search('.*\#(.*?)$', href)):
  38. if self.origin not in self.anchors:
  39. self.anchors[self.origin] = set()
  40. self.anchors[self.origin].add(
  41. self.save_match.match.groups(1)[0])
  42. url = urljoin(root, href)
  43. if url not in self.crawled and not re.match('^\#', href):
  44. self.crawled.add(url)
  45. Parser(url).feed(requests.get(url).content)
  46. if 'id' in attrs:
  47. self.ids.add(attrs['id'])
  48. # explicit <a name=""></a> references
  49. if 'name' in attrs:
  50. self.ids.add(attrs['name'])
  51. r = requests.get(root)
  52. parser = Parser(root)
  53. parser.feed(r.content)
  54. for anchor in sorted(parser.anchors):
  55. if not re.match('.*/\#.*', anchor):
  56. for anchor_name in parser.anchors[anchor]:
  57. if anchor_name not in parser.ids:
  58. print 'Missing - ({0}): #{1}'.format(
  59. anchor.replace(root, ''), anchor_name)