import os import markdown from HTMLParser import HTMLParser def subfiles(path, ext): return [os.path.join(path, name) for name in os.listdir(path) if os.path.isfile(os.path.join(path, name)) and name[-len(ext):] == ext] def subdirs(path): return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))] def walk(path, ext): files = [] for d in subdirs(path): files.extend(walk(os.path.join(path, d), ext)) files.extend(subfiles(path, ext)) return files def basename_no_ext(name): basename = os.path.basename(name) return basename.split('.')[0] class Link(): def __init__(self, raw_link): if 'http' in raw_link: self.http = raw_link self.file = None self.anchor = None elif raw_link[0] == '#': self.http = None self.file = None self.anchor = raw_link[1:] elif '#' in raw_link: self.http = None split_link = raw_link.split('#') self.file = basename_no_ext(split_link[0]) self.anchor = split_link[1] else: self.http = None self.file = basename_no_ext(raw_link) self.anchor = None class LinkParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.headers = [] self.links = [] def handle_starttag(self, tag, attrs): for attr in attrs: if u'href' == attr[0]: self.links.append(Link(attr[1].encode('ascii', 'ignore'))) elif u'class' == attr[0]: self.headers.append(attr[1].encode('ascii', 'ignore')) elif u'name' == attr[0]: self.headers.append(attr[1].encode('ascii', 'ignore')) elif u'id' == attr[0]: self.headers.append(attr[1].encode('ascii', 'ignore')) def parse(html): parser = LinkParser() parser.feed(html) return parser.headers, parser.links def check(headers, links): for f, file_links in links.items(): for link in file_links: if link.http: continue link_file = f if link.file: link_file = link.file if link_file not in headers: print 'BAD FILE IN ' + f + '.md:', link_file continue if link.anchor and link.anchor not in headers[link_file]: print 'BAD LINK IN ' + f + '.md:', link_file + ', ' + link.anchor def main(): docs_dir = '../docs/' ext = 'md' md_files = walk(docs_dir, ext) md = markdown.Markdown( ['meta', 'toc', 'tables', 'fenced_code'] ) html_files = [md.convert(open(f, 'r').read()) for f in md_files] headers = {} links = {} for i in range(len(md_files)): local_headers, local_links = parse(html_files[i]) headers[basename_no_ext(md_files[i])] = local_headers links[basename_no_ext(md_files[i])] = local_links check(headers, links) main()