check_links.py
3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import markdown
from HTMLParser import HTMLParser
def subfiles(path, ext):
return [os.path.join(path, name) for name in os.listdir(path) if os.path.isfile(os.path.join(path, name)) and name[-len(ext):] == ext]
def subdirs(path):
return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
def walk(path, ext):
files = []
for d in subdirs(path):
files.extend(walk(os.path.join(path, d), ext))
files.extend(subfiles(path, ext))
return files
class Link():
def __init__(self, path, raw_link):
if 'http' in raw_link:
self.http = raw_link
self.file = None
self.anchor = None
elif raw_link[0] == '#':
self.http = None
self.file = None
self.anchor = raw_link[1:]
elif '#' in raw_link:
self.http = None
split_link = raw_link.split('#')
self.file = os.path.normpath(os.path.join(path, split_link[0]))
self.anchor = split_link[1]
else:
self.http = None
self.file = os.path.normpath(os.path.join(path, raw_link))
self.anchor = None
class LinkParser(HTMLParser):
def __init__(self, path):
HTMLParser.__init__(self)
self.path = path
self.headers = []
self.links = []
def handle_starttag(self, tag, attrs):
for attr in attrs:
if u'href' == attr[0]:
self.links.append(Link(self.path, attr[1].encode('ascii', 'ignore')))
elif u'class' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
elif u'name' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
elif u'id' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
def parse(path, html):
parser = LinkParser(path)
parser.feed(html)
return parser.headers, parser.links
def check(headers, links):
for f, file_links in links.items():
for link in file_links:
if link.http:
continue
link_file = f
if link.file:
link_file = link.file
if link_file not in headers:
print 'BAD FILE IN ' + f + ':', link_file
print
continue
if link.anchor and link.anchor not in headers[link_file]:
print 'BAD ANCHOR IN ' + f + ':', link_file + '#' + link.anchor
print
continue
def main():
docs_dir = '../docs/'
ext = 'md'
links = open('../docs/links.md', 'r').read()
md_files = walk(docs_dir, ext)
md = markdown.Markdown( ['meta', 'toc', 'tables', 'fenced_code', 'attr_list'] )
html_files = [md.convert(open(f, 'r').read() + "\n\n" + links) for f in md_files]
headers = {}
links = {}
for i in range(len(md_files)):
local_headers, local_links = parse(os.path.dirname(md_files[i]), html_files[i])
headers[md_files[i]] = local_headers
links[md_files[i]] = local_links
check(headers, links)
main()