check_links.py
3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import markdown
from io import open
from HTMLParser import HTMLParser
def subfiles(path, ext):
return [os.path.join(path, name) for name in os.listdir(path) if os.path.isfile(os.path.join(path, name)) and name[-len(ext):] == ext]
def subdirs(path):
return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
def walk(path, ext):
files = []
for d in subdirs(path):
files.extend(walk(os.path.join(path, d), ext))
files.extend(subfiles(path, ext))
return files
class Link():
def __init__(self, path, raw_link):
if 'http' in raw_link or 'www' in raw_link:
self.http = raw_link
self.file = None
self.anchor = None
elif raw_link[0] == '#':
self.http = None
self.file = None
self.anchor = raw_link[1:]
elif '#' in raw_link:
self.http = None
split_link = raw_link.split('#')
self.file = os.path.normpath(os.path.join(path, split_link[0]))
self.anchor = split_link[1]
else:
self.http = None
self.file = os.path.normpath(os.path.join(path, raw_link))
self.anchor = None
class LinkParser(HTMLParser):
def __init__(self, path):
HTMLParser.__init__(self)
self.path = path
self.headers = []
self.links = []
def handle_starttag(self, tag, attrs):
for attr in attrs:
if u'href' == attr[0]:
self.links.append(Link(self.path, attr[1].encode('ascii', 'ignore')))
elif u'class' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
elif u'name' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
elif u'id' == attr[0]:
self.headers.append(attr[1].encode('ascii', 'ignore'))
def parse(path, html):
parser = LinkParser(path)
parser.feed(html)
return parser.headers, parser.links
def check(headers, links):
for f, file_links in links.items():
for link in file_links:
if link.http: # Can't check links to other websites
continue
link_file = f
if link.file:
link_file = link.file
if link_file.endswith('.pdf'):
if not os.path.exists(link_file):
print 'BAD PDF: ' + link_file + ' DOES NOT EXIST'
print
continue
if link_file not in headers:
print 'BAD FILE IN ' + f + ':', link_file
print
continue
if link.anchor and link.anchor != "fnref:1" and link.anchor not in headers[link_file]:
print 'BAD ANCHOR IN ' + f + ':', link_file + '#' + link.anchor
print
continue
def main():
docs_dir = '../docs/'
ext = 'md'
md_files = walk(docs_dir, ext)
md = markdown.Markdown( ['meta', 'toc', 'tables', 'fenced_code', 'attr_list', 'footnotes'] )
html_files = [md.convert(open(f, 'r', encoding='utf-8').read()) for f in md_files]
headers = {}
links = {}
for i in range(len(md_files)):
local_headers, local_links = parse(os.path.dirname(md_files[i]), html_files[i])
headers[md_files[i]] = local_headers
links[md_files[i]] = local_links
check(headers, links)
main()