This repository has been archived by the owner on Mar 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
150 lines (112 loc) · 4.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import argparse
from html.parser import HTMLParser
import requests
class Tag(object):
def __init__(self, tag, attrs, pos, is_end=False):
self.tag = tag
self.attrs = attrs
self.pos = pos
self.is_end = is_end
def __str__(self):
tmpl = '<{e}{t} {attrs}> ({ln}:{of})'
attrs = ''.join(['{k}="{v}" '.format(k=a[0], v=a[1]) for a in self.attrs])
return tmpl.format(
e='/' if self.is_end else '',
t=self.tag,
attrs=attrs,
ln=self.pos[0],
of=self.pos[1])
class TagMismatch(object):
def __init__(self, start_tag, end_tag):
self.start = start_tag
self.end = end_tag
def __str__(self):
template = "{st} can't be closed by {et}"
return template.format(st=self.start,
et=self.end)
class HTMLMismatchParser(HTMLParser):
tags = []
errors = []
in_head = False
# https://www.w3.org/TR/html5/syntax.html#void-elements
void_elements = [
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
'link', 'meta', 'param', 'source', 'track', 'wbr'
]
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/head
head_elements = [
'title', 'base', 'link', 'style', 'meta', 'script', 'noscript'
]
def __init__(self):
super(HTMLMismatchParser, self).__init__()
self.tags = []
self.errors = []
def handle_starttag(self, start, attrs):
if start not in self.void_elements:
tag = Tag(start, attrs, self.getpos())
self.tags.append(tag)
if self.in_head and start not in self.head_elements:
print("\nERROR")
msg = "{t} is not valid in {ht}".format(t=tag, ht='<head>')
print(msg)
self.errors.append(msg)
if start == 'head':
self.in_head = True
def handle_endtag(self, end):
if end not in self.void_elements:
start = self.tags.pop()
if end == 'head':
self.in_head = False
if start.tag != end:
end_tag = Tag(end, {}, self.getpos(), is_end=True)
tm = TagMismatch(start, end_tag)
self.errors.append(tm)
print("\nERROR")
print(tm)
print("CASE 1)")
print(tm.end)
print("... is a rogue end tag - i.e. it was never opened")
print("Continue as if we never saw it")
print("CASE 2)")
print(tm.start)
print("... is missing an end tag")
print("Continue as if we found the close tag")
case = input("\nEnter 1 or 2: ")
if case.lower() == '1':
# Rogue end tag
# Start tag will be closed later, so add it back to the list
self.tags.append(start)
elif case.lower() == '2':
# Missing end tag
# Compare the current end tag with the next start tag
self.handle_endtag(end)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Show tag mismatches on given URL")
parser.add_argument('--url', nargs='?', default=None)
parser.add_argument('--infile', nargs='?', default=None)
args = parser.parse_args()
urls = []
errors = {}
if args.url:
urls += [args.url]
if args.infile:
with open(args.infile) as f:
url_list = f.read()
urls += url_list.splitlines()
for url in urls:
r = requests.get(url)
r.raise_for_status()
print("\nChecking {u}:\n".format(u=url))
mismatch_parser = HTMLMismatchParser()
mismatch_parser.feed(r.text)
if mismatch_parser.errors:
errors[url] = mismatch_parser.errors
if errors:
print("ERRORS:\n")
for url, error_list in errors.items():
print(url)
for error in error_list:
print("\t{e}".format(e=error))
else:
print("SUCCESS\n")