-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTokenizer.py
98 lines (84 loc) · 2.7 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
class JackTokenizer:
def __init__(self, input_file):
with open(input_file, "r") as f:
self.lines = f.readlines()
self.tokens = []
self.current_token = None
self._tokenize()
def _clean_line(self, line):
line = re.sub(r"\/\/.*", "", line)
line = re.sub(r"\/\*.*?\*\/", "", line)
return line.strip()
def _tokenize_line(self, line):
line = self._clean_line(line)
while line:
if line[0] == '"':
end_index = line.find('"', 1)
token = line[: end_index + 1]
line = line[end_index + 1 :]
else:
match = re.match(r"^\s*(\w+|\d+|.)", line)
token = match.group(1)
line = line[match.end() :]
if token:
self.tokens.append(token)
def _tokenize(self):
for line in self.lines:
self._tokenize_line(line)
def has_more_tokens(self):
return bool(self.tokens)
def advance(self):
if self.has_more_tokens():
self.current_token = self.tokens.pop(0)
else:
self.current_token = None
def token_type(self):
keywords = [
"class",
"constructor",
"function",
"method",
"field",
"static",
"var",
"int",
"char",
"boolean",
"void",
"true",
"false",
"null",
"this",
"let",
"do",
"if",
"else",
"while",
"return",
]
symbols = "{}()[].,;+-*/&|<>=~"
if self.current_token in keywords:
return "keyword"
elif self.current_token in symbols:
return "symbol"
elif re.match(r"^\d+$", self.current_token):
return "integerConstant"
elif re.match(r'^".*"$', self.current_token):
return "stringConstant"
elif re.match(r"^[a-zA-Z_]\w*$", self.current_token):
return "identifier"
def to_xml(self):
xml_output = "<tokens>\n"
while self.has_more_tokens():
self.advance()
token_type = self.token_type()
token_element = "<{0}> {1} </{0}>\n".format(token_type, self.current_token)
xml_output += token_element
xml_output += "</tokens>"
return xml_output
if __name__ == "__main__":
input_file = "Main.jack"
tokenizer = JackTokenizer(input_file)
xml_output = tokenizer.to_xml()
print(xml_output)