diff --git a/README.md b/README.md index 1af3c5b..6a80c49 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ # simple_interpreter (A python interpreter implemented in python.) -一个用 python 实现的简单python解释器 - -分版本逐步实现完整功能,适合初学者学习解释器的工作原理 +一个用 python 实现的简单python解释器,分版本(与分支对应)逐步实现一个简单的python解释器功能,适合初学者了解解释器的工作原理 ## 版本说明 +为了方便渐进式学习进度,每一个版本都创建了一个独立的分支,比如 v1.0版本对应的分支名为 v1.0, 该分支只实现了 v1.0 的功能,以此类推,逐步进行功能迭代。 ### v1.0 only support single-digit integers + @@ -45,3 +44,18 @@ support unary operators (+, -) python genastdot.py "5---2" > ast.dot && dot -Tpng -o ast_v8.png ast.dot ``` ![ast_v8.png](ast_v8.png) + + +### v9.0 +support to handle python assignment statements. + +```shell +python interpreter.py assignments.txt +``` + +```shell +python genastdot.py assignments.txt > ast.dot && dot -Tpng -o ast_v9.png ast.dot +``` + +### v10.0 +增加符号表记录变量的定义,以处理使用未处理的变量 diff --git a/ast.py b/abs_syntax_tree.py similarity index 66% rename from ast.py rename to abs_syntax_tree.py index 9450aa3..73d7a25 100644 --- a/ast.py +++ b/abs_syntax_tree.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -@file: ast.py +@file: abs_syntax_tree.py @author: amazing coder @date: 2024/8/28 @desc: abstract-syntax tree (AST) 抽象语法树节点定义 @@ -14,7 +14,7 @@ """ -class AST(object): +class AST(object): """ ASTs represent the operator-operand model. 每一个 AST 节点都代表一个运算符和一个操作数 @@ -51,3 +51,28 @@ class UnaryOp(AST): def __init__(self, op, expr): self.token = self.op = op self.expr = expr + +class Assign(AST): + """ + 赋值运算符节点,也是非叶子节点,代表一个赋值运算符 + 比如 a = 2 这个表达式,a 是变量,2是值, 都是叶子节点,= 是赋值运算符节点 + """ + def __init__(self, left, op, right): + self.left = left + self.token = self.op = op + self.right = right + +class Var(AST): + """ + 变量节点,也是叶子节点,代表一个变量 + """ + def __init__(self, token): + self.token = token + self.value = token.value + +class NoOp(AST): + pass + +class Compound(AST): + def __init__(self): + self.children = [] \ No newline at end of file diff --git a/assignments.txt b/assignments.txt new file mode 100644 index 0000000..f5cba79 --- /dev/null +++ b/assignments.txt @@ -0,0 +1,5 @@ +a=1.34 +b=2 +c=a+b +d=a+b-c +e=45+f \ No newline at end of file diff --git a/ast_v9.png b/ast_v9.png new file mode 100644 index 0000000..7089939 Binary files /dev/null and b/ast_v9.png differ diff --git a/func_test.py b/func_test.py new file mode 100644 index 0000000..6d0a32a --- /dev/null +++ b/func_test.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +@file: func_test.py +@author: amazing coder +@date: 2024/8/29 +@desc: +""" +from spi_token import Token +from abs_syntax_tree import Num, BinOp, UnaryOp +from interpreter import Analyzer, Parser, Interpreter, INTEGER, MINUS + +def test_unary_op(): + """ + 测试一元运算符, text=6---1 + """ + text = '5---2' + six_tok = Num(Token(INTEGER, 5)) + one_tok = Num(Token(INTEGER, 2)) + minus_tok = Token(MINUS, '-') + exp_node = BinOp(six_tok, minus_tok, UnaryOp(minus_tok, UnaryOp(minus_tok, one_tok))) + interpreter = Interpreter(None) + print(interpreter.visit(exp_node)) + +def test_analyzer(): + """ + 测试语法分析器 + """ + text = "a=45" + print(text) + analyzer = Analyzer(text) + token = analyzer.get_next_token() + while token.symbol_type != 'EOF': + print(token) + token = analyzer.get_next_token() + +def test_interpret_py_statements(): + """ + 测试解释器 + """ + text = """a=1 + b=2 + c=a+b + d=a+b+c + e=45 + """ + print(text) + print(repr(text)) + analyzer = Analyzer(text) + parser = Parser(analyzer) + interpreter = Interpreter(parser) + interpreter.interpret() + print(interpreter.GLOBAL_SCOPE) + + + +if __name__ == '__main__': + test_interpret_py_statements() \ No newline at end of file diff --git a/genastdot.py b/genastdot.py index 20be6c3..9fd8895 100644 --- a/genastdot.py +++ b/genastdot.py @@ -1,13 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -@file: genastdot.py -@author: amazing coder -@date: 2024/8/28 -@desc: -""" - ############################################################################### # AST visualizer - generates a DOT file for Graphviz. # # # @@ -63,6 +53,43 @@ def visit_UnaryOp(self, node): s = ' node{} -> node{}\n'.format(node._num, node.expr._num) self.dot_body.append(s) + def visit_Compound(self, node): + s = ' node{} [label="Compound"]\n'.format(self.ncount) + self.dot_body.append(s) + node._num = self.ncount + self.ncount += 1 + + for child in node.children: + self.visit(child) + s = ' node{} -> node{}\n'.format(node._num, child._num) + self.dot_body.append(s) + + def visit_Assign(self, node): + s = ' node{} [label="{}"]\n'.format(self.ncount, node.op.value) + self.dot_body.append(s) + node._num = self.ncount + self.ncount += 1 + + self.visit(node.left) + self.visit(node.right) + + for child_node in (node.left, node.right): + s = ' node{} -> node{}\n'.format(node._num, child_node._num) + self.dot_body.append(s) + + def visit_Var(self, node): + s = ' node{} [label="{}"]\n'.format(self.ncount, node.value) + self.dot_body.append(s) + node._num = self.ncount + self.ncount += 1 + + def visit_NoOp(self, node): + s = ' node{} [label="NoOp"]\n'.format(self.ncount) + self.dot_body.append(s) + node._num = self.ncount + self.ncount += 1 + + def gendot(self): tree = self.parser.parse() self.visit(tree) @@ -74,11 +101,12 @@ def main(): description='Generate an AST DOT file.' ) argparser.add_argument( - 'text', - help='Arithmetic expression (in quotes): "1 + 2 * 3"' + 'fname', + help='Pascal source file' ) args = argparser.parse_args() - text = args.text + fname = args.fname + text = open(fname, 'r').read() lexer = Analyzer(text) parser = Parser(lexer) diff --git a/genptdot.py b/genptdot.py index 2198cd1..f4792c8 100644 --- a/genptdot.py +++ b/genptdot.py @@ -59,7 +59,7 @@ def eat(self, token_type): # type and if they match then "eat" the current token # and assign the next token to the self.current_token, # otherwise raise an exception. - if self.current_token.type == token_type: + if self.current_token.symbol_type == token_type: self.current_node.add(TokenNode(self.current_token.value)) self.current_token = self.lexer.get_next_token() else: @@ -73,9 +73,9 @@ def factor(self): self.current_node = node token = self.current_token - if token.type == INTEGER: + if token.symbol_type == INTEGER: self.eat(INTEGER) - elif token.type == LPAREN: + elif token.symbol_type == LPAREN: self.eat(LPAREN) self.expr() self.eat(RPAREN) @@ -91,11 +91,11 @@ def term(self): self.factor() - while self.current_token.type in (MUL, DIV): + while self.current_token.symbol_type in (MUL, DIV): token = self.current_token - if token.type == MUL: + if token.symbol_type == MUL: self.eat(MUL) - elif token.type == DIV: + elif token.symbol_type == DIV: self.eat(DIV) self.factor() @@ -119,11 +119,11 @@ def expr(self): self.term() - while self.current_token.type in (PLUS, MINUS): + while self.current_token.symbol_type in (PLUS, MINUS): token = self.current_token - if token.type == PLUS: + if token.symbol_type == PLUS: self.eat(PLUS) - elif token.type == MINUS: + elif token.symbol_type == MINUS: self.eat(MINUS) self.term() diff --git a/interpreter.py b/interpreter.py index 636e5fd..c120f49 100644 --- a/interpreter.py +++ b/interpreter.py @@ -14,25 +14,18 @@ v6.0 : support to evaluates arithmetic expressions that have different operators and parentheses. v7.0 : using ASTs represent the operator-operand model of arithmetic expressions. v8.0 : support unary operators (+, -) +v9.0 : support to handle python assignment statements. +v10.0 : handle variable not defined error """ -from ast import BinOp, Num, UnaryOp +import keyword +from abs_syntax_tree import BinOp, Num, UnaryOp, Var, NoOp, Compound, Assign +from spi_token import Token +from spi_symbol import VarSymbol, SymbolTable -INTEGER, PLUS, EOF, MINUS, MUL, DIV, LPAREN, RPAREN = 'INTEGER', 'PLUS', 'EOF', 'MINUS', 'MUL', 'DIV', 'LPAREN', 'RPAREN' -class Token(object): - def __init__(self, type, value): - self.type = type - self.value = value - - def __str__(self): - return 'Token({type}, {value})'.format( - type=self.type, - value=repr(self.value) - ) - - def __repr__(self): - return self.__str__() +INTEGER, FLOAT, PLUS, EOF, MINUS, MUL, DIV, LPAREN, RPAREN, ID, ASSIGN, REPL = 'INTEGER', 'FLOAT', 'PLUS', 'EOF', 'MINUS', 'MUL', 'DIV', 'LPAREN', 'RPAREN', 'ID', 'ASSIGN', 'REPL' +PYTHON_RESERVED_KEYWORDS = {key: Token(key, key) for key in keyword.kwlist} class Analyzer(object): """Lexical analyzer 表达式的语法解析器,用于将表达式解析成token流""" @@ -42,7 +35,7 @@ def __init__(self, text): self.current_char = self.text[self.pos] def error(self): - return Exception("Invalid input") + return SyntaxError("invalid syntax") def advance(self): """Advance the 'pos' pointer and set the 'current_char' variable.""" @@ -54,16 +47,42 @@ def advance(self): def skip_whitespace(self): """Skip whitespace, tab, newline.""" - while self.current_char is not None and self.current_char == ' ': + while self.current_char is not None and self.current_char.isspace(): self.advance() - def integer(self): + def peek(self): + peek_pos = self.pos + 1 + if peek_pos > len(self.text) - 1: + return None + else: + return self.text[peek_pos] + + def number(self): """return a multi-digit integer""" result = '' while self.current_char is not None and self.current_char.isdigit(): result += self.current_char self.advance() - return int(result) + if self.current_char == '.': + result += self.current_char + self.advance() + while self.current_char is not None and self.current_char.isdigit(): + result += self.current_char + self.advance() + return float(result) + else: + return int(result) + + def identifier(self): + """return a multi-digit identifier""" + result = '' + while self.current_char is not None and self.current_char.isalnum(): + result += self.current_char + self.advance() + + if result in PYTHON_RESERVED_KEYWORDS: + return self.error() + return Token(ID, result) def get_next_token(self): """this function breaking a sentence apart into tokens.""" @@ -72,7 +91,8 @@ def get_next_token(self): self.skip_whitespace() continue if self.current_char.isdigit(): - return Token(INTEGER, self.integer()) + number = self.number() + return Token(INTEGER, number) if isinstance(number, int) else Token(FLOAT, number) if self.current_char == '+': self.advance() return Token(PLUS, '+') @@ -91,6 +111,15 @@ def get_next_token(self): if self.current_char == ')': self.advance() return Token(RPAREN, ')') + if self.current_char.isalpha(): + return self.identifier() + if self.current_char == '=': + self.advance() + return Token(ASSIGN, '=') + if self.current_char == '\\' and self.peek() == 'n': + self.advance() + self.advance() + return Token(REPL, '\\n') self.error() return Token(EOF, None) @@ -125,25 +154,62 @@ def term(self): node = BinOp(left=node, op=token, right=self.factor()) return node - def factor(self): - """返回参与运算的数,支持整型或者带括号的表达式 INTEGER | LPAREN expr RPAREN | (PLUS|MINUS) factor""" + def variable(self): + node = Var(self.current_token) + self.eat(ID) + return node + + def empty(self): + return NoOp() + + def assignment_statement(self): + """ + assignment_statement : variable ASSIGN expr + """ + left = self.variable() token = self.current_token - if self.current_token.type == PLUS: - self.eat(PLUS) - return UnaryOp(op=token, expr=self.factor()) - elif self.current_token.type == MINUS: - self.eat(MINUS) - return UnaryOp(op=token, expr=self.factor()) - elif self.current_token.type == INTEGER: - self.eat(INTEGER) - return Num(token) - elif self.current_token.type == LPAREN: - self.eat(LPAREN) - node = self.expr() - self.eat(RPAREN) - return node + self.eat(ASSIGN) + right = self.expr() + node = Assign(left=left, op=token, right=right) + return node + + def statement(self): + """statement : assignment_statement | empty""" + if self.current_token.type == ID: + node = self.assignment_statement() else: - self.error() + node = self.empty() + return node + + def statements(self): + """ + statements : statement + | statement REPL statement_list + """ + node = self.statement() + results = [node] + while self.current_token.type == ID: + results.append(self.statement()) + + return results + + def compound_statement(self): + """ + compound_statement : statement_list + """ + # self.eat(REPL) + nodes = self.statements() + # self.eat(REPL) + + root = Compound() + for node in nodes: + root.children.append(node) + return root + + def program(self): + """program : compound_statement """ + node = self.compound_statement() + return node def expr(self): """表达式解析:term((PLUS|MINUS) term)* . @@ -161,8 +227,37 @@ def expr(self): node = BinOp(left=node, op=token, right=self.term()) return node + def factor(self): + """返回参与运算的数,支持整型或者带括号的表达式 INTEGER | LPAREN expr RPAREN | (PLUS|MINUS) factor | variable""" + token = self.current_token + if self.current_token.type == PLUS: + self.eat(PLUS) + return UnaryOp(op=token, expr=self.factor()) + elif self.current_token.type == MINUS: + self.eat(MINUS) + return UnaryOp(op=token, expr=self.factor()) + elif self.current_token.type == INTEGER: + self.eat(INTEGER) + return Num(token) + elif self.current_token.type == FLOAT: + self.eat(FLOAT) + return Num(token) + elif self.current_token.type == LPAREN: + self.eat(LPAREN) + node = self.expr() + self.eat(RPAREN) + return node + elif self.current_token.type == ID: + node = self.variable() + return node + else: + self.error() + def parse(self): - return self.expr() + node = self.program() + if self.current_token.type != EOF: + self.error() + return node class NodeVisitor(object): @@ -178,6 +273,7 @@ def generic_visit(self, node): class Interpreter(NodeVisitor): def __init__(self, parser): self.parser = parser + self.GLOBAL_SCOPE = {} def visit_BinOp(self, node): if node.op.type == PLUS: @@ -198,6 +294,25 @@ def visit_UnaryOp(self, node): elif node.op.type == MINUS: return -self.visit(node.expr) + def visit_Assign(self, node): + var_name = node.left.value + self.GLOBAL_SCOPE[var_name] = self.visit(node.right) + + def visit_NoOp(self, node): + pass + + def visit_Compound(self, node): + for child in node.children: + self.visit(child) + + def visit_Var(self, node): + var_name = node.value + val = self.GLOBAL_SCOPE.get(var_name) + if val is None: + raise NameError(repr(var_name)) + else: + return val + def visit(self, node): if isinstance(node, BinOp): return self.visit_BinOp(node) @@ -205,41 +320,93 @@ def visit(self, node): return self.visit_Num(node) elif isinstance(node, UnaryOp): return self.visit_UnaryOp(node) + elif isinstance(node, Var): + return self.visit_Var(node) + elif isinstance(node, Assign): + return self.visit_Assign(node) + elif isinstance(node, Compound): + return self.visit_Compound(node) + elif isinstance(node, NoOp): + return self.visit_NoOp(node) def interpret(self): tree = self.parser.parse() + symbol_builder = SymbolTableBuilder() + symbol_builder.visit(tree) return self.visit(tree) -def test_unary_op(): - """ - 测试一元运算符, text=6---1 - """ - text = '5---2' - six_tok = Num(Token(INTEGER, 5)) - one_tok = Num(Token(INTEGER, 2)) - minus_tok = Token(MINUS, '-') - exp_node = BinOp(six_tok, minus_tok, UnaryOp(minus_tok, UnaryOp(minus_tok, one_tok))) - interpreter = Interpreter(None) - print(interpreter.visit(exp_node)) +class SymbolTableBuilder(NodeVisitor): + def __init__(self): + self.symtab = SymbolTable() + def visit_BinOp(self, node): + self.visit(node.left) + self.visit(node.right) -def main(): - test_unary_op() - while True: - try: - text = input('input a express like "10+2*3+16/(4+4)-(3-2)*2"(Only single digit integers are allowed in the input)> ') - except EOFError: - break + def visit_Num(self, node): + pass - if not text: - continue - analyzer = Analyzer(text) - parser = Parser(analyzer) - interpreter = Interpreter(parser) - print(interpreter.interpret()) + def visit_UnaryOp(self, node): + self.visit(node.expr) + + def visit_Compound(self, node): + for child in node.children: + self.visit(child) + + def visit_NoOp(self, node): + pass + + def visit_VarDecl(self, node): + type_name = node.type_node.value + type_symbol = self.symtab.lookup(type_name) + var_name = node.var_node.value + var_symbol = VarSymbol(var_name, type_symbol) + self.symtab.define(var_symbol) + + def visit_Assign(self, node): + # python代码中赋值就是定义,没有声明 + var_name = node.left.value + var_symbol = VarSymbol(var_name, None) + self.symtab.define(var_symbol) + self.visit(node.right) + + def visit_Var(self, node): + var_name = node.value + var_symbol = self.symtab.lookup(var_name) + if var_symbol is None: + raise NameError(repr(var_name)) + + def visit(self, node): + if isinstance(node, BinOp): + return self.visit_BinOp(node) + elif isinstance(node, Num): + return self.visit_Num(node) + elif isinstance(node, UnaryOp): + return self.visit_UnaryOp(node) + elif isinstance(node, Var): + return self.visit_Var(node) + elif isinstance(node, Assign): + return self.visit_Assign(node) + elif isinstance(node, Compound): + return self.visit_Compound(node) + elif isinstance(node, NoOp): + return self.visit_NoOp(node) + + + +def main(): + import sys + py_file = sys.argv[1] + # py_file = 'assignments.txt' + text = open(py_file, 'r').read() + print(f"begin parse input: {text}") + lexer = Analyzer(text) + parser = Parser(lexer) + interpreter = Interpreter(parser) + result = interpreter.interpret() + print(interpreter.GLOBAL_SCOPE) if __name__ == '__main__': main() - diff --git a/spi_symbol.py b/spi_symbol.py new file mode 100644 index 0000000..791cd08 --- /dev/null +++ b/spi_symbol.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +@file: spi_symbol.py +@author: amazing coder +@date: 2024/8/31 +@desc: 增加通用符号类 +""" + + +class Symbol(object): + def __int__(self, name, type=None): + self.name = name + self.symbol_type = type + + +class BuiltinTypeSymbol(Symbol): + def __init__(self, name): + super().__int__(name) + + def __str__(self): + return self.name + + __repr__ = __str__ + + +class VarSymbol(Symbol): + def __init__(self, name, type=None): + # python 定义时可以不指定类型 + super().__int__(name, type) + + def __str__(self): + return f'VarSymbol:name={self.name}: type={self.symbol_type}' + + __repr__ = __str__ + + +class SymbolTable(object): + def __init__(self): + self._symbols = {} + + def __str__(self): + return 'Symbols: {symbols}'.format(symbols=[value for value in self._symbols.values()]) + + __repr__ = __str__ + + def define(self, symbol): + print('Define: %s' % symbol) + self._symbols[symbol.name] = symbol + return symbol + + def lookup(self, name): + print('Lookup: %s' % name) + symbol = self._symbols.get(name) + return symbol + + +def test_class(): + int_type = BuiltinTypeSymbol('INTEGER') + float_type = BuiltinTypeSymbol('FLOAT') + var_x = VarSymbol('x', int_type) + var_y = VarSymbol('y', float_type) + print(var_x) + print(var_y) + + +if __name__ == '__main__': + test_class() diff --git a/spi_token.py b/spi_token.py new file mode 100644 index 0000000..87bada4 --- /dev/null +++ b/spi_token.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +@file: spi_token.py +@author: amazing coder +@date: 2024/8/29 +@desc: +""" + + +class Token(object): + def __init__(self, type, value): + self.type = type + self.value = value + + def __str__(self): + return 'Token({type}, {value})'.format( + type=self.type, + value=repr(self.value) + ) + + def __repr__(self): + return self.__str__()