编译器可以粗略地分为几个简单的部分:词法分析器、语法分析器、代码生成器。其中语义分析器、代码优化器这些不影响本文示例功能。另外,本文的编译器生成的是字节码,因此还需要一个虚拟机来执行。
Lexer 词法分析器
Lexer
的功能是将代码串不同的符号进行标记分组,例如(3 + 5) * 2
可以拆分为不同的token
,分别是(
、3
、+
、5
、)
、*
、2
,包含五个分类,有左括号、右括号、加号、乘号以及整数,其中的空白符号需要忽略。
class Lexer
{
private $input;
private $pos = 0;
private $currentChar;
public function __construct($input)
{
$this->input = $input;
$this->currentChar = $input[0] ?? null;
}
private function advance()
{
$this->pos++;
$this->currentChar = $this->input[$this->pos] ?? null;
}
private function skipWhitespace()
{
while ($this->currentChar !== null && ctype_space($this->currentChar)) {
$this->advance();
}
}
private function integer()
{
$result = '';
while ($this->currentChar !== null && ctype_digit($this->currentChar)) {
$result .= $this->currentChar;
$this->advance();
}
return (int)$result;
}
private function identifier()
{
$result = '';
while ($this->currentChar !== null && (ctype_alnum($this->currentChar) || $this->currentChar === '_')) {
$result .= $this->currentChar;
$this->advance();
}
return $result;
}
public function getNextToken()
{
while ($this->currentChar !== null) {
// 忽略空白符
if (ctype_space($this->currentChar)) {
$this->skipWhitespace();
continue;
}
// 当前字符为整型时,扫描后续部分的整型字符
if (ctype_digit($this->currentChar)) {
return ['type' => 'INTEGER', 'value' => $this->integer()];
}
// 当前字符为字母时,扫描后续部分的字母或数字字符
if (ctype_alpha($this->currentChar)) {
$id = $this->identifier();
// 如果扫描到的 token 为 print,则将其标记为关键词 print
if (strtoupper($id) === 'PRINT') {
return ['type' => 'PRINT', 'value' => $id];
}
return ['type' => 'ID', 'value' => $id];
}
// 普通的符号
switch ($this->currentChar) {
case '=':
$this->advance();
return ['type' => 'ASSIGN', 'value' => '='];
case '+':
$this->advance();
return ['type' => 'PLUS', 'value' => '+'];
case '-':
$this->advance();
return ['type' => 'MINUS', 'value' => '-'];
case '*':
$this->advance();
return ['type' => 'MUL', 'value' => '*'];
case '/':
$this->advance();
return ['type' => 'DIV', 'value' => '/'];
case ';':
$this->advance();
return ['type' => 'SEMI', 'value' => ';'];
case '(':
$this->advance();
return ['type' => 'LPAREN', 'value' => '('];
case ')':
$this->advance();
return ['type' => 'RPAREN', 'value' => ')'];
default:
throw new Exception("Invalid character: " . $this->currentChar);
}
}
return ['type' => 'EOF', 'value' => null];
}
}
Parser 语法分析器
Parser
遍历token
,生成AST
。
class Parser
{
private $lexer;
private $currentToken;
public function __construct($lexer)
{
$this->lexer = $lexer;
$this->currentToken = $lexer->getNextToken();
}
// 用于消费一个指定类型的 token
private function eat($tokenType)
{
if ($this->currentToken['type'] === $tokenType) {
$this->currentToken = $this->lexer->getNextToken();
} else {
throw new Exception("Syntax error: expected $tokenType");
}
}
// 第一优先级的 token,分别是整型、标识符(这里是变量)、左括号(左括号会递归解释括号中的表达式)
private function factor()
{
$token = $this->currentToken;
if ($token['type'] === 'INTEGER') {
$this->eat('INTEGER');
return ['type' => 'NumberLiteral', 'value' => $token['value']];
}
if ($token['type'] === 'ID') {
$this->eat('ID');
return ['type' => 'Variable', 'name' => $token['value']];
}
if ($token['type'] === 'LPAREN') {
$this->eat('LPAREN');
$node = $this->expr();
$this->eat('RPAREN');
return $node;
}
throw new Exception("Unexpected factor: " . $token['type']);
}
// 第二优先级。乘除法,解释为一个二元表达式
private function term()
{
$node = $this->factor();
while (in_array($this->currentToken['type'], ['MUL', 'DIV'])) {
$token = $this->currentToken;
$this->eat($token['type']);
$node = [
'type' => 'BinaryOp',
'left' => $node,
'op' => $token['value'],
'right' => $this->factor()
];
}
return $node;
}
// 第三优先级。加减法,解释为一个二元表达式
private function expr()
{
$node = $this->term();
while (in_array($this->currentToken['type'], ['PLUS', 'MINUS'])) {
$token = $this->currentToken;
$this->eat($token['type']);
$node = [
'type' => 'BinaryOp',
'left' => $node,
'op' => $token['value'],
'right' => $this->term()
];
}
return $node;
}
// 目前只有两种语句啦:print 语句;赋值表达式
private function statement()
{
if ($this->currentToken['type'] === 'PRINT') {
$this->eat('PRINT');
$expr = $this->expr();
$this->eat('SEMI');
return ['type' => 'PrintStatement', 'expr' => $expr];
}
if ($this->currentToken['type'] === 'ID') {
$varName = $this->currentToken['value'];
$this->eat('ID');
$this->eat('ASSIGN');
$expr = $this->expr();
$this->eat('SEMI');
return ['type' => 'Assignment', 'name' => $varName, 'expr' => $expr];
}
throw new Exception("Invalid statement");
}
public function parse()
{
$statements = [];
while ($this->currentToken['type'] !== 'EOF') {
$statements[] = $this->statement();
}
return ['type' => 'Program', 'body' => $statements];
}
}
Code Generator 代码生成器
递归遍历AST
,生成字节码。
class CodeGenerator
{
private $ast;
private $instructions = [];
private $varIndex = 0;
private $varMap = [];
public function __construct($ast)
{
$this->ast = $ast;
}
private function generateExpr($node)
{
switch ($node['type']) {
case 'NumberLiteral':
$this->instructions[] = ['op' => 'PUSH', 'value' => $node['value']];
break;
case 'Variable':
if (!isset($this->varMap[$node['name']])) {
$this->varMap[$node['name']] = $this->varIndex++;
}
$this->instructions[] = ['op' => 'LOAD', 'address' => $this->varMap[$node['name']]];
break;
case 'BinaryOp':
$this->generateExpr($node['left']);
$this->generateExpr($node['right']);
switch ($node['op']) {
case '+':
$this->instructions[] = ['op' => 'ADD'];
break;
case '-':
$this->instructions[] = ['op' => 'SUB'];
break;
case '*':
$this->instructions[] = ['op' => 'MUL'];
break;
case '/':
$this->instructions[] = ['op' => 'DIV'];
break;
}
break;
}
}
public function generate()
{
foreach ($this->ast['body'] as $stmt) {
switch ($stmt['type']) {
case 'PrintStatement':
$this->generateExpr($stmt['expr']);
$this->instructions[] = ['op' => 'PRINT'];
break;
case 'Assignment':
$this->generateExpr($stmt['expr']);
if (!isset($this->varMap[$stmt['name']])) {
$this->varMap[$stmt['name']] = $this->varIndex++;
}
$this->instructions[] = ['op' => 'STORE', 'address' => $this->varMap[$stmt['name']]];
break;
}
}
return $this->instructions;
}
}
VM 虚拟机
VM
执行Code Generator
一步生成的字节码。
class VM
{
private $code;
private $stack = [];
private $memory = [];
private $pc = 0;
public function __construct($code)
{
$this->code = $code;
}
public function run()
{
while ($this->pc < count($this->code)) {
$instr = $this->code[$this->pc++];
switch ($instr['op']) {
case 'PUSH':
$this->stack[] = $instr['value'];
break;
case 'LOAD':
$this->stack[] = $this->memory[$instr['address']] ?? 0;
break;
case 'STORE':
$value = array_pop($this->stack);
$this->memory[$instr['address']] = $value;
break;
case 'ADD':
$b = array_pop($this->stack);
$a = array_pop($this->stack);
$this->stack[] = $a + $b;
break;
case 'SUB':
$b = array_pop($this->stack);
$a = array_pop($this->stack);
$this->stack[] = $a - $b;
break;
case 'MUL':
$b = array_pop($this->stack);
$a = array_pop($this->stack);
$this->stack[] = $a * $b;
break;
case 'DIV':
$b = array_pop($this->stack);
$a = array_pop($this->stack);
$this->stack[] = $a / $b;
break;
case 'PRINT':
echo array_pop($this->stack) . PHP_EOL;
break;
}
}
}
}
测试代码
$input = <<<CODE
print (3 + 5) * 2;
a = 10 - 2;
print a * 3;
b = a * 5;
print b;
CODE;
try {
// 词法分析
$lexer = new Lexer($input);
// 语法分析生成AST
$parser = new Parser($lexer);
$ast = $parser->parse();
// 生成中间代码
$generator = new CodeGenerator($ast);
$code = $generator->generate();
// 虚拟机执行
$vm = new VM($code);
$vm->run();
} catch (Exception $e) {
echo "Error: " . $e->getMessage();
}