From 42020fb6c9c0aae18bec1241ad31c3ff854c3f03 Mon Sep 17 00:00:00 2001 From: quou Date: Sat, 27 May 2023 10:27:16 +1000 Subject: Add expression compiler. --- compiler.c | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 compiler.c diff --git a/compiler.c b/compiler.c new file mode 100644 index 0000000..c1086d4 --- /dev/null +++ b/compiler.c @@ -0,0 +1,344 @@ +/* Simple expression to x86_64 compiler. + * + * I made it to test out an idea that I had for reaching + * the design goals of the C compiler that I am building + * which are: very low memory usage, no reliance on + * complex memory allocators, no reliance on the C + * standard library and no operating system-specific + * requirements. + * + * The first expression parser and generator I made for + * my compiler was based on an Abstract Syntax Tree (AST), + * where it would parse the source code into a tree + * structure and then recursively descent the tree and + * generate code for each branch. This works fine, but + * trees are often quite wasteful on memory (even in + * my case where the tree is fully static) and in the + * cast of ASTs they waste stack during recursion + * because there ends up being a lot of "empty" nodes + * that only check a single parameter before descending. + * + * What follows is an implementation of "interpreter-based + * compilation" (someone has probably already done this + * and called it something else, but I haven't seen + * it). The idea is instead of building a tree, the + * compiler builds a buffer of bytecode. That bytecode + * is then run inside a stack-based virtual machine in + * a similar way to how the Lua interpreter works. The + * difference here is that instead of producing a result, + * the VM outputs assembly code. The advantages of it + * is that the compiler can be made very lightweight and + * fast because a lot of assumptions can be made about + * the state of the VM stack at any given time which + * makes operator precedence simpler. + * + * This implementation supports operator-precedence + * and two operators (+ and *) and 64 bit signed + * integers. It outputs x86_64 assembly in AT&T + * syntax. It's very primitive and does almost zero + * error checking; the goal was to verify that the + * concept works. + */ + +#include +#include + +#define max_ops 32 +#define max_stack 32 + +static const char* regs[] = { + "rax", + "r9" +}; + +typedef enum { + op_pushc, + op_add, + op_mul, + op_ret +} Op; + +typedef enum { + tok_number, + tok_plus, + tok_star, + tok_error, + tok_end +} Tok_Type; + +typedef struct { + Tok_Type type; + int len; + const char* start; +} Tok; + +typedef enum { + stack_type_constexpr, + stack_type_expr +} Stack_Type; + +typedef struct { + Stack_Type type; + unsigned char reg; + unsigned char idx; +} Stack_Value; + +typedef struct { + const char* src; + const char* cur; + const char* start; + Tok tok; + Stack_Value* stack_top; + int op_count, num_count, r; + unsigned char ops[max_ops]; + int numbers[(unsigned char)-1]; + Stack_Value stack[max_stack]; +} Compiler; + +int get_prec(Tok_Type type) { + switch (type) { + case tok_number: + return 0; + case tok_plus: + return 1; + case tok_star: + return 2; + default: + return -1; + } +} + +int is_digit(char c) { + return c >= '0' && c <= '9'; +} + +char peek(Compiler* c) { + return *c->cur; +} + +char advance(Compiler* c) { + c->cur++; + return c->cur[-1]; +} + +void skip_whitespace(Compiler* c) { + while (1) { + switch (peek(c)) { + case ' ': + case '\t': + case '\n': + advance(c); + break; + default: + return; + } + } +} + +Tok new_tok(Compiler* c, Tok_Type type) { + Tok t; + t.type = type; + t.start = c->start - 1; + t.len = (c->cur - c->start) + 1; + return t; +} + +Tok new_err_tok(Compiler* c, const char* msg) { + Tok t; + t.type = tok_error; + t.start = msg; + t.len = -1; + return t; +} + +Tok next_tok(Compiler* c) { + char ch; + skip_whitespace(c); + ch = advance(c); + c->start = c->cur; + if (is_digit(ch)) { + while (is_digit(peek(c))) { + advance(c); + } + return new_tok(c, tok_number); + } + switch (ch) { + case '+': + return new_tok(c, tok_plus); + case '*': + return new_tok(c, tok_star); + case 0: + return new_tok(c, tok_end); + default: + return new_err_tok(c, "Unexpected character."); + } +} + +void emit_byte(Compiler* c, unsigned char b) { + c->ops[c->op_count++] = b; +} + +void emit_op(Compiler* c, Op op) { + emit_byte(c, (unsigned char)op); +} + +unsigned char emit_number(Compiler* c, int n) { + c->numbers[c->num_count] = n; + return (unsigned char)c->num_count++; +} + +void push(Compiler* c, Stack_Value v) { + *c->stack_top = v; + c->stack_top++; +} + +Stack_Value pop(Compiler* c) { + c->stack_top--; + return *c->stack_top; +} + +void gen_val(Compiler* c, Stack_Value v, unsigned char r) { + if (v.type == stack_type_constexpr) { + printf( + "mov $%d, %%%s\n", + c->numbers[v.idx], + regs[r] + ); + } else if (v.reg != r) { + /*printf( + "mov %%rax, %%%s\n", + regs[r] + );*/ + } +} + +void generate(Compiler* c) { + Op op; + int i; + Stack_Value v, a, b; + i = 0; + printf(".globl eval\n"); + printf(".text\n"); + printf("eval:\n"); + while (1) { + op = c->ops[i]; + switch (op) { + case op_pushc: + v.type = stack_type_constexpr; + v.reg = c->ops[i + 1]; + v.idx = c->ops[i + 2]; + push(c, v); + i += 2; + break; + case op_add: + b = pop(c); + a = pop(c); + gen_val(c, a, 1); + gen_val(c, b, 0); + printf( + "add %%%s, %%%s\n", + regs[1], + regs[0] + ); + v.type = stack_type_expr; + v.reg = 0; + v.idx = -1; + push(c, v); + break; + case op_mul: + b = pop(c); + a = pop(c); + gen_val(c, a, 0); + gen_val(c, b, 1); + printf( + "imul %%%s, %%%s\n", + regs[1], + regs[0] + ); + v.type = stack_type_expr; + v.reg = 0; + v.idx = -1; + push(c, v); + break; + break; + case op_ret: + printf("ret\n"); + default: return; + } + i++; + } +} + +const Tok* cadvance(Compiler* c) { + c->tok = next_tok(c); + return &c->tok; +} + +void cnumber(Compiler* c) { + int n; + n = (int)strtol(c->tok.start, 0, 10); + emit_op(c, op_pushc); + emit_op(c, c->r); + emit_op(c, emit_number(c, n)); + c->r = !c->r; +} + +void bin_op(Compiler* c, Tok_Type t) { + int p, np; + const Tok* rhs, * nxt; + Tok_Type nt; + p = get_prec(t); + rhs = cadvance(c); + cnumber(c); /* rhs */ + nxt = cadvance(c); + nt = nxt->type; + np = get_prec(nt); + if (nt == tok_star || nt == tok_plus) { + if (p <= np) { + bin_op(c, nt); + emit_op(c, t); + } else { + emit_op(c, t); + bin_op(c, nt); + } + } else { + emit_op(c, t); + } +} + +void compile(const char* src) { + const Tok* t; + Compiler c; + c.src = src; + c.cur = src; + c.op_count = 0; + c.num_count = 0; + c.r = 0; + c.stack_top = c.stack; + do { + t = cadvance(&c); + if (t->type == tok_error) { + fprintf(stderr, "Error: %s\n", t->start); + return; + } + switch (t->type) { + case tok_number: + cnumber(&c); + break; + case tok_plus: + case tok_star: + bin_op(&c, t->type); + break; + default: break; + } + } while (t->type != tok_end); + + emit_op(&c, op_ret); + generate(&c); +} + +int main() { + const char* expr = "6 + 2 + 5 * 10"; + compile(expr); + return 0; +} -- cgit v1.2.3-54-g00ecf