aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler.c344
1 files changed, 344 insertions, 0 deletions
diff --git a/compiler.c b/compiler.c
new file mode 100644
index 0000000..c1086d4
--- /dev/null
+++ b/compiler.c
@@ -0,0 +1,344 @@
+/* Simple expression to x86_64 compiler.
+ *
+ * I made it to test out an idea that I had for reaching
+ * the design goals of the C compiler that I am building
+ * which are: very low memory usage, no reliance on
+ * complex memory allocators, no reliance on the C
+ * standard library and no operating system-specific
+ * requirements.
+ *
+ * The first expression parser and generator I made for
+ * my compiler was based on an Abstract Syntax Tree (AST),
+ * where it would parse the source code into a tree
+ * structure and then recursively descent the tree and
+ * generate code for each branch. This works fine, but
+ * trees are often quite wasteful on memory (even in
+ * my case where the tree is fully static) and in the
+ * cast of ASTs they waste stack during recursion
+ * because there ends up being a lot of "empty" nodes
+ * that only check a single parameter before descending.
+ *
+ * What follows is an implementation of "interpreter-based
+ * compilation" (someone has probably already done this
+ * and called it something else, but I haven't seen
+ * it). The idea is instead of building a tree, the
+ * compiler builds a buffer of bytecode. That bytecode
+ * is then run inside a stack-based virtual machine in
+ * a similar way to how the Lua interpreter works. The
+ * difference here is that instead of producing a result,
+ * the VM outputs assembly code. The advantages of it
+ * is that the compiler can be made very lightweight and
+ * fast because a lot of assumptions can be made about
+ * the state of the VM stack at any given time which
+ * makes operator precedence simpler.
+ *
+ * This implementation supports operator-precedence
+ * and two operators (+ and *) and 64 bit signed
+ * integers. It outputs x86_64 assembly in AT&T
+ * syntax. It's very primitive and does almost zero
+ * error checking; the goal was to verify that the
+ * concept works.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define max_ops 32
+#define max_stack 32
+
+static const char* regs[] = {
+ "rax",
+ "r9"
+};
+
+typedef enum {
+ op_pushc,
+ op_add,
+ op_mul,
+ op_ret
+} Op;
+
+typedef enum {
+ tok_number,
+ tok_plus,
+ tok_star,
+ tok_error,
+ tok_end
+} Tok_Type;
+
+typedef struct {
+ Tok_Type type;
+ int len;
+ const char* start;
+} Tok;
+
+typedef enum {
+ stack_type_constexpr,
+ stack_type_expr
+} Stack_Type;
+
+typedef struct {
+ Stack_Type type;
+ unsigned char reg;
+ unsigned char idx;
+} Stack_Value;
+
+typedef struct {
+ const char* src;
+ const char* cur;
+ const char* start;
+ Tok tok;
+ Stack_Value* stack_top;
+ int op_count, num_count, r;
+ unsigned char ops[max_ops];
+ int numbers[(unsigned char)-1];
+ Stack_Value stack[max_stack];
+} Compiler;
+
+int get_prec(Tok_Type type) {
+ switch (type) {
+ case tok_number:
+ return 0;
+ case tok_plus:
+ return 1;
+ case tok_star:
+ return 2;
+ default:
+ return -1;
+ }
+}
+
+int is_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
+char peek(Compiler* c) {
+ return *c->cur;
+}
+
+char advance(Compiler* c) {
+ c->cur++;
+ return c->cur[-1];
+}
+
+void skip_whitespace(Compiler* c) {
+ while (1) {
+ switch (peek(c)) {
+ case ' ':
+ case '\t':
+ case '\n':
+ advance(c);
+ break;
+ default:
+ return;
+ }
+ }
+}
+
+Tok new_tok(Compiler* c, Tok_Type type) {
+ Tok t;
+ t.type = type;
+ t.start = c->start - 1;
+ t.len = (c->cur - c->start) + 1;
+ return t;
+}
+
+Tok new_err_tok(Compiler* c, const char* msg) {
+ Tok t;
+ t.type = tok_error;
+ t.start = msg;
+ t.len = -1;
+ return t;
+}
+
+Tok next_tok(Compiler* c) {
+ char ch;
+ skip_whitespace(c);
+ ch = advance(c);
+ c->start = c->cur;
+ if (is_digit(ch)) {
+ while (is_digit(peek(c))) {
+ advance(c);
+ }
+ return new_tok(c, tok_number);
+ }
+ switch (ch) {
+ case '+':
+ return new_tok(c, tok_plus);
+ case '*':
+ return new_tok(c, tok_star);
+ case 0:
+ return new_tok(c, tok_end);
+ default:
+ return new_err_tok(c, "Unexpected character.");
+ }
+}
+
+void emit_byte(Compiler* c, unsigned char b) {
+ c->ops[c->op_count++] = b;
+}
+
+void emit_op(Compiler* c, Op op) {
+ emit_byte(c, (unsigned char)op);
+}
+
+unsigned char emit_number(Compiler* c, int n) {
+ c->numbers[c->num_count] = n;
+ return (unsigned char)c->num_count++;
+}
+
+void push(Compiler* c, Stack_Value v) {
+ *c->stack_top = v;
+ c->stack_top++;
+}
+
+Stack_Value pop(Compiler* c) {
+ c->stack_top--;
+ return *c->stack_top;
+}
+
+void gen_val(Compiler* c, Stack_Value v, unsigned char r) {
+ if (v.type == stack_type_constexpr) {
+ printf(
+ "mov $%d, %%%s\n",
+ c->numbers[v.idx],
+ regs[r]
+ );
+ } else if (v.reg != r) {
+ /*printf(
+ "mov %%rax, %%%s\n",
+ regs[r]
+ );*/
+ }
+}
+
+void generate(Compiler* c) {
+ Op op;
+ int i;
+ Stack_Value v, a, b;
+ i = 0;
+ printf(".globl eval\n");
+ printf(".text\n");
+ printf("eval:\n");
+ while (1) {
+ op = c->ops[i];
+ switch (op) {
+ case op_pushc:
+ v.type = stack_type_constexpr;
+ v.reg = c->ops[i + 1];
+ v.idx = c->ops[i + 2];
+ push(c, v);
+ i += 2;
+ break;
+ case op_add:
+ b = pop(c);
+ a = pop(c);
+ gen_val(c, a, 1);
+ gen_val(c, b, 0);
+ printf(
+ "add %%%s, %%%s\n",
+ regs[1],
+ regs[0]
+ );
+ v.type = stack_type_expr;
+ v.reg = 0;
+ v.idx = -1;
+ push(c, v);
+ break;
+ case op_mul:
+ b = pop(c);
+ a = pop(c);
+ gen_val(c, a, 0);
+ gen_val(c, b, 1);
+ printf(
+ "imul %%%s, %%%s\n",
+ regs[1],
+ regs[0]
+ );
+ v.type = stack_type_expr;
+ v.reg = 0;
+ v.idx = -1;
+ push(c, v);
+ break;
+ break;
+ case op_ret:
+ printf("ret\n");
+ default: return;
+ }
+ i++;
+ }
+}
+
+const Tok* cadvance(Compiler* c) {
+ c->tok = next_tok(c);
+ return &c->tok;
+}
+
+void cnumber(Compiler* c) {
+ int n;
+ n = (int)strtol(c->tok.start, 0, 10);
+ emit_op(c, op_pushc);
+ emit_op(c, c->r);
+ emit_op(c, emit_number(c, n));
+ c->r = !c->r;
+}
+
+void bin_op(Compiler* c, Tok_Type t) {
+ int p, np;
+ const Tok* rhs, * nxt;
+ Tok_Type nt;
+ p = get_prec(t);
+ rhs = cadvance(c);
+ cnumber(c); /* rhs */
+ nxt = cadvance(c);
+ nt = nxt->type;
+ np = get_prec(nt);
+ if (nt == tok_star || nt == tok_plus) {
+ if (p <= np) {
+ bin_op(c, nt);
+ emit_op(c, t);
+ } else {
+ emit_op(c, t);
+ bin_op(c, nt);
+ }
+ } else {
+ emit_op(c, t);
+ }
+}
+
+void compile(const char* src) {
+ const Tok* t;
+ Compiler c;
+ c.src = src;
+ c.cur = src;
+ c.op_count = 0;
+ c.num_count = 0;
+ c.r = 0;
+ c.stack_top = c.stack;
+ do {
+ t = cadvance(&c);
+ if (t->type == tok_error) {
+ fprintf(stderr, "Error: %s\n", t->start);
+ return;
+ }
+ switch (t->type) {
+ case tok_number:
+ cnumber(&c);
+ break;
+ case tok_plus:
+ case tok_star:
+ bin_op(&c, t->type);
+ break;
+ default: break;
+ }
+ } while (t->type != tok_end);
+
+ emit_op(&c, op_ret);
+ generate(&c);
+}
+
+int main() {
+ const char* expr = "6 + 2 + 5 * 10";
+ compile(expr);
+ return 0;
+}