aboutsummaryrefslogtreecommitdiff
path: root/compiler.c
blob: e0259920f1854359f17b67964b830b6263916f56 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/* Simple expression to x86_64 compiler.
 *
 * I made it to test out an idea that I had for reaching
 * the design goals of the C compiler that I am building
 * which are: very low memory usage, no reliance on
 * complex memory allocators, no reliance on the C
 * standard library and no operating system-specific
 * requirements.
 *
 * The first expression parser and generator I made for
 * my compiler was based on an Abstract Syntax Tree (AST),
 * where it would parse the source code into a tree
 * structure and then recursively descent the tree and
 * generate code for each branch. This works fine, but
 * trees are often quite wasteful on memory (even in
 * my case where the tree is fully static) and in the
 * cast of ASTs they waste stack during recursion
 * because there ends up being a lot of "empty" nodes
 * that only check a single parameter before descending.
 *
 * What follows is an implementation of "interpreter-based
 * compilation" (someone has probably already done this
 * and called it something else, but I haven't seen
 * it). The idea is instead of building a tree, the
 * compiler builds a buffer of bytecode. That bytecode
 * is then run inside a stack-based virtual machine in
 * a similar way to how the Lua interpreter works. The
 * difference here is that instead of producing a result,
 * the VM outputs assembly code. The advantages of it
 * is that the compiler can be made very lightweight and
 * fast because a lot of assumptions can be made about
 * the state of the VM stack at any given time which
 * makes operator precedence simpler.
 *
 * This implementation supports operator-precedence
 * and two operators (+ and *) and 64 bit signed
 * integers. It outputs x86_64 assembly in AT&T
 * syntax. It's very primitive and does almost zero
 * error checking; the goal was to verify that the
 * concept works.
 */

#include <stdio.h>
#include <stdlib.h>

#define max_ops 32
#define max_stack 32

static const char* regs[] = {
	"rax",
	"r9"
};

typedef enum {
	op_pushc,
	op_add,
	op_mul,
	op_ret
} Op;

typedef enum {
	tok_number,
	tok_plus,
	tok_star,
	tok_error,
	tok_end
} Tok_Type;

typedef struct {
	Tok_Type type;
	int len;
	const char* start;
} Tok;

typedef enum {
	stack_type_constexpr,
	stack_type_expr
} Stack_Type;

typedef struct {
	Stack_Type type;
	unsigned char reg;
	unsigned char idx;
} Stack_Value;

typedef struct {
	const char* src;
	const char* cur;
	const char* start;
	Tok tok;
	Stack_Value* stack_top;
	int op_count, num_count, r;
	unsigned char ops[max_ops];
	int numbers[(unsigned char)-1];
	Stack_Value stack[max_stack];
} Compiler;

int get_prec(Tok_Type type) {
	switch (type) {
		case tok_number:
			return 0;
		case tok_plus:
			return 1;
		case tok_star:
			return 2;
		default:
			return -1;
	}
}

int is_digit(char c) {
	return c >= '0' && c <= '9';
}

char peek(Compiler* c) {
	return *c->cur;
}

char advance(Compiler* c) {
	c->cur++;
	return c->cur[-1];
}

void skip_whitespace(Compiler* c) {
	while (1) {
		switch (peek(c)) {
			case ' ':
			case '\t':
			case '\n':
				advance(c);
				break;
			default:
				return;
		}
	}
}

Tok new_tok(Compiler* c, Tok_Type type) {
	Tok t;
	t.type = type;
	t.start = c->start - 1;
	t.len = (c->cur - c->start) + 1;
	return t;
}

Tok new_err_tok(Compiler* c, const char* msg) {
	Tok t;
	t.type = tok_error;
	t.start = msg;
	t.len = -1;
	return t;
}

Tok next_tok(Compiler* c) {
	char ch;
	skip_whitespace(c);
	ch = advance(c);
	c->start = c->cur;
	if (is_digit(ch)) {
		while (is_digit(peek(c))) {
			advance(c);
		}
		return new_tok(c, tok_number);
	}
	switch (ch) {
		case '+':
			return new_tok(c, tok_plus);
		case '*':
			return new_tok(c, tok_star);
		case 0:
			return new_tok(c, tok_end);
		default:
			return new_err_tok(c, "Unexpected character.");
	}
}

void emit_byte(Compiler* c, unsigned char b) {
	c->ops[c->op_count++] = b;
}

void emit_op(Compiler* c, Op op) {
	emit_byte(c, (unsigned char)op);
}

unsigned char emit_number(Compiler* c, int n) {
	c->numbers[c->num_count] = n;
	return (unsigned char)c->num_count++;
}

void push(Compiler* c, Stack_Value v) {
	*c->stack_top = v;
	c->stack_top++;
}

Stack_Value pop(Compiler* c) {
	c->stack_top--;
	return *c->stack_top;
}

void gen_val(Compiler* c, Stack_Value v, unsigned char r) {
	if (v.type == stack_type_constexpr) {
		printf(
			"mov $%d, %%%s\n",
			c->numbers[v.idx],
			regs[r]
		);
	}
	/* The result of the last sub-expression is already
	 * in the rax register, no need to do anything. */
}

void generate(Compiler* c) {
	Op op;
	int i;
	Stack_Value v, a, b;
	i = 0;
	printf(".globl eval\n");
	printf(".text\n");
	printf("eval:\n");
	while (1) {
		op = c->ops[i];
		switch (op) {
			case op_pushc:
				v.type = stack_type_constexpr;
				v.reg = c->ops[i + 1];
				v.idx = c->ops[i + 2];
				push(c, v);
				i += 2;
				break;
			case op_add:
				b = pop(c);
				a = pop(c);
				gen_val(c, a, 1);
				gen_val(c, b, 0);
				printf(
					"add %%%s, %%%s\n",
					regs[1],
					regs[0]
				);
				v.type = stack_type_expr;
				v.reg = 0;
				v.idx = -1;
				push(c, v);
				break;
			case op_mul:
				b = pop(c);
				a = pop(c);
				gen_val(c, a, 1);
				gen_val(c, b, 0);
				printf(
					"imul %%%s, %%%s\n",
					regs[1],
					regs[0]
				);
				v.type = stack_type_expr;
				v.reg = 0;
				v.idx = -1;
				push(c, v);
				break;
				break;
			case op_ret:
				printf("ret\n");
			default: return;
		}
		i++;
	}
}

const Tok* cadvance(Compiler* c) {
	c->tok = next_tok(c);
	return &c->tok;
}

void cnumber(Compiler* c) {
	int n;
	n = (int)strtol(c->tok.start, 0, 10);
	emit_op(c, op_pushc);
	emit_op(c, c->r);
	emit_op(c, emit_number(c, n));
	c->r = !c->r;
}

void bin_op(Compiler* c, Tok_Type t) {
	int p, np;
	const Tok* rhs, * nxt;
	Tok_Type nt;
	p = get_prec(t);
	rhs = cadvance(c);
	cnumber(c); /* rhs */
	nxt = cadvance(c);
	nt = nxt->type;
	np = get_prec(nt);
	if (nt == tok_star || nt == tok_plus) {
		if (p <= np) {
			bin_op(c, nt);
			emit_op(c, t);
		} else {
			emit_op(c, t);
			bin_op(c, nt);
		}
	} else {
		emit_op(c, t);
	}
}

void compile(const char* src) {
	const Tok* t;
	Compiler c;
	c.src = src;
	c.cur = src;
	c.op_count = 0;
	c.num_count = 0;
	c.r = 0;
	c.stack_top = c.stack;
	do {
		t = cadvance(&c);
		if (t->type == tok_error) {
			fprintf(stderr, "Error: %s\n", t->start);
			return;
		}
		switch (t->type) {
			case tok_number:
				cnumber(&c);
				break;
			case tok_plus:
			case tok_star:
				bin_op(&c, t->type);
				break;
			default: break;
		}
	} while (t->type != tok_end);

	emit_op(&c, op_ret);
	generate(&c);
}

int main() {
	const char* expr = "6 + 2 + 5 * 10";
	compile(expr);
	return 0;
}