First working assembler! - nrj-oisc - NOR and Reference Jump OISC platform

commit d712730bb499a777f41635b72e82953264c37cd7
parent f560cc5cbdcc96a162049e546be89d90cec5264d
Author: Luxferre <lux@ferre>
Date:   Fri,  2 Sep 2022 19:57:59 +0300

First working assembler!

Diffstat:
A example.nrjasm  | 40 ++++++++++++++++++++++++++++++++++++++++
M nrj.c  | 4 ++--
A nrjasm.py  | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A stdlib.nrjasm  | 21 +++++++++++++++++++++

4 files changed, 281 insertions(+), 2 deletions(-)
diff --git a/example.nrjasm b/example.nrjasm
@@ -0,0 +1,40 @@
+; a proposed assembly syntax for NRJ machines (example for NRJ16)
+; semicolons are comments
+; preprocessor instructions start with dot (.)
+; every non-preprocessor instruction creates an entry in the lookup table
+; all addressing is in words
+; we usually start at the word 3 (don't pre-fill the I/O buffers)
+
+.bit 16 ; word/address size: NRJ16 is the default setting
+.org 3 ; .org defines the start of further code/data (in words, hex)
+
+; include the standard library
+
+.inc stdlib.nrjasm
+
+.var x 12EF ; .var defines a label for a particular memory location
+.var y 12F0 ; define another variable at 0x12F0
+.set @x 33EE ; .set sets a memory location to a particular hex constant at the build time, @ dereferences a label into the address
+.set @y 'M ; ' dereferences a character into a whole word with its ASCII code
+
+; we CANNOT use dereferencing operators with .var, only with .set or directly
+
+; there also can be .inc instruction to include a snippet from another file in the same directory
+
+; now, main elementary macros:
+; NXT - address of the next instruction position in the lookup table
+; HLT - the last address position in the lookup table (0xFFFF for NRJ16), set by .bits
+; FREE - address of the next available (at build time) memory cell, can only be used in .var
+
+; for the lookup table and CUR/NXT macros to work correctly, the code must start at an address divisible by 3
+; note that FREE doesn't intelligently detect the available cells, it only takes the next one after the maximum address used
+; so in our case, the first FREE instance will be substituted with 12F1, the next with 12F2 and so on
+
+
+; now, lets output a character by transferring the y value to the output cell 1
+; in an endless loop
+
+.lbl myloop
+MOV 1 @y @myloop ; output the character and jump to the beginning
+; we don't have to explicitly zero out the cell 1 as it is done by the I/O logic
+
diff --git a/nrj.c b/nrj.c
@@ -41,7 +41,7 @@ void nrj_run(NRJWORD *mem, NRJWORD pc) { /* Main NRJ engine - just 12 lines of C
     mem[mem[pc]] = (~(mem[mem[pc]] | mem[mem[pc+1]])) & MAXADDR; /* then perform the NOR operation */
     pc = mem[mem[pc+2]]; /* then perform the reference jump operation */
     if(mem[1]) { /* then handle output if word 1 is set */
-      nrj_out(&mem[2], &mem[mem[1]]); /* output the value from the location specified in word 1 */
+      nrj_out(&mem[2], &mem[1]); /* output the value from the word 1 */
       mem[1] = (NRJWORD) 0; /* clear word 1 */
     }
   }
@@ -58,7 +58,7 @@ int main(int argc, char* argv[]) { /* emulator entry point: nrj program.bin */
       fseek(prog, 0, SEEK_END);
       int flen = ftell(prog);
       fseek(prog, 0, SEEK_SET);
-      fread(mem, sizeof(NRJWORD), (flen/sizeof(NRJWORD)) & MAXADDR, prog);
+      fread(mem, sizeof(NRJWORD), flen/sizeof(NRJWORD), prog);
       fclose(prog);
       tcgetattr(0, &tty_opts_backup);
       atexit(&restore_term);
diff --git a/nrjasm.py b/nrjasm.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+# The reference assembler for NRJ OISC (tested for NRJ16)
+# By Luxferre, 2022, public domain
+
+import sys
+import array
+from os.path import realpath
+
+# some constants to redefine more easily in case of major breaking changes
+
+NRJDEF_BITS = 16 # default word/addr size if the .bit directive is omitted
+
+# service characters
+NRJCHAR_COMMENT = ';' # all comments are after ;
+NRJCHAR_VARDEREF = '@' # variable dereferencing is with @
+NRJCHAR_CHARDEREF = "'" # character dereferencing is with '
+NRJCHAR_SUBST = '%' # var-in-macro substitution is with %
+
+# macro variables
+NRJVAR1 = NRJCHAR_SUBST + 'A' # %A
+NRJVAR2 = NRJCHAR_SUBST + 'B' # %B
+NRJVAR3 = NRJCHAR_SUBST + 'C' # %C
+
+# and preprocessor directives
+NRJDIR_INC = '.inc'
+NRJDIR_BITS = '.bit'
+NRJDIR_ORG = '.org'
+NRJDIR_DEF = '.def'
+NRJDIR_END = '.end'
+NRJDIR_VAR = '.var'
+NRJDIR_SET = '.set'
+NRJDIR_FREE = 'FREE'
+NRJDIR_NXT = 'NXT'
+NRJDIR_HLT = 'HLT'
+
+included_files = [] # stash to check the already included files
+
+def readsrc(fname): # read source file contents, stripping comments, empty lines and trailing/leading whitespace
+    f = open(fname, 'r')
+    rawlines = f.readlines()
+    f.close()
+    lines = []
+    global included_files
+    included_files.append(realpath(fname))
+    for line in rawlines:
+        line = line.split(NRJCHAR_COMMENT)[0].strip()
+        if len(line) > 0:
+            tokens = line.split() # split on any whitespace, which is what we need
+            if tokens[0] == NRJDIR_INC: # process include directive immediately
+                incfname = realpath(' '.join(tokens[1:])) # because the name may include spaces
+                if incfname not in included_files: # cyclic inclusion protection
+                    incfile = readsrc(incfname) # call itself recursively, trying to include a file
+                    included_files.append(incfname) # update the list of included files
+                    lines.extend(incfile) # update the source with the included contents in place
+                else:
+                    print('Attempt to include an already included file %s, ignoring!' % incfname)
+            else: # otherwise just append the tokenized source line
+                lines.append(tokens)
+    return lines
+
+def start_assembly(srcfname, dstfname): # main assembly method
+    wordsize = NRJDEF_BITS # define machine word/address size
+
+    # we're starting with tokenized Stage 1 source: all includes processed, comments and whitespace stripped
+    stage1src = readsrc(srcfname)
+
+    # Stage 2: scan the source for the first word size set directive
+    for line in stage1src:
+        if line[0] == NRJDIR_BITS:
+            wordsize = int(line[1])
+            break
+    print('Building for NRJ%u' % wordsize)
+
+    # Stage 3: expand all macros
+    stage3src = []
+    macrobuffers = {}
+    macrostart = False
+    macroname = None
+    for line in stage1src:
+        if macrostart: # we already are buffering a macro
+            if line[0] == NRJDIR_END: # macro ended and saved in the buffers
+                macrostart = False
+                macroname = None
+            else: # continue buffering
+                macrobuffers[macroname].append(line)
+        else: # usual code
+            if line[0] == NRJDIR_DEF: # starting a macro
+                macroname = line[1]
+                macrobuffers[macroname] = [] # prepare the place to buffer the macro into
+                macrostart = True
+            elif line[0] != NRJDIR_BITS: # ignoring word size directive as we already processed it
+                if line[0] in macrobuffers: # detected an already compiled macro, substituting the code and parameters
+                    p1 = NRJDIR_HLT # placeholders for missing parameters
+                    p2 = NRJDIR_HLT
+                    p3 = NRJDIR_NXT # assume we're referring to the next address in p3
+                    if len(line) > 1: # fill the first parameter if present
+                        p1 = line[1]
+                    if len(line) > 2: # fill the second parameter if present
+                        p2 = line[2]
+                    if len(line) > 3: # fill the third parameter if present
+                        p3 = line[3]
+                    for macroline in macrobuffers[line[0]]: # now, perform the macrosubstitution with parameter replacement
+                        stage3src.append(' '.join(macroline).replace(NRJVAR1, p1).replace(NRJVAR2, p2).replace(NRJVAR3, p3).split())
+                else: # append a normal line
+                    stage3src.append(line)
+
+    # Stage 4: now, process .var directive, FREE directive, @ and ' dereferencing operators
+    vartable = {} # don't store numeric locations here yet, only string representations (hex or FREE)
+    stage4src = []
+    for line in stage3src:
+        if line[0] == NRJDIR_VAR: # .var directive: no @ or ' operators allowed here
+            if line[2] == NRJDIR_FREE:
+                vartable[line[1]] = 0
+            else:
+                vartable[line[1]] = int(line[2], 16)
+    # now, fill in the FREE bits
+    maxvar = 0
+    for vname in vartable:
+        if vartable[vname] > maxvar:
+            maxvar = vartable[vname]
+    for vname in vartable:
+        if vartable[vname] == 0:
+            maxvar += 1
+            vartable[vname] = maxvar
+    for line in stage3src:
+        if line[0] != NRJDIR_VAR: # finally, perform variable substitution
+            # but first, attempt to perform character substitution
+            for i, el in enumerate(line):
+                if el.startswith(NRJCHAR_CHARDEREF):
+                    line[i] = hex(ord(el[1]))[2:].upper()
+            sline = ' '.join(line)
+            for vname in vartable:
+                sline = sline.replace(NRJCHAR_VARDEREF+vname, hex(vartable[vname])[2:].upper())
+            stage4src.append(sline.split())
+
+    # now, our Stage 4 code is fully flat and we can start allocating memory for it
+    # directives left to process at this point: .org, .set, NXT, HLT
+    # (we cannot process .set before because it can also take value of NXT or HLT)
+
+    memsize = 1 << wordsize
+    haltaddr = memsize - 1 # halting address to be filled in the lookup table
+    print('Allocating %u %u-bit words of memory...' % (memsize, wordsize))
+    memmod = 'H'
+    if wordsize >= 32:
+        memmod = 'L'
+    elif wordsize >= 64:
+        memmod = 'Q'
+    elif wordsize <= 8:
+        memmod = 'B'
+    targetmem = array.array(memmod, [0]*memsize)
+
+    # here is the trickiest part of the whole assembly process - building a lookup table
+    # as NRJ can't directly jump to the next instruction by itself, we need to tell it to
+    # the NXT macro will be replaced with a cell in the lookup table that points to the next instruction
+    # and the lookup table will also take some memory in the machine
+
+    ltoffset = memsize >> 1 # in the worst case scenario, the code will take half of all memory and lookup table will take the other half
+    targetmem[ltoffset] = haltaddr # the first lookup table entry is always the halting address
+    codepos = 0
+    ltpos = 1 
+    # let's iterate over the code 
+    # pass 1
+    for line in stage4src:
+        if line[0] == NRJDIR_ORG: # handle .org
+            codepos = int(line[1], 16)
+        elif line[0] == NRJDIR_SET: # handle .set
+            addr = int(line[1], 16)
+            val = line[2]
+            if val == 'HLT':
+                targetmem[addr] = ltoffset
+            elif val != 'NXT':
+                targetmem[addr] = int(val, 16)
+        else: # 3-value vector where HLT or NXT can be encountered
+            # save current instruction in the lookup table
+            targetmem[ltoffset + ltpos] = codepos
+            ltpos += 1
+            for v in line:
+                if v == 'HLT':
+                    targetmem[codepos] = ltoffset
+                elif v == 'NXT':
+                    targetmem[codepos] = ltoffset + ltpos
+                else:
+                    targetmem[codepos] = int(v, 16)
+                codepos += 1
+    # pass 2 - fill in NXT
+    codepos = 0
+    ltpos = 1
+    for line in stage4src:
+        if line[0] == NRJDIR_ORG: # handle .org
+            codepos = int(line[1], 16)
+        elif line[0] == NRJDIR_SET: # handle .set
+            addr = int(line[1], 16)
+            val = line[2]
+            if val == 'NXT':
+                val = targetmem[ltoffset + ltpos]
+                targetmem[addr] = val
+        else: # 3-value vector where HLT or NXT can be encountered
+            ltpos += 1
+            for v in line:
+                codepos += 1
+
+    # now, we have assembled our target memory snapshot, let's write the output file
+
+    outf = open(dstfname, "wb")
+    targetmem.tofile(outf)
+    outf.close()
+    print('Assembled %s' % dstfname)
+
+
+if __name__ == '__main__': # nrjasm entry point
+    version = '0.0.1'
+    print('nrjasm v%s by Luxferre, 2022' % version)
+    if len(sys.argv) > 2:
+        print('Assembling %s into %s...' % (sys.argv[1], sys.argv[2]))
+        start_assembly(sys.argv[1], sys.argv[2])
+    else:
+        print('Usage: nrjasm.py [source] [binary]')
+
diff --git a/stdlib.nrjasm b/stdlib.nrjasm
@@ -0,0 +1,21 @@
+; nrjasm standard library starts here
+
+; custom macros always take 3 values, usually cell addresses (referred to as %A, %B and %C) and defined between .def and .end (no nesting allowed)
+; custom macros are always expanded before the elementary macros
+; if %C is not passed, it is replaced with NXT
+; if %B and/or %A is not passed, it is replaced with HLT
+
+.def .lbl ; define labels
+  .var %A FREE ; allocate a variable with the name in %A, then set it to the next instruciton address:
+  .set @%A NXT ; %A is directly substituted as text, so we can use it after the dereferencing operator
+.end
+
+; define a reusable buffer variable for our following macros
+.var setbuf FREE ; we don't care which address it will actually be
+
+.def MOV ; transfer one cell to another, usage: MOV dst src
+  @setbuf HLT NXT ; first, zero out the setbuf variable by performing NOR with 0xFFFF
+  %A HLT NXT ; then, zero out the destination cell by performing NOR with 0xFFFF
+  @setbuf %B NXT ; then, set setbuf variable to the inverted source value
+  %A @setbuf %C ; finally, set destination cell to the inverted setbuf value ( = source value)
+.end

	nrj-oisc NOR and Reference Jump OISC platform
	git clone git://git.luxferre.top/nrj-oisc.git
	Log \| Files \| Refs \| README

A	example.nrjasm	\|	40	++++++++++++++++++++++++++++++++++++++++
M	nrj.c	\|	4	++--
A	nrjasm.py	\|	218	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	stdlib.nrjasm	\|	21	+++++++++++++++++++++