awk-gold-collection

My best software created for POSIX AWK
git clone git://git.luxferre.top/awk-gold-collection.git
Log | Files | Refs | Submodules | README

tgl.awk (7889B)


      1 # The Great Library of useful AWK functions
      2 # Fully POSIX-compatible but sometimes depends on other POSIX commands
      3 # Use with your programs like this:
      4 # LANG=C awk -f tgl.awk -f your_prog.awk [args]
      5 #
      6 # Current functionality:
      7 # * single character input: setterm, getchar
      8 # * ASCII and UTF-8 codepoint conversion: ord, wctomb, mbtowc
      9 # * loading binary files as decimal integers into arrays: loadbin
     10 # * saving binary files from arrays with decimal integers: savebin
     11 # * tangent and cotangent functions: tan, cotan
     12 # * signum, floor and ceiling functions: sign, floor, ceil
     13 # * test for native bitwise operation support: bw_native_support
     14 # * reimplementation of most bitwise operations (unsigned 32-bit):
     15 # - NOT: bw_compl
     16 # - AND: bw_and
     17 # - OR: bw_or
     18 # - XOR: bw_xor
     19 # - NAND: bw_nand
     20 # - NOR: bw_nor
     21 # - >>: bw_rshift
     22 # - <<: bw_lshift
     23 #
     24 # Created by Luxferre in 2023, released into public domain
     25 
     26 # set/restore the terminal input mode using stty
     27 # usage: setterm(0|1|2|3)
     28 # 0 - restore the original terminal input mode
     29 # 1 - blocking single-character input with echo
     30 # 2 - blocking single-character input without echo
     31 # 3 - non-blocking single-character input without echo
     32 # in pipes, this function doesn't do anything
     33 # (but returns 0 since it's not an error)
     34 # otherwise an actual stty exit code is returned
     35 function setterm(mode, cmd) {
     36   if(system("stty >/dev/null 2>&1")) return 0 # exit code 0 means we're in a tty
     37   if(!TGL_TERMMODE) { # cache the original terminal input mode
     38     (cmd = "stty -g") | getline TGL_TERMMODE
     39     close(cmd)
     40   }
     41   if(mode == 1) cmd = "-icanon"
     42   else if(mode == 2) cmd = "-icanon -echo"
     43   else if(mode == 3) cmd = "-icanon time 0 min 0 -echo"
     44   else cmd = TGL_TERMMODE # restore the original mode
     45   return system("stty " cmd ">/dev/null 2>&1") # execute the stty command
     46 }
     47 
     48 # getchar emulation using od
     49 # caches the read command for further usage
     50 # also able to capture null bytes, unlike read/printf approach
     51 # use in conjunction with setterm to achieve different input modes
     52 # setting LANG=C envvar is recommended, for GAWK it is required
     53 # usage: getchar() => integer
     54 function getchar(c) {
     55   if(!TGL_GCH_CMD) TGL_GCH_CMD = "od -tu1 -w1 -N1 -An -v" # first time usage
     56   TGL_GCH_CMD | getline c
     57   close(TGL_GCH_CMD)
     58   return int(c)
     59 }
     60 
     61 # get the ASCII code of a character
     62 # setting LANG=C envvar is recommended, for GAWK it is required
     63 # usage: ord(c) => integer
     64 function ord(c, b) {
     65   # init char-to-ASCII mapping if it's not there yet
     66   if(!TGL_ORD["#"]) for(b=0;b<256;b++) TGL_ORD[sprintf("%c", b)] = b
     67   return int(TGL_ORD[c])
     68 }
     69 
     70 # encode a single integer UTF-8 codepoint into a byte sequence in a string
     71 # setting LANG=C envvar is recommended, for GAWK it is required
     72 # usage: wctomb(code) => string
     73 # we can safely use the string type for all codepoints above 0 as all
     74 # multibyte sequences have a high bit set, so no null byte is there
     75 # for invalid codepoints, an empty string will be returned
     76 function wctomb(code, s) {
     77   code = int(code)
     78   if(code < 0 || code > 1114109) s = ""  # invalid codepoint
     79   else if(code < 128) s = sprintf("%c", code) # single byte
     80   else if(code < 2048) # 2-byte sequence
     81     s = sprintf("%c%c", \
     82       192 + (int(code/64) % 32), \
     83       128 + (code % 64))
     84   else if(code < 65536) # 3-byte sequence
     85     s = sprintf("%c%c%c", \
     86       224 + (int(code/4096) % 16), \
     87       128 + (int(code/64) % 64), \
     88       128 + (code % 64))
     89   else # 4-byte sequence
     90     s = sprintf("%c%c%c%c", \
     91       240 + (int(code/262144) % 8), \
     92       128 + (int(code/4096) % 64), \
     93       128 + (int(code/64) % 64), \
     94       128 + (code % 64))
     95   return s
     96 }
     97 
     98 # decode a byte string into a UTF-8 codepoint
     99 # setting LANG=C envvar is recommended, for GAWK it is required
    100 # usage: mbtowc(s) => integer
    101 # decoding stops on the first encountered invalid byte
    102 function mbtowc(s, len, code, b, pos) {
    103   len = length(s)
    104   code = 0
    105   for(pos=1;pos<=len;pos++) {
    106     code *= 64 # shift the code 6 bits left
    107     b = ord(substr(s, pos, 1))
    108     if(pos == 1) { # expect a single or header byte
    109       if(b < 128) {code = b; break} # it resolves into a single byte
    110       else if(b >= 192 && b < 224) # it's a header byte of 2-byte sequence
    111         code += b % 32
    112       else if(b >= 224 && b < 240) # it's a header byte of 3-byte sequence
    113         code += b % 16
    114       else if(b >= 240) # it's a header byte of 4-byte sequence
    115         code += b % 8
    116       else break # a trailer byte in the header position is invalid
    117     }
    118     else if(b >= 128 && b < 192) # it must be a trailer byte
    119       code += b % 64
    120     else break # a header byte in the trailer position is invalid
    121   }
    122   return code
    123 }
    124 
    125 # load any binary file into an AWK array (0-indexed), depends on od
    126 # returns the resulting array length
    127 # usage: loadbin(fname, arr, len, wordsize) => integer
    128 # len parameter is optional, specifies how many bytes to read
    129 # (if 0 or unset, read everything)
    130 # wordsize parameter is optional, 1 byte by default
    131 # multibyte words are considered little-endian
    132 function loadbin(fname, arr, len, wordsize, cmd, i) {
    133   wordsize = int(wordsize)
    134   if(wordsize < 1) wordsize = 1
    135   len = int(len)
    136   i = (len > 0) ? (" -N" len " ") : ""
    137   cmd = "od -tu" wordsize " -An -w" wordsize i " -v \"" fname "\""
    138   # every line should be a single decimal integer (with some whitespace)
    139   i = 0
    140   while((cmd | getline) > 0)  # read the next line from the stream
    141     if(NF) arr[i++] = int($1) # read the first and only field
    142   close(cmd) # close the od process
    143   return i
    144 }
    145 
    146 # save an AWK array (0-indexed) into a binary file
    147 # setting LANG=C envvar is recommended, for GAWK it is required
    148 # returns the amount of written elements
    149 # usage: savebin(fname, arr, len, wordsize) => integer
    150 # wordsize parameter is optional, 1 byte by default
    151 # multibyte words are considered little-endian
    152 function savebin(fname, arr, len, wordsize, i, j) {
    153   wordsize = int(wordsize)
    154   if(wordsize < 1) wordsize = 1
    155   printf("") > fname # truncate the file and open the stream
    156   for(i=0;i<len;i++) {
    157     if(wordsize == 1) printf("%c", arr[i]) >> fname
    158     else # we have a multibyte word size
    159       for(j=0;j<wordsize;j++)
    160         printf("%c", int(arr[i]/2^(8*j))%256) >> fname
    161   }
    162   close(fname) # close the output file
    163   return i
    164 }
    165 
    166 # the missing tangent/cotangent functions
    167 
    168 function tan(x) {return sin(x)/cos(x)}
    169 function cotan(x) {return cos(x)/sin(x)}
    170 
    171 # the missing sign/floor/ceil functions
    172 
    173 function sign(x) {return x < 0 ? -1 : !!x}
    174 function floor(x, f) {
    175   f = int(x)
    176   if(x == f) return x
    177   else return x >= 0 ? f : (f - 1) 
    178 }
    179 function ceil(x, f) {
    180   f = int(x)
    181   if(x == f) return x
    182   else return x >= 0 ? (f + 1) : f
    183 }
    184 
    185 # Bitwise operations section
    186 
    187 # test if the AWK engine has non-POSIX bitwise operation functions
    188 # (and, or, xor, compl, lshift, rshift) implemented natively:
    189 # if compl is missing, it will be concatenated with 1 and equal to 1
    190 # so the inverse of this condition will be the result
    191 function bw_native_support() {return (compl (1) != 1)}
    192 
    193 # now, the implementation of the operations themselves
    194 # note that all complements are 32-bit and all operands must be non-negative
    195 
    196 function bw_compl(a) {return 4294967295 - int(a)}
    197 function bw_lshift(a, b) {for(;b>0;b--) a = int(a/2);return a}
    198 function bw_rshift(a, b) {for(;b>0;b--) a *= 2;return int(a)}
    199 function bw_and(a, b, v, r) {
    200   v = 1; r = 0
    201   while(a > 0 || b > 0) {
    202     if((a%2) == 1 && (b%2) == 1) r += v
    203     a = int(a/2)
    204     b = int(b/2)
    205     v *= 2
    206   }
    207   return int(r)
    208 }
    209 function bw_or(a, b, v, r) {
    210   v = 1; r = 0
    211   while(a > 0 || b > 0) {
    212     if((a%2) == 1 || (b%2) == 1) r += v
    213     a = int(a/2)
    214     b = int(b/2)
    215     v *= 2
    216   }
    217   return int(r)
    218 }
    219 function bw_xor(a, b, v, r) {
    220   v = 1; r = 0
    221   while(a > 0 || b > 0) {
    222     if((a%2) != (b%2)) r += v
    223     a = int(a/2)
    224     b = int(b/2)
    225     v *= 2
    226   }
    227   return int(r)
    228 }
    229 function bw_nand(a, b) {return bw_compl(bw_and(a,b))}
    230 function bw_nor(a, b) {return bw_compl(bw_or(a,b))}
    231