'------------------------------------------------------------------
'
' This is an implementation of a self-designed compression algorithm.
'
' The code tries to be as simple and small as possible.
'
' (c) Peter van Eerten 2018, MIT license
'
'------------------------------------------------------------------
'
' The idea is that the compression will use bit patterns based on occurrences of data:
'
' xx11 -> 4 bytes
' xxx10 -> 8 bytes
' xxxx01 -> 16 bytes
' xxxxxxxx00 -> remaining 228 bytes
'
' The compressed data is identified by the 2 bits on the right. Based on the pattern, the
' decompression will determine the amount of bits which describes the byte. So, if a bit
' pattern is '11', then the next 2 bits describe an index to the actual byte (index being
' 00, 01, 10, 11). If a bit pattern is '10' then there are 3 bits describing the byte, and
' with bit pattern '01' there are 4 bits describing the byte. This way, it is possible to
' store 28 bytes in a compressed manner.
'
' The remaining 228 bytes will be stored as-they-are plus 2 identifying bits '00' for each
' byte. This indeed enlarges the result somewhat, though effectively, the total compression
' ratio can still be pretty good depending on the type of data. If the data contains a lot
' of similar bytes, like BMP or ASCII files, the algorithm can reach a ratio up to 40%.
'
' Of course, compressed files need a conversion table to restore the actual bytes. The
' conversion table is put into the compressed file by the compression routine. This also
' means, that a resulting file always will larger than 28 bytes. So compressing very small
' files of 10 bytes will actually result into a larger file (negative compression).
'
'----------------------------------------------------------------------

' Optimize for speed
PRAGMA OPTIONS -O3

' Get filename
IF AMOUNT(ARGUMENT$) < 2 THEN
	PRINT "Usage: ", ME$, " <file.baz>"
	END 1
ENDIF

file$ = TOKEN$(ARGUMENT$, 2)

IF LCASE$(RIGHT$(file$, 4)) <> ".baz" THEN
	PRINT "Usage: ", ME$, " <file.baz>"
	END 1
ENDIF

' Using array references is faster than PEEK
DECLARE data TYPE uint8_t*

' Load the data
data = BLOAD(file$)

length = FILELEN(file$)

' Verify file header
IF PEEK(data) <> ASC("B") OR PEEK(data+1) <> ASC("A") OR PEEK(data+2) <> ASC("Z") THEN
	PRINT "This is not a BaCon Zipped file! Exiting..."
	END
ENDIF

' Need int bytes more in the buffer because of parsing below
RESIZE data TO length+SIZEOF(int)

' Array with the values for the 28 special chars.
' We declare 256 and leave the other elements 0.
DECLARE new[256] = { 0 }

' Skip the 3 header bytes BAZ
idx = 3

' Fetch the table
FOR x = 0 TO 27
	new[x] = PEEK(data+idx+x)
NEXT

' Proceed to data
INCR idx, 28

' Memory for result
result = MEMORY(length*2)

bits = 0

' Temp buffer in which the data is parsed
DECLARE buf TYPE uint32_t

' Parse data
WHILE idx < length

	' Read 3 bytes
	buf = ((data[idx] | (data[idx+1]<<8) | (data[idx+2]<<16)) >> bits)

	IF (buf & 3) = 3 THEN
		POKE result+pos, new[(buf >> 2) & 3]
		INCR bits, 4
	ELIF (buf & 2) = 2 THEN
		POKE result+pos, new[((buf >> 2) & 7)+4]
		INCR bits, 5
	ELIF (buf & 1) = 1 THEN
		POKE result+pos, new[((buf >> 2) & 15)+12]
		INCR bits, 6
	ELSE
		POKE result+pos, (buf >> 2) & 255
		INCR bits, 10
	ENDIF

	' Wrap around to next byte when needed
	WHILE bits > 7
		DECR bits, 8
		INCR idx
	WEND

	INCR pos
WEND

BSAVE result TO BASENAME$(file$, 1) & ".original" SIZE pos-1

FREE data, result

PRINT "Done in ", TIMER, " msecs."