'
' This is an implementation of the Lempel Ziv Data Compression (LZ77).
'
' http://en.wikipedia.org/wiki/LZ77_and_LZ78
'
' This type of encoding works good with reucrring patterns, for example with TXT files.
' It does not work good with files containing random binary data.
'
' (c) PvE - Oct 2012. Needs BaCon build 28 or higher.
'--------------------------------------------------------------------------------------------

' Get the separate arguments
OPTION COLLAPSE TRUE
SPLIT ARGUMENT$ BY " " TO arg$ SIZE amount

' Show usage
IF amount < 2 THEN
    PRINT "Usage: ", arg$[0], " <file>"
    END
ENDIF

' Check if the file exists
IF NOT(FILEEXISTS(arg$[1])) THEN
    PRINT "File ", arg$[1], " not found!"
    END 1
END IF

' Get filesize
FileSize = FILELEN(arg$[1])

' Reserve some memory to start with
FileData = MEMORY(FileSize)

' Get the file into memory
OPEN arg$[1] FOR READING AS myfile
    GETBYTE FileData FROM myfile SIZE FileSize
CLOSE FILE myfile

' Compensation for file length.
offset = 0
IF ODD(FileSize) THEN offset = 1

' Count the 2-byte sequences in the file. Each sequence is an index to an array.
DECLARE seqs[65536]
FOR x = 0 TO FileSize-1-offset STEP 2
    INCR seqs[((int)PEEK(FileData+x)<<8)+(int)PEEK(FileData+x+1)]
NEXT

' Create the index table with 255 indexes. Most occuring sequences first.
DECLARE idx_table[255]

FOR y = 0 TO 254
    highest = 0
    highest_x = 0
    FOR x = 0 TO 65535
        IF seqs[x] > highest AND seqs[x] > 0 THEN
            highest = seqs[x]
            highest_x = x
        END IF
    NEXT
    idx_table[y] = highest_x
    seqs[highest_x] = 0
NEXT

' Create another table which looks up position in the index table based on sequence.
DECLARE idx_idx[256*256]

FOR x = 0 TO 65535
    idx_idx[x] = -1
NEXT
FOR x = 0 TO 254
    idx_idx[idx_table[x]] = x
NEXT

' Reserve memory for compressed file
FileDest = MEMORY(FileSize*2+254*2)

' Store index table
FOR x = 0 TO 254
    POKE FileDest+x*2, idx_table[x]>>8
    POKE FileDest+x*2+1, idx_table[x]&255
NEXT

'----------------------------------------------------------------------------

SUB Store_Raw_Data

    ' The 255 value indicates the start of a series of unknown sequences.
    POKE FileDest + pos, 255

    ' How many bytes of unknown sequences to expect.
    POKE FileDest + pos + 1, amount
    INCR pos, 2

    ' Copy the unknown sequences to the target
    FOR y = raw TO x
        POKE FileDest + pos, PEEK(FileData+y)
        INCR pos
    NEXT

    ' Reset
    amount = 0
    raw = -1

END SUB

'----------------------------------------------------------------------------

' Initialize
pos = 254*2+2
amount = 0
raw = -1

' Now loop through memory
FOR x = 0 TO FileSize-1-offset STEP 2

    current = idx_idx[(int)PEEK(FileData+x)*256+(int)PEEK(FileData+x+1)]

    ' Sequence appears to exist.
    IF current >= 0 THEN
        ' First write unknown sequences if there are any.
        IF raw > -1 THEN Store_Raw_Data()

        ' Now replace known sequence for offset in the indextable.
        POKE FileDest + pos, current
        INCR pos

    ' Sequence does not exist.
    ELSE

        ' Are we counting unknown sequences already?
        IF raw = -1 THEN raw = x
        INCR amount, 2

        ' If amount reaches 254
        IF amount = 254 THEN Store_Raw_Data()

    END IF
NEXT

' See if we still need to write RAW sequences
IF offset THEN INCR amount

IF raw > -1 THEN
    DECR x
    Store_Raw_Data()
ELIF offset THEN
    raw = x
    Store_Raw_Data()
ENDIF

' Show some statistics
PRINT "Compression ratio: ", 100-pos*100/FileSize, "%. ";

' Determine new name
newname$ = CONCAT$(arg$[1], ".lz77")

' Save compressed data from memory to file
OPEN newname$ FOR WRITING AS myfile
    PUTBYTE FileDest TO myfile SIZE pos
CLOSE FILE myfile

PRINT "File '", arg$[1], "' compressed with LZ77 and saved as '", newname$, "'."

' Free claimed memory
FREE FileData
FREE FileDest

END