Initial
This commit is contained in:
commit
c361dd43da
|
|
@ -0,0 +1,16 @@
|
||||||
|
# lz77
|
||||||
|
|
||||||
|
lz77 compressor/decompressor using a 32KB sliding window which includes an allocation free window comparison helper and a prefix-indexing finder that prunes out-of-window positions
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
- dune build
|
||||||
|
- dune exec bin/main.exe -- compress <in> <out>
|
||||||
|
- dune exec bin/main.exe -- decompress <in> <out>
|
||||||
|
|
||||||
|
### Format
|
||||||
|
|
||||||
|
- Literal: flag bit 1 + 8-bit byte
|
||||||
|
- Match: flag bit 0 + 15-bit distance + 8-bit length
|
||||||
|
|
||||||
|
See `lib/` for module implementations :)
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
(executable
|
||||||
|
(name main)
|
||||||
|
(public_name lz77-cli)
|
||||||
|
(package lz77)
|
||||||
|
(libraries lz77)
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
let read_file path =
|
||||||
|
let ic = open_in_bin path in
|
||||||
|
let len = in_channel_length ic in
|
||||||
|
let s = really_input_string ic len in
|
||||||
|
close_in ic; Bytes.of_string s
|
||||||
|
|
||||||
|
let write_file path b =
|
||||||
|
let oc = open_out_bin path in
|
||||||
|
output_string oc (Bytes.to_string b);
|
||||||
|
close_out oc
|
||||||
|
|
||||||
|
let () =
|
||||||
|
match Array.to_list Sys.argv with
|
||||||
|
| _ :: mode :: in_path :: out_path :: [] ->
|
||||||
|
(try
|
||||||
|
let input = read_file in_path in
|
||||||
|
if mode = "compress" then begin
|
||||||
|
let out = Encoder.compress input in
|
||||||
|
write_file out_path out;
|
||||||
|
Printf.printf "Original: %d bytes\n" (Bytes.length input);
|
||||||
|
Printf.printf "Compressed: %d bytes\n" (Bytes.length out);
|
||||||
|
let ratio = 100.0 *. (float_of_int (Bytes.length out)) /. (float_of_int (Bytes.length input)) in
|
||||||
|
Printf.printf "Compression ratio: %.2f%%\n" ratio
|
||||||
|
end else if mode = "decompress" then begin
|
||||||
|
let out = Decoder.decompress input in
|
||||||
|
write_file out_path out;
|
||||||
|
Printf.printf "Decompressed to %d bytes\n" (Bytes.length out)
|
||||||
|
end else begin
|
||||||
|
prerr_endline "Invalid mode: use 'compress' or 'decompress'";
|
||||||
|
exit 2
|
||||||
|
end
|
||||||
|
with
|
||||||
|
| Sys_error e -> prerr_endline ("I/O error: " ^ e); exit 1
|
||||||
|
| Failure e -> prerr_endline ("Error: " ^ e); exit 1
|
||||||
|
| Invalid_argument e -> prerr_endline ("Invalid input: " ^ e); exit 1
|
||||||
|
)
|
||||||
|
| _ -> prerr_endline "Usage: lz77-cli (compress|decompress) <in> <out>"; exit 2
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
(lang dune 3.8)
|
||||||
|
(name lz77_ocaml)
|
||||||
|
(license MIT)
|
||||||
|
(version 0.1)
|
||||||
|
|
||||||
|
; Minimal dune-project for an OCaml library + binary
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
let decompress input =
|
||||||
|
let reader = Reader.of_bytes input in
|
||||||
|
let out = Buffer.create 4096 in
|
||||||
|
let window = Window.create () in
|
||||||
|
let rec loop () =
|
||||||
|
if Reader.available_bits reader <= 0 then () else
|
||||||
|
let flag = Reader.read_bits reader 1 in
|
||||||
|
if flag = 1 then (
|
||||||
|
if Reader.available_bits reader < 8 then invalid_arg "decompress: truncated literal";
|
||||||
|
let byte = Reader.read_bits reader 8 in
|
||||||
|
Buffer.add_char out (Char.chr byte);
|
||||||
|
Window.add_byte window byte;
|
||||||
|
loop ()
|
||||||
|
) else (
|
||||||
|
if Reader.available_bits reader < 23 then invalid_arg "decompress: truncated match";
|
||||||
|
let distance = Reader.read_bits reader 15 in
|
||||||
|
let length = Reader.read_bits reader 8 in
|
||||||
|
if distance <= 0 || distance > Window.window_size then invalid_arg "decompress: invalid distance";
|
||||||
|
for _ = 1 to length do
|
||||||
|
let b = Window.get_byte_at_distance window distance in
|
||||||
|
Buffer.add_char out (Char.chr b);
|
||||||
|
Window.add_byte window b
|
||||||
|
done;
|
||||||
|
loop ()
|
||||||
|
)
|
||||||
|
in
|
||||||
|
loop ();
|
||||||
|
Bytes.of_string (Buffer.contents out)
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
val decompress : bytes -> bytes
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
(library
|
||||||
|
(name lz77)
|
||||||
|
(public_name lz77)
|
||||||
|
(wrapped false)
|
||||||
|
(modules window finder writer reader encoder decoder)
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
let min_match_len = 3
|
||||||
|
let max_match_len = 255
|
||||||
|
let lookahead_size = 256
|
||||||
|
let window_size = 32768
|
||||||
|
|
||||||
|
let compress input =
|
||||||
|
let len = Bytes.length input in
|
||||||
|
let window = Window.create () in
|
||||||
|
let finder = Finder.create () in
|
||||||
|
let writer = Writer.create () in
|
||||||
|
let rec loop pos =
|
||||||
|
if pos >= len then () else
|
||||||
|
let la_len = min (len - pos) lookahead_size in
|
||||||
|
if la_len < min_match_len then (
|
||||||
|
for i = pos to len - 1 do
|
||||||
|
let byte = Char.code (Bytes.get input i) in
|
||||||
|
Writer.write_literal writer byte;
|
||||||
|
Window.add_byte window byte;
|
||||||
|
Finder.add_bytes finder window input i 1
|
||||||
|
done
|
||||||
|
) else (
|
||||||
|
match Finder.find_longest finder window input pos la_len with
|
||||||
|
| Some (distance, match_len) ->
|
||||||
|
let match_len = min match_len max_match_len in
|
||||||
|
Writer.write_match writer distance match_len;
|
||||||
|
Window.add_bytes window input pos match_len;
|
||||||
|
Finder.add_bytes finder window input pos match_len;
|
||||||
|
loop (pos + match_len)
|
||||||
|
| None ->
|
||||||
|
let byte = Char.code (Bytes.get input pos) in
|
||||||
|
Writer.write_literal writer byte;
|
||||||
|
Window.add_byte window byte;
|
||||||
|
Finder.add_bytes finder window input pos 1;
|
||||||
|
loop (pos + 1)
|
||||||
|
)
|
||||||
|
in
|
||||||
|
loop 0;
|
||||||
|
Writer.to_bytes writer
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
val min_match_len : int
|
||||||
|
val max_match_len : int
|
||||||
|
val lookahead_size : int
|
||||||
|
val window_size : int
|
||||||
|
|
||||||
|
val compress : bytes -> bytes
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
open Stdlib
|
||||||
|
|
||||||
|
module H = Hashtbl
|
||||||
|
|
||||||
|
let min_match_len = 3
|
||||||
|
let max_match_len = 255
|
||||||
|
let max_candidates = 128
|
||||||
|
|
||||||
|
type t = { table : (int, int list) H.t; mutable last_indexed_pos : int }
|
||||||
|
|
||||||
|
let create () = { table = H.create 131071; last_indexed_pos = 0 }
|
||||||
|
|
||||||
|
let hash3 b0 b1 b2 = (b0 lsl 16) lor (b1 lsl 8) lor b2
|
||||||
|
|
||||||
|
let add_start_pos t key pos =
|
||||||
|
match H.find_opt t.table key with
|
||||||
|
| None -> H.add t.table key [pos]
|
||||||
|
| Some lst -> H.replace t.table key (pos :: lst)
|
||||||
|
|
||||||
|
let add_bytes t window _buf _ofs _len =
|
||||||
|
let total = Int64.to_int (Window.total_processed window) in
|
||||||
|
let start = max 1 (t.last_indexed_pos + 1) in
|
||||||
|
let last = total - 2 in
|
||||||
|
for p = start to last do
|
||||||
|
let d = total - p + 1 in
|
||||||
|
let b0 = Window.get_byte_at_distance window d in
|
||||||
|
let b1 = Window.get_byte_at_distance window (d - 1) in
|
||||||
|
let b2 = Window.get_byte_at_distance window (d - 2) in
|
||||||
|
let key = hash3 b0 b1 b2 in
|
||||||
|
add_start_pos t key p
|
||||||
|
done;
|
||||||
|
if last >= t.last_indexed_pos then t.last_indexed_pos <- max t.last_indexed_pos (total - 1)
|
||||||
|
|
||||||
|
let take_n n lst =
|
||||||
|
let rec aux i acc = function
|
||||||
|
| [] | _ when i <= 0 -> List.rev acc
|
||||||
|
| x::xs -> aux (i - 1) (x :: acc) xs
|
||||||
|
in aux n [] lst
|
||||||
|
|
||||||
|
let prune_positions current_total = function
|
||||||
|
| [] -> []
|
||||||
|
| lst ->
|
||||||
|
let lower = current_total - Window.window_size + 1 in
|
||||||
|
List.filter (fun pos -> pos >= lower) lst
|
||||||
|
|
||||||
|
let find_longest t window lookahead ofs la_len =
|
||||||
|
if la_len - ofs < min_match_len then None else
|
||||||
|
let b0 = Char.code (Bytes.get lookahead ofs) in
|
||||||
|
let b1 = Char.code (Bytes.get lookahead (ofs + 1)) in
|
||||||
|
let b2 = Char.code (Bytes.get lookahead (ofs + 2)) in
|
||||||
|
let key = hash3 b0 b1 b2 in
|
||||||
|
match H.find_opt t.table key with
|
||||||
|
| None -> None
|
||||||
|
| Some positions_raw ->
|
||||||
|
let current_total = Int64.to_int (Window.total_processed window) in
|
||||||
|
let positions = prune_positions current_total positions_raw in
|
||||||
|
let candidates = take_n max_candidates positions in
|
||||||
|
let best = ref None in
|
||||||
|
let window_sz = Window.window_size in
|
||||||
|
List.iter (fun pos ->
|
||||||
|
let distance = current_total - pos + 1 in
|
||||||
|
if distance > 0 && distance <= window_sz then begin
|
||||||
|
let max_len = min max_match_len (la_len - ofs) in
|
||||||
|
let available = Window.current_size window - distance + 1 in
|
||||||
|
if available >= min_match_len then
|
||||||
|
let final_max = min max_len available in
|
||||||
|
let matched = Window.match_length_at_distance window distance lookahead ofs final_max in
|
||||||
|
if matched >= min_match_len then
|
||||||
|
match !best with
|
||||||
|
| None -> best := Some (distance, matched)
|
||||||
|
| Some (d,l) -> if matched > l || (matched = l && distance < d) then best := Some (distance, matched)
|
||||||
|
end
|
||||||
|
) candidates;
|
||||||
|
best
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
type t
|
||||||
|
|
||||||
|
val create : unit -> t
|
||||||
|
|
||||||
|
val add_bytes : t -> Window.t -> bytes -> int -> int -> unit
|
||||||
|
|
||||||
|
val find_longest : t -> Window.t -> bytes -> int -> int -> (int * int) option
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
type t = { data : bytes; len : int; mutable byte_idx : int; mutable bit_idx : int }
|
||||||
|
|
||||||
|
let of_bytes b = { data = b; len = Bytes.length b; byte_idx = 0; bit_idx = 0 }
|
||||||
|
|
||||||
|
let byte_pos t = t.byte_idx
|
||||||
|
let bit_pos t = t.bit_idx
|
||||||
|
|
||||||
|
let has_more t = t.byte_idx < t.len || (t.byte_idx = t.len && t.bit_idx <> 0)
|
||||||
|
|
||||||
|
let available_bits t = (t.len - t.byte_idx) * 8 - t.bit_idx
|
||||||
|
|
||||||
|
let read_bits t n =
|
||||||
|
if n <= 0 then 0 else begin
|
||||||
|
if n > 31 then invalid_arg "read_bits: n too large";
|
||||||
|
if available_bits t < n then invalid_arg "read_bits: not enough bits";
|
||||||
|
let acc = ref 0 in
|
||||||
|
for _ = 1 to n do
|
||||||
|
if t.byte_idx >= t.len then invalid_arg "read_bits: out of data";
|
||||||
|
let byte = Char.code (Bytes.get t.data t.byte_idx) in
|
||||||
|
let bit_pos = 7 - t.bit_idx in
|
||||||
|
let bit = (byte lsr bit_pos) land 1 in
|
||||||
|
acc := (!acc lsl 1) lor bit;
|
||||||
|
if t.bit_idx = 7 then (t.byte_idx <- t.byte_idx + 1; t.bit_idx <- 0) else t.bit_idx <- t.bit_idx + 1
|
||||||
|
done;
|
||||||
|
!acc
|
||||||
|
end
|
||||||
|
|
||||||
|
let peek_bits t n =
|
||||||
|
let saved_b = t.byte_idx in
|
||||||
|
let saved_bit = t.bit_idx in
|
||||||
|
let v = read_bits t n in
|
||||||
|
t.byte_idx <- saved_b; t.bit_idx <- saved_bit; v
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
type t
|
||||||
|
|
||||||
|
val of_bytes : bytes -> t
|
||||||
|
|
||||||
|
val read_bits : t -> int -> int
|
||||||
|
|
||||||
|
val peek_bits : t -> int -> int
|
||||||
|
|
||||||
|
val has_more : t -> bool
|
||||||
|
|
||||||
|
val available_bits : t -> int
|
||||||
|
|
||||||
|
val byte_pos : t -> int
|
||||||
|
val bit_pos : t -> int
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
let window_size = 32768
|
||||||
|
|
||||||
|
type t = { data : bytes; mutable write_pos : int; mutable size : int; mutable total : int64 }
|
||||||
|
|
||||||
|
let create () = { data = Bytes.make window_size '\000'; write_pos = 0; size = 0; total = 0L }
|
||||||
|
|
||||||
|
let current_size t = t.size
|
||||||
|
let total_processed t = t.total
|
||||||
|
|
||||||
|
let add_byte t b =
|
||||||
|
if b < 0 || b > 255 then invalid_arg "add_byte: byte out of range";
|
||||||
|
Bytes.set t.data t.write_pos (Char.chr b);
|
||||||
|
t.write_pos <- (t.write_pos + 1) mod window_size;
|
||||||
|
t.total <- Int64.succ t.total;
|
||||||
|
if t.size < window_size then t.size <- t.size + 1
|
||||||
|
|
||||||
|
let add_bytes t buf ofs len =
|
||||||
|
if ofs < 0 || ofs + len > Bytes.length buf then invalid_arg "add_bytes: out of bounds";
|
||||||
|
for i = 0 to len - 1 do
|
||||||
|
Bytes.set t.data t.write_pos (Bytes.get buf (ofs + i));
|
||||||
|
t.write_pos <- (t.write_pos + 1) mod window_size;
|
||||||
|
t.total <- Int64.succ t.total;
|
||||||
|
if t.size < window_size then t.size <- t.size + 1
|
||||||
|
done
|
||||||
|
|
||||||
|
let get_byte_at_distance t distance =
|
||||||
|
if distance <= 0 || distance > t.size || distance > window_size then
|
||||||
|
invalid_arg "get_byte_at_distance: invalid distance";
|
||||||
|
let pos = (t.write_pos - distance) mod window_size in
|
||||||
|
let pos = if pos < 0 then pos + window_size else pos in
|
||||||
|
Char.code (Bytes.get t.data pos)
|
||||||
|
|
||||||
|
let extract_slice t start len =
|
||||||
|
if start <= 0 || start > t.size then invalid_arg "extract_slice: invalid start";
|
||||||
|
if len < 0 then invalid_arg "extract_slice: negative len";
|
||||||
|
let out = Bytes.make len '\000' in
|
||||||
|
for i = 0 to len - 1 do
|
||||||
|
let dist = start + i in
|
||||||
|
if dist > t.size then invalid_arg "extract_slice: out of range";
|
||||||
|
let pos = (t.write_pos - dist) mod window_size in
|
||||||
|
let pos = if pos < 0 then pos + window_size else pos in
|
||||||
|
Bytes.set out i (Bytes.get t.data pos)
|
||||||
|
done;
|
||||||
|
out
|
||||||
|
|
||||||
|
let match_length_at_distance t distance lookahead ofs maxlen =
|
||||||
|
if distance <= 0 || distance > t.size then invalid_arg "match_length_at_distance: invalid distance";
|
||||||
|
let remaining = Bytes.length lookahead - ofs in
|
||||||
|
if remaining <= 0 || maxlen <= 0 then 0 else
|
||||||
|
let available_in_window = t.size - distance + 1 in
|
||||||
|
let max_cmp = min maxlen (min remaining available_in_window) in
|
||||||
|
let start_pos = (t.write_pos - distance) mod window_size in
|
||||||
|
let start_pos = if start_pos < 0 then start_pos + window_size else start_pos in
|
||||||
|
let rec cmp i =
|
||||||
|
if i >= max_cmp then i else
|
||||||
|
let pos = (start_pos + i) mod window_size in
|
||||||
|
if Bytes.get t.data pos = Bytes.get lookahead (ofs + i) then cmp (i + 1) else i
|
||||||
|
in cmp 0
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
val window_size : int
|
||||||
|
|
||||||
|
type t
|
||||||
|
|
||||||
|
val create : unit -> t
|
||||||
|
|
||||||
|
val add_byte : t -> int -> unit
|
||||||
|
|
||||||
|
val add_bytes : t -> bytes -> int -> int -> unit
|
||||||
|
|
||||||
|
val get_byte_at_distance : t -> int -> int
|
||||||
|
|
||||||
|
val current_size : t -> int
|
||||||
|
|
||||||
|
val total_processed : t -> int64
|
||||||
|
|
||||||
|
val extract_slice : t -> int -> int -> bytes
|
||||||
|
|
||||||
|
val match_length_at_distance : t -> int -> bytes -> int -> int -> int
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
type t = { buf : Buffer.t; mutable cur : int; mutable filled : int }
|
||||||
|
|
||||||
|
let create () = { buf = Buffer.create 1024; cur = 0; filled = 0 }
|
||||||
|
|
||||||
|
let push_byte t = Buffer.add_char t.buf (Char.chr t.cur); t.cur <- 0; t.filled <- 0
|
||||||
|
|
||||||
|
let write_bit t bit =
|
||||||
|
let bit = if bit <> 0 then 1 else 0 in
|
||||||
|
let pos = 7 - t.filled in
|
||||||
|
t.cur <- t.cur lor (bit lsl pos);
|
||||||
|
t.filled <- t.filled + 1;
|
||||||
|
if t.filled = 8 then push_byte t
|
||||||
|
|
||||||
|
let write_bits t value nbits =
|
||||||
|
if nbits <= 0 then invalid_arg "write_bits: nbits must be > 0";
|
||||||
|
if nbits > 31 then invalid_arg "write_bits: nbits too large";
|
||||||
|
for i = nbits - 1 downto 0 do
|
||||||
|
let bit = (value lsr i) land 1 in
|
||||||
|
write_bit t bit
|
||||||
|
done
|
||||||
|
|
||||||
|
let write_literal t byte =
|
||||||
|
if byte < 0 || byte > 255 then invalid_arg "write_literal: byte out of range";
|
||||||
|
write_bit t 1;
|
||||||
|
write_bits t byte 8
|
||||||
|
|
||||||
|
let write_match t distance length =
|
||||||
|
if distance <= 0 || distance >= (1 lsl 15) then invalid_arg "write_match: invalid distance";
|
||||||
|
if length <= 0 || length > 255 then invalid_arg "write_match: invalid length";
|
||||||
|
write_bit t 0;
|
||||||
|
write_bits t distance 15;
|
||||||
|
write_bits t length 8
|
||||||
|
|
||||||
|
let flush t = if t.filled > 0 then push_byte t
|
||||||
|
|
||||||
|
let to_bytes t = flush t; Buffer.contents t.buf |> Bytes.of_string
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
type t
|
||||||
|
|
||||||
|
val create : unit -> t
|
||||||
|
|
||||||
|
val write_bit : t -> int -> unit
|
||||||
|
val write_bits : t -> int -> int -> unit
|
||||||
|
|
||||||
|
val flush : t -> unit
|
||||||
|
val to_bytes : t -> bytes
|
||||||
|
|
||||||
|
val write_literal : t -> int -> unit
|
||||||
|
val write_match : t -> int -> int -> unit
|
||||||
Loading…
Reference in New Issue