This commit is contained in:
kim 2025-12-21 16:38:25 +00:00 committed by GitHub
commit c361dd43da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 396 additions and 0 deletions

16
README.md Normal file
View File

@ -0,0 +1,16 @@
# lz77
lz77 compressor/decompressor using a 32KB sliding window which includes an allocation free window comparison helper and a prefix-indexing finder that prunes out-of-window positions
### Build
- dune build
- dune exec bin/main.exe -- compress <in> <out>
- dune exec bin/main.exe -- decompress <in> <out>
### Format
- Literal: flag bit 1 + 8-bit byte
- Match: flag bit 0 + 15-bit distance + 8-bit length
See `lib/` for module implementations :)

6
bin/dune Normal file
View File

@ -0,0 +1,6 @@
(executable
(name main)
(public_name lz77-cli)
(package lz77)
(libraries lz77)
)

37
bin/main.ml Normal file
View File

@ -0,0 +1,37 @@
let read_file path =
let ic = open_in_bin path in
let len = in_channel_length ic in
let s = really_input_string ic len in
close_in ic; Bytes.of_string s
let write_file path b =
let oc = open_out_bin path in
output_string oc (Bytes.to_string b);
close_out oc
let () =
match Array.to_list Sys.argv with
| _ :: mode :: in_path :: out_path :: [] ->
(try
let input = read_file in_path in
if mode = "compress" then begin
let out = Encoder.compress input in
write_file out_path out;
Printf.printf "Original: %d bytes\n" (Bytes.length input);
Printf.printf "Compressed: %d bytes\n" (Bytes.length out);
let ratio = 100.0 *. (float_of_int (Bytes.length out)) /. (float_of_int (Bytes.length input)) in
Printf.printf "Compression ratio: %.2f%%\n" ratio
end else if mode = "decompress" then begin
let out = Decoder.decompress input in
write_file out_path out;
Printf.printf "Decompressed to %d bytes\n" (Bytes.length out)
end else begin
prerr_endline "Invalid mode: use 'compress' or 'decompress'";
exit 2
end
with
| Sys_error e -> prerr_endline ("I/O error: " ^ e); exit 1
| Failure e -> prerr_endline ("Error: " ^ e); exit 1
| Invalid_argument e -> prerr_endline ("Invalid input: " ^ e); exit 1
)
| _ -> prerr_endline "Usage: lz77-cli (compress|decompress) <in> <out>"; exit 2

6
dune-project Normal file
View File

@ -0,0 +1,6 @@
(lang dune 3.8)
(name lz77_ocaml)
(license MIT)
(version 0.1)
; Minimal dune-project for an OCaml library + binary

28
lib/decoder.ml Normal file
View File

@ -0,0 +1,28 @@
let decompress input =
let reader = Reader.of_bytes input in
let out = Buffer.create 4096 in
let window = Window.create () in
let rec loop () =
if Reader.available_bits reader <= 0 then () else
let flag = Reader.read_bits reader 1 in
if flag = 1 then (
if Reader.available_bits reader < 8 then invalid_arg "decompress: truncated literal";
let byte = Reader.read_bits reader 8 in
Buffer.add_char out (Char.chr byte);
Window.add_byte window byte;
loop ()
) else (
if Reader.available_bits reader < 23 then invalid_arg "decompress: truncated match";
let distance = Reader.read_bits reader 15 in
let length = Reader.read_bits reader 8 in
if distance <= 0 || distance > Window.window_size then invalid_arg "decompress: invalid distance";
for _ = 1 to length do
let b = Window.get_byte_at_distance window distance in
Buffer.add_char out (Char.chr b);
Window.add_byte window b
done;
loop ()
)
in
loop ();
Bytes.of_string (Buffer.contents out)

1
lib/decoder.mli Normal file
View File

@ -0,0 +1 @@
val decompress : bytes -> bytes

6
lib/dune Normal file
View File

@ -0,0 +1,6 @@
(library
(name lz77)
(public_name lz77)
(wrapped false)
(modules window finder writer reader encoder decoder)
)

38
lib/encoder.ml Normal file
View File

@ -0,0 +1,38 @@
let min_match_len = 3
let max_match_len = 255
let lookahead_size = 256
let window_size = 32768
let compress input =
let len = Bytes.length input in
let window = Window.create () in
let finder = Finder.create () in
let writer = Writer.create () in
let rec loop pos =
if pos >= len then () else
let la_len = min (len - pos) lookahead_size in
if la_len < min_match_len then (
for i = pos to len - 1 do
let byte = Char.code (Bytes.get input i) in
Writer.write_literal writer byte;
Window.add_byte window byte;
Finder.add_bytes finder window input i 1
done
) else (
match Finder.find_longest finder window input pos la_len with
| Some (distance, match_len) ->
let match_len = min match_len max_match_len in
Writer.write_match writer distance match_len;
Window.add_bytes window input pos match_len;
Finder.add_bytes finder window input pos match_len;
loop (pos + match_len)
| None ->
let byte = Char.code (Bytes.get input pos) in
Writer.write_literal writer byte;
Window.add_byte window byte;
Finder.add_bytes finder window input pos 1;
loop (pos + 1)
)
in
loop 0;
Writer.to_bytes writer

6
lib/encoder.mli Normal file
View File

@ -0,0 +1,6 @@
val min_match_len : int
val max_match_len : int
val lookahead_size : int
val window_size : int
val compress : bytes -> bytes

74
lib/finder.ml Normal file
View File

@ -0,0 +1,74 @@
open Stdlib
module H = Hashtbl
let min_match_len = 3
let max_match_len = 255
let max_candidates = 128
type t = { table : (int, int list) H.t; mutable last_indexed_pos : int }
let create () = { table = H.create 131071; last_indexed_pos = 0 }
let hash3 b0 b1 b2 = (b0 lsl 16) lor (b1 lsl 8) lor b2
let add_start_pos t key pos =
match H.find_opt t.table key with
| None -> H.add t.table key [pos]
| Some lst -> H.replace t.table key (pos :: lst)
let add_bytes t window _buf _ofs _len =
let total = Int64.to_int (Window.total_processed window) in
let start = max 1 (t.last_indexed_pos + 1) in
let last = total - 2 in
for p = start to last do
let d = total - p + 1 in
let b0 = Window.get_byte_at_distance window d in
let b1 = Window.get_byte_at_distance window (d - 1) in
let b2 = Window.get_byte_at_distance window (d - 2) in
let key = hash3 b0 b1 b2 in
add_start_pos t key p
done;
if last >= t.last_indexed_pos then t.last_indexed_pos <- max t.last_indexed_pos (total - 1)
let take_n n lst =
let rec aux i acc = function
| [] | _ when i <= 0 -> List.rev acc
| x::xs -> aux (i - 1) (x :: acc) xs
in aux n [] lst
let prune_positions current_total = function
| [] -> []
| lst ->
let lower = current_total - Window.window_size + 1 in
List.filter (fun pos -> pos >= lower) lst
let find_longest t window lookahead ofs la_len =
if la_len - ofs < min_match_len then None else
let b0 = Char.code (Bytes.get lookahead ofs) in
let b1 = Char.code (Bytes.get lookahead (ofs + 1)) in
let b2 = Char.code (Bytes.get lookahead (ofs + 2)) in
let key = hash3 b0 b1 b2 in
match H.find_opt t.table key with
| None -> None
| Some positions_raw ->
let current_total = Int64.to_int (Window.total_processed window) in
let positions = prune_positions current_total positions_raw in
let candidates = take_n max_candidates positions in
let best = ref None in
let window_sz = Window.window_size in
List.iter (fun pos ->
let distance = current_total - pos + 1 in
if distance > 0 && distance <= window_sz then begin
let max_len = min max_match_len (la_len - ofs) in
let available = Window.current_size window - distance + 1 in
if available >= min_match_len then
let final_max = min max_len available in
let matched = Window.match_length_at_distance window distance lookahead ofs final_max in
if matched >= min_match_len then
match !best with
| None -> best := Some (distance, matched)
| Some (d,l) -> if matched > l || (matched = l && distance < d) then best := Some (distance, matched)
end
) candidates;
best

7
lib/finder.mli Normal file
View File

@ -0,0 +1,7 @@
type t
val create : unit -> t
val add_bytes : t -> Window.t -> bytes -> int -> int -> unit
val find_longest : t -> Window.t -> bytes -> int -> int -> (int * int) option

32
lib/reader.ml Normal file
View File

@ -0,0 +1,32 @@
type t = { data : bytes; len : int; mutable byte_idx : int; mutable bit_idx : int }
let of_bytes b = { data = b; len = Bytes.length b; byte_idx = 0; bit_idx = 0 }
let byte_pos t = t.byte_idx
let bit_pos t = t.bit_idx
let has_more t = t.byte_idx < t.len || (t.byte_idx = t.len && t.bit_idx <> 0)
let available_bits t = (t.len - t.byte_idx) * 8 - t.bit_idx
let read_bits t n =
if n <= 0 then 0 else begin
if n > 31 then invalid_arg "read_bits: n too large";
if available_bits t < n then invalid_arg "read_bits: not enough bits";
let acc = ref 0 in
for _ = 1 to n do
if t.byte_idx >= t.len then invalid_arg "read_bits: out of data";
let byte = Char.code (Bytes.get t.data t.byte_idx) in
let bit_pos = 7 - t.bit_idx in
let bit = (byte lsr bit_pos) land 1 in
acc := (!acc lsl 1) lor bit;
if t.bit_idx = 7 then (t.byte_idx <- t.byte_idx + 1; t.bit_idx <- 0) else t.bit_idx <- t.bit_idx + 1
done;
!acc
end
let peek_bits t n =
let saved_b = t.byte_idx in
let saved_bit = t.bit_idx in
let v = read_bits t n in
t.byte_idx <- saved_b; t.bit_idx <- saved_bit; v

14
lib/reader.mli Normal file
View File

@ -0,0 +1,14 @@
type t
val of_bytes : bytes -> t
val read_bits : t -> int -> int
val peek_bits : t -> int -> int
val has_more : t -> bool
val available_bits : t -> int
val byte_pos : t -> int
val bit_pos : t -> int

58
lib/window.ml Normal file
View File

@ -0,0 +1,58 @@
let window_size = 32768
type t = { data : bytes; mutable write_pos : int; mutable size : int; mutable total : int64 }
let create () = { data = Bytes.make window_size '\000'; write_pos = 0; size = 0; total = 0L }
let current_size t = t.size
let total_processed t = t.total
let add_byte t b =
if b < 0 || b > 255 then invalid_arg "add_byte: byte out of range";
Bytes.set t.data t.write_pos (Char.chr b);
t.write_pos <- (t.write_pos + 1) mod window_size;
t.total <- Int64.succ t.total;
if t.size < window_size then t.size <- t.size + 1
let add_bytes t buf ofs len =
if ofs < 0 || ofs + len > Bytes.length buf then invalid_arg "add_bytes: out of bounds";
for i = 0 to len - 1 do
Bytes.set t.data t.write_pos (Bytes.get buf (ofs + i));
t.write_pos <- (t.write_pos + 1) mod window_size;
t.total <- Int64.succ t.total;
if t.size < window_size then t.size <- t.size + 1
done
let get_byte_at_distance t distance =
if distance <= 0 || distance > t.size || distance > window_size then
invalid_arg "get_byte_at_distance: invalid distance";
let pos = (t.write_pos - distance) mod window_size in
let pos = if pos < 0 then pos + window_size else pos in
Char.code (Bytes.get t.data pos)
let extract_slice t start len =
if start <= 0 || start > t.size then invalid_arg "extract_slice: invalid start";
if len < 0 then invalid_arg "extract_slice: negative len";
let out = Bytes.make len '\000' in
for i = 0 to len - 1 do
let dist = start + i in
if dist > t.size then invalid_arg "extract_slice: out of range";
let pos = (t.write_pos - dist) mod window_size in
let pos = if pos < 0 then pos + window_size else pos in
Bytes.set out i (Bytes.get t.data pos)
done;
out
let match_length_at_distance t distance lookahead ofs maxlen =
if distance <= 0 || distance > t.size then invalid_arg "match_length_at_distance: invalid distance";
let remaining = Bytes.length lookahead - ofs in
if remaining <= 0 || maxlen <= 0 then 0 else
let available_in_window = t.size - distance + 1 in
let max_cmp = min maxlen (min remaining available_in_window) in
let start_pos = (t.write_pos - distance) mod window_size in
let start_pos = if start_pos < 0 then start_pos + window_size else start_pos in
let rec cmp i =
if i >= max_cmp then i else
let pos = (start_pos + i) mod window_size in
if Bytes.get t.data pos = Bytes.get lookahead (ofs + i) then cmp (i + 1) else i
in cmp 0

19
lib/window.mli Normal file
View File

@ -0,0 +1,19 @@
val window_size : int
type t
val create : unit -> t
val add_byte : t -> int -> unit
val add_bytes : t -> bytes -> int -> int -> unit
val get_byte_at_distance : t -> int -> int
val current_size : t -> int
val total_processed : t -> int64
val extract_slice : t -> int -> int -> bytes
val match_length_at_distance : t -> int -> bytes -> int -> int -> int

36
lib/writer.ml Normal file
View File

@ -0,0 +1,36 @@
type t = { buf : Buffer.t; mutable cur : int; mutable filled : int }
let create () = { buf = Buffer.create 1024; cur = 0; filled = 0 }
let push_byte t = Buffer.add_char t.buf (Char.chr t.cur); t.cur <- 0; t.filled <- 0
let write_bit t bit =
let bit = if bit <> 0 then 1 else 0 in
let pos = 7 - t.filled in
t.cur <- t.cur lor (bit lsl pos);
t.filled <- t.filled + 1;
if t.filled = 8 then push_byte t
let write_bits t value nbits =
if nbits <= 0 then invalid_arg "write_bits: nbits must be > 0";
if nbits > 31 then invalid_arg "write_bits: nbits too large";
for i = nbits - 1 downto 0 do
let bit = (value lsr i) land 1 in
write_bit t bit
done
let write_literal t byte =
if byte < 0 || byte > 255 then invalid_arg "write_literal: byte out of range";
write_bit t 1;
write_bits t byte 8
let write_match t distance length =
if distance <= 0 || distance >= (1 lsl 15) then invalid_arg "write_match: invalid distance";
if length <= 0 || length > 255 then invalid_arg "write_match: invalid length";
write_bit t 0;
write_bits t distance 15;
write_bits t length 8
let flush t = if t.filled > 0 then push_byte t
let to_bytes t = flush t; Buffer.contents t.buf |> Bytes.of_string

12
lib/writer.mli Normal file
View File

@ -0,0 +1,12 @@
type t
val create : unit -> t
val write_bit : t -> int -> unit
val write_bits : t -> int -> int -> unit
val flush : t -> unit
val to_bytes : t -> bytes
val write_literal : t -> int -> unit
val write_match : t -> int -> int -> unit