From c361dd43da33e2245b19d773c0112ac4f4bc53aa Mon Sep 17 00:00:00 2001 From: kim Date: Sun, 21 Dec 2025 16:38:25 +0000 Subject: [PATCH] Initial --- README.md | 16 +++++++++++ bin/dune | 6 ++++ bin/main.ml | 37 +++++++++++++++++++++++++ dune-project | 6 ++++ lib/decoder.ml | 28 +++++++++++++++++++ lib/decoder.mli | 1 + lib/dune | 6 ++++ lib/encoder.ml | 38 +++++++++++++++++++++++++ lib/encoder.mli | 6 ++++ lib/finder.ml | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/finder.mli | 7 +++++ lib/reader.ml | 32 +++++++++++++++++++++ lib/reader.mli | 14 ++++++++++ lib/window.ml | 58 ++++++++++++++++++++++++++++++++++++++ lib/window.mli | 19 +++++++++++++ lib/writer.ml | 36 ++++++++++++++++++++++++ lib/writer.mli | 12 ++++++++ 17 files changed, 396 insertions(+) create mode 100644 README.md create mode 100644 bin/dune create mode 100644 bin/main.ml create mode 100644 dune-project create mode 100644 lib/decoder.ml create mode 100644 lib/decoder.mli create mode 100644 lib/dune create mode 100644 lib/encoder.ml create mode 100644 lib/encoder.mli create mode 100644 lib/finder.ml create mode 100644 lib/finder.mli create mode 100644 lib/reader.ml create mode 100644 lib/reader.mli create mode 100644 lib/window.ml create mode 100644 lib/window.mli create mode 100644 lib/writer.ml create mode 100644 lib/writer.mli diff --git a/README.md b/README.md new file mode 100644 index 0000000..9770953 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# lz77 + +lz77 compressor/decompressor using a 32KB sliding window which includes an allocation free window comparison helper and a prefix-indexing finder that prunes out-of-window positions + +### Build + +- dune build +- dune exec bin/main.exe -- compress +- dune exec bin/main.exe -- decompress + +### Format + +- Literal: flag bit 1 + 8-bit byte +- Match: flag bit 0 + 15-bit distance + 8-bit length + +See `lib/` for module implementations :) diff --git a/bin/dune b/bin/dune new file mode 100644 index 0000000..be81c6b --- /dev/null +++ b/bin/dune @@ -0,0 +1,6 @@ +(executable + (name main) + (public_name lz77-cli) + (package lz77) + (libraries lz77) +) diff --git a/bin/main.ml b/bin/main.ml new file mode 100644 index 0000000..f086c34 --- /dev/null +++ b/bin/main.ml @@ -0,0 +1,37 @@ +let read_file path = + let ic = open_in_bin path in + let len = in_channel_length ic in + let s = really_input_string ic len in + close_in ic; Bytes.of_string s + +let write_file path b = + let oc = open_out_bin path in + output_string oc (Bytes.to_string b); + close_out oc + +let () = + match Array.to_list Sys.argv with + | _ :: mode :: in_path :: out_path :: [] -> + (try + let input = read_file in_path in + if mode = "compress" then begin + let out = Encoder.compress input in + write_file out_path out; + Printf.printf "Original: %d bytes\n" (Bytes.length input); + Printf.printf "Compressed: %d bytes\n" (Bytes.length out); + let ratio = 100.0 *. (float_of_int (Bytes.length out)) /. (float_of_int (Bytes.length input)) in + Printf.printf "Compression ratio: %.2f%%\n" ratio + end else if mode = "decompress" then begin + let out = Decoder.decompress input in + write_file out_path out; + Printf.printf "Decompressed to %d bytes\n" (Bytes.length out) + end else begin + prerr_endline "Invalid mode: use 'compress' or 'decompress'"; + exit 2 + end + with + | Sys_error e -> prerr_endline ("I/O error: " ^ e); exit 1 + | Failure e -> prerr_endline ("Error: " ^ e); exit 1 + | Invalid_argument e -> prerr_endline ("Invalid input: " ^ e); exit 1 + ) + | _ -> prerr_endline "Usage: lz77-cli (compress|decompress) "; exit 2 diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..00c6138 --- /dev/null +++ b/dune-project @@ -0,0 +1,6 @@ +(lang dune 3.8) +(name lz77_ocaml) +(license MIT) +(version 0.1) + +; Minimal dune-project for an OCaml library + binary diff --git a/lib/decoder.ml b/lib/decoder.ml new file mode 100644 index 0000000..94f395a --- /dev/null +++ b/lib/decoder.ml @@ -0,0 +1,28 @@ +let decompress input = + let reader = Reader.of_bytes input in + let out = Buffer.create 4096 in + let window = Window.create () in + let rec loop () = + if Reader.available_bits reader <= 0 then () else + let flag = Reader.read_bits reader 1 in + if flag = 1 then ( + if Reader.available_bits reader < 8 then invalid_arg "decompress: truncated literal"; + let byte = Reader.read_bits reader 8 in + Buffer.add_char out (Char.chr byte); + Window.add_byte window byte; + loop () + ) else ( + if Reader.available_bits reader < 23 then invalid_arg "decompress: truncated match"; + let distance = Reader.read_bits reader 15 in + let length = Reader.read_bits reader 8 in + if distance <= 0 || distance > Window.window_size then invalid_arg "decompress: invalid distance"; + for _ = 1 to length do + let b = Window.get_byte_at_distance window distance in + Buffer.add_char out (Char.chr b); + Window.add_byte window b + done; + loop () + ) + in + loop (); + Bytes.of_string (Buffer.contents out) diff --git a/lib/decoder.mli b/lib/decoder.mli new file mode 100644 index 0000000..867ec10 --- /dev/null +++ b/lib/decoder.mli @@ -0,0 +1 @@ +val decompress : bytes -> bytes diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..0f49789 --- /dev/null +++ b/lib/dune @@ -0,0 +1,6 @@ +(library + (name lz77) + (public_name lz77) + (wrapped false) + (modules window finder writer reader encoder decoder) +) diff --git a/lib/encoder.ml b/lib/encoder.ml new file mode 100644 index 0000000..bc47847 --- /dev/null +++ b/lib/encoder.ml @@ -0,0 +1,38 @@ +let min_match_len = 3 +let max_match_len = 255 +let lookahead_size = 256 +let window_size = 32768 + +let compress input = + let len = Bytes.length input in + let window = Window.create () in + let finder = Finder.create () in + let writer = Writer.create () in + let rec loop pos = + if pos >= len then () else + let la_len = min (len - pos) lookahead_size in + if la_len < min_match_len then ( + for i = pos to len - 1 do + let byte = Char.code (Bytes.get input i) in + Writer.write_literal writer byte; + Window.add_byte window byte; + Finder.add_bytes finder window input i 1 + done + ) else ( + match Finder.find_longest finder window input pos la_len with + | Some (distance, match_len) -> + let match_len = min match_len max_match_len in + Writer.write_match writer distance match_len; + Window.add_bytes window input pos match_len; + Finder.add_bytes finder window input pos match_len; + loop (pos + match_len) + | None -> + let byte = Char.code (Bytes.get input pos) in + Writer.write_literal writer byte; + Window.add_byte window byte; + Finder.add_bytes finder window input pos 1; + loop (pos + 1) + ) + in + loop 0; + Writer.to_bytes writer diff --git a/lib/encoder.mli b/lib/encoder.mli new file mode 100644 index 0000000..1b748f4 --- /dev/null +++ b/lib/encoder.mli @@ -0,0 +1,6 @@ +val min_match_len : int +val max_match_len : int +val lookahead_size : int +val window_size : int + +val compress : bytes -> bytes diff --git a/lib/finder.ml b/lib/finder.ml new file mode 100644 index 0000000..2073f79 --- /dev/null +++ b/lib/finder.ml @@ -0,0 +1,74 @@ +open Stdlib + +module H = Hashtbl + +let min_match_len = 3 +let max_match_len = 255 +let max_candidates = 128 + +type t = { table : (int, int list) H.t; mutable last_indexed_pos : int } + +let create () = { table = H.create 131071; last_indexed_pos = 0 } + +let hash3 b0 b1 b2 = (b0 lsl 16) lor (b1 lsl 8) lor b2 + +let add_start_pos t key pos = + match H.find_opt t.table key with + | None -> H.add t.table key [pos] + | Some lst -> H.replace t.table key (pos :: lst) + +let add_bytes t window _buf _ofs _len = + let total = Int64.to_int (Window.total_processed window) in + let start = max 1 (t.last_indexed_pos + 1) in + let last = total - 2 in + for p = start to last do + let d = total - p + 1 in + let b0 = Window.get_byte_at_distance window d in + let b1 = Window.get_byte_at_distance window (d - 1) in + let b2 = Window.get_byte_at_distance window (d - 2) in + let key = hash3 b0 b1 b2 in + add_start_pos t key p + done; + if last >= t.last_indexed_pos then t.last_indexed_pos <- max t.last_indexed_pos (total - 1) + +let take_n n lst = + let rec aux i acc = function + | [] | _ when i <= 0 -> List.rev acc + | x::xs -> aux (i - 1) (x :: acc) xs + in aux n [] lst + +let prune_positions current_total = function + | [] -> [] + | lst -> + let lower = current_total - Window.window_size + 1 in + List.filter (fun pos -> pos >= lower) lst + +let find_longest t window lookahead ofs la_len = + if la_len - ofs < min_match_len then None else + let b0 = Char.code (Bytes.get lookahead ofs) in + let b1 = Char.code (Bytes.get lookahead (ofs + 1)) in + let b2 = Char.code (Bytes.get lookahead (ofs + 2)) in + let key = hash3 b0 b1 b2 in + match H.find_opt t.table key with + | None -> None + | Some positions_raw -> + let current_total = Int64.to_int (Window.total_processed window) in + let positions = prune_positions current_total positions_raw in + let candidates = take_n max_candidates positions in + let best = ref None in + let window_sz = Window.window_size in + List.iter (fun pos -> + let distance = current_total - pos + 1 in + if distance > 0 && distance <= window_sz then begin + let max_len = min max_match_len (la_len - ofs) in + let available = Window.current_size window - distance + 1 in + if available >= min_match_len then + let final_max = min max_len available in + let matched = Window.match_length_at_distance window distance lookahead ofs final_max in + if matched >= min_match_len then + match !best with + | None -> best := Some (distance, matched) + | Some (d,l) -> if matched > l || (matched = l && distance < d) then best := Some (distance, matched) + end + ) candidates; + best diff --git a/lib/finder.mli b/lib/finder.mli new file mode 100644 index 0000000..b804489 --- /dev/null +++ b/lib/finder.mli @@ -0,0 +1,7 @@ +type t + +val create : unit -> t + +val add_bytes : t -> Window.t -> bytes -> int -> int -> unit + +val find_longest : t -> Window.t -> bytes -> int -> int -> (int * int) option diff --git a/lib/reader.ml b/lib/reader.ml new file mode 100644 index 0000000..62c6420 --- /dev/null +++ b/lib/reader.ml @@ -0,0 +1,32 @@ +type t = { data : bytes; len : int; mutable byte_idx : int; mutable bit_idx : int } + +let of_bytes b = { data = b; len = Bytes.length b; byte_idx = 0; bit_idx = 0 } + +let byte_pos t = t.byte_idx +let bit_pos t = t.bit_idx + +let has_more t = t.byte_idx < t.len || (t.byte_idx = t.len && t.bit_idx <> 0) + +let available_bits t = (t.len - t.byte_idx) * 8 - t.bit_idx + +let read_bits t n = + if n <= 0 then 0 else begin + if n > 31 then invalid_arg "read_bits: n too large"; + if available_bits t < n then invalid_arg "read_bits: not enough bits"; + let acc = ref 0 in + for _ = 1 to n do + if t.byte_idx >= t.len then invalid_arg "read_bits: out of data"; + let byte = Char.code (Bytes.get t.data t.byte_idx) in + let bit_pos = 7 - t.bit_idx in + let bit = (byte lsr bit_pos) land 1 in + acc := (!acc lsl 1) lor bit; + if t.bit_idx = 7 then (t.byte_idx <- t.byte_idx + 1; t.bit_idx <- 0) else t.bit_idx <- t.bit_idx + 1 + done; + !acc + end + +let peek_bits t n = + let saved_b = t.byte_idx in + let saved_bit = t.bit_idx in + let v = read_bits t n in + t.byte_idx <- saved_b; t.bit_idx <- saved_bit; v diff --git a/lib/reader.mli b/lib/reader.mli new file mode 100644 index 0000000..2cd41dd --- /dev/null +++ b/lib/reader.mli @@ -0,0 +1,14 @@ +type t + +val of_bytes : bytes -> t + +val read_bits : t -> int -> int + +val peek_bits : t -> int -> int + +val has_more : t -> bool + +val available_bits : t -> int + +val byte_pos : t -> int +val bit_pos : t -> int diff --git a/lib/window.ml b/lib/window.ml new file mode 100644 index 0000000..92d120f --- /dev/null +++ b/lib/window.ml @@ -0,0 +1,58 @@ +let window_size = 32768 + +type t = { data : bytes; mutable write_pos : int; mutable size : int; mutable total : int64 } + +let create () = { data = Bytes.make window_size '\000'; write_pos = 0; size = 0; total = 0L } + +let current_size t = t.size +let total_processed t = t.total + +let add_byte t b = + if b < 0 || b > 255 then invalid_arg "add_byte: byte out of range"; + Bytes.set t.data t.write_pos (Char.chr b); + t.write_pos <- (t.write_pos + 1) mod window_size; + t.total <- Int64.succ t.total; + if t.size < window_size then t.size <- t.size + 1 + +let add_bytes t buf ofs len = + if ofs < 0 || ofs + len > Bytes.length buf then invalid_arg "add_bytes: out of bounds"; + for i = 0 to len - 1 do + Bytes.set t.data t.write_pos (Bytes.get buf (ofs + i)); + t.write_pos <- (t.write_pos + 1) mod window_size; + t.total <- Int64.succ t.total; + if t.size < window_size then t.size <- t.size + 1 + done + +let get_byte_at_distance t distance = + if distance <= 0 || distance > t.size || distance > window_size then + invalid_arg "get_byte_at_distance: invalid distance"; + let pos = (t.write_pos - distance) mod window_size in + let pos = if pos < 0 then pos + window_size else pos in + Char.code (Bytes.get t.data pos) + +let extract_slice t start len = + if start <= 0 || start > t.size then invalid_arg "extract_slice: invalid start"; + if len < 0 then invalid_arg "extract_slice: negative len"; + let out = Bytes.make len '\000' in + for i = 0 to len - 1 do + let dist = start + i in + if dist > t.size then invalid_arg "extract_slice: out of range"; + let pos = (t.write_pos - dist) mod window_size in + let pos = if pos < 0 then pos + window_size else pos in + Bytes.set out i (Bytes.get t.data pos) + done; + out + +let match_length_at_distance t distance lookahead ofs maxlen = + if distance <= 0 || distance > t.size then invalid_arg "match_length_at_distance: invalid distance"; + let remaining = Bytes.length lookahead - ofs in + if remaining <= 0 || maxlen <= 0 then 0 else + let available_in_window = t.size - distance + 1 in + let max_cmp = min maxlen (min remaining available_in_window) in + let start_pos = (t.write_pos - distance) mod window_size in + let start_pos = if start_pos < 0 then start_pos + window_size else start_pos in + let rec cmp i = + if i >= max_cmp then i else + let pos = (start_pos + i) mod window_size in + if Bytes.get t.data pos = Bytes.get lookahead (ofs + i) then cmp (i + 1) else i + in cmp 0 diff --git a/lib/window.mli b/lib/window.mli new file mode 100644 index 0000000..282df67 --- /dev/null +++ b/lib/window.mli @@ -0,0 +1,19 @@ +val window_size : int + +type t + +val create : unit -> t + +val add_byte : t -> int -> unit + +val add_bytes : t -> bytes -> int -> int -> unit + +val get_byte_at_distance : t -> int -> int + +val current_size : t -> int + +val total_processed : t -> int64 + +val extract_slice : t -> int -> int -> bytes + +val match_length_at_distance : t -> int -> bytes -> int -> int -> int diff --git a/lib/writer.ml b/lib/writer.ml new file mode 100644 index 0000000..fc96627 --- /dev/null +++ b/lib/writer.ml @@ -0,0 +1,36 @@ +type t = { buf : Buffer.t; mutable cur : int; mutable filled : int } + +let create () = { buf = Buffer.create 1024; cur = 0; filled = 0 } + +let push_byte t = Buffer.add_char t.buf (Char.chr t.cur); t.cur <- 0; t.filled <- 0 + +let write_bit t bit = + let bit = if bit <> 0 then 1 else 0 in + let pos = 7 - t.filled in + t.cur <- t.cur lor (bit lsl pos); + t.filled <- t.filled + 1; + if t.filled = 8 then push_byte t + +let write_bits t value nbits = + if nbits <= 0 then invalid_arg "write_bits: nbits must be > 0"; + if nbits > 31 then invalid_arg "write_bits: nbits too large"; + for i = nbits - 1 downto 0 do + let bit = (value lsr i) land 1 in + write_bit t bit + done + +let write_literal t byte = + if byte < 0 || byte > 255 then invalid_arg "write_literal: byte out of range"; + write_bit t 1; + write_bits t byte 8 + +let write_match t distance length = + if distance <= 0 || distance >= (1 lsl 15) then invalid_arg "write_match: invalid distance"; + if length <= 0 || length > 255 then invalid_arg "write_match: invalid length"; + write_bit t 0; + write_bits t distance 15; + write_bits t length 8 + +let flush t = if t.filled > 0 then push_byte t + +let to_bytes t = flush t; Buffer.contents t.buf |> Bytes.of_string diff --git a/lib/writer.mli b/lib/writer.mli new file mode 100644 index 0000000..c328e35 --- /dev/null +++ b/lib/writer.mli @@ -0,0 +1,12 @@ +type t + +val create : unit -> t + +val write_bit : t -> int -> unit +val write_bits : t -> int -> int -> unit + +val flush : t -> unit +val to_bytes : t -> bytes + +val write_literal : t -> int -> unit +val write_match : t -> int -> int -> unit