From aa8f2b759ae0ea4b1637ba345f2274b604b0bc1d Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:03:50 +0200 Subject: [PATCH] bitarray compression --- Cargo.lock | 4 ++ Cargo.toml | 2 +- bitarray/Cargo.toml | 8 +++ bitarray/src/binary.rs | 127 +++++++++++++++++++++++++++++++++++++++++ bitarray/src/lib.rs | 101 ++++++++++++++++++++++++++++++++ 5 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 bitarray/Cargo.toml create mode 100644 bitarray/src/binary.rs create mode 100644 bitarray/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 27c3eed..56db76a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,6 +240,10 @@ dependencies = [ "which", ] +[[package]] +name = "bitarray" +version = "0.1.0" + [[package]] name = "bitflags" version = "1.3.2" diff --git a/Cargo.toml b/Cargo.toml index 728b06e..617ad70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] resolver = "2" -members = [ +members = [ "bitarray", "fa-compression", "libsais64-rs", "sa-builder", diff --git a/bitarray/Cargo.toml b/bitarray/Cargo.toml new file mode 100644 index 0000000..8176d57 --- /dev/null +++ b/bitarray/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "bitarray" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs new file mode 100644 index 0000000..546b65c --- /dev/null +++ b/bitarray/src/binary.rs @@ -0,0 +1,127 @@ +use std::io::{BufRead, Read, Result, Write}; + +use crate::BitArray; + +pub trait Binary { + fn write_binary(&self, writer: W) -> Result<()>; + fn read_binary(&mut self, reader: R) -> Result<()>; +} + +impl Binary for BitArray { + fn write_binary(&self, mut writer: W) -> Result<()> { + for value in self.data.iter() { + writer.write_all(&value.to_le_bytes())?; + } + + Ok(()) + } + + fn read_binary(&mut self, mut reader: R) -> Result<()> { + self.data.clear(); + + let mut buffer = vec![0; 8 * 1024]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + + if finished { + break; + } + } + + Ok(()) + } +} + +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { + // Store the buffer size in advance, because rust will complain + // about the buffer being borrowed mutably while it's borrowed + let buffer_size = buffer.len(); + + let mut writable_buffer_space = buffer.as_mut(); + + loop { + match input.read(writable_buffer_space) { + // No bytes written, which means we've completely filled the buffer + // or we've reached the end of the file + Ok(0) => { + return ( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len() + ); + } + + // We've read {bytes_read} bytes + Ok(bytes_read) => { + // Shrink the writable buffer slice + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + } + + Err(err) => { + panic!("Error while reading input: {}", err); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fill_buffer() { + let input_str = "a".repeat(8_000); + let mut input = input_str.as_bytes(); + + let mut buffer = vec![0; 800]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); + + if finished { + assert!(bytes_read < 800); + break; + } else { + assert_eq!(bytes_read, 800); + } + } + } + + #[test] + fn test_write_binary() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.set(0, 0x1234567890); + bitarray.set(1, 0xabcdef0123); + bitarray.set(2, 0x4567890abc); + bitarray.set(3, 0xdef0123456); + + let mut buffer = Vec::new(); + bitarray.write_binary(&mut buffer).unwrap(); + + assert_eq!(buffer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); + } + + #[test] + fn test_read_binary() { + let buffer = vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]; + + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.read_binary(&buffer[..]).unwrap(); + + assert_eq!(bitarray.get(0), 0x1234567890); + assert_eq!(bitarray.get(1), 0xabcdef0123); + assert_eq!(bitarray.get(2), 0x4567890abc); + assert_eq!(bitarray.get(3), 0xdef0123456); + } +} diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs new file mode 100644 index 0000000..d8a1e9b --- /dev/null +++ b/bitarray/src/lib.rs @@ -0,0 +1,101 @@ +pub mod binary; + +pub struct BitArray { + pub data: Vec, + pub mask: u64, + pub len: usize, +} + +impl BitArray { + pub fn with_capacity(capacity: usize) -> Self { + Self { + data: vec![0; capacity * B / 64 + 1], + mask: (1 << B) - 1, + len: capacity, + } + } + + pub fn get(&self, index: usize) -> u64 { + let start_block = index * B / 64; + let start_block_offset = index * B % 64; + + if start_block_offset + B <= 64 { + return self.data[start_block] >> (64 - start_block_offset - B) & self.mask; + } + + let end_block = (index + 1) * B / 64; + let end_block_offset = (index + 1) * B % 64; + + let a = self.data[start_block] << end_block_offset; + let b = self.data[end_block] >> (64 - end_block_offset); + + (a | b) & self.mask + } + + pub fn set(&mut self, index: usize, value: u64) { + let start_block = index * B / 64; + let start_block_offset = index * B % 64; + + if start_block_offset + B <= 64 { + self.data[start_block] &= !(self.mask << (64 - start_block_offset - B)); + self.data[start_block] |= value << (64 - start_block_offset - B); + return; + } + + let end_block = (index + 1) * B / 64; + let end_block_offset = (index + 1) * B % 64; + + self.data[start_block] &= !(self.mask >> start_block_offset); + self.data[start_block] |= value >> end_block_offset; + + self.data[end_block] &= !(self.mask << (64 - end_block_offset)); + self.data[end_block] |= value << (64 - end_block_offset); + } + + pub fn len(&self) -> usize { + self.len + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bitarray_with_capacity() { + let bitarray = BitArray::<40>::with_capacity(4); + assert_eq!(bitarray.data, vec![ 0, 0, 0 ]); + assert_eq!(bitarray.mask, 0xff_ffff_ffff); + assert_eq!(bitarray.len, 4); + } + + #[test] + fn test_bitarray_get() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.data = vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4 ]; + + assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); + assert_eq!(bitarray.get(1), 0b1100001001010010011000010100110111001001); + assert_eq!(bitarray.get(2), 0b1111001101001101101101101011101001010001); + assert_eq!(bitarray.get(3), 0b0000100010010001010001001110101110011100); + } + + #[test] + fn test_bitarray_set() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.data = vec![ 0, 0, 0 ]; + + bitarray.set(0, 0b0001110011111010110001000111111100110010); + bitarray.set(1, 0b1100001001010010011000010100110111001001); + bitarray.set(2, 0b1111001101001101101101101011101001010001); + bitarray.set(3, 0b0000100010010001010001001110101110011100); + + assert_eq!(bitarray.data, vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000 ]); + } + + #[test] + fn test_bitarray_len() { + let bitarray = BitArray::<40>::with_capacity(4); + assert_eq!(bitarray.len(), 4); + } +} \ No newline at end of file