Skip to content

Commit

Permalink
bitarray compression
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed May 16, 2024
1 parent 87aa164 commit aa8f2b7
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 1 deletion.
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[workspace]
resolver = "2"

members = [
members = [ "bitarray",
"fa-compression",
"libsais64-rs",
"sa-builder",
Expand Down
8 changes: 8 additions & 0 deletions bitarray/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[package]
name = "bitarray"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
127 changes: 127 additions & 0 deletions bitarray/src/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
use std::io::{BufRead, Read, Result, Write};

use crate::BitArray;

pub trait Binary {
fn write_binary<W: Write>(&self, writer: W) -> Result<()>;
fn read_binary<R: BufRead>(&mut self, reader: R) -> Result<()>;
}

impl<const B: usize> Binary for BitArray<B> {
fn write_binary<W: Write>(&self, mut writer: W) -> Result<()> {
for value in self.data.iter() {
writer.write_all(&value.to_le_bytes())?;
}

Ok(())
}

fn read_binary<R: BufRead>(&mut self, mut reader: R) -> Result<()> {
self.data.clear();

let mut buffer = vec![0; 8 * 1024];

loop {
let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer);
for buffer_slice in buffer[..bytes_read].chunks_exact(8) {
self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap()));
}

if finished {
break;
}
}

Ok(())
}
}

fn fill_buffer<T: Read>(input: &mut T, buffer: &mut Vec<u8>) -> (bool, usize) {
// Store the buffer size in advance, because rust will complain
// about the buffer being borrowed mutably while it's borrowed
let buffer_size = buffer.len();

let mut writable_buffer_space = buffer.as_mut();

loop {
match input.read(writable_buffer_space) {
// No bytes written, which means we've completely filled the buffer
// or we've reached the end of the file
Ok(0) => {
return (
!writable_buffer_space.is_empty(),
buffer_size - writable_buffer_space.len()
);
}

// We've read {bytes_read} bytes
Ok(bytes_read) => {
// Shrink the writable buffer slice
writable_buffer_space = writable_buffer_space[bytes_read..].as_mut();
}

Err(err) => {
panic!("Error while reading input: {}", err);
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_fill_buffer() {
let input_str = "a".repeat(8_000);
let mut input = input_str.as_bytes();

let mut buffer = vec![0; 800];

loop {
let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer);

if finished {
assert!(bytes_read < 800);
break;
} else {
assert_eq!(bytes_read, 800);
}
}
}

#[test]
fn test_write_binary() {
let mut bitarray = BitArray::<40>::with_capacity(4);
bitarray.set(0, 0x1234567890);
bitarray.set(1, 0xabcdef0123);
bitarray.set(2, 0x4567890abc);
bitarray.set(3, 0xdef0123456);

let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();

assert_eq!(buffer, vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12,
0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01,
0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
]);
}

#[test]
fn test_read_binary() {
let buffer = vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12,
0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01,
0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
];

let mut bitarray = BitArray::<40>::with_capacity(4);
bitarray.read_binary(&buffer[..]).unwrap();

assert_eq!(bitarray.get(0), 0x1234567890);
assert_eq!(bitarray.get(1), 0xabcdef0123);
assert_eq!(bitarray.get(2), 0x4567890abc);
assert_eq!(bitarray.get(3), 0xdef0123456);
}
}
101 changes: 101 additions & 0 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
pub mod binary;

pub struct BitArray<const B: usize> {
pub data: Vec<u64>,
pub mask: u64,
pub len: usize,
}

impl<const B: usize> BitArray<B> {
pub fn with_capacity(capacity: usize) -> Self {
Self {
data: vec![0; capacity * B / 64 + 1],
mask: (1 << B) - 1,
len: capacity,
}
}

pub fn get(&self, index: usize) -> u64 {
let start_block = index * B / 64;
let start_block_offset = index * B % 64;

if start_block_offset + B <= 64 {
return self.data[start_block] >> (64 - start_block_offset - B) & self.mask;
}

let end_block = (index + 1) * B / 64;
let end_block_offset = (index + 1) * B % 64;

let a = self.data[start_block] << end_block_offset;
let b = self.data[end_block] >> (64 - end_block_offset);

(a | b) & self.mask
}

pub fn set(&mut self, index: usize, value: u64) {
let start_block = index * B / 64;
let start_block_offset = index * B % 64;

if start_block_offset + B <= 64 {
self.data[start_block] &= !(self.mask << (64 - start_block_offset - B));
self.data[start_block] |= value << (64 - start_block_offset - B);
return;
}

let end_block = (index + 1) * B / 64;
let end_block_offset = (index + 1) * B % 64;

self.data[start_block] &= !(self.mask >> start_block_offset);
self.data[start_block] |= value >> end_block_offset;

self.data[end_block] &= !(self.mask << (64 - end_block_offset));
self.data[end_block] |= value << (64 - end_block_offset);
}

pub fn len(&self) -> usize {
self.len
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_bitarray_with_capacity() {
let bitarray = BitArray::<40>::with_capacity(4);
assert_eq!(bitarray.data, vec![ 0, 0, 0 ]);
assert_eq!(bitarray.mask, 0xff_ffff_ffff);
assert_eq!(bitarray.len, 4);
}

#[test]
fn test_bitarray_get() {
let mut bitarray = BitArray::<40>::with_capacity(4);
bitarray.data = vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4 ];

assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010);
assert_eq!(bitarray.get(1), 0b1100001001010010011000010100110111001001);
assert_eq!(bitarray.get(2), 0b1111001101001101101101101011101001010001);
assert_eq!(bitarray.get(3), 0b0000100010010001010001001110101110011100);
}

#[test]
fn test_bitarray_set() {
let mut bitarray = BitArray::<40>::with_capacity(4);
bitarray.data = vec![ 0, 0, 0 ];

bitarray.set(0, 0b0001110011111010110001000111111100110010);
bitarray.set(1, 0b1100001001010010011000010100110111001001);
bitarray.set(2, 0b1111001101001101101101101011101001010001);
bitarray.set(3, 0b0000100010010001010001001110101110011100);

assert_eq!(bitarray.data, vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000 ]);
}

#[test]
fn test_bitarray_len() {
let bitarray = BitArray::<40>::with_capacity(4);
assert_eq!(bitarray.len(), 4);
}
}

0 comments on commit aa8f2b7

Please sign in to comment.