Skip to content

Commit

Permalink
Redesign incremental indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
boxbeam committed May 7, 2024
1 parent 8503ebc commit f1083d6
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 207 deletions.
2 changes: 1 addition & 1 deletion crates/tabby-common/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub struct CodeSearchSchema {
pub schema: Schema,
pub field_git_url: Field,
pub field_filepath: Field,
/// Indexed field uniquely identifying a file in a repository, format is `git_url:filepath`
/// Indexed field uniquely identifying a file in a repository, stringified SourceFileKey
pub field_file_id: Field,
pub field_language: Field,
pub field_body: Field,
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use serde_jsonlines::JsonLinesReader;

#[derive(Serialize, Deserialize, Clone)]
pub struct SourceFile {
pub git_url: String,
pub basedir: String,
pub filepath: String,
pub language: String,
Expand Down
126 changes: 63 additions & 63 deletions crates/tabby-scheduler/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::{
fs::read_to_string,
path::{Path, PathBuf},
process::Command,
str::FromStr,
};

use anyhow::{bail, Context, Result};
Expand All @@ -13,54 +14,34 @@ use tracing::{info, warn};
use crate::code::CodeIntelligence;

const SOURCE_FILE_BUCKET_KEY: &str = "source_files";
const LAST_INDEX_COMMIT_BUCKET: &str = "last_index_commit";

fn cmd_stdout(path: &Path, cmd: &str, args: &[&str]) -> Result<String> {
Ok(String::from_utf8(
Command::new(cmd)
.current_dir(path)
.args(args)
.output()?
.stdout,
)?
.trim()
.to_string())
const INDEX_BUCKET_KEY: &str = "indexed_files";

fn cmd_stdout(cmd: &str, args: &[&str]) -> Result<String> {
Ok(
String::from_utf8(Command::new(cmd).args(args).output()?.stdout)?
.trim()
.to_string(),

Check warning on line 23 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L19-L23

Added lines #L19 - L23 were not covered by tests
)
}

Check warning on line 25 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L25

Added line #L25 was not covered by tests

fn get_git_hash(path: &Path) -> Result<String> {
Ok(cmd_stdout(
path,
"git",
&["hash-object", &path.display().to_string()],
)?)

Check warning on line 31 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L28-L31

Added lines #L28 - L31 were not covered by tests
}

pub fn get_current_commit_hash(path: &Path) -> Result<String> {
cmd_stdout(path, "git", &["rev-parse", "HEAD"])
}

pub fn get_changed_files(path: &Path, since_commit: &str) -> Result<Vec<String>> {
Ok(cmd_stdout(
path,
"git",
&["diff", "--no-renames", "--name-only", since_commit],
)?
.lines()
.map(|line| line.to_owned())
.collect())
}

#[derive(Deserialize, Serialize)]
struct SourceFileKey {
#[derive(Deserialize, Serialize, Debug)]

Check warning on line 34 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L34

Added line #L34 was not covered by tests
pub(crate) struct SourceFileKey {
path: PathBuf,
language: String,
git_hash: String,
}

impl TryFrom<&str> for SourceFileKey {
type Error = serde_json::Error;
impl FromStr for SourceFileKey {
type Err = serde_json::Error;

fn try_from(s: &str) -> Result<Self, Self::Error> {
fn from_str(s: &str) -> Result<Self, Self::Err> {

Check warning on line 44 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L44

Added line #L44 was not covered by tests
serde_json::from_str(s)
}
}
Expand Down Expand Up @@ -105,38 +86,56 @@ impl CacheStore {
}
}

pub fn get_last_index_commit(&self, repository: &RepositoryConfig) -> Option<String> {
fn index_bucket(&self) -> Bucket<String, String> {
self.store
.bucket(Some(LAST_INDEX_COMMIT_BUCKET))
.expect("Failed to access meta bucket")
.get(&repository.canonical_git_url())
.expect("Failed to read last index commit")
.bucket(Some(INDEX_BUCKET_KEY))
.expect("Failed to access indexed files bucket")
}

Check warning on line 93 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L89-L93

Added lines #L89 - L93 were not covered by tests

pub fn set_last_index_commit(&self, repository: &RepositoryConfig, commit: Option<String>) {
let bucket = self
.store
.bucket(Some(LAST_INDEX_COMMIT_BUCKET))
.expect("Failed to access meta bucket");
if let Some(commit) = commit {
bucket
.set(&repository.canonical_git_url(), &commit)
.expect("Failed to write last index commit");
} else {
bucket
.remove(&repository.git_url)
.expect("Failed to remove last index commit");
}
pub fn is_indexed(&self, key: &SourceFileKey) -> bool {
self.index_bucket()
.contains(&key.to_string())
.expect("Failed to read index bucket")
}

Check warning on line 99 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L95-L99

Added lines #L95 - L99 were not covered by tests

pub fn list_indexed_repositories(&self) -> Vec<RepositoryConfig> {
self.store
.bucket::<String, String>(Some(LAST_INDEX_COMMIT_BUCKET))
.expect("Failed to read meta bucket")
pub fn set_indexed(&self, key: &SourceFileKey) {
self.index_bucket()
.set(&key.to_string(), &String::new())
.expect("Failed to write to index bucket");
}

Check warning on line 105 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L101-L105

Added lines #L101 - L105 were not covered by tests

pub fn cleanup_old_indexed_files(&self, key_remover: impl Fn(&String)) {
info!("Cleaning up indexed file cache");
let bucket = self.index_bucket();
let mut batch = Batch::new();

let mut num_keep = 0;
let mut num_removed = 0;

bucket
.iter()
.map(|item| item.unwrap().key().unwrap())
.map(|git_url| RepositoryConfig::new(git_url))
.collect()
.filter_map(|item| {
let item = item.expect("Failed to read item");
let item_key: String = item.key().expect("Failed to get key");
if is_item_key_matched(&item_key) {
num_keep += 1;
None

Check warning on line 122 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L107-L122

Added lines #L107 - L122 were not covered by tests
} else {
num_removed += 1;
Some(item_key)

Check warning on line 125 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L124-L125

Added lines #L124 - L125 were not covered by tests
}
})
.inspect(key_remover)
.for_each(|key| {
batch
.remove(&key)
.expect("Failed to remove indexed source file")
});

info!("Finished cleaning up indexed files: {num_keep} items kept, {num_removed} items removed");
bucket
.batch(batch)
.expect("Failed to execute batched delete");
}

Check warning on line 139 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L127-L139

Added lines #L127 - L139 were not covered by tests

pub fn get_source_file(
Expand Down Expand Up @@ -167,8 +166,8 @@ impl CacheStore {
}
}

pub fn garbage_collection(&self) {
info!("Running garbage collection");
pub fn cleanup_old_source_files(&self) {
info!("Cleaning up synced file cache");

Check warning on line 170 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L169-L170

Added lines #L169 - L170 were not covered by tests
let bucket: Bucket<String, Json<SourceFile>> = self
.store
.bucket(Some(SOURCE_FILE_BUCKET_KEY))
Expand All @@ -188,7 +187,7 @@ impl CacheStore {
None
} else {
num_removed += 1;
Some(item.key().expect("Failed to get key"))
Some(item_key)

Check warning on line 190 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L190

Added line #L190 was not covered by tests
}
})
.for_each(|key| batch.remove(&key).expect("Failed to remove key"));
Expand All @@ -202,7 +201,7 @@ impl CacheStore {
}

fn is_item_key_matched(item_key: &str) -> bool {
let Ok(key) = SourceFileKey::try_from(item_key) else {
let Ok(key) = item_key.parse::<SourceFileKey>() else {

Check warning on line 204 in crates/tabby-scheduler/src/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/cache.rs#L204

Added line #L204 was not covered by tests
return false;
};

Expand Down Expand Up @@ -244,6 +243,7 @@ fn create_source_file(
}
};
let source_file = SourceFile {
git_url: config.canonical_git_url(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
max_line_length: metrics::max_line_length(&contents),
Expand Down
Loading

0 comments on commit f1083d6

Please sign in to comment.