From 8503ebcde4b2ee3b9fb40019b85118d251ca90e2 Mon Sep 17 00:00:00 2001 From: boxbeam Date: Mon, 6 May 2024 17:40:32 -0400 Subject: [PATCH] Apply suggestions --- crates/tabby-common/src/index.rs | 1 + crates/tabby-common/src/lib.rs | 4 --- crates/tabby-scheduler/src/index.rs | 48 +++++++++++++++++++---------- crates/tabby-scheduler/src/lib.rs | 8 ++--- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/crates/tabby-common/src/index.rs b/crates/tabby-common/src/index.rs index 89ac3e488a31..9e44e2211cb5 100644 --- a/crates/tabby-common/src/index.rs +++ b/crates/tabby-common/src/index.rs @@ -19,6 +19,7 @@ pub struct CodeSearchSchema { pub schema: Schema, pub field_git_url: Field, pub field_filepath: Field, + /// Indexed field uniquely identifying a file in a repository, format is `git_url:filepath` pub field_file_id: Field, pub field_language: Field, pub field_body: Field, diff --git a/crates/tabby-common/src/lib.rs b/crates/tabby-common/src/lib.rs index 513965d4a0c3..747126fd4df7 100644 --- a/crates/tabby-common/src/lib.rs +++ b/crates/tabby-common/src/lib.rs @@ -33,10 +33,6 @@ pub struct SourceFile { } impl SourceFile { - pub fn create_file_id(git_url: &str, filepath: &str) -> String { - format!("{}:{}", git_url, filepath) - } - pub fn files_jsonl() -> PathBuf { dataset_dir().join("files.jsonl") } diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index e9d4750e4e37..22ea53668c1e 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -21,9 +21,8 @@ use crate::{ static MAX_LINE_LENGTH_THRESHOLD: usize = 300; static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; -pub fn index_repositories(config: &[RepositoryConfig]) { +pub fn index_repositories(cache: &mut CacheStore, config: &[RepositoryConfig]) { let code = CodeSearchSchema::new(); - let mut cache = CacheStore::new(tabby_common::path::cache_dir()); let index = open_or_create_index(&code, &path::index_dir()); register_tokenizers(&index); @@ -41,14 +40,15 @@ pub fn index_repositories(config: &[RepositoryConfig]) { for repository in config { let Some(commit) = cache.get_last_index_commit(repository) else { - index_repository_from_scratch(repository, &writer, &code, &intelligence, &mut cache); + index_repository_from_scratch(repository, &writer, &code, &intelligence, cache); continue; }; let dir = repository.dir(); let changed_files = get_changed_files(&dir, &commit).expect("Failed read file diff"); for file in changed_files { let path = dir.join(&file); - delete_indexed_source_file(&writer, &code, &repository.git_url, &file); + let file_id = create_file_id(&repository.git_url, &file); + delete_document(&writer, &code, file_id.clone()); if !path.exists() { continue; } @@ -58,7 +58,14 @@ pub fn index_repositories(config: &[RepositoryConfig]) { if !is_valid_file(&source_file) { continue; } - add_indexed_source_file(&writer, repository, &source_file, &code, &intelligence); + add_document( + &writer, + repository, + &source_file, + file_id, + &code, + &intelligence, + ); } cache.set_last_index_commit( repository, @@ -69,7 +76,7 @@ pub fn index_repositories(config: &[RepositoryConfig]) { for indexed_repository in cache.list_indexed_repositories() { if !indexed_repository.dir().exists() { cache.set_last_index_commit(&indexed_repository, None); - delete_all_indexed_files(&writer, &code, &indexed_repository.canonical_git_url()); + delete_all_documents(&writer, &code, &indexed_repository.canonical_git_url()); } } @@ -107,7 +114,15 @@ fn index_repository_from_scratch( if !is_valid_file(&source_file) { continue; } - add_indexed_source_file(writer, repository, &source_file, code, intelligence); + let file_id = create_file_id(&repository.git_url, &source_file.filepath); + add_document( + writer, + repository, + &source_file, + file_id, + code, + intelligence, + ); pb.as_mut().map(|pb| { pb.update(source_file.read_file_size()) .expect("Failed to update progress bar") @@ -132,26 +147,21 @@ fn is_valid_file(source_file: &SourceFile) -> bool { && source_file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD } -pub fn delete_indexed_source_file( - writer: &IndexWriter, - code: &CodeSearchSchema, - git_url: &str, - filepath: &str, -) { - let file_id = SourceFile::create_file_id(git_url, filepath); +pub fn delete_document(writer: &IndexWriter, code: &CodeSearchSchema, file_id: String) { let term = Term::from_field_text(code.field_file_id.clone(), &file_id); writer.delete_term(term); } -pub fn delete_all_indexed_files(writer: &IndexWriter, code: &CodeSearchSchema, git_url: &str) { +pub fn delete_all_documents(writer: &IndexWriter, code: &CodeSearchSchema, git_url: &str) { let term = Term::from_field_text(code.field_git_url, git_url); writer.delete_term(term); } -pub fn add_indexed_source_file( +pub fn add_document( writer: &IndexWriter, repository: &RepositoryConfig, file: &SourceFile, + file_id: String, code: &CodeSearchSchema, intelligence: &CodeIntelligence, ) -> usize { @@ -167,7 +177,7 @@ pub fn add_indexed_source_file( .add_document(doc!( code.field_git_url => repository.canonical_git_url(), code.field_filepath => file.filepath.clone(), - code.field_file_id => SourceFile::create_file_id(&repository.git_url, &file.filepath), + code.field_file_id => file_id.clone(), code.field_language => file.language.clone(), code.field_body => body, )) @@ -198,3 +208,7 @@ fn open_or_create_index_impl(code: &CodeSearchSchema, path: &Path) -> tantivy::R let directory = MmapDirectory::open(path).expect("Failed to open index directory"); Index::open_or_create(directory, code.schema.clone()) } + +pub fn create_file_id(git_url: &str, filepath: &str) -> String { + format!("{}:{}", git_url, filepath) +} diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 5210b8474390..b0cedd696865 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -22,7 +22,7 @@ pub async fn scheduler(now: bool, access: T) { .await .expect("Must be able to retrieve repositories for sync"); job_sync(&mut cache, &repositories); - job_index(&repositories); + job_index(&mut cache, &repositories); cache.garbage_collection(); } else { @@ -52,7 +52,7 @@ pub async fn scheduler(now: bool, access: T) { .expect("Must be able to retrieve repositories for sync"); job_sync(&mut cache, &repositories); - job_index(&repositories); + job_index(&mut cache, &repositories); cache.garbage_collection(); }) }) @@ -69,9 +69,9 @@ pub async fn scheduler(now: bool, access: T) { } } -fn job_index(repositories: &[RepositoryConfig]) { +fn job_index(cache: &mut CacheStore, repositories: &[RepositoryConfig]) { println!("Indexing repositories..."); - index::index_repositories(repositories); + index::index_repositories(cache, repositories); } fn job_sync(cache: &mut CacheStore, repositories: &[RepositoryConfig]) {