Improve the docs.

open-prophetdb · Mar 15, 2024 · 1514b37 · 1514b37
1 parent 847ed94
commit 1514b37
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 79 deletions.
diff --git a/Makefile b/Makefile
@@ -64,4 +64,8 @@ build-service:
 changelog:
 	@printf "Generate changelog...\n"
 	@python build/build_changelog.py --repo ../biominer-components --output-file ./studio/public/README/changelog.md --repo-name 'BioMedGPS UI'
-	@python build/build_changelog.py --repo . --output-file ./studio/public/README/changelog.md --repo-name BioMedGPS
+	@python build/build_changelog.py --repo . --output-file ./studio/public/README/changelog.md --repo-name BioMedGPS
+
+deploy: build-studio
+	@docker run --rm -it -v "$(CURDIR)":/home/rust/src messense/rust-musl-cross:x86_64-musl cargo build --release
+	@rsync -avP target/x86_64-unknown-linux-musl/release/biomedgps target/x86_64-unknown-linux-musl/release/biomedgps-cli [email protected]:/data/biomedgps/bin
diff --git a/src/bin/biomedgps-cli.rs b/src/bin/biomedgps-cli.rs
@@ -17,7 +17,8 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use structopt::StructOpt;
 
-/// A cli for biomedgps service.
+/// NOTE: In the first time, you need to follow the order to run the commands: initdb -> importdb (entity + entity_metadata + relation + relation_metadata etc.) -> importkge (embeddings) -> cachetable (compound-disease-symptom, knowledge-score). In the current stage, we don't have a mechanism to check the format of entity ids and relation_types and keep the consistent of the data, such as whether all entities in the relation table exist in the entity table. But we provide a script for this purpose, you can follow this link to check the data consistency: https://github.com/open-prophetdb/biomedgps-data/blob/main/graph_data/scripts/correct_graph_data.py
+/// 
 #[derive(StructOpt, Debug)]
 #[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name = "A cli for biomedgps service.", author="Jingcheng Yang <[email protected]>;")]
 struct Opt {
@@ -34,19 +35,17 @@ struct Opt {
 enum SubCommands {
     #[structopt(name = "initdb")]
     InitDB(InitDbArguments),
-    #[structopt(name = "cachetable")]
-    CacheTable(CacheTableArguments),
     #[structopt(name = "importdb")]
     ImportDB(ImportDBArguments),
-    #[structopt(name = "importgraph")]
-    ImportGraph(ImportGraphArguments),
     #[structopt(name = "importkge")]
     ImportKGE(ImportKGEArguments),
+    #[structopt(name = "cachetable")]
+    CacheTable(CacheTableArguments),
     #[structopt(name = "cleandb")]
     CleanDB(CleanDBArguments),
 }
 
-/// Init database.
+/// Initialize the database, only for the postgres database. In common, we don't need to initialize the graph database, such as neo4j. We can clean the graph database by the cleandb command simply before we import the data. We might need to run the initdb command when we want to upgrade the database schema or the first time we run the application.
 #[derive(StructOpt, PartialEq, Debug)]
 #[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name="BioMedGPS - initdb", author="Jingcheng Yang <[email protected]>")]
 pub struct InitDbArguments {
@@ -55,7 +54,7 @@ pub struct InitDbArguments {
     database_url: Option<String>,
 }
 
-/// Clean database
+/// Clean the database, if you want to clean any table in the database, you can use this command.
 #[derive(StructOpt, PartialEq, Debug)]
 #[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name="BioMedGPS - cleandb", author="Jingcheng Yang <[email protected]>")]
 pub struct CleanDBArguments {
@@ -68,14 +67,18 @@ pub struct CleanDBArguments {
     table: String,
 }
 
-/// Import data files into database.
+/// Import data files into database, such as entity, relation, entity_metadata, relation_metadata, knowledge_curation, subgraph, entity2d etc. When you import the entity data, we will also sync the entity data to the graph database. But the relation data will be synced to the graph database in the cachetable command, because we need to compute the score for the relation data first. The entity_metadata and relation_metadata are generated by the importdb command automatically, actually, you don't need to prepare the entity_metadata and relation_metadata files. But you must use the importdb command manually to upgrade the entity_metadata and relation_metadata tables after the entity and relation tables are upgraded or the first time you run the application. In the most cases, you don't need to import knowledge_curation and subgraph data, we might import them at the migration stage. The entity_2d table is used to store the 2D embedding data, you need to prepare the 2D embedding data manually. If you have multiple models, you might need to choose one model to compute the 2D embedding data. The 2D embedding data is used to visualize the entity data in the 2D space.
 #[derive(StructOpt, PartialEq, Debug)]
 #[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name="BioMedGPS - importdb", author="Jingcheng Yang <[email protected]>")]
 pub struct ImportDBArguments {
     /// [Required] Database url, such as postgres://postgres:postgres@localhost:5432/rnmpdb, if not set, use the value of environment variable DATABASE_URL.
     #[structopt(name = "database_url", short = "d", long = "database-url")]
     database_url: Option<String>,
 
+    /// [Optional] Database url, such as neo4j://<username>:<password>@localhost:7687, if not set, use the value of environment variable NEO4J_URL. If you don't want to sync the data to the graph database, you can skip this option.
+    #[structopt(name = "neo4j_url", short = "n", long = "neo4j-url")]
+    neo4j_url: Option<String>,
+
     /// [Required] The file path of the data file to import. It may be a file or a directory. If you have multiple files to import, you can use the --filepath option with a directory path. We will import all files in the directory. But you need to disable the --drop option, otherwise, only the last file will be imported successfully.
     ///
     /// In the case of entity, the file should be a csv/tsv file which contains the id, name, label etc. More details about the format can be found in the github.com/yjcyxky/biomedgps-data.
@@ -109,7 +112,7 @@ pub struct ImportDBArguments {
     table: String,
 
     /// [Optional] Drop the table before import data. If you have multiple files to import, don't use this option. If you use this option, only the last file will be imported successfully.
-    #[structopt(name = "drop", short = "D", long = "drop")]
+    #[structopt(name = "drop", long = "drop")]
     drop: bool,
 
     /// [Optional] Don't check other related tables in the database. Such as knowledge_curation which might be related to entity.
@@ -133,14 +136,14 @@ pub struct CacheTableArguments {
     #[structopt(name = "database_url", short = "d", long = "database-url")]
     database_url: Option<String>,
 
-    /// [Optional] Database host, such as postgres-ml:5432. Only needed when you run your application in a docker container and the database is in another container.
-    #[structopt(name = "db_host", short = "D", long = "db-host")]
-    db_host: Option<String>,
-
     /// [Optional] Database url, such as neo4j://<username>:<password>@localhost:7687, if not set, use the value of environment variable NEO4J_URL.
     #[structopt(name = "neo4j_url", short = "n", long = "neo4j-url")]
     neo4j_url: Option<String>,
 
+    /// [Optional] Database host, such as postgres-ml:5432. Only needed when you run your application in a docker container and the database is in another container.
+    #[structopt(name = "db_host", short = "D", long = "db-host")]
+    db_host: Option<String>,
+
     /// [Required] The table name to init. supports compound-disease-symptom, knowledge-score etc.
     #[structopt(name = "table", short = "t", long = "table")]
     table: String,
@@ -168,45 +171,7 @@ pub struct CacheTableArguments {
     batch_size: usize,
 }
 
-/// Import data files into a graph database.
-#[derive(StructOpt, PartialEq, Debug)]
-#[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name="BioMedGPS - importgraph", author="Jingcheng Yang <[email protected]>")]
-pub struct ImportGraphArguments {
-    /// [Required] Database url, such as neo4j://<username>:<password>@localhost:7687, if not set, use the value of environment variable NEO4J_URL.
-    #[structopt(name = "neo4j_url", short = "n", long = "neo4j-url")]
-    neo4j_url: Option<String>,
-
-    /// [Required] The file path of the data file to import. It may be a file or a directory.
-    #[structopt(name = "filepath", short = "f", long = "filepath")]
-    filepath: Option<String>,
-
-    /// [Required] The file type of the data file to import. It may be entity, relation, entity_attribute.
-    #[structopt(name = "filetype", short = "t", long = "filetype")]
-    filetype: Option<String>,
-
-    /// [Optional] Batch size for import data. Default is 1000.
-    #[structopt(name = "batch_size", short = "b", long = "batch-size")]
-    batch_size: Option<usize>,
-
-    /// [Optional] Don't check other related tables in the database. Such as knowledge_curation which might be related to entity.
-    #[structopt(name = "skip_check", short = "s", long = "skip-check")]
-    skip_check: bool,
-
-    /// [Optional] Check if the data exists in the database before import data.
-    #[structopt(name = "check_exist", short = "c", long = "check-exist")]
-    check_exist: bool,
-
-    /// [Optional] Show the first 3 errors when import data.
-    #[structopt(name = "show_all_errors", short = "e", long = "show-all-errors")]
-    show_all_errors: bool,
-
-    /// [Optional] Which dataset is the data from. We assume that you have split the data into different datasets. If not, you can treat all data as one dataset. e.g. biomedgps. This feature is used to distinguish different dataset combinations matched with your model.
-    #[structopt(name = "dataset", short = "d", long = "dataset")]
-    dataset: Option<String>,
-}
-
-/// Import embedding files into a database
-/// The embedding files are generated by KGE models.
+/// Import embedding files into a database. The embedding files are generated by KGE models. If you have multiple models for different cases or datasets, you need to import them all and with different parameters, such as table_name, model_name, model_type, dataset, description etc. More details about these parameters can be found in their descriptions.
 #[derive(StructOpt, PartialEq, Debug)]
 #[structopt(setting=structopt::clap::AppSettings::ColoredHelp, name="BioMedGPS - importkge", author="Jingcheng Yang <[email protected]>")]
 pub struct ImportKGEArguments {
@@ -527,51 +492,39 @@ async fn main() {
                 arguments.skip_check,
                 arguments.show_all_errors,
             )
-            .await
-        }
-        SubCommands::ImportGraph(arguments) => {
+            .await;
+
+            // Sync the entity data to the graph database. The relation data will be synced to the graph database in the cachetable command, because we need to compute the score for the relation data.
             let neo4j_url = if arguments.neo4j_url.is_none() {
                 match std::env::var("NEO4J_URL") {
                     Ok(v) => v,
                     Err(_) => {
-                        error!("{}", "NEO4J_URL is not set.");
-                        std::process::exit(1);
+                        error!(
+                            "{}",
+                            "NEO4J_URL is not set, so skip to sync the data to the graph database."
+                        );
+                        return ();
                     }
                 }
             } else {
                 arguments.neo4j_url.unwrap()
             };
 
-            let graph = connect_graph_db(&neo4j_url).await;
-
-            let filetype = if arguments.filetype.is_none() {
-                error!("Please specify the file type.");
-                std::process::exit(1);
-            } else {
-                arguments.filetype.unwrap()
-            };
-
-            let batch_size = if arguments.batch_size.is_none() {
-                1000
-            } else {
-                arguments.batch_size.unwrap()
-            };
-
-            if filetype == "entity" || filetype == "relation" || filetype == "entity_attribute" {
+            if arguments.table == "entity" {
+                let graph = connect_graph_db(&neo4j_url).await;
+                let batch_size = 10000;
                 import_graph_data(
                     &graph,
                     &arguments.filepath,
-                    &filetype,
+                    &arguments.table,
                     arguments.skip_check,
-                    arguments.check_exist,
+                    true,
                     arguments.show_all_errors,
                     batch_size,
                     &arguments.dataset,
                 )
-                .await
-            }
+                .await;
 
-            if filetype == "entity_index" {
                 build_index(
                     &graph,
                     &arguments.filepath,
@@ -580,6 +533,8 @@ async fn main() {
                 )
                 .await
             }
+
+            info!("We have synced the entity data to the graph database, but the relation data will be synced to the graph database in the cachetable command, because we need to compute the score for the relation data. Before you run the cachetable command, you need to ensure that the entity, relation, and the embedding data has been imported into the database.");
         }
         SubCommands::ImportKGE(arguments) => {
             let database_url = if arguments.database_url.is_none() {