Activate feather and update all catalogues

metagenopolis · Sep 23, 2024 · a4d7cca · a4d7cca
1 parent d385cf6
commit a4d7cca
Show file tree

Hide file tree

Showing 17 changed files with 143 additions and 351 deletions.
diff --git a/meteor/counter.py b/meteor/counter.py
@@ -370,12 +370,7 @@ def save_cram_strain(
             / ref_json["annotation"]["msp"]["filename"]
         )
         msp_content = (
-            pd.read_csv(
-                msp_file,
-                sep="\t",
-                # names=["msp_name", "gene_id", "gene_name", "gene_category"],
-                header=0,
-            )
+            self.load_data(msp_file)
             .query("gene_category == 'core'")
             .groupby("msp_name", as_index=False)
             .head(self.core_size)

diff --git a/meteor/data/category_pcm.feather b/meteor/data/category_pcm.feather
diff --git a/meteor/data/category_pcm.tsv b/meteor/data/category_pcm.tsv
diff --git a/meteor/data/modules_definition.feather b/meteor/data/modules_definition.feather
diff --git a/meteor/data/modules_definition.tsv b/meteor/data/modules_definition.tsv
diff --git a/meteor/data/zenodo.json b/meteor/data/zenodo.json
@@ -1,122 +1,122 @@
 {
     "cat_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12820763/files/fc_1_3_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13771508/files/fc_1_3_gut.tar.xz",
             "filename": "fc_1_3_gut.tar.xz",
-            "md5": "6d058b2f68586ada5e690e2db9ddfa0d"
+            "md5": "b8e9d8e60981d575e38624eed8031642"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12820763/files/fc_1_3_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13771508/files/fc_1_3_gut_taxo.tar.xz",
             "filename": "fc_1_3_gut_taxo.tar.xz",
-            "md5": "01fc5dc52a681e5f8997108b625ccb6b"
+            "md5": "16f99e4f0726170f38e4815ab3fffc4f"
         }
     },
     "chicken_caecal": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12820776/files/gg_13_6_caecal.tar.xz",
+            "catalogue": "https://zenodo.org/records/13771776/files/gg_13_6_caecal.tar.xz",
             "filename": "gg_13_6_caecal.tar.xz",
-            "md5": "b68a9bc0dbecf15b5280c6da3b7b7155"
+            "md5": "2c386b11731d58eaa0e7c90f7d14b0fe"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12820776/files/gg_13_6_caecal_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13771776/files/gg_13_6_caecal_taxo.tar.xz",
             "filename": "gg_13_6_caecal_taxo.tar.xz",
-            "md5": "c35972a8fd685dc25e267e4cc93e9c89"
+            "md5": "0ac3510484385b85ede5f53e91598492"
         }
     },
     "dog_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12820719/files/clf_1_0_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13768924/files/clf_1_0_gut.tar.xz",
             "filename": "clf_1_0_gut.tar.xz",
-            "md5": "e8aa19bacc48641dd54dce10a7b2eea2"
+            "md5": "081d52ed8aad93638fa50ceb4e65cb39"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12820719/files/clf_1_0_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13768924/files/clf_1_0_gut_taxo.tar.xz",
             "filename": "clf_1_0_gut_taxo.tar.xz",
-            "md5": "a60257ed7e64c577c69dc71a913b040a"
+            "md5": "7cbdfc9aa7af0d8a27f5635871ff0f3e"
         }
     },
     "human_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12820832/files/hs_10_4_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13772391/files/hs_10_4_gut.tar.xz",
             "filename": "hs_10_4_gut.tar.xz",
-            "md5": "4e87ef559e9aa66434aea4722b335f7f"
+            "md5": "3fab475bff77e313d8c0d4f4d96ccd2e"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12820832/files/hs_10_4_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13772391/files/hs_10_4_gut_taxo.tar.xz",
             "filename": "hs_10_4_gut_taxo.tar.xz",
-            "md5": "0235037ca9e949fe748f45fec75ca63c"
+            "md5": "781d47c2ab2d8c9766468229a54bcb3a"
         }
     },
     "human_oral": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12821296/files/hs_8_4_oral.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786233/files/hs_8_4_oral.tar.xz",
             "filename": "hs_8_4_oral.tar.xz",
-            "md5": "648d43d0646a1f2218f260e6684f929a"
+            "md5": "9ed8d61a344945f93c81a0198983e01f"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12821296/files/hs_8_4_oral_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786233/files/hs_8_4_oral_taxo.tar.xz",
             "filename": "hs_8_4_oral_taxo.tar.xz",
-            "md5": "91ef007adb3697bdfd04add80173440c"
+            "md5": "c35c6193bd687626a540f8de2f1413bb"
         }
     },
     "human_skin": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12820845/files/hs_2_9_skin.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786083/files/hs_2_9_skin.tar.xz",
             "filename": "hs_2_9_skin.tar.xz",
-            "md5": "de432ae53110316a4060bd98fa7fb131"
+            "md5": "475b77e2c285c78b737f59c13d04839a"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12820845/files/hs_2_9_skin_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786083/files/hs_2_9_skin_taxo.tar.xz",
             "filename": "hs_2_9_skin_taxo.tar.xz",
-            "md5": "0bd86c10f31f4bcc9b661147ffb1bca0"
+            "md5": "31ba766f2c3ec092d9fb5b7fab4759d3"
         }
     },
     "mouse_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12821471/files/mm_5_0_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786249/files/mm_5_0_gut.tar.xz",
             "filename": "mm_5_0_gut.tar.xz",
-            "md5": "33ac32b377ab93117ece81cafd272020"
+            "md5": "1bc0d511caaa9f6b64d63a640597141e"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12821471/files/mm_5_0_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786249/files/mm_5_0_gut_taxo.tar.xz",
             "filename": "mm_5_0_gut_taxo.tar.xz",
-            "md5": "5f15d7f763bfdebf6f89ccdb9d326f71"
+            "md5": "6565223d8b6efbb0cc7334fafe81991e"
         }
     },
     "rabbit_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/12821585/files/oc_5_7_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786259/files/oc_5_7_gut.tar.xz",
             "filename": "oc_5_7_gut.tar.xz",
-            "md5": "3733f7096eb69b00c4a89721609e9511"
+            "md5": "7ba41d08c3defb1b86dc30b76cee9953"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/12821585/files/oc_5_7_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786259/files/oc_5_7_gut_taxo.tar.xz",
             "filename": "oc_5_7_gut_taxo.tar.xz",
-            "md5": "f1eb46abf8246d2c1f8bab69ca72fe3e"
+            "md5": "023093b44673b2b6421486d14f11a093"
         }
     },
     "rat_gut": {
         "file_info": {
-            "catalogue": "https://zenodo.org/records/13119584/files/rn_5_9_gut.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786316/files/rn_5_9_gut.tar.xz",
             "filename": "rn_5_9_gut.tar.xz",
-            "md5": "839e791839da771b0e698a63bbc45d39"
+            "md5": "debbf9677aea2ce668331c333c9efb80"
         },
         "taxonomy_info": {
-            "catalogue": "https://zenodo.org/records/13119584/files/rn_5_9_gut_taxo.tar.xz",
+            "catalogue": "https://zenodo.org/records/13786316/files/rn_5_9_gut_taxo.tar.xz",
             "filename": "rn_5_9_gut_taxo.tar.xz",
-            "md5": "3f4c7ee4a96d5ec8e65227eddad9b8c7"
+            "md5": "a9aa773f44464cce74b71433d3a1c47a"
         }
     },
     "pig_gut": {
         "file_info": {
-            "complete": "https://zenodo.org/records/13119585/files/ssc_9_3_gut.tar.xz",
+            "complete": "https://zenodo.org/records/13786359/files/ssc_9_3_gut.tar.xz",
             "filename": "ssc_9_3_gut.tar.xz",
-            "md5": "1451197acf6257949751061d96fa1856"
+            "md5": "9140d8c2517b3d12b08ffb0bde1319f0"
         },
         "taxonomy_info": {
-            "taxo": "https://zenodo.org/records/13119585/files/ssc_9_3_gut_taxo.tar.xz",
+            "taxo": "https://zenodo.org/records/13786359/files/ssc_9_3_gut_taxo.tar.xz",
             "filename": "ssc_9_3_gut_taxo.tar.xz",
-            "md5": "0059e8ee77083725ab48afdbdaa98fac"
+            "md5": "370884b1730c28d73efa0b886c259ee9"
         }
     },
     "test": {

diff --git a/meteor/merging.py b/meteor/merging.py
@@ -361,13 +361,10 @@ def execute(self) -> None:
                 # Filter the DataFrame to keep only columns with a non-zero sum
                 filtered_df = filtered_df.loc[:, (filtered_df.sum(axis=0) != 0)]
             if my_pattern == "msp":
-                annotation = pd.read_csv(
+                annotation = self.load_data(
                     self.meteor.ref_dir
                     / ref_json["reference_file"]["database_dir"]
-                    / ref_json["annotation"]["taxonomy"]["filename"],
-                    sep="\t",
-                    header=0,
-                    usecols=list(self.ranks.keys()),
+                    / ref_json["annotation"]["taxonomy"]["filename"]
                 )
 
                 annotation = annotation[

diff --git a/meteor/parser.py b/meteor/parser.py
@@ -32,7 +32,8 @@ def load_modules(self, module_file: Path) -> dict[str, str]:
         four columns: id (e.g., MF0001), type (e.g., GMM), name (e.g., butyrate production),
         definition (e.g., K01+K02))
         """
-        module = pd.read_table(module_file, names=["id", "type", "name", "definition"])
+        # module = pd.read_table(module_file, names=["id", "type", "name", "definition"])
+        module = self.load_data(module_file)
         # Remove trailing whitespaces if any
         module["definition"] = module["definition"].str.rstrip()
         module_dict = dict(zip(module.id, module.definition))

diff --git a/meteor/profiler.py b/meteor/profiler.py
@@ -122,17 +122,17 @@ def __post_init__(self):
             )
 
         # Check the input count table
-        self.check_file(
-            self.input_count_table,
-            {
-                "gene_id",
-                "value",
-                "gene_length",
-            },
-        )
+        # self.check_file(
+        #     self.input_count_table,
+        #     {
+        #         "gene_id",
+        #         "value",
+        #         "gene_length",
+        #     },
+        # )
 
         # Load the count table
-        self.gene_count = pd.read_table(self.input_count_table)
+        self.gene_count = self.load_data(self.input_count_table)
         self.gene_count["value"] = self.gene_count["value"].astype(
             pd.SparseDtype("float", fill_value=0.0)
         )
@@ -147,14 +147,14 @@ def __post_init__(self):
             / self.ref_config["annotation"]["msp"]["filename"]
         )
         assert self.msp_filename.is_file()
-        self.check_file(
-            self.msp_filename,
-            {
-                "msp_name",
-                "gene_id",
-                "gene_category",
-            },
-        )
+        # self.check_file(
+        #     self.msp_filename,
+        #     {
+        #         "msp_name",
+        #         "gene_id",
+        #         "gene_category",
+        #     },
+        # )
 
         # Get functional db filenames
         if self.database_type == "complete":
@@ -169,7 +169,7 @@ def __post_init__(self):
 
             # Initialize the module definition file
             self.module_path = (
-                importlib.resources.files("meteor") / "data/modules_definition.tsv"
+                importlib.resources.files("meteor") / "data/modules_definition.feather"
             )
             assert self.module_path.is_file()
 
@@ -321,7 +321,7 @@ def get_msp_core(self, msp_def_filename: Path, core_size: int) -> dict:
         :param core_size: maximum number of core genes to consider.
         """
         # Load msp file
-        msp_df = pd.read_table(msp_def_filename)
+        msp_df = self.load_data(msp_def_filename)
         # Restrict to core
         msp_df_selection = msp_df.loc[msp_df["gene_category"] == "core"]
         # Return the df as a dict of set
@@ -338,7 +338,7 @@ def compute_msp_stats(self, msp_def_filename: Path) -> float:
         :param msp_def_filename: A path object pointing to an MSP definition file.
         """
         # Load msp file
-        msp_df = pd.read_table(msp_def_filename)
+        msp_df = self.load_data(msp_def_filename)
         # Get the ensemble of genes used in MSP
         all_msp_genes = msp_df["gene_id"].unique()
         # Get the percentage of reads that map on an MSP
@@ -357,7 +357,7 @@ def compute_ko_abundance(self, annot_file: Path) -> None:
         :param annot_file: a path object pointing to the annotation gene_name -> enzyme file.
         """
         # Load annotation file
-        annot_df = pd.read_table(annot_file)
+        annot_df = self.load_data(annot_file)
         # Merge count table and gene annotation
         merged_df = pd.merge(
             annot_df,
@@ -378,11 +378,11 @@ def compute_ko_abundance_by_msp(
         :param msp_def_filename: A path object pointing to an MSP definition file.
         """
         # Load annotation file
-        annot_df = pd.read_table(annot_file)
+        annot_df = self.load_data(annot_file)
         # Get KO list
         all_ko = annot_df["annotation"].unique()
         # Load MSP file
-        msp_df = pd.read_table(msp_def_filename)
+        msp_df = self.load_data(msp_def_filename)
         # Merge both data frames
         msp_df_annotated = pd.merge(msp_df, annot_df)
         # Restrict to detected genes
@@ -422,10 +422,10 @@ def compute_ko_stats(
         :param msp_def_filename: A path object pointing to an MSP definition file.
         """
         # Load annotation file
-        annot_df = pd.read_table(annot_file)
+        annot_df = self.load_data(annot_file)
         if by_msp:
             # Load MSP file
-            msp_df = pd.read_table(msp_def_filename)
+            msp_df = self.load_data(msp_def_filename)
             # Merge both data frames
             annot_df = pd.merge(msp_df, annot_df)
         # Get the genes in MSP AND annotated
@@ -449,7 +449,7 @@ def merge_catalogue_info(
         :param annot_file: path to the gene functional annotation file
         """
         # Load files
-        msp_df = pd.read_table(msp_file)
+        msp_df = self.load_data(msp_file)
         # Restrict df to detected genes
         detected_genes = self.gene_count.loc[self.gene_count["value"] > 0, "gene_id"]
         msp_df = msp_df.loc[msp_df["gene_id"].isin(detected_genes)]
@@ -462,7 +462,7 @@ def merge_catalogue_info(
         # Merge each provided db
         annot_df = pd.concat(
             [
-                pd.read_table(db)[["gene_id", "annotation"]]
+                self.load_data(db)[["gene_id", "annotation"]]
                 for db in annot_file.values()
             ],
             ignore_index=True,