Merge pull request #105 from grand-mother/dev_database

Dev database
grand-mother · Oct 14, 2024 · 3b01985 · 3b01985
2 parents 96155d2 + d32acf5
commit 3b01985
Show file tree

Hide file tree

Showing 14 changed files with 603 additions and 147 deletions.
diff --git a/granddb/config.ini.example b/granddb/config.ini.example
@@ -21,8 +21,9 @@ localdir = ["/home/fleg/incoming/"]
 ; If credentials are required to access the repository, they should be given in the [credential] section using the same name
 ; repository CCIN2P3 is already defined in the database (so it's not necessary to define it here), but credentials for it have
 ; to be supplied in the [credentials] section below
+; THIS DEFINITIONS OVERRIDE THE ONES FROM THE DATABASE
 [repositories]
-CCIN2P3 = ["ssh","cca.in2p3.fr",22,["/sps/grand/data/nancay/GRANDfiles"]]
+CC = ["ssh","cca.in2p3.fr",22,["/sps/grand/data/nancay/GRANDfiles"]]
 
 
 ; Credentials for repositories given as :
@@ -35,6 +36,7 @@ CCIN2P3 = ["ssh","cca.in2p3.fr",22,["/sps/grand/data/nancay/GRANDfiles"]]
 ; To run an ssh-agent just do : `eval $(ssh-agent)` and `ssh-add .ssh/id_rsa`
 [credentials]
 CCIN2P3 = ["john",""]
+CC = ["jim",""]
 SSHTUNNEL = ["joe",""]
 
 ; database to use (only one database can be defined)

diff --git a/granddb/granddatalib.py b/granddb/granddatalib.py
diff --git a/granddb/granddblib.py b/granddb/granddblib.py
@@ -22,22 +22,33 @@
 
 
 def casttodb(value):
+    #print(f'{type(value)} - {value}')
     if isinstance(value, numpy.uint32):
-        value = int(value)
-    if isinstance(value, numpy.float32):
-        value = float(value)
-    if isinstance(value, numpy.ndarray):
+        val = int(value)
+    elif isinstance(value, numpy.float32):
+        val = float(value)
+    elif isinstance(value, numpy.ndarray):
         if value.size == 0:
-            value = None
+            val = None
         elif value.size == 1:
-            value = value.item()
+            val = value.item()
         else:
-            value = value.tolist()
-    if isinstance(value, grand.dataio.root_trees.StdVectorList):
-        value = [i for i in value]
-    if isinstance(value, str):
-        value = value.strip().strip('\t').strip('\n')
-    return value
+            val = value.tolist()
+    elif isinstance(value, grand.dataio.root_trees.StdVectorList):
+        val =[]
+        #postgres cannot store arrays of arrays... so we split (not sure if really correct)!
+        for i in value:
+            if isinstance(i,numpy.ndarray) or isinstance(i, grand.dataio.root_trees.StdVectorList):
+                val.append(casttodb(i))
+            else:
+                val.append(i)
+
+        #value = [i for i in value]
+    elif isinstance(value, str):
+        val = value.strip().strip('\t').strip('\n')
+    else:
+        val = value
+    return val
 
 
 ## @brief Class to handle the Grand database.
@@ -94,7 +105,8 @@ def __init__(self, host, port, dbname, user, passwd, sshserv="", sshport=22, cre
         Base = automap_base()
 
         Base.prepare(engine, reflect=True)
-        self.sqlalchemysession = Session(engine)
+        self.sqlalchemysession = Session(engine,autoflush=False)
+        #self.sqlalchemysession.no_autoflush = True
         inspection = inspect(engine)
         for table in inspection.get_table_names():
             # for table in engine.table_names(): #this is obsolete
@@ -306,21 +318,37 @@ def register_repository(self, name, protocol, port, server, path, description=""
     # Returns the id_file for the file and a boolean True if the file was not previously in the DB (i.e it's a new file)
     # and false if the file was already registered. This is usefull to know if the metadata of the file needs to be read
     # or not
-    def register_filename(self, filename, newfilename, id_repository, provider):
+    def register_filename(self, filename, newfilename, dataset, id_repository, provider, targetfile=None):
         import os
         register_file = False
         isnewfile = False
         idfile = None
+        id_dataset = None
+        if targetfile is None:
+            targetfile = newfilename
+        if dataset is not None:
+            id_dataset = self.get_or_create_key('dataset', 'dataset_name', os.path.basename(dataset))
+            filt = {}
+            filt['id_dataset'] = str(casttodb(id_dataset))
+            filt['id_repository'] = str(casttodb(id_repository))
+            ret = self.sqlalchemysession.query(getattr(self._tables['dataset_location'], 'id_dataset')).filter_by(
+                **filt).all()
+            if len(ret) == 0:
+                container = self.tables()['dataset_location'](id_dataset=id_dataset, id_repository=id_repository, path=dataset,
+                                                              description="")
+                self.sqlalchemysession.add(container)
+                self.sqlalchemysession.flush()
+
         ## Check if file not already registered IN THIS REPO : IF YES, ABORT, IF NO REGISTER
+        #First see if file is registered elsewhere
         file_exist = self.sqlalchemysession.query(self.tables()['file']).filter_by(
-            filename=os.path.basename(newfilename)).first()
+            filename=os.path.basename(targetfile),id_dataset=id_dataset).first()
         if file_exist is not None:
-            # file_exist_here = self.sqlalchemysession.query(self.tables()['file_location']).filter_by(
-            #    id_repository=id_repository).first()
+            #File exists somewhere... see if in the repository we want
             file_exist_here = self.sqlalchemysession.query(self.tables()['file_location']).filter_by(
-                id_repository=id_repository).first()
+                id_repository=id_repository, id_file=file_exist.id_file).first()
             if file_exist_here is None:
-                # file exists in different repo. We only need to register it in the current repo
+                # file exists but in a different repo. We only need to register it in the current repo
                 register_file = True
                 idfile = file_exist.id_file
         else:
@@ -332,11 +360,11 @@ def register_filename(self, filename, newfilename, id_repository, provider):
         if register_file:
             id_provider = self.get_or_create_key('provider', 'provider', provider)
             if isnewfile:
-                # rfile = ROOT.TFile(str(filename))
                 rfile = rdb.RootFile(str(filename))
-                rfile.dataset_name()
+                #rfile.dataset_name()
                 # rfile.file().GetSize()
-                container = self.tables()['file'](filename=os.path.basename(newfilename),
+                container = self.tables()['file'](id_dataset=id_dataset,
+                                                  filename=os.path.basename(targetfile),
                                                   description='autodesc',
                                                   original_name=os.path.basename(filename),
                                                   id_provider=id_provider,
@@ -346,9 +374,10 @@ def register_filename(self, filename, newfilename, id_repository, provider):
                 self.sqlalchemysession.flush()
                 idfile = container.id_file
             # container = self.tables()['file_location'](id_file=idfile, id_repository=id_repository, path=os.path.dirname(newfilename))
-            container = self.tables()['file_location'](id_file=idfile, id_repository=id_repository, path=newfilename,
+            container = self.tables()['file_location'](id_file=idfile, id_repository=id_repository, path=targetfile,
                                                        description="")
             self.sqlalchemysession.add(container)
+            logger.debug(f"File name {filename} registered")
             # self.sqlalchemysession.flush()
 
         return idfile, isnewfile
@@ -485,12 +514,14 @@ def register_filecontent(self, file, idfile):
                 # print('Execution time:', elapsed_time, 'seconds')
                 logger.debug(f"execution time {elapsed_time} seconds")
 
-    def register_file(self, orgfilename, newfilename, id_repository, provider):
-        idfile, read_file = self.register_filename(orgfilename, newfilename, id_repository, provider)
+    def register_file(self, orgfilename, newfilename, dataset, id_repository, provider, targetdir=None):
+        idfile, read_file = self.register_filename(orgfilename, newfilename, dataset, id_repository, provider, targetdir)
         if read_file:
             # We read the localfile and not the remote one
             self.register_filecontent(orgfilename, idfile)
             # self.register_filecontent(newfilename,idfile)
         else:
             logger.info(f"file {orgfilename} already registered.")
         self.sqlalchemysession.commit()
+
+
diff --git a/granddb/rootdblib.py b/granddb/rootdblib.py
@@ -1,6 +1,7 @@
 import ROOT
 import grand.dataio.root_trees as groot
 import grand.manage_log as mlg
+import os
 logger = mlg.get_logger_for_script(__name__)
 #mlg.create_output_for_logger("debug", log_stdout=True)
 
@@ -136,21 +137,21 @@ class RootFile:
         'hadronic_model': 'id_hadronic_model',
         'low_energy_model': 'id_low_energy_model',
         'cpu_time': 'cpu_time',
-        #        'long_pd_depth': 'long_pd_depth',
-        #        'long_pd_eminus': 'long_pd_eminus',
-        #        'long_pd_eplus': 'long_pd_eplus',
-        #        'long_pd_muminus': 'long_pd_muminus',
-        #        'long_pd_muplus': 'long_pd_muplus',
-        #        'long_pd_gamma': 'long_pd_gamma',
-        #        'long_pd_hadron': 'long_pd_hadron',
-        #        'long_gamma_elow': 'long_gamma_elow',
-        #        'long_e_elow': 'long_e_elow',
-        #        'long_e_edep': 'long_e_edep',
-        #        'long_mu_edep': 'long_mu_edep',
-        #        'long_mu_elow': 'long_mu_elow',
-        #        'long_hadron_edep': 'long_hadron_edep',
-        #        'long_hadron_elow': 'long_hadron_elow',
-        #        'long_neutrino': 'long_neutrino',
+        #'long_pd_depth': 'long_pd_depth',
+        #'long_pd_eminus': 'long_pd_eminus',
+        #'long_pd_eplus': 'long_pd_eplus',
+        #'long_pd_muminus': 'long_pd_muminus',
+        #'long_pd_muplus': 'long_pd_muplus',
+        #'long_pd_gamma': 'long_pd_gamma',
+        #'long_pd_hadron': 'long_pd_hadron',
+        #'long_gamma_elow': 'long_gamma_elow',
+        #'long_e_elow': 'long_e_elow',
+        #'long_e_edep': 'long_e_edep',
+        #'long_mu_edep': 'long_mu_edep',
+        #'long_mu_elow': 'long_mu_elow',
+        #'long_hadron_edep': 'long_hadron_edep',
+        #'long_hadron_elow': 'long_hadron_elow',
+        #'long_neutrino': 'long_neutrino',
         'event_weight': 'event_weight'
     }
     tadcToDB = {
@@ -307,9 +308,11 @@ class RootFile:
     #TreeList is a dict with name of the trees as key and the class corresponding to it's type as value
     TreeList = {}
     file = None
+    filename = None
 
     ## We retreive the list of Ttrees in the file  and store them as the corresponding class from root_files.py in the dict TreeList
     def __init__(self, f_name):
+        self.filename = f_name
         self.TreeList.clear()
         self.file = ROOT.TFile(f_name)
         for key in self.file.GetListOfKeys():
@@ -341,6 +344,8 @@ def copy_content_to(self, file):
     # [extra]-> given by user (metadata ?)
     # serial -> automatically incremented in case of new version (how to do that ?)
     def dataset_name(self):
+        name = os.path.basename(os.path.dirname(self.filename))
+        return name
         treename = 'trun'
         name = "noname"
         for run in self.TreeList[treename].get_list_of_runs():

diff --git a/scripts/archiving/config.properties.gaa b/scripts/archiving/config.properties.gaa
@@ -0,0 +1,4 @@
+aipTempDirectory=/sps/grand/prod_grand/tests/archivage/archs/gaa/
+configMetadataDescriptiveDC=dc_gaa.xml
+configDocumentation=GRAND_DMP_2024.pdf
+representationID_1=representation1
diff --git a/scripts/archiving/config.properties.gp13 b/scripts/archiving/config.properties.gp13
@@ -0,0 +1,4 @@
+aipTempDirectory=/sps/grand/prod_grand/tests/archivage/archs/gp13/
+configMetadataDescriptiveDC=dc_gp13.xml
+configDocumentation=GRAND_DMP_2024.pdf
+representationID_1=representation1
diff --git a/scripts/archiving/create_archive.bash b/scripts/archiving/create_archive.bash
@@ -0,0 +1,76 @@
+#!/bin/bash
+datadir="/sps/grand/data"
+archive_root_name="doi+10.25520+in2p3.archive.grand"
+irods_path='/grand/home/trirods/data/archives/'
+
+usage="$(basename "$0") [-d DATE] [-s SITE] [
+Archive some Grand raw files into irods :
+    -s  site (gaa, gp13)
+    -d  YYYY-MM to be archived
+    "
+
+while getopts "d:s:" option ${args}; do
+  case $option in
+    d)
+      if [[ ${OPTARG} =~ ^([0-9]{4})-([0][1-9]|[1][0-2]|[1-9])$ ]]; then
+        date=$(date --date="${BASH_REMATCH[1]}-${BASH_REMATCH[2]}-01" "+%Y_%m")
+        dir=$(date --date="${BASH_REMATCH[1]}-${BASH_REMATCH[2]}-01" "+%Y/%m")
+	    else
+        echo "Date ${OPTARG} should be in format YYYY-MM"
+        exit 1
+	    fi
+      ;;
+    s)
+	    if [[ ${OPTARG} =~ gp13|gaa ]] ; then
+        site=${OPTARG}
+	    else
+		    echo "Site should be gp13 or gaa"
+		    exit 1
+	    fi
+	    ;;
+    :)
+      printf "option -${OPTARG} need an argument\n"
+      exit 1;;
+    ?) # Invalid option
+      printf "Error: Invalid option -${OPTARG}\n"
+      exit 1;;
+  esac
+done
+
+if [ ! "$date" ] || [ ! "$site" ]; then
+  echo "arguments -d and -s must be provided"
+  echo "$usage" >&2; exit 1
+fi
+
+outfile="${archive_root_name}.${site}.${date}"
+logfile=archs/${site}/${outfile}--$(date "+%Y_%m_%d_%H%M%S").log
+
+find $datadir/$site/raw/$dir/ -name "*.bin" >list_files_${site}
+echo "List of files to archive :" >> ${logfile}
+cat list_files_${site} >> ${logfile}
+
+java -jar createAIP.jar --configfile=config.properties.${site} --listobjects=list_files_${site} -i ${outfile}
+
+echo "Archive ready to tar" >> ${logfile}
+
+tar -cvf archs/${site}/${outfile}.tar archs/${site}/${outfile}
+
+echo "Archive tared" >> ${logfile}
+
+echo "Push archs/${site}/${outfile}.tar to irods" >> ${logfile}
+# Put file into irods
+  sfile=archs/${site}/${outfile}.tar
+  ipath="${irods_path}${site}/raw"
+  ifile="${ipath}/${outfile}.tar"
+  echo "imkdir -p $ipath" >> ${logfile}
+  imkdir -p $ipath >> ${logfile} 2>&1
+  echo "iput -f $sfile $ifile" >> ${logfile}
+  #iput -f $sfile $ifile >> ${logfile} 2>&1
+  #iput_status=$?
+  #if [ "$iput_status" -ne 0 ]; then
+  #  notify=1
+  #fi
+
+rm -rf archs/${site}/${outfile}
+rm $sfile
+echo "Month archived.">> ${logfile}
diff --git a/scripts/archiving/dc_gaa.xml b/scripts/archiving/dc_gaa.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.4/metadata.xsd">
+<identifier>
+	<identifier identifierType="DOI">doi+10.25520+in2p3.archive+grand+gaa</identifier>
+</identifier>
+<titles>
+	<title xml:lang="EN">Grand Raw Files from GAA</title>
+</titles>
+<creators>
+	<creator>
+		<creatorName nameType="Organizational">Grand Observatory</creatorName>
+		<nameIdentifier>Grand Observatory</nameIdentifier>
+	</creator>
+</creators>
+<publisher>Grand Observatory</publisher>
+<publicationYear>2024</publicationYear>
+<resourceType resourceTypeGeneral="Dataset">Grand raw files</resourceType>
+<descriptions>
+	<description xml:lang="EN">Grand Raw Data from GAA Observatory</description>
+</descriptions>
+<subjects>
+	<subject subjectScheme="wikidata">radio astronomy cosmics rays neutrinos</subject>
+</subjects>
+<dates>
+	<date dateType="Copyrighted">2024</date>
+</dates>
+<geoLocations>
+	<geoLocation>
+		<geoLocationPlace>Argentina</geoLocationPlace>
+	</geoLocation>
+</geoLocations>
+</resource>
diff --git a/scripts/archiving/dc_gp13.xml b/scripts/archiving/dc_gp13.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.4/metadata.xsd">
+<identifier>
+	<identifier identifierType="DOI">doi+10.25520+in2p3.archive+grand+gp13</identifier>
+</identifier>
+<titles>
+	<title xml:lang="EN">Grand Raw Files from GP13</title>
+</titles>
+<creators>
+	<creator>
+		<creatorName nameType="Organizational">Grand Observatory</creatorName>
+		<nameIdentifier>Grand Observatory</nameIdentifier>
+	</creator>
+</creators>
+<publisher>Grand Observatory</publisher>
+<publicationYear>2024</publicationYear>
+<resourceType resourceTypeGeneral="Dataset">Grand raw files</resourceType>
+<descriptions>
+	<description xml:lang="EN">Grand Raw Data from GP13 Observatory</description>
+</descriptions>
+<subjects>
+	<subject subjectScheme="wikidata">radio astronomy cosmics rays neutrinos</subject>
+</subjects>
+<dates>
+	<date dateType="Copyrighted">2024</date>
+</dates>
+<geoLocations>
+	<geoLocation>
+		<geoLocationPlace>China</geoLocationPlace>
+	</geoLocation>
+</geoLocations>
+</resource>