Merge pull request #289 from george0st/change

TS204
george0st · Apr 19, 2024 · 291f0ec · 291f0ec
2 parents b20bd5a + c58360d
commit 291f0ec
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ The quality gate covers these test scenarios (✅ done, ✔ in-progress, ❌ pla
    - ✅ TS201: Create feature set(s)
    - ✅ TS202: Create feature set(s) & Ingest from DataFrame source (one step)
    - ✅ TS203: Create feature set(s) & Ingest from CSV source (one step) 
-   - ✔  TS204: Create feature set(s) & Ingest from Parquet source (one step)
+   - ✅ TS204: Create feature set(s) & Ingest from Parquet source (one step)
    - ❌ TS205: Create feature set(s) & Ingest from SQL source (one step)
    - ❌ TS206: Create feature set(s) & Ingest from Kafka source (one step)
    - ❌ TS207: Create feature set(s) & Ingest from HTTP source (one step)

diff --git a/docs/applied-limits.md b/docs/applied-limits.md
@@ -19,6 +19,12 @@ NOTE: Solution, it is necessity to use WSL2 under OS Windows
    - in case of e.g. more on-line targets, it is not possible to choose 
    relevant target for FeatureVector  
 
+## RedisNoSqlTarget
+
+1. Issue with support **date** type
+   - see [combination of RedisNoSqlTarget and ParquetSource](https://github.com/mlrun/mlrun/issues/5447)
+
+
 ## SQLTarget
 
 NOTE: It is in preview version, very limited with focus on MySQL only, 

diff --git a/qgate-sln-mlrun-private.env b/qgate-sln-mlrun-private.env
@@ -46,11 +46,11 @@ QGATE_DATASET = 01-size-100
 #QGATE_FILTER_PROJECTS = agate-redis-parquet
 #QGATE_FILTER_PROJECTS = agate-parquet
 #QGATE_FILTER_PROJECTS = agate-redis-csv,agate-redis-parquet,agate-mysql-csv,agate-mysql-parquet,agate-kafka
-QGATE_FILTER_PROJECTS = agate-mysql-csv
+QGATE_FILTER_PROJECTS = agate-redis-csv
 
 # List of test scenarios for testing e.g. TS201, etc. (it is important to keep TS dependencies).
 # Default is empty list (all projects will be tested)
-QGATE_FILTER_SCENARIOS = TS101, TS102, TS203
+QGATE_FILTER_SCENARIOS = TS101, TS102, TS204
 
 # Path to the output directory (as off-line storage, valid for target 'parquet' and 'csv')
 # sample value e.g. ./output

diff --git a/qgate_sln_mlrun/qualityreport.py b/qgate_sln_mlrun/qualityreport.py
@@ -3,7 +3,7 @@
 import os
 from qgate_sln_mlrun.setup import Setup, ProjectDelete
 from qgate_sln_mlrun.output import Output
-from qgate_sln_mlrun.ts import ts101, ts102, ts201, ts202, ts203, ts301, ts302, ts303, ts401, ts501, ts502, ts701, ts801
+from qgate_sln_mlrun.ts import ts101, ts102, ts201, ts202, ts203, ts204, ts301, ts302, ts303, ts401, ts501, ts502, ts701, ts801
 from qgate_sln_mlrun.ts import tsbase
 import logging
 import importlib.resources
@@ -15,7 +15,7 @@ class QualityReport:
     """
 
     TEST_SCENARIOS = [ts101.TS101,
-                      ts201.TS201, ts202.TS202, ts203.TS203,
+                      ts201.TS201, ts202.TS202, ts203.TS203, ts204.TS204,
                       ts301.TS301, ts302.TS302, ts303.TS303,
                       ts401.TS401,
                       ts501.TS501, ts502.TS502]
@@ -31,7 +31,7 @@ class QualityReport:
     TARGET_NOT_VALID_TEST = {"kafka": ["TS501", "TS502"]}
 
     # Test vs Only On/Off-line
-    TEST_BOTH = ["TS101","TS102","TS201","TS301", "TS302", "TS303", "TS401"]
+    TEST_BOTH = ["TS101","TS102","TS201", "TS202", "TS203", "TS204", "TS301", "TS302", "TS303", "TS401"]
     TEST_ONLY_OFFLINE = ["TS501","TS701","TS801"]
     TEST_ONLY_ONLINE = ["TS502"]
 

diff --git a/qgate_sln_mlrun/ts/ts204.py b/qgate_sln_mlrun/ts/ts204.py
@@ -0,0 +1,72 @@
+"""
+  TS204: Create feature set(s) & Ingest from Parquet source (one step)
+"""
+from qgate_sln_mlrun.ts.tsbase import TSBase
+import mlrun
+import mlrun.feature_store as fstore
+from mlrun.data_types.data_types import ValueType
+from mlrun.datastore.sources import ParquetSource
+from qgate_sln_mlrun.ts import ts201
+import os
+import json
+import glob
+
+
+class TS204(TSBase):
+
+    def __init__(self, solution):
+        super().__init__(solution, self.__class__.__name__)
+
+    @property
+    def desc(self) -> str:
+        return "Create feature set(s) & Ingest from Parquet source (one step)"
+
+    @property
+    def long_desc(self):
+        return ("Create feature set(s) & Ingest from Parquet source (one step)")
+
+    def exec(self, project_name):
+        """ Get or create featuresets"""
+
+        for featureset_name in self.get_featuresets(self.project_specs.get(project_name)):
+            # create file with definition of vector
+            source_file = os.path.join(os.getcwd(),
+                                       self.setup.model_definition,
+                                       "01-model",
+                                       "02-feature-set",
+                                       f"*-{featureset_name}.json")
+
+            for file in glob.glob(source_file):
+                # iterate cross all featureset definitions
+                with open(file, "r") as json_file:
+                    self._create_featureset_ingest(f'{project_name}/{featureset_name}', project_name, json_file)
+
+    @TSBase.handler_testcase
+    def _create_featureset_ingest(self, testcase_name, project_name, json_file):
+        json_content = json.load(json_file)
+        name, desc, lbls, kind = TSBase.get_json_header(json_content)
+
+        if kind == "feature-set":
+
+            # create feature set based on the logic in TS201
+            ts=ts201.TS201(self._solution)
+            featureset=ts.create_featureset_content(project_name, f"{self.name}-{name}", desc, json_content['spec'])
+
+            # TODO: get the relevant data file
+            source_file = os.path.join(os.getcwd(),
+                                       self.setup.model_definition,
+                                       "02-data",
+                                       self.setup.dataset_name,
+                                       f"*-{name}.parquet")
+            for file in glob.glob(source_file):
+
+                fstore.ingest(featureset,
+                              ParquetSource(name="tst", path=file),
+                              # overwrite=False,
+                              return_df=False,
+                              # infer_options=mlrun.data_types.data_types.InferOptions.Null)
+                              infer_options=mlrun.data_types.data_types.InferOptions.default())
+                # TODO: use InferOptions.Null with python 3.10 or focus on WSL
+                # NOTE: option default, change types
+                # NOTE: option Null, generate error with datetime in python 3.9
+