From 7c1d497557c1a669f2a7f002ba85529410b13307 Mon Sep 17 00:00:00 2001 From: HuangWei Date: Tue, 19 Sep 2023 10:09:53 +0000 Subject: [PATCH 1/2] fix: add deploy bias on demo and docs --- demo/byzer-taxi/openmldb_byzer_taxi.bznb | 2 +- .../demo/src/main/java/com/openmldb/demo/App.java | 2 +- demo/jd-recommendation/sql_scripts/deploy.sql | 2 +- demo/predict-taxi-trip-duration/README.md | 4 ++-- demo/predict-taxi-trip-duration/script/taxi.sql | 2 +- .../train_and_serve.ipynb | 2 +- .../train_and_serve.py | 3 ++- docs/en/use_case/lightgbm_demo.md | 2 +- .../integration/deploy_integration/OpenMLDB_Byzer_taxi.md | 2 +- docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md | 6 ++++++ docs/zh/quickstart/sdk/java_sdk.md | 2 +- docs/zh/use_case/JD_recommendation.md | 2 +- docs/zh/use_case/taxi_tour_duration_prediction.md | 2 +- 13 files changed, 20 insertions(+), 13 deletions(-) diff --git a/demo/byzer-taxi/openmldb_byzer_taxi.bznb b/demo/byzer-taxi/openmldb_byzer_taxi.bznb index dc1c925cb0f..b4835f7cc85 100644 --- a/demo/byzer-taxi/openmldb_byzer_taxi.bznb +++ b/demo/byzer-taxi/openmldb_byzer_taxi.bznb @@ -64,7 +64,7 @@ "job_id" : null }, { "id" : "240", - "content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";", + "content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";", "job_id" : null }, { "id" : "241", diff --git a/demo/java_quickstart/demo/src/main/java/com/openmldb/demo/App.java b/demo/java_quickstart/demo/src/main/java/com/openmldb/demo/App.java index 2923832d3b8..cbe363f4359 100644 --- a/demo/java_quickstart/demo/src/main/java/com/openmldb/demo/App.java +++ b/demo/java_quickstart/demo/src/main/java/com/openmldb/demo/App.java @@ -146,7 +146,7 @@ private void createDeployment() { "(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table, table, table); // 上线一个Deployment - String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql); + String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql); // set return null rs, don't check the returned value, it's false state.execute(deploySql); } catch (Exception e) { diff --git a/demo/jd-recommendation/sql_scripts/deploy.sql b/demo/jd-recommendation/sql_scripts/deploy.sql index 7cb2121e869..e37408b6396 100644 --- a/demo/jd-recommendation/sql_scripts/deploy.sql +++ b/demo/jd-recommendation/sql_scripts/deploy.sql @@ -1,6 +1,6 @@ USE JD_db; SET @@execute_mode='online'; -DEPLOY demo select * from +DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') select * from ( select `reqId` as reqId_1, diff --git a/demo/predict-taxi-trip-duration/README.md b/demo/predict-taxi-trip-duration/README.md index 3a2bc6b9dee..c066a83e93b 100644 --- a/demo/predict-taxi-trip-duration/README.md +++ b/demo/predict-taxi-trip-duration/README.md @@ -85,7 +85,7 @@ python3 train.py /tmp/feature_data /tmp/model.txt # The below commands are executed in the CLI > USE demo_db; > SET @@execute_mode='online'; -> DEPLOY demo SELECT trip_duration, passenger_count, +> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, @@ -193,7 +193,7 @@ python3 train.py /tmp/feature.csv /tmp/model.txt ```sql # The below commands are executed in the CLI > USE demo_db; -> DEPLOY demo SELECT trip_duration, passenger_count, +> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, diff --git a/demo/predict-taxi-trip-duration/script/taxi.sql b/demo/predict-taxi-trip-duration/script/taxi.sql index bbdd219b2e5..8ade33df870 100644 --- a/demo/predict-taxi-trip-duration/script/taxi.sql +++ b/demo/predict-taxi-trip-duration/script/taxi.sql @@ -22,7 +22,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN OPTIONS(mode='overwrite'); SET @@execute_mode='online'; -DEPLOY demo SELECT trip_duration, passenger_count, +DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, diff --git a/demo/talkingdata-adtracking-fraud-detection/train_and_serve.ipynb b/demo/talkingdata-adtracking-fraud-detection/train_and_serve.ipynb index 6a7c71ff412..b3b01306588 100644 --- a/demo/talkingdata-adtracking-fraud-detection/train_and_serve.ipynb +++ b/demo/talkingdata-adtracking-fraud-detection/train_and_serve.ipynb @@ -187,7 +187,7 @@ "outputs": [], "source": [ "deploy_name='d1'\n", - "%sql DEPLOY $deploy_name $sql_part;" + "%sql DEPLOY $deploy_name OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") $sql_part;" ] }, { diff --git a/demo/talkingdata-adtracking-fraud-detection/train_and_serve.py b/demo/talkingdata-adtracking-fraud-detection/train_and_serve.py index 9cdd93d2074..a592edfdb0e 100644 --- a/demo/talkingdata-adtracking-fraud-detection/train_and_serve.py +++ b/demo/talkingdata-adtracking-fraud-detection/train_and_serve.py @@ -166,7 +166,8 @@ def nothrow_execute(sql): connection.execute("SET @@execute_mode='online';") connection.execute(f'USE {DB_NAME}') nothrow_execute(f'DROP DEPLOYMENT {DEPLOY_NAME}') -deploy_sql = f"""DEPLOY {DEPLOY_NAME} {sql_part}""" +# to avoid data expired by abs ttl, set inf +deploy_sql = f"""DEPLOY {DEPLOY_NAME} OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") {sql_part}""" print(deploy_sql) connection.execute(deploy_sql) print('Import data to online') diff --git a/docs/en/use_case/lightgbm_demo.md b/docs/en/use_case/lightgbm_demo.md index 28132af3829..85cdf47b6c5 100644 --- a/docs/en/use_case/lightgbm_demo.md +++ b/docs/en/use_case/lightgbm_demo.md @@ -152,7 +152,7 @@ Assuming that the model produced by the features designed in Section 2.3 in the ```sql > USE demo_db; > SET @@execute_mode='online'; -> DEPLOY demo SELECT trip_duration, passenger_count, +> DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, diff --git a/docs/zh/integration/deploy_integration/OpenMLDB_Byzer_taxi.md b/docs/zh/integration/deploy_integration/OpenMLDB_Byzer_taxi.md index 1d43938f04e..3c6894e59e9 100644 --- a/docs/zh/integration/deploy_integration/OpenMLDB_Byzer_taxi.md +++ b/docs/zh/integration/deploy_integration/OpenMLDB_Byzer_taxi.md @@ -232,7 +232,7 @@ and `sql-0`=''' SET @@execute_mode='online'; ''' and `sql-1`=''' -DEPLOY d1 SELECT trip_duration, passenger_count, +DEPLOY d1 OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, diff --git a/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md b/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md index 4f94e228357..8183c61bb72 100644 --- a/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md +++ b/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md @@ -185,6 +185,12 @@ deploy demo options(SYNC="false") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as 而时间偏移的单位是`min`,我们会在内部将其转换为`min`,并且取上界。比如,新索引ttl是abs 2min,加上偏移20s,结果是`2min + ub(20s) = 3min`,然后和旧索引1min取上界,最终索引ttl是`max(1min, 3min) = 3min`。 +**Example** +```sql +DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as w1_col4_sum FROM t1 LAST JOIN t2 ORDER BY t2.col3 ON t1.col2 = t2.col2 + WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); +``` + ## 相关SQL [USE DATABASE](../ddl/USE_DATABASE_STATEMENT.md) diff --git a/docs/zh/quickstart/sdk/java_sdk.md b/docs/zh/quickstart/sdk/java_sdk.md index 17e5019a5b3..1eb8776f6c1 100644 --- a/docs/zh/quickstart/sdk/java_sdk.md +++ b/docs/zh/quickstart/sdk/java_sdk.md @@ -401,7 +401,7 @@ try { "(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table, table, table); // 上线一个Deployment - String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql); + String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql); // set return null rs, don't check the returned value, it's false state.execute(deploySql); } catch (Exception e) { diff --git a/docs/zh/use_case/JD_recommendation.md b/docs/zh/use_case/JD_recommendation.md index 143666d58ec..0a5b5de69cb 100644 --- a/docs/zh/use_case/JD_recommendation.md +++ b/docs/zh/use_case/JD_recommendation.md @@ -393,7 +393,7 @@ bash train_deepfm.sh $demodir/feature_preprocess/out ```sql -- OpenMLDB CLI USE JD_db; - DEPLOY demo ; + DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') ; ``` 也可以在 Docker 容器内直接运行部署脚本: diff --git a/docs/zh/use_case/taxi_tour_duration_prediction.md b/docs/zh/use_case/taxi_tour_duration_prediction.md index 21d2c50fee6..1432f6133aa 100644 --- a/docs/zh/use_case/taxi_tour_duration_prediction.md +++ b/docs/zh/use_case/taxi_tour_duration_prediction.md @@ -151,7 +151,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN --OpenMLDB CLI USE demo_db; SET @@execute_mode='online'; - DEPLOY demo SELECT trip_duration, passenger_count, + DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count, sum(pickup_latitude) OVER w AS vendor_sum_pl, max(pickup_latitude) OVER w AS vendor_max_pl, min(pickup_latitude) OVER w AS vendor_min_pl, From fc6716d4d3212c4272ede826e9d7c10a86722d29 Mon Sep 17 00:00:00 2001 From: HuangWei Date: Mon, 9 Oct 2023 03:30:53 +0000 Subject: [PATCH 2/2] fix --- docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md | 4 ++-- docs/zh/use_case/taxi_tour_duration_prediction.md | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md b/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md index 8183c61bb72..41b3e1141e8 100644 --- a/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md +++ b/docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md @@ -175,9 +175,9 @@ deploy demo options(SYNC="false") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); ``` -#### 设置偏移 +#### 设置偏移BIAS -如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。 +如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移BIAS,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。 时间偏移,单位可以是`s`、`m`、`h`、`d`,也可以是整数,单位为`ms`,也可以是`inf`,表示永不淘汰;如果是行数偏移,可以是整数,单位是`row`,也可以是`inf`,表示永不淘汰。两种偏移中,0均表示不偏移。 diff --git a/docs/zh/use_case/taxi_tour_duration_prediction.md b/docs/zh/use_case/taxi_tour_duration_prediction.md index 1432f6133aa..42764f5c00a 100644 --- a/docs/zh/use_case/taxi_tour_duration_prediction.md +++ b/docs/zh/use_case/taxi_tour_duration_prediction.md @@ -167,6 +167,10 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW); ``` +```{note} +此处DEPLOY包含BIAS OPTIONS,是因为导入在线存储的数据文件不会更新,对于当前时间来讲,可能会超过DEPLOY后的表索引的时间TTL,导致表淘汰掉这些数据。时间淘汰,只看每个索引的ts列和ttl,只要数据中该列的值<(当前时间-abs_ttl),在该索引上就会被淘汰,与其他因素无关,各个索引也互相不影响。如果你的数据不是实时产生的新timestamp,也需要考虑带上BIAS OPTIONS。 +``` + ### 步骤 7:导入在线数据 首先,请切换到**在线**执行模式。接着在在线模式下,导入样例数据 `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` 作为在线数据,用于在线特征计算。