Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add deploy bias on demo and docs #3520

Merged
merged 2 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion demo/byzer-taxi/openmldb_byzer_taxi.bznb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"job_id" : null
}, {
"id" : "240",
"content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";",
"content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";",
"job_id" : null
}, {
"id" : "241",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ private void createDeployment() {
"(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table,
table, table);
// 上线一个Deployment
String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql);
String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql);
// set return null rs, don't check the returned value, it's false
state.execute(deploySql);
} catch (Exception e) {
Expand Down
2 changes: 1 addition & 1 deletion demo/jd-recommendation/sql_scripts/deploy.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
USE JD_db;
SET @@execute_mode='online';
DEPLOY demo select * from
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') select * from
(
select
`reqId` as reqId_1,
Expand Down
4 changes: 2 additions & 2 deletions demo/predict-taxi-trip-duration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ python3 train.py /tmp/feature_data /tmp/model.txt
# The below commands are executed in the CLI
> USE demo_db;
> SET @@execute_mode='online';
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down Expand Up @@ -193,7 +193,7 @@ python3 train.py /tmp/feature.csv /tmp/model.txt
```sql
# The below commands are executed in the CLI
> USE demo_db;
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
2 changes: 1 addition & 1 deletion demo/predict-taxi-trip-duration/script/taxi.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
OPTIONS(mode='overwrite');

SET @@execute_mode='online';
DEPLOY demo SELECT trip_duration, passenger_count,
DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
"outputs": [],
"source": [
"deploy_name='d1'\n",
"%sql DEPLOY $deploy_name $sql_part;"
"%sql DEPLOY $deploy_name OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") $sql_part;"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ def nothrow_execute(sql):
connection.execute("SET @@execute_mode='online';")
connection.execute(f'USE {DB_NAME}')
nothrow_execute(f'DROP DEPLOYMENT {DEPLOY_NAME}')
deploy_sql = f"""DEPLOY {DEPLOY_NAME} {sql_part}"""
# to avoid data expired by abs ttl, set inf
deploy_sql = f"""DEPLOY {DEPLOY_NAME} OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") {sql_part}"""
print(deploy_sql)
connection.execute(deploy_sql)
print('Import data to online')
Expand Down
2 changes: 1 addition & 1 deletion docs/en/use_case/lightgbm_demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ Assuming that the model produced by the features designed in Section 2.3 in the
```sql
> USE demo_db;
> SET @@execute_mode='online';
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ and `sql-0`='''
SET @@execute_mode='online';
'''
and `sql-1`='''
DEPLOY d1 SELECT trip_duration, passenger_count,
DEPLOY d1 OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
10 changes: 8 additions & 2 deletions docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,16 +175,22 @@ deploy demo options(SYNC="false") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as
WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
```

#### 设置偏移
#### 设置偏移BIAS

如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。
如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移BIAS,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。

时间偏移,单位可以是`s``m``h``d`,也可以是整数,单位为`ms`,也可以是`inf`,表示永不淘汰;如果是行数偏移,可以是整数,单位是`row`,也可以是`inf`,表示永不淘汰。两种偏移中,0均表示不偏移。

注意,我们只将偏移加在deploy的解析索引中,也就是新索引,它们并不是最终索引。最终索引的计算方式是,如果是创建索引,最终索引是`解析索引 + 偏移`;如果是更新索引,最终索引是`merge(旧索引, 新索引 + 偏移)`

而时间偏移的单位是`min`,我们会在内部将其转换为`min`,并且取上界。比如,新索引ttl是abs 2min,加上偏移20s,结果是`2min + ub(20s) = 3min`,然后和旧索引1min取上界,最终索引ttl是`max(1min, 3min) = 3min`

**Example**
vagetablechicken marked this conversation as resolved.
Show resolved Hide resolved
```sql
DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as w1_col4_sum FROM t1 LAST JOIN t2 ORDER BY t2.col3 ON t1.col2 = t2.col2
WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
```

## 相关SQL

[USE DATABASE](../ddl/USE_DATABASE_STATEMENT.md)
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/quickstart/sdk/java_sdk.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ try {
"(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table,
table, table);
// 上线一个Deployment
String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql);
String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql);
// set return null rs, don't check the returned value, it's false
state.execute(deploySql);
} catch (Exception e) {
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/use_case/JD_recommendation.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ bash train_deepfm.sh $demodir/feature_preprocess/out
```sql
-- OpenMLDB CLI
USE JD_db;
DEPLOY demo <SQL>;
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') <SQL>;
```

也可以在 Docker 容器内直接运行部署脚本:
Expand Down
6 changes: 5 additions & 1 deletion docs/zh/use_case/taxi_tour_duration_prediction.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
--OpenMLDB CLI
USE demo_db;
SET @@execute_mode='online';
DEPLOY demo SELECT trip_duration, passenger_count,
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
vagetablechicken marked this conversation as resolved.
Show resolved Hide resolved
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand All @@ -167,6 +167,10 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);
```

```{note}
此处DEPLOY包含BIAS OPTIONS,是因为导入在线存储的数据文件不会更新,对于当前时间来讲,可能会超过DEPLOY后的表索引的时间TTL,导致表淘汰掉这些数据。时间淘汰,只看每个索引的ts列和ttl,只要数据中该列的值<(当前时间-abs_ttl),在该索引上就会被淘汰,与其他因素无关,各个索引也互相不影响。如果你的数据不是实时产生的新timestamp,也需要考虑带上BIAS OPTIONS。
```

### 步骤 7:导入在线数据

首先,请切换到**在线**执行模式。接着在在线模式下,导入样例数据 `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` 作为在线数据,用于在线特征计算。
Expand Down
Loading