Skip to content

Commit

Permalink
fix: add deploy bias on demo and docs (#3520)
Browse files Browse the repository at this point in the history
  • Loading branch information
vagetablechicken authored Oct 11, 2023
1 parent 1386632 commit 3437741
Show file tree
Hide file tree
Showing 13 changed files with 26 additions and 15 deletions.
2 changes: 1 addition & 1 deletion demo/byzer-taxi/openmldb_byzer_taxi.bznb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"job_id" : null
}, {
"id" : "240",
"content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";",
"content" : "run command as FeatureStoreExt.`` where\r\nzkAddress=\"127.0.0.1:2181\"\r\nand zkPath=\"/openmldb\"\r\nand `sql-0`='''\r\nSET @@execute_mode='online';\r\n'''\r\nand `sql-1`='''\r\nDEPLOY d1 OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,\r\nsum(pickup_latitude) OVER w AS vendor_sum_pl,\r\nmax(pickup_latitude) OVER w AS vendor_max_pl,\r\nmin(pickup_latitude) OVER w AS vendor_min_pl,\r\navg(pickup_latitude) OVER w AS vendor_avg_pl,\r\nsum(pickup_latitude) OVER w2 AS pc_sum_pl,\r\nmax(pickup_latitude) OVER w2 AS pc_max_pl,\r\nmin(pickup_latitude) OVER w2 AS pc_min_pl,\r\navg(pickup_latitude) OVER w2 AS pc_avg_pl ,\r\ncount(vendor_id) OVER w2 AS pc_cnt,\r\ncount(vendor_id) OVER w AS vendor_cnt\r\nFROM t1 \r\nWINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),\r\nw2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);\r\n'''\r\nand db=\"db1\"\r\nand action=\"ddl\";",
"job_id" : null
}, {
"id" : "241",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ private void createDeployment() {
"(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table,
table, table);
// 上线一个Deployment
String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql);
String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql);
// set return null rs, don't check the returned value, it's false
state.execute(deploySql);
} catch (Exception e) {
Expand Down
2 changes: 1 addition & 1 deletion demo/jd-recommendation/sql_scripts/deploy.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
USE JD_db;
SET @@execute_mode='online';
DEPLOY demo select * from
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') select * from
(
select
`reqId` as reqId_1,
Expand Down
4 changes: 2 additions & 2 deletions demo/predict-taxi-trip-duration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ python3 train.py /tmp/feature_data /tmp/model.txt
# The below commands are executed in the CLI
> USE demo_db;
> SET @@execute_mode='online';
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down Expand Up @@ -193,7 +193,7 @@ python3 train.py /tmp/feature.csv /tmp/model.txt
```sql
# The below commands are executed in the CLI
> USE demo_db;
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
2 changes: 1 addition & 1 deletion demo/predict-taxi-trip-duration/script/taxi.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
OPTIONS(mode='overwrite');

SET @@execute_mode='online';
DEPLOY demo SELECT trip_duration, passenger_count,
DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
"outputs": [],
"source": [
"deploy_name='d1'\n",
"%sql DEPLOY $deploy_name $sql_part;"
"%sql DEPLOY $deploy_name OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") $sql_part;"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ def nothrow_execute(sql):
connection.execute("SET @@execute_mode='online';")
connection.execute(f'USE {DB_NAME}')
nothrow_execute(f'DROP DEPLOYMENT {DEPLOY_NAME}')
deploy_sql = f"""DEPLOY {DEPLOY_NAME} {sql_part}"""
# to avoid data expired by abs ttl, set inf
deploy_sql = f"""DEPLOY {DEPLOY_NAME} OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") {sql_part}"""
print(deploy_sql)
connection.execute(deploy_sql)
print('Import data to online')
Expand Down
2 changes: 1 addition & 1 deletion docs/en/use_case/lightgbm_demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ Assuming that the model produced by the features designed in Section 2.3 in the
```sql
> USE demo_db;
> SET @@execute_mode='online';
> DEPLOY demo SELECT trip_duration, passenger_count,
> DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ and `sql-0`='''
SET @@execute_mode='online';
'''
and `sql-1`='''
DEPLOY d1 SELECT trip_duration, passenger_count,
DEPLOY d1 OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand Down
10 changes: 8 additions & 2 deletions docs/zh/openmldb_sql/deployment_manage/DEPLOY_STATEMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,16 +175,22 @@ deploy demo options(SYNC="false") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as
WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
```

#### 设置偏移
#### 设置偏移BIAS

如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。
如果你并不希望数据根据deploy的索引淘汰,或者希望晚一点淘汰,可以在deploy时设置偏移BIAS,常用于数据时间戳并不实时的情况、测试等情况。如果deploy后的索引ttl为abs 3h,但是数据的时间戳是3h前的(以系统时间为基准),那么这条数据就会被淘汰,无法参与计算。设置一定时间或永久的偏移,则可以让数据更久的停留在在线表中。

时间偏移,单位可以是`s``m``h``d`,也可以是整数,单位为`ms`,也可以是`inf`,表示永不淘汰;如果是行数偏移,可以是整数,单位是`row`,也可以是`inf`,表示永不淘汰。两种偏移中,0均表示不偏移。

注意,我们只将偏移加在deploy的解析索引中,也就是新索引,它们并不是最终索引。最终索引的计算方式是,如果是创建索引,最终索引是`解析索引 + 偏移`;如果是更新索引,最终索引是`merge(旧索引, 新索引 + 偏移)`

而时间偏移的单位是`min`,我们会在内部将其转换为`min`,并且取上界。比如,新索引ttl是abs 2min,加上偏移20s,结果是`2min + ub(20s) = 3min`,然后和旧索引1min取上界,最终索引ttl是`max(1min, 3min) = 3min`

**Example**
```sql
DEPLOY demo OPTIONS(RANGE_BIAS="inf", ROWS_BIAS="inf") SELECT t1.col1, t2.col2, sum(col4) OVER w1 as w1_col4_sum FROM t1 LAST JOIN t2 ORDER BY t2.col3 ON t1.col2 = t2.col2
WINDOW w1 AS (PARTITION BY t1.col2 ORDER BY t1.col3 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
```

## 相关SQL

[USE DATABASE](../ddl/USE_DATABASE_STATEMENT.md)
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/quickstart/sdk/java_sdk.md
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ try {
"(PARTITION BY %s.c1 ORDER BY %s.c7 ROWS_RANGE BETWEEN 2d PRECEDING AND CURRENT ROW);", table,
table, table);
// 上线一个Deployment
String deploySql = String.format("DEPLOY %s %s", deploymentName, selectSql);
String deploySql = String.format("DEPLOY %s OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') %s", deploymentName, selectSql);
// set return null rs, don't check the returned value, it's false
state.execute(deploySql);
} catch (Exception e) {
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/use_case/JD_recommendation.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ bash train_deepfm.sh $demodir/feature_preprocess/out
```sql
-- OpenMLDB CLI
USE JD_db;
DEPLOY demo <SQL>;
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') <SQL>;
```
也可以在 Docker 容器内直接运行部署脚本:
Expand Down
6 changes: 5 additions & 1 deletion docs/zh/use_case/taxi_tour_duration_prediction.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
--OpenMLDB CLI
USE demo_db;
SET @@execute_mode='online';
DEPLOY demo SELECT trip_duration, passenger_count,
DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count,
sum(pickup_latitude) OVER w AS vendor_sum_pl,
max(pickup_latitude) OVER w AS vendor_max_pl,
min(pickup_latitude) OVER w AS vendor_min_pl,
Expand All @@ -167,6 +167,10 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);
```

```{note}
此处DEPLOY包含BIAS OPTIONS,是因为导入在线存储的数据文件不会更新,对于当前时间来讲,可能会超过DEPLOY后的表索引的时间TTL,导致表淘汰掉这些数据。时间淘汰,只看每个索引的ts列和ttl,只要数据中该列的值<(当前时间-abs_ttl),在该索引上就会被淘汰,与其他因素无关,各个索引也互相不影响。如果你的数据不是实时产生的新timestamp,也需要考虑带上BIAS OPTIONS。
```

### 步骤 7:导入在线数据

首先,请切换到**在线**执行模式。接着在在线模式下,导入样例数据 `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` 作为在线数据,用于在线特征计算。
Expand Down

0 comments on commit 3437741

Please sign in to comment.