From f9d3133c4fd39e8a14329bbb98bb726283bccffd Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 10 Jan 2025 04:45:57 +0700 Subject: [PATCH] Improve perfomance of `reverse` function (#14025) * Improve perfomance of 'reverse' function Signed-off-by: Tai Le Manh * Apply sugestion change * Fix typo --------- Signed-off-by: Tai Le Manh --- datafusion/functions/Cargo.toml | 5 ++ datafusion/functions/benches/reverse.rs | 90 +++++++++++++++++++++ datafusion/functions/src/unicode/reverse.rs | 25 +++--- 3 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 datafusion/functions/benches/reverse.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index fd986c4be41c..c8025fb2d895 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -204,6 +204,11 @@ harness = false name = "strpos" required-features = ["unicode_expressions"] +[[bench]] +harness = false +name = "reverse" +required-features = ["unicode_expressions"] + [[bench]] harness = false name = "trunc" diff --git a/datafusion/functions/benches/reverse.rs b/datafusion/functions/benches/reverse.rs new file mode 100644 index 000000000000..c7c1ef8a8220 --- /dev/null +++ b/datafusion/functions/benches/reverse.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::OffsetSizeTrait; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode; +use std::sync::Arc; + +fn create_args( + size: usize, + str_len: usize, + force_view_types: bool, +) -> Vec { + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + + vec![ColumnarValue::Array(string_array)] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ColumnarValue::Array(string_array)] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let reverse = unicode::reverse(); + for size in [1024, 4096] { + let str_len = 8; + + let args = create_args::(size, str_len, true); + c.bench_function( + format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + + let str_len = 32; + + let args = create_args::(size, str_len, true); + c.bench_function( + format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + + let args = create_args::(size, str_len, false); + c.bench_function( + format!("reverse_string [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 5ad347ed96c0..f07deda70e52 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -20,8 +20,7 @@ use std::sync::Arc; use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, - OffsetSizeTrait, + Array, ArrayRef, AsArray, GenericStringBuilder, OffsetSizeTrait, StringArrayType, }; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; @@ -105,8 +104,7 @@ impl ScalarUDFImpl for ReverseFunc { } } -/// Reverses the order of the characters in the string. -/// reverse('abcde') = 'edcba' +/// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`. /// The implementation uses UTF-8 code points as characters pub fn reverse(args: &[ArrayRef]) -> Result { if args[0].data_type() == &Utf8View { @@ -116,14 +114,23 @@ pub fn reverse(args: &[ArrayRef]) -> Result { } } -fn reverse_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( +fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( string_array: V, ) -> Result { - let result = ArrayIter::new(string_array) - .map(|string| string.map(|string: &str| string.chars().rev().collect::())) - .collect::>(); + let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), 1024); + + let mut reversed = String::new(); + for string in string_array.iter() { + if let Some(s) = string { + reversed.extend(s.chars().rev()); + builder.append_value(&reversed); + reversed.clear(); + } else { + builder.append_null(); + } + } - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } #[cfg(test)]