From f9f37a787e2b80cfef345f7ce6541d6720086950 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 17 Dec 2024 16:23:55 -0300 Subject: [PATCH 1/6] Make String and Stringliteral splitlines return List[StringSlice] Signed-off-by: martinvuyk --- stdlib/src/builtin/string_literal.mojo | 10 ++-- stdlib/src/collections/string.mojo | 8 ++-- stdlib/src/utils/string_slice.mojo | 65 ++++++++++++++++++++++---- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 581e21de5e..d0314a3554 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -743,10 +743,10 @@ struct StringLiteral( """ return str(self).split(sep, maxsplit) - fn splitlines(self, keepends: Bool = False) -> List[String]: - """Split the string literal at line boundaries. This corresponds to Python's - [universal newlines:]( - https://docs.python.org/3/library/stdtypes.html#str.splitlines) + fn splitlines(self, keepends: Bool = False) -> List[StaticString]: + """Split the string literal at line boundaries. This corresponds to + Python's [universal newlines:]( + https://docs.python.org/3/library/stdtypes.html#str.splitlines) `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. Args: @@ -755,7 +755,7 @@ struct StringLiteral( Returns: A List of Strings containing the input split by line boundaries. """ - return _to_string_list(self.as_string_slice().splitlines(keepends)) + return self.as_string_slice().splitlines(keepends) fn count(self, substr: String) -> Int: """Return the number of non-overlapping occurrences of substring diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index 06cd2afeb5..557c41d8a6 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -1882,10 +1882,12 @@ struct String( return output - fn splitlines(self, keepends: Bool = False) -> List[String]: + fn splitlines( + ref self, keepends: Bool = False + ) -> List[StringSlice[__origin_of(self)]]: """Split the string at line boundaries. This corresponds to Python's [universal newlines:]( - https://docs.python.org/3/library/stdtypes.html#str.splitlines) + https://docs.python.org/3/library/stdtypes.html#str.splitlines) `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. Args: @@ -1894,7 +1896,7 @@ struct String( Returns: A List of Strings containing the input split by line boundaries. """ - return _to_string_list(self.as_string_slice().splitlines(keepends)) + return self.as_string_slice().splitlines(keepends) fn replace(self, old: String, new: String) -> String: """Return a copy of the string with all occurrences of substring `old` diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index c465811f1b..287fe0857a 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -1047,24 +1047,21 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( offset += b_len return length != 0 - fn splitlines[ - O: ImmutableOrigin, // - ](self: StringSlice[O], keepends: Bool = False) -> List[StringSlice[O]]: + fn splitlines( + self: StringSlice, keepends: Bool = False + ) -> List[StringSlice[__type_of(self).origin]]: """Split the string at line boundaries. This corresponds to Python's [universal newlines:]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. - Parameters: - O: The immutable origin. - Args: keepends: If True, line breaks are kept in the resulting strings. Returns: A List of Strings containing the input split by line boundaries. """ - + alias O = __type_of(self).origin # highly performance sensitive code, benchmark before touching alias `\r` = UInt8(ord("\r")) alias `\n` = UInt8(ord("\n")) @@ -1133,7 +1130,7 @@ fn _to_string_list[ @always_inline -fn _to_string_list[ +fn to_string_list[ O: ImmutableOrigin, // ](items: List[StringSlice[O]]) -> List[String]: """Create a list of Strings **copying** the existing data. @@ -1158,7 +1155,32 @@ fn _to_string_list[ @always_inline -fn _to_string_list[ +fn to_string_list[ + O: MutableOrigin, // +](items: List[StringSlice[O]]) -> List[String]: + """Create a list of Strings **copying** the existing data. + + Parameters: + O: The origin of the data. + + Args: + items: The List of string slices. + + Returns: + The list of created strings. + """ + + fn unsafe_ptr_fn(v: StringSlice[O]) -> UnsafePointer[Byte]: + return v.unsafe_ptr() + + fn len_fn(v: StringSlice[O]) -> Int: + return v.byte_length() + + return _to_string_list[items.T, len_fn, unsafe_ptr_fn](items) + + +@always_inline +fn to_string_list[ O: ImmutableOrigin, // ](items: List[Span[Byte, O]]) -> List[String]: """Create a list of Strings **copying** the existing data. @@ -1182,6 +1204,31 @@ fn _to_string_list[ return _to_string_list[items.T, len_fn, unsafe_ptr_fn](items) +@always_inline +fn to_string_list[ + O: MutableOrigin, // +](items: List[Span[Byte, O]]) -> List[String]: + """Create a list of Strings **copying** the existing data. + + Parameters: + O: The origin of the data. + + Args: + items: The List of Bytes. + + Returns: + The list of created strings. + """ + + fn unsafe_ptr_fn(v: Span[Byte, O]) -> UnsafePointer[Byte]: + return v.unsafe_ptr() + + fn len_fn(v: Span[Byte, O]) -> Int: + return len(v) + + return _to_string_list[items.T, len_fn, unsafe_ptr_fn](items) + + @always_inline fn _is_newline_char[ include_r_n: Bool = False From 8cae7a9543c9c6a7789ece2ceda7a933ebfcd2d7 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 17 Dec 2024 16:35:26 -0300 Subject: [PATCH 2/6] bring over the tests form string slice Signed-off-by: martinvuyk --- stdlib/test/collections/test_string.mojo | 65 +++++++++++++++--------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/stdlib/test/collections/test_string.mojo b/stdlib/test/collections/test_string.mojo index 4d9151b279..f65b7dde9d 100644 --- a/stdlib/test/collections/test_string.mojo +++ b/stdlib/test/collections/test_string.mojo @@ -849,58 +849,75 @@ def test_split(): def test_splitlines(): - alias L = List[String] + alias S = StringSlice[StaticConstantOrigin] + alias L = List[StringSlice[StaticConstantOrigin]] + + # FIXME: remove once StringSlice conforms to TestableCollectionElement + fn _assert_equal[ + O1: ImmutableOrigin, O2: ImmutableOrigin + ](l1: List[StringSlice[O1]], l2: List[StringSlice[O2]]) raises: + assert_equal(len(l1), len(l2)) + for i in range(len(l1)): + assert_equal(str(l1[i]), str(l2[i])) + + # FIXME: remove once StringSlice conforms to TestableCollectionElement + fn _assert_equal[ + O1: ImmutableOrigin + ](l1: List[StringSlice[O1]], l2: List[String]) raises: + assert_equal(len(l1), len(l2)) + for i in range(len(l1)): + assert_equal(str(l1[i]), l2[i]) + # Test with no line breaks - assert_equal(String("hello world").splitlines(), L("hello world")) + _assert_equal(S("hello world").splitlines(), L("hello world")) # Test with line breaks - assert_equal(String("hello\nworld").splitlines(), L("hello", "world")) - assert_equal(String("hello\rworld").splitlines(), L("hello", "world")) - assert_equal(String("hello\r\nworld").splitlines(), L("hello", "world")) + _assert_equal(S("hello\nworld").splitlines(), L("hello", "world")) + _assert_equal(S("hello\rworld").splitlines(), L("hello", "world")) + _assert_equal(S("hello\r\nworld").splitlines(), L("hello", "world")) # Test with multiple different line breaks - s1 = String("hello\nworld\r\nmojo\rlanguage\r\n") + s1 = S("hello\nworld\r\nmojo\rlanguage\r\n") hello_mojo = L("hello", "world", "mojo", "language") - assert_equal(s1.splitlines(), hello_mojo) - assert_equal( + _assert_equal(s1.splitlines(), hello_mojo) + _assert_equal( s1.splitlines(keepends=True), L("hello\n", "world\r\n", "mojo\r", "language\r\n"), ) # Test with an empty string - assert_equal(String("").splitlines(), L()) + _assert_equal(S("").splitlines(), L()) # test \v \f \x1c \x1d - s2 = String("hello\vworld\fmojo\x1clanguage\x1d") - assert_equal(s2.splitlines(), hello_mojo) - assert_equal( + s2 = S("hello\vworld\fmojo\x1clanguage\x1d") + _assert_equal(s2.splitlines(), hello_mojo) + _assert_equal( s2.splitlines(keepends=True), L("hello\v", "world\f", "mojo\x1c", "language\x1d"), ) # test \x1c \x1d \x1e - s3 = String("hello\x1cworld\x1dmojo\x1elanguage\x1e") - assert_equal(s3.splitlines(), hello_mojo) - assert_equal( + s3 = S("hello\x1cworld\x1dmojo\x1elanguage\x1e") + _assert_equal(s3.splitlines(), hello_mojo) + _assert_equal( s3.splitlines(keepends=True), L("hello\x1c", "world\x1d", "mojo\x1e", "language\x1e"), ) # test \x85 \u2028 \u2029 - var next_line = List[UInt8](0xC2, 0x85, 0) + var next_line = String(List[UInt8](0xC2, 0x85, 0)) """TODO: \\x85""" - var unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8, 0) + var unicode_line_sep = String(List[UInt8](0xE2, 0x80, 0xA8, 0)) """TODO: \\u2028""" - var unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9, 0) + var unicode_paragraph_sep = String(List[UInt8](0xE2, 0x80, 0xA9, 0)) """TODO: \\u2029""" for i in List(next_line, unicode_line_sep, unicode_paragraph_sep): - u = String(i[]) + u = i[] item = String("").join("hello", u, "world", u, "mojo", u, "language", u) - assert_equal(item.splitlines(), hello_mojo) - assert_equal( - item.splitlines(keepends=True), - L("hello" + u, "world" + u, "mojo" + u, "language" + u), - ) + s = StringSlice(item) + _assert_equal(s.splitlines(), hello_mojo) + items = List("hello" + u, "world" + u, "mojo" + u, "language" + u) + _assert_equal(s.splitlines(keepends=True), items) def test_isupper(): From 81ba2c1b4f6888e94b2ab62cd8ab31e7172a1a96 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 17 Dec 2024 16:41:38 -0300 Subject: [PATCH 3/6] bring over the tests form string slice Signed-off-by: martinvuyk --- stdlib/test/builtin/test_string_literal.mojo | 64 +++++++++++++++----- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/stdlib/test/builtin/test_string_literal.mojo b/stdlib/test/builtin/test_string_literal.mojo index 28ca91a1af..6575a9a699 100644 --- a/stdlib/test/builtin/test_string_literal.mojo +++ b/stdlib/test/builtin/test_string_literal.mojo @@ -430,42 +430,76 @@ def test_split(): def test_splitlines(): - alias L = List[String] + alias S = StringSlice[StaticConstantOrigin] + alias L = List[StringSlice[StaticConstantOrigin]] + + # FIXME: remove once StringSlice conforms to TestableCollectionElement + fn _assert_equal[ + O1: ImmutableOrigin, O2: ImmutableOrigin + ](l1: List[StringSlice[O1]], l2: List[StringSlice[O2]]) raises: + assert_equal(len(l1), len(l2)) + for i in range(len(l1)): + assert_equal(str(l1[i]), str(l2[i])) + + # FIXME: remove once StringSlice conforms to TestableCollectionElement + fn _assert_equal[ + O1: ImmutableOrigin + ](l1: List[StringSlice[O1]], l2: List[String]) raises: + assert_equal(len(l1), len(l2)) + for i in range(len(l1)): + assert_equal(str(l1[i]), l2[i]) + # Test with no line breaks - assert_equal("hello world".splitlines(), L("hello world")) + _assert_equal(S("hello world").splitlines(), L("hello world")) # Test with line breaks - assert_equal("hello\nworld".splitlines(), L("hello", "world")) - assert_equal("hello\rworld".splitlines(), L("hello", "world")) - assert_equal("hello\r\nworld".splitlines(), L("hello", "world")) + _assert_equal(S("hello\nworld").splitlines(), L("hello", "world")) + _assert_equal(S("hello\rworld").splitlines(), L("hello", "world")) + _assert_equal(S("hello\r\nworld").splitlines(), L("hello", "world")) # Test with multiple different line breaks - s1 = "hello\nworld\r\nmojo\rlanguage\r\n" + s1 = S("hello\nworld\r\nmojo\rlanguage\r\n") hello_mojo = L("hello", "world", "mojo", "language") - assert_equal(s1.splitlines(), hello_mojo) - assert_equal( + _assert_equal(s1.splitlines(), hello_mojo) + _assert_equal( s1.splitlines(keepends=True), L("hello\n", "world\r\n", "mojo\r", "language\r\n"), ) # Test with an empty string - assert_equal("".splitlines(), L()) + _assert_equal(S("").splitlines(), L()) # test \v \f \x1c \x1d - s2 = "hello\vworld\fmojo\x1clanguage\x1d" - assert_equal(s2.splitlines(), hello_mojo) - assert_equal( + s2 = S("hello\vworld\fmojo\x1clanguage\x1d") + _assert_equal(s2.splitlines(), hello_mojo) + _assert_equal( s2.splitlines(keepends=True), L("hello\v", "world\f", "mojo\x1c", "language\x1d"), ) # test \x1c \x1d \x1e - s3 = "hello\x1cworld\x1dmojo\x1elanguage\x1e" - assert_equal(s3.splitlines(), hello_mojo) - assert_equal( + s3 = S("hello\x1cworld\x1dmojo\x1elanguage\x1e") + _assert_equal(s3.splitlines(), hello_mojo) + _assert_equal( s3.splitlines(keepends=True), L("hello\x1c", "world\x1d", "mojo\x1e", "language\x1e"), ) + # test \x85 \u2028 \u2029 + var next_line = String(List[UInt8](0xC2, 0x85, 0)) + """TODO: \\x85""" + var unicode_line_sep = String(List[UInt8](0xE2, 0x80, 0xA8, 0)) + """TODO: \\u2028""" + var unicode_paragraph_sep = String(List[UInt8](0xE2, 0x80, 0xA9, 0)) + """TODO: \\u2029""" + + for i in List(next_line, unicode_line_sep, unicode_paragraph_sep): + u = i[] + item = String("").join("hello", u, "world", u, "mojo", u, "language", u) + s = StringSlice(item) + _assert_equal(s.splitlines(), hello_mojo) + items = List("hello" + u, "world" + u, "mojo" + u, "language" + u) + _assert_equal(s.splitlines(keepends=True), items) + def test_float_conversion(): assert_equal(("4.5").__float__(), 4.5) From 01dc2d47316204628365e5922e8c7ae3460b5034 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 17 Dec 2024 22:29:37 -0300 Subject: [PATCH 4/6] fix missing import Signed-off-by: martinvuyk --- stdlib/test/builtin/test_string_literal.mojo | 1 + 1 file changed, 1 insertion(+) diff --git a/stdlib/test/builtin/test_string_literal.mojo b/stdlib/test/builtin/test_string_literal.mojo index 6575a9a699..8c064c49af 100644 --- a/stdlib/test/builtin/test_string_literal.mojo +++ b/stdlib/test/builtin/test_string_literal.mojo @@ -22,6 +22,7 @@ from testing import ( assert_raises, assert_true, ) +from utils import StringSlice def test_add(): From b3f4924c1adef13d5983de38f012f720db9080c6 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 19 Dec 2024 09:31:03 -0300 Subject: [PATCH 5/6] add suggestions by @ConnorGray Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 65 ++++-------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 287fe0857a..01602f9061 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -1047,9 +1047,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( offset += b_len return length != 0 - fn splitlines( - self: StringSlice, keepends: Bool = False - ) -> List[StringSlice[__type_of(self).origin]]: + fn splitlines(self, keepends: Bool = False) -> List[Self]: """Split the string at line boundaries. This corresponds to Python's [universal newlines:]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) @@ -1061,12 +1059,11 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( Returns: A List of Strings containing the input split by line boundaries. """ - alias O = __type_of(self).origin # highly performance sensitive code, benchmark before touching alias `\r` = UInt8(ord("\r")) alias `\n` = UInt8(ord("\n")) - output = List[StringSlice[O]](capacity=128) # guessing + output = List[Self](capacity=128) # guessing var ptr = self.unsafe_ptr() var length = self.byte_length() var offset = 0 @@ -1096,7 +1093,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( eol_start += char_len var str_len = eol_start - offset + int(keepends) * eol_length - var s = StringSlice[O](ptr=ptr + offset, length=str_len) + var s = Self(ptr=ptr + offset, length=str_len) output.append(s) offset = eol_start + eol_length @@ -1131,11 +1128,12 @@ fn _to_string_list[ @always_inline fn to_string_list[ - O: ImmutableOrigin, // + mut: Bool, O: Origin[mut], // ](items: List[StringSlice[O]]) -> List[String]: """Create a list of Strings **copying** the existing data. Parameters: + mut: The mutability of the origin. O: The origin of the data. Args: @@ -1156,61 +1154,12 @@ fn to_string_list[ @always_inline fn to_string_list[ - O: MutableOrigin, // -](items: List[StringSlice[O]]) -> List[String]: - """Create a list of Strings **copying** the existing data. - - Parameters: - O: The origin of the data. - - Args: - items: The List of string slices. - - Returns: - The list of created strings. - """ - - fn unsafe_ptr_fn(v: StringSlice[O]) -> UnsafePointer[Byte]: - return v.unsafe_ptr() - - fn len_fn(v: StringSlice[O]) -> Int: - return v.byte_length() - - return _to_string_list[items.T, len_fn, unsafe_ptr_fn](items) - - -@always_inline -fn to_string_list[ - O: ImmutableOrigin, // -](items: List[Span[Byte, O]]) -> List[String]: - """Create a list of Strings **copying** the existing data. - - Parameters: - O: The origin of the data. - - Args: - items: The List of Bytes. - - Returns: - The list of created strings. - """ - - fn unsafe_ptr_fn(v: Span[Byte, O]) -> UnsafePointer[Byte]: - return v.unsafe_ptr() - - fn len_fn(v: Span[Byte, O]) -> Int: - return len(v) - - return _to_string_list[items.T, len_fn, unsafe_ptr_fn](items) - - -@always_inline -fn to_string_list[ - O: MutableOrigin, // + mut: Bool, O: Origin[mut], // ](items: List[Span[Byte, O]]) -> List[String]: """Create a list of Strings **copying** the existing data. Parameters: + mut: The mutability of the origin. O: The origin of the data. Args: From 8fc5fd92d112126323c8d4418a02283ff0bc7061 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 19 Dec 2024 16:33:38 -0300 Subject: [PATCH 6/6] add var to _to_string_list implementation Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 01602f9061..e954449222 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -1110,18 +1110,18 @@ fn _to_string_list[ len_fn: fn (T) -> Int, unsafe_ptr_fn: fn (T) -> UnsafePointer[Byte], ](items: List[T]) -> List[String]: - i_len = len(items) - i_ptr = items.unsafe_ptr() - out_ptr = UnsafePointer[String].alloc(i_len) + var i_len = len(items) + var i_ptr = items.unsafe_ptr() + var out_ptr = UnsafePointer[String].alloc(i_len) for i in range(i_len): - og_len = len_fn(i_ptr[i]) - f_len = og_len + 1 # null terminator - p = UnsafePointer[Byte].alloc(f_len) - og_ptr = unsafe_ptr_fn(i_ptr[i]) + var og_len = len_fn(i_ptr[i]) + var f_len = og_len + 1 # null terminator + var p = UnsafePointer[Byte].alloc(f_len) + var og_ptr = unsafe_ptr_fn(i_ptr[i]) memcpy(p, og_ptr, og_len) p[og_len] = 0 # null terminator - buf = String._buffer_type(ptr=p, length=f_len, capacity=f_len) + var buf = String._buffer_type(ptr=p, length=f_len, capacity=f_len) (out_ptr + i).init_pointee_move(String(buf^)) return List[String](ptr=out_ptr, length=i_len, capacity=i_len)