From 28d44968da117435bbdeb697bbd52a4a27c44ba7 Mon Sep 17 00:00:00 2001 From: Kenichi Kamiya Date: Mon, 17 May 2021 22:10:33 +0900 Subject: [PATCH] Implement `ULID.normalize` and `ULID.normalized?` Resolves #150 This is a simple solution for following issues * #78 * #57 * #143 * https://github.com/ulid/spec/pull/57 * https://github.com/ulid/spec/issues/3 --- README.md | 31 +++++++++++- lib/ulid.rb | 21 ++++++++ lib/ulid/crockford_base32.rb | 15 ++++++ sig/ulid.rbs | 27 +++++++++++ test/core/test_ulid_class.rb | 92 ++++++++++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 025920b0..3f516db9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Instead, herein is proposed ULID: - 1.21e+24 unique ULIDs per millisecond - Lexicographically sortable! - Canonically encoded as a 26 character string, as opposed to the 36 character UUID -- Uses [Crockford's base32](https://www.crockford.com/base32.html) for better efficiency and readability (5 bits per character) # See also exists issues in [Note](#note) +- Uses [Crockford's base32](https://www.crockford.com/base32.html) for better efficiency and readability (5 bits per character) - Case insensitive - No special characters (URL safe) - Monotonic sort order (correctly detects and handles the same millisecond) @@ -326,6 +326,34 @@ ULID.sample(10, period: ulid1.to_time..ulid2.to_time) # ULID(2021-04-28 15:05:06.808 UTC: 01F4CG68ZRST94T056KRZ5K9S4)] ``` +### ULID specification ambiguity around orthographical variants of the format + +I'm afraid so, we should consider [Current ULID spec](https://github.com/ulid/spec/tree/d0c7170df4517939e70129b4d6462cc162f2d5bf#universally-unique-lexicographically-sortable-identifier) has `orthographical variants of the format` possibilities. + +>Uses Crockford's base32 for better efficiency and readability (5 bits per character) + +The original `Crockford's base32` maps `I`, `L` to `1`, `O` to `0`. +And accepts freestyle inserting `Hyphens (-)`. +To consider this patterns or not is different in each implementations. + +Current parser/validator/matcher aims to cover `subset of Crockford's base32`. +I have suggested it would be clarified in [ulid/spec#57](https://github.com/ulid/spec/pull/57). + +>Case insensitive + +I can understand it might be considered in actual use-case. +But it is a controversial point, discussing in [ulid/spec#3](https://github.com/ulid/spec/issues/3). + +Be that as it may, this gem provides API for handling the nasty possibilities. + +`ULID.normalize` and `ULID.normalized?` + +```ruby +ULID.normalize('-olarz3-noekisv4rrff-q6ig5fav--') #=> "01ARZ3N0EK1SV4RRFFQ61G5FAV" +ULID.normalized?('-olarz3-noekisv4rrff-q6ig5fav--') #=> false +ULID.normalized?('01ARZ3N0EK1SV4RRFFQ61G5FAV') #=> true +``` + ### UUIDv4 converter for migration use-cases `ULID.from_uuidv4` and `ULID#to_uuidv4` is the converter. @@ -418,4 +446,3 @@ The results are not something to be proud of. ## Note - Another choices for sortable and randomness IDs, [UUIDv6, UUIDv7, UUIDv8 might be the one. (But they are still in draft state)](https://www.ietf.org/archive/id/draft-peabody-dispatch-new-uuid-format-01.html), I will track them in [ruby-ulid#37](https://github.com/kachick/ruby-ulid/issues/37) -- Current parser/validator/matcher aims to cover `subset of Crockford's base32`. Suggesting it in [ulid/spec#57](https://github.com/ulid/spec/pull/57). Be that as it may, I might provide special handler or converter for the exception in [ruby-ulid#57](https://github.com/kachick/ruby-ulid/issues/57) and/or [ruby-ulid#78](https://github.com/kachick/ruby-ulid/issues/78) diff --git a/lib/ulid.rb b/lib/ulid.rb index 552e2fa4..4bd5a47c 100644 --- a/lib/ulid.rb +++ b/lib/ulid.rb @@ -259,6 +259,27 @@ def self.parse(string) from_integer(CrockfordBase32.decode(string)) end + # @param [String, #to_str] string + # @return [String] + # @raise [ParserError] if the given format is not correct for ULID specs, even if ignored `orthographical variants of the format` + def self.normalize(string) + string = String.try_convert(string) + raise ArgumentError, 'ULID.normalize takes only strings' unless string + + normalized_in_crockford = CrockfordBase32.normalize(string) + # Ensure the ULID correctness, because CrockfordBase32 does not always mean to satisfy ULID format + parse(normalized_in_crockford).to_s + end + + # @return [Boolean] + def self.normalized?(object) + normalized = normalize(object) + rescue Exception + false + else + normalized == object + end + # @return [Boolean] def self.valid?(object) string = String.try_convert(object) diff --git a/lib/ulid/crockford_base32.rb b/lib/ulid/crockford_base32.rb index 65f2a053..6057b89c 100644 --- a/lib/ulid/crockford_base32.rb +++ b/lib/ulid/crockford_base32.rb @@ -51,6 +51,14 @@ class SetupError < ScriptError; end CROCKFORD_BASE32_CHAR_BY_N32_CHAR = N32_CHAR_BY_CROCKFORD_BASE32_CHAR.invert.freeze N32_CHAR_PATTERN = /[#{CROCKFORD_BASE32_CHAR_BY_N32_CHAR.keys.join}]/.freeze + VARIANT_BY_STANDARD = { + 'L' => '1', + 'I' => '1', + 'O' => '0', + '-' => '' + }.freeze + VARIANT_PATTERN = /[#{VARIANT_BY_STANDARD.keys.join}]/.freeze + # @api private # @param [String] string # @return [Integer] @@ -66,5 +74,12 @@ def self.encode(integer) n32encoded = integer.to_s(32) n32encoded.upcase.gsub(N32_CHAR_PATTERN, CROCKFORD_BASE32_CHAR_BY_N32_CHAR).rjust(ENCODED_LENGTH, '0') end + + # @api private + # @param [String] string + # @return [String] + def self.normalize(string) + string.upcase.gsub(VARIANT_PATTERN, VARIANT_BY_STANDARD) + end end end diff --git a/sig/ulid.rbs b/sig/ulid.rbs index a933b007..9ff8a688 100644 --- a/sig/ulid.rbs +++ b/sig/ulid.rbs @@ -43,12 +43,17 @@ class ULID < Object CROCKFORD_BASE32_CHAR_PATTERN: Regexp CROCKFORD_BASE32_CHAR_BY_N32_CHAR: Hash[String, String] N32_CHAR_PATTERN: Regexp + VARIANT_BY_STANDARD: Hash[String, String] + VARIANT_PATTERN: Regexp # A pribate API. Should not be used in your code. def self.encode: (Integer integer) -> String # A pribate API. Should not be used in your code. def self.decode: (String string) -> Integer + + # A pribate API. Should not be used in your code. + def self.normalize: (String string) -> String end class MonotonicGenerator @@ -317,6 +322,28 @@ class ULID < Object | (Integer number, ?period: period) -> Array[self] def self.valid?: (untyped) -> bool + # Returns normalized string + # + # ```ruby + # ULID.normalize('-olarz3-noekisv4rrff-q6ig5fav--') #=> "01ARZ3N0EK1SV4RRFFQ61G5FAV" + # ULID.normalized?('-olarz3-noekisv4rrff-q6ig5fav--') #=> false + # ULID.normalized?('01ARZ3N0EK1SV4RRFFQ61G5FAV') #=> true + # ``` + # + # See also [ulid/spec#57](https://github.com/ulid/spec/pull/57) and [ulid/spec#3](https://github.com/ulid/spec/issues/3) + def self.normalize: (_ToStr string) -> String + + # Returns `true` if it is normalized string + # + # ```ruby + # ULID.normalize('-olarz3-noekisv4rrff-q6ig5fav--') #=> "01ARZ3N0EK1SV4RRFFQ61G5FAV" + # ULID.normalized?('-olarz3-noekisv4rrff-q6ig5fav--') #=> false + # ULID.normalized?('01ARZ3N0EK1SV4RRFFQ61G5FAV') #=> true + # ``` + # + # See also [ulid/spec#57](https://github.com/ulid/spec/pull/57) and [ulid/spec#3](https://github.com/ulid/spec/issues/3) + def self.normalized?: (untyped) -> bool + # Returns parsed ULIDs from given String for rough operations. # # ```ruby diff --git a/test/core/test_ulid_class.rb b/test/core/test_ulid_class.rb index 81cbcc79..0a8ed399 100644 --- a/test/core/test_ulid_class.rb +++ b/test/core/test_ulid_class.rb @@ -114,6 +114,98 @@ def test_valid? end end + def test_normalize + # This is the core of this feature + assert_equal(ULID.parse('01ARZ3N0EK1SV4RRFFQ61G5FAV'), ULID.parse(ULID.normalize('-OlARZ3-NoEKISV4rRFF-Q6iG5FAV--'))) + assert_equal(ULID.parse('01ARZ3N0EK1SV4RRFFQ61G5FAV').to_s, ULID.normalize('-olarz3-noekisv4rrff-q6ig5fav--')) + + normalized = '01ARZ3NDEKTSV4RRFFQ69G5FAV' + downcased = normalized.downcase + dup_downcased = downcased.dup + + assert(normalized.frozen?) + assert_not_same(normalized, ULID.normalize(normalized)) + + # This behavior is controversial, should be return non frozen string? + assert do + ULID.normalize(normalized).frozen? + end + + # Ensure the string is not modified in parser + assert_equal(false, downcased.frozen?) + assert_not_same(downcased, ULID.normalize(downcased)) + assert_equal(dup_downcased, downcased) + + assert_equal(normalized, ULID.normalize(downcased)) + assert_instance_of(String, ULID.normalize(downcased)) + + # This encoding handling is controversial, should be return original encoding? + assert_equal(Encoding::UTF_8, downcased.encoding) + assert_equal(Encoding::US_ASCII, ULID.normalize(downcased).encoding) + + [ + '', + "01ARZ3NDEKTSV4RRFFQ69G5FAV\n", + '01ARZ3NDEKTSV4RRFFQ69G5FAU', + '01ARZ3NDEKTSV4RRFFQ69G5FA', + '80000000000000000000000000' + ].each do |invalid| + err = assert_raises(ULID::ParserError) do + ULID.normalize(invalid) + end + assert_match(/does not match to/, err.message) + end + + ULID.sample(1000).each do |sample| + assert_equal(sample.to_s, ULID.normalize(sample.to_s)) + assert_equal(sample.to_s, ULID.normalize(sample.to_s.downcase)) + end + + assert_raises(ArgumentError) do + ULID.normalize + end + + [nil, 42, normalized.to_sym, BasicObject.new, Object.new, ULID.parse(normalized)].each do |evil| + err = assert_raises(ArgumentError) do + ULID.normalize(evil) + end + assert_equal('ULID.normalize takes only strings', err.message) + end + end + + def test_normalized? + nasty = '-olarz3-noekisv4rrff-q6ig5fav--' + assert_equal(false, ULID.normalized?(nasty)) + assert_equal(true, ULID.normalized?(ULID.normalize(nasty))) + + normalized = '01ARZ3NDEKTSV4RRFFQ69G5FAV' + assert_equal(true, ULID.normalized?(normalized)) + assert_equal(false, ULID.normalized?(normalized.downcase)) + + [ + '', + "01ARZ3NDEKTSV4RRFFQ69G5FAV\n", + '01ARZ3NDEKTSV4RRFFQ69G5FAU', + '01ARZ3NDEKTSV4RRFFQ69G5FA', + '80000000000000000000000000' + ].each do |invalid| + assert_equal(false, ULID.normalized?(invalid)) + end + + ULID.sample(1000).each do |sample| + assert_equal(true, ULID.normalized?(sample.to_s)) + assert_equal(false, ULID.normalized?(sample.to_s.downcase)) + end + + assert_raises(ArgumentError) do + ULID.normalized? + end + + [nil, 42, normalized.to_sym, BasicObject.new, Object.new, ULID.parse(normalized)].each do |evil| + assert_equal(false, ULID.normalized?(evil)) + end + end + def test_range time_has_more_value_than_milliseconds1 = Time.at(946684800, Rational('123456.789')) # 2000-01-01 00:00:00.123456789 UTC time_has_more_value_than_milliseconds2 = Time.at(1620045632, Rational('123456.789')) # 2021-05-03 12:40:32.123456789 UTC