From 11554f76a9dcd798b875c618c6cfd7bcd911c167 Mon Sep 17 00:00:00 2001 From: David Koski Date: Thu, 14 Mar 2024 09:00:04 -0700 Subject: [PATCH 1/2] switch swift-tokenizers to main, remove some workarounds - swift-tokenizers is getting a lot of updates and fixes, let's track main for now - remove some workarounds that are no longer needed - https://github.com/huggingface/swift-transformers/issues/63 --- Libraries/LLM/Tokenizer.swift | 41 ------------------- mlx-swift-examples.xcodeproj/project.pbxproj | 4 +- .../xcshareddata/swiftpm/Package.resolved | 4 +- 3 files changed, 4 insertions(+), 45 deletions(-) diff --git a/Libraries/LLM/Tokenizer.swift b/Libraries/LLM/Tokenizer.swift index 0159bec..72d7960 100644 --- a/Libraries/LLM/Tokenizer.swift +++ b/Libraries/LLM/Tokenizer.swift @@ -67,54 +67,13 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok tokenizerConfig = Config(dictionary) } - // workaround: some merges can't be split on space in BPETokenizer - if let tokenizerClass = tokenizerConfig.tokenizerClass?.stringValue { - switch tokenizerClass { - case "T5Tokenizer": - break - default: - tokenizerData = discardUnhandledMerges(tokenizerData: tokenizerData) - } - } - let impl = try PreTrainedTokenizer( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) return Tokenizer(tokenizer: impl, tokenizerConfig: tokenizerConfig) } -public func discardUnhandledMerges(tokenizerData: Config) -> Config { - // see https://github.com/ml-explore/mlx-swift-examples/issues/1 - // and https://github.com/huggingface/swift-transformers/issues/51 - - if let model = tokenizerData.model { - if let merges = model.dictionary["merges"] as? [String] { - // discard any merges that can't be split on a space - // (required by BPETokenizer) - let newMerges = - merges - .filter { - $0.split(separator: " ").count == 2 - } - - if newMerges.count != merges.count { - var newModel = model.dictionary - newModel["merges"] = newMerges - - var newTokenizerData = tokenizerData.dictionary - newTokenizerData["model"] = newModel - - return Config(newTokenizerData) - } - } - } - - return tokenizerData -} - /// overrides for TokenizerModel/knownTokenizers let replacementTokenizers = [ - "CodeLlamaTokenizer": "LlamaTokenizer", - "GemmaTokenizer": "PreTrainedTokenizer", "Qwen2Tokenizer": "PreTrainedTokenizer", ] diff --git a/mlx-swift-examples.xcodeproj/project.pbxproj b/mlx-swift-examples.xcodeproj/project.pbxproj index e680b1d..97f0715 100644 --- a/mlx-swift-examples.xcodeproj/project.pbxproj +++ b/mlx-swift-examples.xcodeproj/project.pbxproj @@ -2220,8 +2220,8 @@ isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/huggingface/swift-transformers"; requirement = { - kind = upToNextMajorVersion; - minimumVersion = 0.1.2; + branch = main; + kind = branch; }; }; C392736E2B60699100368D5D /* XCRemoteSwiftPackageReference "swift-argument-parser" */ = { diff --git a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 4b4f79e..75af8d9 100644 --- a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/huggingface/swift-transformers", "state" : { - "revision" : "564442fba36b0b694d730a62d0593e5f54043b55", - "version" : "0.1.2" + "branch" : "main", + "revision" : "24605a8c0cc974bec5b94a6752eb687bae77db31" } } ], From 9b8713b50f99536f4df4badb913cdc75990676a5 Mon Sep 17 00:00:00 2001 From: David Koski Date: Thu, 14 Mar 2024 09:59:51 -0700 Subject: [PATCH 2/2] swift-format --- Libraries/LLM/Tokenizer.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Libraries/LLM/Tokenizer.swift b/Libraries/LLM/Tokenizer.swift index 72d7960..100b5d2 100644 --- a/Libraries/LLM/Tokenizer.swift +++ b/Libraries/LLM/Tokenizer.swift @@ -75,5 +75,5 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok /// overrides for TokenizerModel/knownTokenizers let replacementTokenizers = [ - "Qwen2Tokenizer": "PreTrainedTokenizer", + "Qwen2Tokenizer": "PreTrainedTokenizer" ]