From 50750874614aaf58147a13ee7bac98b166ab8590 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:11:46 -0700 Subject: [PATCH 1/7] Fix command-line arguments being ignored --- modules/models_settings.py | 2 +- modules/shared.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index ca87b44b3b..bc3ace6f9d 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -96,7 +96,7 @@ def update_model_parameters(state, initial=False): gpu_memories.append(value) continue - if initial and vars(shared.args)[element] != vars(shared.args_defaults)[element]: + if initial and element in shared.provided_arguments: continue # Setting null defaults diff --git a/modules/shared.py b/modules/shared.py index ab1dbe11c8..2c562d2bfb 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -1,4 +1,5 @@ import argparse +import sys from collections import OrderedDict from pathlib import Path @@ -6,7 +7,6 @@ from modules.logging_colors import logger - # Model variables model = None tokenizer = None @@ -187,6 +187,11 @@ def str2bool(v): args = parser.parse_args() args_defaults = parser.parse_args([]) +provided_arguments = [] +for arg in sys.argv[1:]: + arg = arg.lstrip('-').replace('-', '_') + if hasattr(args, arg): + provided_arguments.append(arg) # Deprecation warnings for k in ['chat', 'notebook', 'no_stream']: From 03dc69edc5436b9426238fa626212dcffd9d62a3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:12:19 -0700 Subject: [PATCH 2/7] ExLlama_HF (v1 and v2) prefix matching --- modules/exllama_hf.py | 24 ++++++++++++++++++++---- modules/exllamav2_hf.py | 25 ++++++++++++++++++++----- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py index 9e4701bf71..3245ac8703 100644 --- a/modules/exllama_hf.py +++ b/modules/exllama_hf.py @@ -77,17 +77,33 @@ def __call__(self, *args, **kwargs): seq = past_key_values + seq seq_tensor = torch.tensor(seq) + reset = True # Make the forward call if labels is None: - if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]): + if past_seq is not None: + min_length = min(past_seq.shape[0], seq_tensor.shape[0]) + indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) + if len(indices) > 0: + longest_prefix = indices[0].item() + else: + longest_prefix = min_length + + if longest_prefix > 0: + reset = False + ex_cache.current_seq_len = longest_prefix + if len(seq_tensor) - longest_prefix > 1: + self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) + + if reset: ex_cache.current_seq_len = 0 - self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True, lora=self.lora) + if len(seq_tensor) > 1: + self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) - logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache, lora=self.lora).to(input_ids.device) + logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, lora=self.lora).to(input_ids.device) else: ex_cache.current_seq_len = 0 - logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False, lora=self.lora) + logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, lora=self.lora) if is_negative: self.past_seq_negative = seq_tensor diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 457942acf8..6542ede9c9 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -81,18 +81,33 @@ def __call__(self, *args, **kwargs): seq = past_key_values + seq seq_tensor = torch.tensor(seq) + reset = True # Make the forward call if labels is None: - if past_seq is None or not torch.equal(past_seq, seq_tensor[:-1]): + if past_seq is not None: + min_length = min(past_seq.shape[0], seq_tensor.shape[0]) + indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) + if len(indices) > 0: + longest_prefix = indices[0].item() + else: + longest_prefix = min_length + + if longest_prefix > 0: + reset = False + ex_cache.current_seq_len = longest_prefix + if len(seq_tensor) - longest_prefix > 1: + self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True) + + if reset: ex_cache.current_seq_len = 0 - self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), ex_cache, preprocess_only=True) + if len(seq_tensor) > 1: + self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True) - logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), ex_cache).to(input_ids.device) + logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache).to(input_ids.device) else: ex_cache.current_seq_len = 0 - # logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache, last_id_only=False) - logits = self.ex_model.forward(torch.tensor([seq], dtype=torch.long), ex_cache) + logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False) if is_negative: self.past_seq_negative = seq_tensor From e2fddd9584b96ac5fef7d7550e726c6cf359bc60 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:12:34 -0700 Subject: [PATCH 3/7] More robust autoscrolling (attempt) --- js/main.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/main.js b/js/main.js index c93ae34246..f4e20f8906 100644 --- a/js/main.js +++ b/js/main.js @@ -113,7 +113,7 @@ let isScrolled = false; targetElement.addEventListener('scroll', function() { let diff = targetElement.scrollHeight - targetElement.clientHeight; - if(Math.abs(targetElement.scrollTop - diff) <= 1 || diff == 0) { + if(Math.abs(targetElement.scrollTop - diff) <= 10 || diff == 0) { isScrolled = false; } else { isScrolled = true; @@ -161,7 +161,7 @@ let notebookScrolled = false; notebookElement.addEventListener('scroll', function() { let diff = notebookElement.scrollHeight - notebookElement.clientHeight; - if(Math.abs(notebookElement.scrollTop - diff) <= 1 || diff == 0) { + if(Math.abs(notebookElement.scrollTop - diff) <= 10 || diff == 0) { notebookScrolled = false; } else { notebookScrolled = true; @@ -186,7 +186,7 @@ let defaultScrolled = false; defaultElement.addEventListener('scroll', function() { let diff = defaultElement.scrollHeight - defaultElement.clientHeight; - if(Math.abs(defaultElement.scrollTop - diff) <= 1 || diff == 0) { + if(Math.abs(defaultElement.scrollTop - diff) <= 10 || diff == 0) { defaultScrolled = false; } else { defaultScrolled = true; From 13ac55fa1805d7f4b87a43eb04a47d0d8b5ee50d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:13:03 -0700 Subject: [PATCH 4/7] Reorder some functions --- modules/exllama.py | 32 ++++++++++++++++---------------- modules/exllamav2.py | 32 ++++++++++++++++---------------- modules/metadata_gguf.py | 2 +- modules/models.py | 1 - 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/modules/exllama.py b/modules/exllama.py index 4253e6ca1b..cb92344e7c 100644 --- a/modules/exllama.py +++ b/modules/exllama.py @@ -85,6 +85,22 @@ def from_pretrained(self, path_to_model): result.generator = generator return result, result + def encode(self, string, **kwargs): + return self.tokenizer.encode(string, max_seq_len=self.model.config.max_seq_len, add_bos=True) + + def decode(self, ids, **kwargs): + if isinstance(ids, list): + ids = torch.tensor([ids]) + elif isinstance(ids, torch.Tensor) and ids.numel() == 1: + ids = ids.view(1, -1) + + return self.tokenizer.decode(ids)[0] + + def get_logits(self, token_ids, **kwargs): + self.cache.current_seq_len = 0 + self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True) + return self.model.forward(token_ids[:, -1:], self.cache, **kwargs).float().cpu() + def generate_with_streaming(self, prompt, state): # The cache batch size must be 2 for CFG and 1 otherwise @@ -200,19 +216,3 @@ def generate(self, prompt, state): pass return output - - def encode(self, string, **kwargs): - return self.tokenizer.encode(string, max_seq_len=self.model.config.max_seq_len, add_bos=True) - - def decode(self, ids, **kwargs): - if isinstance(ids, list): - ids = torch.tensor([ids]) - elif isinstance(ids, torch.Tensor) and ids.numel() == 1: - ids = ids.view(1, -1) - - return self.tokenizer.decode(ids)[0] - - def get_logits(self, token_ids, **kwargs): - self.cache.current_seq_len = 0 - self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True) - return self.model.forward(token_ids[:, -1:], self.cache, **kwargs).float().cpu() diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 605a09275e..be5f47e4af 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -62,6 +62,22 @@ def from_pretrained(self, path_to_model): result.generator = generator return result, result + def encode(self, string, **kwargs): + return self.tokenizer.encode(string, add_bos=True) + + def decode(self, ids, **kwargs): + if isinstance(ids, list): + ids = torch.tensor([ids]) + elif isinstance(ids, torch.Tensor) and ids.numel() == 1: + ids = ids.view(1, -1) + + return self.tokenizer.decode(ids)[0] + + def get_logits(self, token_ids, **kwargs): + self.cache.current_seq_len = 0 + self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True) + return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, **kwargs).float().cpu() + def generate_with_streaming(self, prompt, state): settings = ExLlamaV2Sampler.Settings() settings.temperature = state['temperature'] @@ -114,19 +130,3 @@ def generate(self, prompt, state): pass return output - - def encode(self, string, **kwargs): - return self.tokenizer.encode(string, add_bos=True) - - def decode(self, ids, **kwargs): - if isinstance(ids, list): - ids = torch.tensor([ids]) - elif isinstance(ids, torch.Tensor) and ids.numel() == 1: - ids = ids.view(1, -1) - - return self.tokenizer.decode(ids)[0] - - def get_logits(self, token_ids, **kwargs): - self.cache.current_seq_len = 0 - self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True) - return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, **kwargs).float().cpu() diff --git a/modules/metadata_gguf.py b/modules/metadata_gguf.py index f633d70c36..f5fa3ce24e 100644 --- a/modules/metadata_gguf.py +++ b/modules/metadata_gguf.py @@ -70,7 +70,7 @@ def load_metadata(fname): GGUF_VERSION = struct.unpack(" Date: Tue, 19 Sep 2023 13:13:13 -0700 Subject: [PATCH 5/7] Lint --- modules/chat.py | 2 +- modules/metadata_gguf.py | 2 +- modules/text_generation.py | 4 ++-- modules/ui_file_saving.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 37fa25f386..e3745d0b9c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -530,7 +530,7 @@ def load_character_memoized(character, name1, name2, instruct=False): def upload_character(file, img, tavern=False): - decoded_file = file if type(file) == str else file.decode('utf-8') + decoded_file = file if isinstance(file, str) else file.decode('utf-8') try: data = json.loads(decoded_file) except: diff --git a/modules/metadata_gguf.py b/modules/metadata_gguf.py index f5fa3ce24e..0ea41a2a2d 100644 --- a/modules/metadata_gguf.py +++ b/modules/metadata_gguf.py @@ -71,7 +71,7 @@ def load_metadata(fname): ti_data_count = struct.unpack(" Date: Tue, 19 Sep 2023 13:13:24 -0700 Subject: [PATCH 6/7] Trim model path if using absolute path --- server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/server.py b/server.py index ac3bcec1c3..fc99ef7232 100644 --- a/server.py +++ b/server.py @@ -208,6 +208,7 @@ def create_interface(): p = Path(shared.model_name) if p.exists(): model_name = p.parts[-1] + shared.model_name = model_name else: model_name = shared.model_name From 029da9563f5fc8602dd03724044526f09849b00b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Sep 2023 14:14:40 -0700 Subject: [PATCH 7/7] Avoid redundant function call in llamacpp_hf --- modules/llamacpp_hf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 00da92ed10..3cb5df1cc2 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -131,9 +131,10 @@ def __call__(self, *args, **kwargs): longest_prefix = min_length if longest_prefix > 0: - self.model.n_tokens = longest_prefix - self.model.eval(seq[longest_prefix:]) reset = False + self.model.n_tokens = longest_prefix + if len(seq_tensor) - longest_prefix > 0: + self.model.eval(seq[longest_prefix:]) if reset: self.model.reset()