-
Notifications
You must be signed in to change notification settings - Fork 1
/
model.py
107 lines (84 loc) · 3.77 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
from langchain.llms.base import LLM
from llama_index import SimpleDirectoryReader, GPTListIndex, PromptHelper
from llama_index import LLMPredictor, GPTSimpleVectorIndex, ServiceContext
from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig, LlamaTokenizer
from transformers import pipeline
from typing import Optional, List, Mapping, Any
class CustomLLM(LLM):
async def _acall(self, prompt: str, stop: Optional[List[str]] = None) -> str:
pass
model_name = 'author/model_name'
generation_config = GenerationConfig(
temperature=0.2,
top_p=0.85,
top_k=2,
num_beams=4,
bos_token_id=1,
eos_token_id=2,
pad_token_id=0,
max_new_tokens=2500, # max_length=max_new_tokens+input_sequence
min_new_tokens=1, # min_length=min_new_tokens+input_sequence
)
model: Optional[LlamaTokenizer]
tokenizer: Optional[LlamaTokenizer]
device = 'cuda'
# pipeline = pipeline("text-generation",
# model=model,
# tokenizer=tokenizer,
# device=device)
def __init__(self, mod, token, gen_config, device, *arg, **kwargs):
super().__init__(*arg, **kwargs)
self.generation_config = gen_config
self.model = mod
self.tokenizer = token
self.device = device
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
return self.doCall(prompt=prompt, stop=stop)
def doCall(self, prompt: str, stop: Optional[List[str]] = None) -> str:
print('# start _call')
print('prompt: \n', prompt)
# response = self.pipeline(prompt, max_new_tokens=2500)[0]["generated_text"]
# print(response[prompt_length:])
print('# break the prompt to tokens')
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
# produce output tokens
# generate_ids = model.generate(input_ids, max_new_tokens=2500, do_sample=True, top_k=30, top_p=0.85,
# temperature=0.5, repetition_penalty=1., eos_token_id=2, bos_token_id=1,
# pad_token_id=0)
print('# generate...')
generate_ids = self.model.generate(input_ids=input_ids,
generation_config=self.generation_config)
# decode token to string response
print('# decode token to string response')
output = \
self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print('output: \n' + output)
# slice the prompt
print('# slice the output, only newly generated token stay')
response = output[len(prompt):]
return response
def doPipelineCall(self, prompt: str, stop: Optional[List[str]] = None) -> str:
pass
@property
def _identifying_params(self) -> Mapping[str, Any]:
return {"name_of_model": self.model_name}
@property
def _llm_type(self) -> str:
return "custom"
class Llama7bHFLLM(LLM):
model_name = "decapoda-research/llama-7b-hf"
num_output = 256
pipeline = pipeline("text-generation", model=model_name, device="cuda:0",
model_kwargs={"torch_dtype": torch.bfloat16})
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
prompt_length = len(prompt)
response = self.pipeline(prompt, max_new_tokens=self.num_output)[0]["generated_text"]
# only return newly generated tokens
return response[prompt_length:]
@property
def _identifying_params(self) -> Mapping[str, Any]:
return {"name_of_model": self.model_name}
@property
def _llm_type(self) -> str:
return "custom"