-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllama3_distributed.py
88 lines (76 loc) · 3.28 KB
/
llama3_distributed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# In this example, a user is running a home cluster with 3 shards.
# They are prompting the cluster to generate a response to a question.
# The cluster is given the question, and the user is given the response.
from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
from exo.inference.shard import Shard
from exo.networking.peer_handle import PeerHandle
from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
from typing import List
import asyncio
import argparse
import uuid
models = {
"mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
"mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80)
}
path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
model_path = get_model_path(path_or_hf_repo)
tokenizer_config = {}
tokenizer = load_tokenizer(model_path, tokenizer_config)
# we intentionally leave out peer1 to demonstrate equality of nodes in exo.
# there is no "master" node in exo, all nodes are equal and can take on any role.
# peer1 = GRPCPeerHandle(
# "node1",
# "localhost:8080",
# DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
# )
peer2 = GRPCPeerHandle(
"node2",
"localhost:8081",
DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0))
)
shard = models[path_or_hf_repo]
request_id = str(uuid.uuid4())
async def run_prompt(prompt: str):
if tokenizer.chat_template is None:
tokenizer.chat_template = tokenizer.default_chat_template
if (
hasattr(tokenizer, "apply_chat_template")
and tokenizer.chat_template is not None
):
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
await peer2.connect()
try:
await peer2.send_prompt(shard, prompt, request_id)
except Exception as e:
print(e)
import time
# poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
previous_length = 0
n_tokens = 0
start_time = time.perf_counter()
while True:
try:
result, is_finished = await peer2.get_inference_result(request_id)
except Exception as e:
continue
await asyncio.sleep(0.1)
# Print the updated string in place
updated_string = tokenizer.decode(result)
n_tokens = len(result)
print(updated_string[previous_length:], end='', flush=True)
previous_length = len(updated_string)
if is_finished:
print("\nDone")
break
end_time = time.perf_counter()
print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run prompt")
parser.add_argument("--prompt", type=str, help="The prompt to run")
args = parser.parse_args()
asyncio.run(run_prompt(args.prompt))