forked from microsoft/BitNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_inference.py
56 lines (51 loc) · 2.42 KB
/
run_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import sys
import signal
import platform
import argparse
import subprocess
def run_command(command, shell=False):
"""Run a system command and ensure it succeeds."""
try:
subprocess.run(command, shell=shell, check=True)
except subprocess.CalledProcessError as e:
print(f"Error occurred while running command: {e}")
sys.exit(1)
def run_inference():
build_dir = "build"
if platform.system() == "Windows":
main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe")
if not os.path.exists(main_path):
main_path = os.path.join(build_dir, "bin", "llama-cli")
else:
main_path = os.path.join(build_dir, "bin", "llama-cli")
command = [
f'{main_path}',
'-m', args.model,
'-n', str(args.n_predict),
'-t', str(args.threads),
'-p', args.prompt,
'-ngl', '0',
'-c', str(args.ctx_size),
'--temp', str(args.temperature),
"-b", "1",
]
if args.conversation:
command.append("-cnv")
run_command(command)
def signal_handler(sig, frame):
print("Ctrl+C pressed, exiting...")
sys.exit(0)
if __name__ == "__main__":
signal.signal(signal.SIGINT, signal_handler)
# Usage: python run_inference.py -p "Microsoft Corporation is an American multinational corporation and technology company headquartered in Redmond, Washington."
parser = argparse.ArgumentParser(description='Run inference')
parser.add_argument("-m", "--model", type=str, help="Path to model file", required=False, default="models/bitnet_b1_58-3B/ggml-model-i2_s.gguf")
parser.add_argument("-n", "--n-predict", type=int, help="Number of tokens to predict when generating text", required=False, default=128)
parser.add_argument("-p", "--prompt", type=str, help="Prompt to generate text from", required=True)
parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
parser.add_argument("-c", "--ctx-size", type=int, help="Size of the prompt context", required=False, default=2048)
parser.add_argument("-temp", "--temperature", type=float, help="Temperature, a hyperparameter that controls the randomness of the generated text", required=False, default=0.8)
parser.add_argument("-cnv", "--conversation", action='store_true', help="Whether to enable chat mode or not (for instruct models.)")
args = parser.parse_args()
run_inference()