Skip to content

Latest commit

 

History

History
26 lines (22 loc) · 774 Bytes

File metadata and controls

26 lines (22 loc) · 774 Bytes

Llama 2 inference

model=meta-llama/Llama-2-7b-chat-hf
token=hf_xxx # replace with your token, which access to the repo
num_shard=1
max_input_length=1562
max_total_tokens=2048

docker run --gpus all -ti -p 8080:80 \
  -e MODEL_ID=$model \
 -e HUGGING_FACE_HUB_TOKEN=$token \
  -e NUM_SHARD=$num_shard \
  -e MAX_INPUT_LENGTH=$max_input_length \
  -e MAX_TOTAL_TOKENS=$max_total_tokens \
  ghcr.io/huggingface/text-generation-inference:latest

send test request

curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant.\n<</SYS>>\n\nWhat is 10+10? [\/INST]","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \
    -H 'Content-Type: application/json'