model=HuggingFaceM4/idefics-9b-instruct
num_shard=1
max_input_length=1562
max_total_tokens=2048
sudo docker run --gpus all -ti -p 8080:80 \
-e MODEL_ID=$model \
-e NUM_SHARD=$num_shard \
-e MAX_INPUT_LENGTH=$max_input_length \
-e MAX_TOTAL_TOKENS=$max_total_tokens \
ghcr.io/huggingface/text-generation-inference:1.1.0
send test request
curl 127.0.0.1:8080/generate \
-X POST \
-d '{"inputs":"User:<fake_token_around_image>![](https://m.media-amazon.com/images/I/51M87ywnihL._AC_SX679_.jpg)<fake_token_around_image>Can i charge my iphone with this cable?<end_of_utterance>\n","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \
-H 'Content-Type: application/json'