mirror of
https://github.com/exo-explore/exo.git
synced 2025-06-03 06:30:24 +00:00
update examples: remove old llama3_distributed, add chatgpt_api
This commit is contained in:
parent
581856897a
commit
5a9f4ba5c1
39
examples/chatgpt_api.sh
Executable file
39
examples/chatgpt_api.sh
Executable file
@ -0,0 +1,39 @@
|
||||
# exo provides an API that aims to be a drop-in replacements for the ChatGPT-API.
|
||||
# This example shows how you can use the API first without streaming and second with streaming.
|
||||
# This works the same in a single-node set up and in a multi-node setup.
|
||||
# You need to start exo before running this by running `python3 main.py`.
|
||||
|
||||
API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):8000}"
|
||||
MODEL="llama-3.1-8b"
|
||||
PROMPT="What is the meaning of exo?"
|
||||
TEMPERATURE=0.7
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo "--- Output without streaming:"
|
||||
echo ""
|
||||
curl "${API_ENDPOINT}/v1/chat/completions" --silent \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"${MODEL}"'",
|
||||
"messages": [{"role": "user", "content": "'"${PROMPT}"'"}],
|
||||
"temperature": '"${TEMPERATURE}"'
|
||||
}'
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo "--- Output with streaming:"
|
||||
echo ""
|
||||
curl "${API_ENDPOINT}/v1/chat/completions" --silent \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "'"${MODEL}"'",
|
||||
"messages": [{"role": "user", "content": "'"${PROMPT}"'"}],
|
||||
"temperature": '"${TEMPERATURE}"',
|
||||
"stream": true
|
||||
}' | while read -r line; do
|
||||
if [[ $line == data:* ]]; then
|
||||
content=$(echo "$line" | sed 's/^data: //')
|
||||
echo "$content" | jq -r '.choices[].delta.content' --unbuffered | tr -d '\n'
|
||||
fi
|
||||
done
|
@ -1,81 +0,0 @@
|
||||
# In this example, a user is running a home cluster with 3 shards.
|
||||
# They are prompting the cluster to generate a response to a question.
|
||||
# The cluster is given the question, and the user is given the response.
|
||||
|
||||
from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
|
||||
from exo.inference.shard import Shard
|
||||
from exo.networking.peer_handle import PeerHandle
|
||||
from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
|
||||
from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
|
||||
from typing import List
|
||||
import asyncio
|
||||
import argparse
|
||||
import uuid
|
||||
|
||||
models = {
|
||||
"mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
|
||||
"mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80)
|
||||
}
|
||||
|
||||
path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
|
||||
model_path = get_model_path(path_or_hf_repo)
|
||||
tokenizer_config = {}
|
||||
tokenizer = load_tokenizer(model_path, tokenizer_config)
|
||||
|
||||
# we intentionally leave out peer1 to demonstrate equality of nodes in exo.
|
||||
# there is no "master" node in exo, all nodes are equal and can take on any role.
|
||||
# peer1 = GRPCPeerHandle(
|
||||
# "node1",
|
||||
# "localhost:8080",
|
||||
# DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
|
||||
# )
|
||||
peer2 = GRPCPeerHandle("node2", "localhost:8081", DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0)))
|
||||
shard = models[path_or_hf_repo]
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
|
||||
async def run_prompt(prompt: str):
|
||||
if tokenizer.chat_template is None:
|
||||
tokenizer.chat_template = tokenizer.default_chat_template
|
||||
if (hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None):
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
await peer2.connect()
|
||||
|
||||
try:
|
||||
await peer2.send_prompt(shard, prompt, request_id)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
import time
|
||||
# poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
|
||||
previous_length = 0
|
||||
n_tokens = 0
|
||||
start_time = time.perf_counter()
|
||||
while True:
|
||||
try:
|
||||
result, is_finished = await peer2.get_inference_result(request_id)
|
||||
except Exception as e:
|
||||
continue
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Print the updated string in place
|
||||
updated_string = tokenizer.decode(result)
|
||||
n_tokens = len(result)
|
||||
print(updated_string[previous_length:], end='', flush=True)
|
||||
previous_length = len(updated_string)
|
||||
|
||||
if is_finished:
|
||||
print("\nDone")
|
||||
break
|
||||
end_time = time.perf_counter()
|
||||
print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run prompt")
|
||||
parser.add_argument("--prompt", type=str, help="The prompt to run")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(run_prompt(args.prompt))
|
Loading…
Reference in New Issue
Block a user