Skip to content

Commit

Permalink
Add host traces to high-level profilings (#577)
Browse files Browse the repository at this point in the history
  • Loading branch information
szutenberg committed Jan 8, 2025
1 parent fb0884c commit 9034631
Showing 1 changed file with 69 additions and 4 deletions.
73 changes: 69 additions & 4 deletions vllm/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
###############################################################################

import gc
import gzip
import json
import os
import queue
import time
from typing import List, Optional, Set, Tuple, Type

import habana_frameworks.torch as htorch # noqa:F401
Expand Down Expand Up @@ -93,21 +97,82 @@ def __init__(
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
logger.info("Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir)

if os.getenv('VLLM_PROFILER_ENABLED') == 'full':
fn = self.full_trace_handler
with_stack = False
else:
fn = torch.profiler.tensorboard_trace_handler
with_stack = True
self.profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.HPU,
],
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True))
with_stack=with_stack,
on_trace_ready=fn(torch_profiler_trace_dir, use_gzip=True))
else:
self.profiler = None

def full_trace_handler(self, dir_name, use_gzip=False):

def handler_fn(prof) -> None:
if not os.path.isdir(dir_name):
try:
os.makedirs(dir_name, exist_ok=True)
except Exception as e:
raise RuntimeError("Can't create directory: " +
dir_name) from e
file_name = f"vllm.{time.time_ns()}.pt.trace.json"
file_path = os.path.join(dir_name, file_name)
prof.export_chrome_trace(file_path)
with open(file_path) as f:
pytorch_trace = json.load(f)
os.remove(file_path)
base = pytorch_trace['baseTimeNanoseconds'] / 1000
events = self.model_runner.profiler.profiling_trace_events
while True:
try:
event_str = events.get_nowait()
event = json.loads(event_str[:-1])
event['ts'] = event['ts'] - base
pytorch_trace['traceEvents'].append(event)
except queue.Empty:
break

pytorch_trace['traceEvents'].append({
"args": {
"name": "vLLM"
},
"name": "process_name",
"ph": "M",
"pid": 1,
"tid": 0,
"ts": 0.0
})
if use_gzip:
file_path = file_path + ".gz"
with gzip.open(file_path, 'wt', encoding="ascii") as zipfile:
json.dump(pytorch_trace, zipfile)
else:
with open(file_path, "w") as outfile:
outfile.write(json.dumps(pytorch_trace))
logger.info("Saved full profiling to %s", file_path)

return handler_fn

def start_profile(self):
if self.profiler is None:
raise RuntimeError("Profiler is not enabled.")
self.profiler.start()
high_level_profiler = self.model_runner.profiler
with high_level_profiler.record_event('internal', 'start_profiler'):
# Clean up the queue
while True:
try:
high_level_profiler.profiling_trace_events.get_nowait()
except queue.Empty:
break
self.profiler.start()

def stop_profile(self):
if self.profiler is None:
Expand Down

0 comments on commit 9034631

Please sign in to comment.