File size: 2,877 Bytes
ce00289 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import torch
import streamlit as st
from pyinstrument import Profiler
from typing import Dict
import pandas as pd
@st.cache_resource(max_entries=1, show_spinner=False)
def init_gpu_memory():
"""
When CUDA is initialized, it occupies some memory on the GPU thus this overhead
can sometimes make it difficult to understand how much memory is actually used by
the model.
This function is used to initialize CUDA and measure the overhead.
"""
if not torch.cuda.is_available():
return {}
# lets init torch gpu for a moment
gpu_memory_overhead = {}
for i in range(torch.cuda.device_count()):
torch.ones(1).cuda(i)
free, total = torch.cuda.mem_get_info(i)
occupied = total - free
gpu_memory_overhead[i] = occupied
return gpu_memory_overhead
class SystemMonitor:
"""
This class is used to monitor the system resources such as GPU memory and CPU
usage. It uses the pyinstrument library to profile the code and measure the
execution time of different parts of the code.
"""
def __init__(
self,
enabled: bool = False,
):
self.enabled = enabled
self.profiler = Profiler()
self.overhead: Dict[int, int]
def __enter__(self):
if not self.enabled:
return
self.overhead = init_gpu_memory()
self.profiler.__enter__()
def __exit__(self, exc_type, exc_value, traceback):
if not self.enabled:
return
self.profiler.__exit__(exc_type, exc_value, traceback)
self.report_gpu_usage()
self.report_profiler()
with st.expander("Session state"):
st.write(st.session_state)
return None
def report_gpu_usage(self):
if not torch.cuda.is_available():
return
data = []
for i in range(torch.cuda.device_count()):
free, total = torch.cuda.mem_get_info(i)
occupied = total - free
data.append({
'overhead': self.overhead[i],
'occupied': occupied - self.overhead[i],
'free': free,
})
df = pd.DataFrame(data, columns=["overhead", "occupied", "free"])
with st.sidebar.expander("System"):
st.write("GPU memory on server")
df /= 1024 ** 3 # Convert to GB
st.bar_chart(df, width=200, height=200, color=["#fefefe", "#84c9ff", "#fe2b2b"])
def report_profiler(self):
html_code = self.profiler.output_html()
with st.expander("Profiler", expanded=False):
st.components.v1.html(html_code, height=1000, scrolling=True)
|