File size: 6,991 Bytes
14e4843
 
034968f
 
3655a9e
0be51d4
3655a9e
034968f
84f0fa3
034968f
 
84f0fa3
0be51d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb32fa1
 
 
 
 
 
0be51d4
 
 
 
 
 
 
 
 
 
 
 
 
 
14e4843
 
 
 
d6d7ec6
 
 
14e4843
3237d78
 
14e4843
d6d7ec6
14e4843
 
 
 
 
d6d7ec6
 
14e4843
 
 
 
 
 
 
d6d7ec6
14e4843
d6d7ec6
14e4843
 
034968f
 
3655a9e
 
 
 
 
 
 
 
 
 
0be51d4
034968f
 
 
bb32fa1
 
3655a9e
84f0fa3
3655a9e
 
 
 
 
 
 
 
 
bb32fa1
3655a9e
 
 
 
 
17162c6
3655a9e
 
 
 
 
0be51d4
84f0fa3
034968f
 
 
 
84f0fa3
 
034968f
 
 
 
 
 
17162c6
034968f
 
 
 
 
 
 
 
 
 
 
 
 
84f0fa3
034968f
 
 
84f0fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
034968f
0be51d4
 
 
 
 
 
bb32fa1
 
 
 
 
0be51d4
bb32fa1
0be51d4
 
 
 
 
 
 
 
 
 
 
bb32fa1
0be51d4
 
 
 
 
 
 
 
034968f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import pandas as pd
from huggingface_hub import snapshot_download
import subprocess
import re
import os
import GPUtil

try:
    from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
except:
    print("local debug: from display.utils")
    from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
    
MEM_BW_DICT ={
    "NVIDIA-A100-PCIe-80GB": 1935,
    "NVIDIA-A100-SXM-80GB": 2039,
    "NVIDIA-H100-PCIe-80GB": 2039,
    "NVIDIA-RTX-A5000-24GB": 768
}

PEAK_FLOPS_DICT = {
    "float32":{
        "NVIDIA-A100-PCIe-80GB": 312e12,
        "NVIDIA-A100-SXM-80GB": 312e12,
        "NVIDIA-H100-PCIe-80GB": 756e12,
        "NVIDIA-RTX-A5000-24GB": 222.2e12
    },
    "float16":{
        "NVIDIA-A100-PCIe-80GB": 624e12,
        "NVIDIA-A100-SXM-80GB": 624e12,
        "NVIDIA-H100-PCIe-80GB": 1513e12,
        "NVIDIA-RTX-A5000-24GB": 444.4e12
    },
    "bfloat16":{
        "NVIDIA-A100-PCIe-80GB": 624e12,
        "NVIDIA-A100-SXM-80GB": 624e12,
        "NVIDIA-H100-PCIe-80GB": 1513e12,
        "NVIDIA-RTX-A5000-24GB": 444.4e12
    },
    "8bit":{
        "NVIDIA-A100-PCIe-80GB": 1248e12,
        "NVIDIA-A100-SXM-80GB": 1248e12,
        "NVIDIA-H100-PCIe-80GB": 3026e12,
        "NVIDIA-RTX-A5000-24GB": 889e12
    },
    "4bit": {
        "NVIDIA-A100-PCIe-80GB": 2496e12,
        "NVIDIA-A100-SXM-80GB": 2496e12,
        "NVIDIA-H100-PCIe-80GB": 6052e12,
        "NVIDIA-RTX-A5000-24GB": 1778e12
    }

}

def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
    for i in range(10):
        try:
            snapshot_download(
                repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
            )
            return
        except Exception as e:
            print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
            import time

            time.sleep(60)
    return


def get_dataset_url(row):
    dataset_name = row["Benchmark"]
    dataset_url = row["Dataset Link"]
    benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
    return benchmark


def get_dataset_summary_table(file_path):
    df = pd.read_csv(file_path)

    df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)

    df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]

    return df

def parse_nvidia_smi():
    visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
    if visible_devices is not None:
        gpu_indices = visible_devices.split(',')
    else:
        # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
        result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
        if result.returncode != 0:
            print("Failed to query GPU indices.")
            return []
        gpu_indices = result.stdout.strip().split('\n')
    # print(f"gpu_indices: {gpu_indices}")
    gpu_stats = []

    gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
    # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
    gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')

    gpu_name = ""
    for index in gpu_indices:
        result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
        output = result.stdout.strip()
        lines = output.split("\n")
        for line in lines:
            match = gpu_info_pattern.search(line)
            name_match = gpu_name_pattern.search(line)
            gpu_info = {}
            if name_match:
                gpu_name = ''.join(filter(None, name_match.groups())).strip()
            if match:
                temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                gpu_info.update({
                    GPU_TEMP: temp,
                    GPU_Power: power_usage,
                    GPU_Mem: round(mem_usage / 1024, 2),
                    GPU_Util: gpu_util
                })

            if len(gpu_info) >= 4:
                gpu_stats.append(gpu_info)
    # print(f"gpu_stats: {gpu_stats}")
    gpu_name = f"{len(gpu_stats)}x{gpu_name}"
    gpu_stats_total = {
                        GPU_TEMP: 0,
                        GPU_Power: 0,
                        GPU_Mem: 0,
                        GPU_Util: 0,
                        GPU_Name: gpu_name
                    }
    for gpu_stat in gpu_stats:
        gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
        gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
        gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
        gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
    gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
    gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
    gpu_stats_total[GPU_Power] /= len(gpu_stats)
    gpu_stats_total[GPU_Util] /= len(gpu_stats)
    return [gpu_stats_total]

def monitor_gpus(stop_event, interval, stats_list):
    while not stop_event.is_set():
        gpu_stats = parse_nvidia_smi()
        if gpu_stats:
            stats_list.extend(gpu_stats)
        stop_event.wait(interval)

def analyze_gpu_stats(stats_list):
    # Check if the stats_list is empty, and return None if it is
    if not stats_list:
        return None

    # Initialize dictionaries to store the stats
    avg_stats = {}
    max_stats = {}

    # Calculate average stats, excluding 'GPU_Mem'
    for key in stats_list[0].keys():
        if key != GPU_Mem and key != GPU_Name:
            total = sum(d[key] for d in stats_list)
            avg_stats[key] = total / len(stats_list)

    # Calculate max stats for 'GPU_Mem'
    max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
    if GPU_Name in stats_list[0]:
        avg_stats[GPU_Name] = stats_list[0][GPU_Name]
    # Update average stats with max GPU memory usage
    avg_stats.update(max_stats)

    return avg_stats

def get_gpu_details():
    gpus = GPUtil.getGPUs()
    gpu = gpus[0]
    name = gpu.name.replace(" ", "-")
    memory_gb = round(gpu.memoryTotal / 1024)
    memory = f"{memory_gb}GB"

    for part in name.split('-'):
        if part.endswith("GB") and part[:-2].isdigit():
            name = name.replace(f"-{part}", "").replace(part, "")

    formatted_name = f"{name}-{memory}"
    
    return formatted_name

def get_peak_bw(gpu_name):
    return MEM_BW_DICT[gpu_name]

def get_peak_flops(gpu_name, precision):
    return PEAK_FLOPS_DICT[precision][gpu_name]

def transfer_precision2bytes(precision):
    if precision == "float32":
        return 4
    elif precision in ["float16", "bfloat16"]:
        return 2
    elif precision == "8bit":
        return 1
    elif precision == "4bit":
        return 0.5
    else:
        raise ValueError(f"Unsupported precision: {precision}")

if __name__ == "__main__":
    print(analyze_gpu_stats(parse_nvidia_smi()))