Skip to content

Add batch size gather func #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 174 additions & 20 deletions pytorch/scripts/compile_results_pytorch_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,108 @@

import pandas as pd

# naming convention
# key: config name
# value: ([version, num_gpus], rename, watt, price)
# version: 0 for pytorch:22.10-py3, and 1 for pytorch:24.10-py3 (currently irrelevant, when different versions of pytorch changes field positions this is relevant)
# num_gpus: sometimes num_gpus can't be inferred from config name (for example p3.16xlarge) or missing from the result log. So we ask for user to specify it here.
# rename: renaming the system so it is easier to read
# watt per gpu
# price per gpu

list_system_single = {
"LambdaBM_ENG04_1xH200_140GB_SXM_h200_v2": ([1, 1], "LambdaBM ENG04 1xH200 140GB SXM", 700, 39375),
"LambdaBM_ENG04_HTOff_1xH200_140GB_SXM_h200_v2": ([1, 1], "LambdaBM ENG04 HTOff 1xH200 140GB SXM", 700, 39375),
"LambdaBM_Radiant_1xGH200_96GB_v2": ([1, 1], "LambdaBM Radiant 1xGH200 96GB", 700, 45000),
"LambdaOD_1x_1xGH200_80GB_192-222-58-35_v2": ([1, 1], "LambdaOD 1x 1xGH200 80GB", 700, 45000),
"LambdaOD_1x_1xGH200_96GB_192-222-57-0_v2": ([1, 1], "LambdaOD 1x 1xGH200 96GB", 700, 45000),
"LambdaOD_1x_1xH100_80GB_SXM5_192-222-52-83_v2": ([1, 1], "LambdaOD 1x 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Test_1xGH200_96GB_192-222-56-184_v2": ([1, 1], "LambdaOD 1x Test 1xGH200 96GB", 700, 45000),
"LambdaOD_1x_Test_1xGH200_96GB_192-222-56-184_v2_bk": ([1, 1], "LambdaOD 1x Test 1xGH200 96GB bk", 700, 45000),
"LambdaOD_1x_Texas_1xH100_80GB_PCIe_209-20-158-50_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB PCIe", 350, 30918),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-129_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-154_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-179_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-249_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-60_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-74_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-77_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_1x_Texas_1xH100_80GB_SXM5_192-222-52-92_v2": ([1, 1], "LambdaOD 1x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_1xH100_80GB_SXM5_192-222-52-120_v2": ([1, 1], "LambdaOD 2x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_1xH100_80GB_SXM5_192-222-52-176_v2": ([1, 1], "LambdaOD 2x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_1xH100_80GB_SXM5_192-222-52-211_v2": ([1, 1], "LambdaOD 2x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_1xH100_80GB_SXM5_192-222-52-65_v2": ([1, 1], "LambdaOD 2x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_1xH100_80GB_SXM5_192-222-52-139_v2": ([1, 1], "LambdaOD 4x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_1xH100_80GB_SXM5_192-222-52-178_v2": ([1, 1], "LambdaOD 4x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_1xH100_80GB_SXM5_192-222-54-151_v2": ([1, 1], "LambdaOD 4x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_1xH100_80GB_SXM5_192-222-54-254_v2": ([1, 1], "LambdaOD 4x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_1xH100_80GB_SXM5_192-222-52-149_v2": ([1, 1], "LambdaOD 8x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_1xH100_80GB_SXM5_192-222-52-89_v2": ([1, 1], "LambdaOD 8x Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-102_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-120_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-130_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-143_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-156_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-158_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-159_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-163_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-180_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-184_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-190_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-206_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-207_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-211_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-225_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-44_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-48_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-90_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-98_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_1xH100_80GB_SXM5_192-222-52-99_v2": ([1, 1], "LambdaOD Texas 1xH100 80GB SXM5", 700, 36718.75),
}

list_system_multiple = {
"LambdaBM_ENG04_8xH200_140GB_SXM_h200_v2": ([1, 8], "LambdaBM ENG04 8xH200 140GB SXM", 700, 39375),
"LambdaBM_ENG04_HTOff_8xH200_140GB_SXM_h200_v2": ([1, 8], "LambdaBM ENG04 HTOff 8xH200 140GB SXM", 700, 39375),
"LambdaOD_2x_Texas_2xH100_80GB_SXM5_192-222-52-120_v2": ([1, 2], "LambdaOD 2x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_2xH100_80GB_SXM5_192-222-52-176_v2": ([1, 2], "LambdaOD 2x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_2xH100_80GB_SXM5_192-222-52-211_v2": ([1, 2], "LambdaOD 2x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_2x_Texas_2xH100_80GB_SXM5_192-222-52-65_v2": ([1, 2], "LambdaOD 2x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_2xH100_80GB_SXM5_192-222-52-139_v2": ([1, 2], "LambdaOD 4x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_2xH100_80GB_SXM5_192-222-52-178_v2": ([1, 2], "LambdaOD 4x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_2xH100_80GB_SXM5_192-222-54-151_v2": ([1, 2], "LambdaOD 4x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_2xH100_80GB_SXM5_192-222-54-254_v2": ([1, 2], "LambdaOD 4x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_4xH100_80GB_SXM5_192-222-52-139_v2": ([1, 4], "LambdaOD 4x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_4xH100_80GB_SXM5_192-222-52-178_v2": ([1, 4], "LambdaOD 4x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_4xH100_80GB_SXM5_192-222-54-151_v2": ([1, 4], "LambdaOD 4x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_4x_Texas_4xH100_80GB_SXM5_192-222-54-254_v2": ([1, 4], "LambdaOD 4x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_2xH100_80GB_SXM5_192-222-52-149_v2": ([1, 2], "LambdaOD 8x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_2xH100_80GB_SXM5_192-222-52-89_v2": ([1, 2], "LambdaOD 8x Texas 2xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_4xH100_80GB_SXM5_192-222-52-149_v2": ([1, 4], "LambdaOD 8x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_4xH100_80GB_SXM5_192-222-52-89_v2": ([1, 4], "LambdaOD 8x Texas 4xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_8xH100_80GB_SXM5_192-222-52-149_v2": ([1, 8], "LambdaOD 8x Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_8x_Texas_8xH100_80GB_SXM5_192-222-52-89_v2": ([1, 8], "LambdaOD 8x Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-102_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-120_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-130_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-143_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-156_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-158_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-159_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-163_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-180_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-184_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-190_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-206_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-207_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-211_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-225_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-44_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-48_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-90_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-98_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
"LambdaOD_Texas_8xH100_80GB_SXM5_192-222-52-99_v2": ([1, 8], "LambdaOD Texas 8xH100 80GB SXM5", 700, 36718.75),
}

list_test_fp32 = {
"PyTorch_SSD_FP32": ("ssd", "^.*Average images/sec:.*$", -1),
"PyTorch_resnet50_FP32": ("resnet50", "^.*Summary: train.loss.*$", 11),
Expand Down Expand Up @@ -64,17 +166,8 @@
),
}


def find_direct_subfolders(folder_path):
try:
subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
return subfolders
except Exception as e:
return str(e)


def gather_throughput(
list_test, name, system, df, path_result
list_test, list_system, name, system, config_name, df, version, path_result
):
column_name, key, pos = list_test[name]
pattern = re.compile(key)
Expand Down Expand Up @@ -104,10 +197,33 @@ def gather_throughput(

if not flag:
print(system + "/" + name + " " + filename + ": something wrong")
df.at[system, column_name] = int(round(total_throughput / count, 2))
df.at[config_name, column_name] = int(round(total_throughput / count, 2))
else:
df.at[system, column_name] = 0
df.at[config_name, column_name] = 0

df.at[config_name, "num_gpu"] = list_system[system][0][1]
df.at[config_name, "watt"] = list_system[system][2] * int(list_system[system][0][1])
df.at[config_name, "price"] = list_system[system][3] * int(
list_system[system][0][1]
)

def gather_bs(
list_test, list_system, name, system, config_name, df, version, path_result
):
column_name, key, pos = list_test[name]
path = path_result + "/" + system + "/" + name
if os.path.exists(path):
for filename in os.listdir(path):
if filename.endswith(".para"):
with open(os.path.join(path, filename)) as f:
first_line = f.readline()
df.at[config_name, column_name] = int(first_line.split(" ")[1])

df.at[config_name, "num_gpu"] = list_system[system][0][1]
df.at[config_name, "watt"] = list_system[system][2] * int(list_system[system][0][1])
df.at[config_name, "price"] = list_system[system][3] * int(
list_system[system][0][1]
)

def main():
parser = argparse.ArgumentParser(description="Gather benchmark results.")
Expand All @@ -124,6 +240,14 @@ def main():
help="Choose becnhmark precision",
)

parser.add_argument(
"--system",
type=str,
default="all",
choices=["single", "multiple", "all"],
help="Choose system type (single or multiple GPUs)",
)

args = parser.parse_args()

if args.precision == "fp32":
Expand All @@ -134,32 +258,62 @@ def main():
sys.exit(
"Wrong precision: " + args.precision + ", choose between fp32 and fp16"
)


# list_system: all direct sub folders in the results folder
list_system = find_direct_subfolders(args.path)

if args.system == "single":
list_system = list_system_single
elif args.system == "multiple":
list_system = list_system_multiple
else:
list_system = {}
list_system.update(list_system_single)
list_system.update(list_system_multiple)

columns = []
columns.append("num_gpu")
columns.append("watt")
columns.append("price")

for test_name, value in sorted(list_test.items()):
columns.append(list_test[test_name][0])
list_configs = [list_system[key][1] for key in list_system]

df_throughput = pd.DataFrame(index=list_system, columns=columns)
df_throughput = pd.DataFrame(index=list_configs, columns=columns)
df_throughput = df_throughput.sort_index()

df_throughput = df_throughput.fillna(-1.0)

for system in list_system:
df_bs = pd.DataFrame(index=list_configs, columns=columns)

for key in list_system:
version = list_system[key][0][0]
config_name = list_system[key][1]
for test_name, value in sorted(list_test.items()):
gather_throughput(
list_test,
list_system,
test_name,
system,
key,
config_name,
df_throughput,
version,
args.path,
)

gather_bs(
list_test,
list_system,
test_name,
key,
config_name,
df_bs,
version,
args.path,
)

df_throughput.index.name = "name_gpu"
df_throughput.to_csv("pytorch-train-throughput-v2-" + args.precision + ".csv")

df_bs.index.name = "name_gpu"
df_bs.to_csv("pytorch-train-bs-v2-" + args.precision + ".csv")



Expand Down