File size: 5,885 Bytes
92c24be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr

def calculate_training_metrics(
    gpu_choice, precision, num_gpus, num_parameters, dataset_tokens, 
    num_epochs, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85
):
    """
    Calculates both the training time and cost for LLM training with parallel computing.

    Args:
    - gpu_choice (str): The choice of GPU model
    - precision (str): The precision level for training
    - num_gpus (int): Number of GPUs for parallel computing
    - num_parameters (float): Number of model parameters in billions
    - dataset_tokens (float): Number of tokens in the dataset
    - num_epochs (int): Number of training epochs
    - utilization_rate (float): GPU utilization rate (0 < rate ≤ 1)
    - overhead (float): Overhead multiplier for additional costs
    - cost_per_gpu_hour (float): Cost per GPU hour in dollars

    Returns:
    - tuple: (total_cost, training_days, training_hours)
    """
    
    # GPU throughputs in FLOPS (operations per second)
    gpu_throughputs = {
        'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12},
        'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12},
        'V100': {'tensor': 130e12},
        'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12},
        'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12}
    }

    # Get the base GPU throughput
    base_throughput = gpu_throughputs[gpu_choice][precision]
    
    # Calculate effective throughput with multiple GPUs
    # Assuming 90% scaling efficiency for parallel computing
    parallel_efficiency = 0.9
    effective_throughput = base_throughput * num_gpus * parallel_efficiency
    
    # Calculate total tokens processed (dataset_tokens * epochs)
    total_tokens = dataset_tokens * num_epochs
    
    # Calculate total FLOPS needed (6 operations per parameter per token)
    total_flops = 6 * num_parameters * total_tokens
    
    # Calculate raw computation hours needed
    compute_hours = total_flops / (effective_throughput * 3600)
    
    # Adjust for utilization rate and overhead
    actual_hours = (compute_hours / utilization_rate) * overhead
    
    # Calculate days and remaining hours
    training_days = int(actual_hours // 24)
    training_hours = actual_hours % 24
    
    # Calculate total cost (cost per GPU * number of GPUs * hours)
    total_cost = actual_hours * cost_per_gpu_hour * num_gpus
    
    return total_cost, training_days, training_hours

def gradio_interface(
    gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
    num_epochs, utilization_rate, overhead, cost_per_gpu_hour
):
    # Convert inputs to appropriate numeric types
    num_parameters = float(num_parameters) * 1e9  # Convert billions to actual number
    dataset_tokens = float(dataset_tokens) * 1e9  # Convert billions to actual number
    num_gpus = int(num_gpus)
    num_epochs = int(num_epochs)
    utilization_rate = float(utilization_rate)
    overhead = float(overhead)
    cost_per_gpu_hour = float(cost_per_gpu_hour)

    # Calculate metrics
    cost, days, hours = calculate_training_metrics(
        gpu_choice, precision, num_gpus, num_parameters, dataset_tokens,
        num_epochs, utilization_rate, overhead, cost_per_gpu_hour
    )
    
    # Format the output message
    time_msg = f"{days} days and {hours:.1f} hours"
    cost_msg = f"{cost:,.2f}$"
    
    return time_msg, cost_msg

# Define available GPU choices and their default precisions
gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"]

# Create the Gradio interface
title = "<h2 style='text-align: center;'>LLM Training Time and Cost Calculator</h2>"
description = """
<p style='text-align: center;'>Calculate both the training time and cost for large language models (LLM) with parallel computing support.</p>
<p><strong>Input Parameters:</strong></p>
<ul>
    <li><strong>GPU Selection:</strong> Choose from various GPU models with different compute capabilities</li>
    <li><strong>Number of GPUs:</strong> Specify how many GPUs to use in parallel</li>
    <li><strong>Model Size:</strong> Number of parameters in billions</li>
    <li><strong>Dataset Size:</strong> Number of tokens in your dataset in billions</li>
    <li><strong>Training Epochs:</strong> Number of times to iterate over the dataset</li>
    <li><strong>Utilization Rate:</strong> Expected GPU utilization (typically 0.4-0.7)</li>
    <li><strong>Overhead:</strong> Additional time/cost factor for data loading, checkpointing, etc.</li>
</ul>
Ouputs:
<ul>
    <li><strong>Estimated Training Time:</strong> Total days and hours required for training</li>
    <li><strong>Estimated Training Cost:</strong> Total cost in dollars based on GPU hours</li>
</ul>
Modified from <a href="https://huggingface.co/spaces/Heng666/LLM-Training-Cost-Calculator">this Hf Space</a>.
"""

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'),
        gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'),
        gr.Number(label="Number of GPUs", value=1, minimum=1, maximum=1024),
        gr.Number(label="Number of Parameters (billions)", value=70),
        gr.Number(label="Dataset Tokens (billions)", value=1),
        gr.Number(label="Number of Epochs", value=3, minimum=1),
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"),
        gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead Factor"),
        gr.Number(label="Cost per GPU Hour ($)", value=1.85)
    ],
    outputs=[gr.Textbox(label="Estimated Training Time:"),
             gr.Textbox(label="Estimated Training Cost:")],
    title=title,
    description=description,
    article="<p style='text-align: center;'>Improved with good intentions by ghost.</p>"
)

if __name__ == "__main__":
    iface.launch()