{
  "AWSTemplateFormatVersion": "2010-09-09",
  "Description": "Observability Solution from CloudWatch: NVIDIA GPU on EC2. Version: 1.0.0.",
  "Transform": "AWS::LanguageExtensions",
  "Parameters": {
    "DashboardNameParameter": {
      "Type": "String",
      "Description": "Enter name of the dashboard, the maximum length is 255, and valid characters are A-Z, a-z, 0-9, \"-\", and \"_\". The dashboard will display metrics from the region where the stack is being created. To easily differentiate this dashboard from similar ones in other regions, we recommend including the region name in the dashboard name.",
      "MinLength": "1",
      "MaxLength": "255",
      "AllowedPattern": "^[a-zA-Z0-9-_]*$",
      "Default": "NvidiaDashboard"
    }
  },
  "Resources": {
    "CloudWatchDashboard": {
      "Type": "AWS::CloudWatch::Dashboard",
      "Properties": {
        "DashboardName": {
          "Ref": "DashboardNameParameter"
        },
        "DashboardBody": {
          "Fn::ToJsonString": {
            "widgets": [
              {
                "height": 1,
                "width": 24,
                "y": 0,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# Utilization"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 1,
                "x": 0,
                "type": "metric",
                "properties": {
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_utilization_gpu', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "view": "timeSeries",
                  "stacked": false,
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Utilization",
                  "period": 60,
                  "yAxis": {
                    "left": {
                      "label": "%",
                      "showUnits": false
                    }
                  },
                  "stat": "Average"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 1,
                "x": 8,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_utilization_memory', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Memory Utilization",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "%",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 8,
                "x": 16,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_memory_total', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Total Memory",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "Mb",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 1,
                "width": 24,
                "y": 7,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# Memory"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 8,
                "x": 0,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_memory_used', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Used Memory",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "Mb",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 8,
                "x": 8,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_memory_free', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Free Memory",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "Mb",
                      "showUnits": false
                    },
                    "right": {
                      "label": ""
                    }
                  }
                }
              },
              {
                "height": 1,
                "width": 24,
                "y": 14,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# Temperature / Power"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 15,
                "x": 0,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_temperature_gpu', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Temperature",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "\u00b0C",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 15,
                "x": 8,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_power_draw', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Power Draw",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "Watts",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 15,
                "x": 16,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_fan_speed', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Fan Speed",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "%",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 1,
                "width": 24,
                "y": 21,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# Encoder"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 22,
                "x": 0,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_encoder_stats_average_latency', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Encoder Average Latency",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "\u03bcs",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 22,
                "x": 8,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_encoder_stats_average_fps', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Encoder Average FPS",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "FPS",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 22,
                "x": 16,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_encoder_stats_session_count', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Encoder Session Count",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "Count",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 1,
                "width": 24,
                "y": 28,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# PCIe"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 29,
                "x": 8,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_pcie_link_width_current', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU PCIe Link Width"
                }
              },
              {
                "height": 6,
                "width": 8,
                "y": 29,
                "x": 0,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_pcie_link_gen_current', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU PCIe Link Generation"
                }
              },
              {
                "height": 1,
                "width": 24,
                "y": 35,
                "x": 0,
                "type": "text",
                "properties": {
                  "markdown": "# Clock"
                }
              },
              {
                "height": 6,
                "width": 6,
                "y": 36,
                "x": 0,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_clocks_current_memory', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Memory Clock",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "showUnits": false,
                      "label": "MHz"
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 6,
                "y": 36,
                "x": 6,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_clocks_current_graphics', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Graphics Clock",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "MHz",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 6,
                "y": 36,
                "x": 12,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_clocks_current_sm', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU SM Clock",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "label": "MHz",
                      "showUnits": false
                    }
                  }
                }
              },
              {
                "height": 6,
                "width": 6,
                "y": 36,
                "x": 18,
                "type": "metric",
                "properties": {
                  "view": "timeSeries",
                  "stacked": false,
                  "metrics": [
                    [
                      {
                        "expression": "SORT(SEARCH('{CWAgent,InstanceId,index,name} MetricName=nvidia_smi_clocks_current_video', 'Average'), MAX, DESC, 10)",
                        "id": "e1",
                        "period": 60
                      }
                    ]
                  ],
                  "region": {
                    "Ref": "AWS::Region"
                  },
                  "title": "GPU Video Clock",
                  "period": 300,
                  "yAxis": {
                    "left": {
                      "showUnits": false,
                      "label": "MHz"
                    }
                  }
                }
              }
            ]
          }
        }
      }
    }
  }
}