Monitoring GPU temperatures with nvidia-smi and Check MK (OMD)

In the previous post on this subject we used code from Technische Universität Kaiserslautern to monitor our GPUs using OMD checkmk (now checkmk raw). With some new RTX2080s installed this broke, as the nvidia-smi check doesn’t report anything for ECC errors (rather than 0, as previous cards did). The solution was to remove the ECC checking completely.

The new scripts are:

On the client system in /usr/lib/check_mk_agent/local/ (or plugins/)

if which nvidia-smi >/dev/null; then
   echo '<<<nvidia_smi>>>'
   nvidia-smi -q -x > /tmp/.check_mk_nvidia_smi
   cards=$(xml_grep --text_only 'nvidia_smi_log/attached_gpus' /tmp/.check_mk_nvidia_smi | tr -d ' ')
   IFS=$'\n' names=($(xml_grep --text_only 'nvidia_smi_log/gpu/product_name' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' fan_speed=($(xml_grep --text_only 'nvidia_smi_log/gpu/fan_speed' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' gpu_utilization=($(xml_grep --text_only 'nvidia_smi_log/gpu/utilization/gpu_util' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' mem_utilization=($(xml_grep --text_only 'nvidia_smi_log/gpu/utilization/memory_util' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' temperature=($(xml_grep --text_only 'nvidia_smi_log/gpu/temperature/gpu_temp' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' power_draw=($(xml_grep --text_only 'nvidia_smi_log/gpu/power_readings/power_draw' /tmp/.check_mk_nvidia_smi | tr -d ' '))
   IFS=$'\n' power_limit=($(xml_grep --text_only 'nvidia_smi_log/gpu/power_readings/power_limit' /tmp/.check_mk_nvidia_smi | tr -d ' '))

   for i in $(seq 1 $cards) ; do
       index=$(($i - 1))
       fan_speed[$index]=${fan_speed[$index]/\%/}
       gpu_utilization[$index]=${gpu_utilization[$index]/\%/}
       mem_utilization[$index]=${mem_utilization[$index]/\%/}
       temperature[$index]=${temperature[$index]/C/}
       power_draw[$index]=${power_draw[$index]/W/}
       power_limit[$index]=${power_limit[$index]/W/}
       echo "$index ${names[$index]} ${fan_speed[$index]} ${gpu_utilization[$index]} ${mem_utilization[$index]} ${temperature[$index]} ${power_draw[$index]} ${power_limit[$index]}"
   done
fi
[/code title="nvidia_smi" lang="python"]

Don't forget to make it executable! You also need xml_grep installed.

On the OMD server at <code>/omd/sites/omd_XYZ/local/share/check_mk/checks/</code>


#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# |             ____ _               _        __  __ _  __           |
# |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
# |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
# |           | |___| | | |  __/ (__|   <    | |  | | . \            |
# |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
# |                                                                  |
# | Copyright Mathias Kettner 2012             mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software;  you can redistribute it and/or modify it
# under the  terms of the  GNU General Public License  as published by
# the Free Software Foundation in version 2.  check_mk is  distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
# out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
# PARTICULAR PURPOSE. See the  GNU General Public License for more de-
# ails.  You should have  received  a copy of the  GNU  General Public
# License along with GNU Make; see the file  COPYING.  If  not,  write
# to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
# Boston, MA 02110-1301 USA.

#######################################
# Check developed by
#######################################
# Dr. Markus Hillenbrand
# University of Kaiserslautern, Germany
# hillenbr@rhrk.uni-kl.de
#######################################

# the inventory functions

def inventory_nvidia_smi_fan(info):
    inventory = []
    for line in info:
        if line[2] != 'N/A':
           inventory.append( ("GPU"+line[0], "", None) )
    return inventory
def inventory_nvidia_smi_gpuutil(info):
    inventory = []
    for line in info:
        if line[3] != 'N/A':
           inventory.append( ("GPU"+line[0], "", None) )
    return inventory
def inventory_nvidia_smi_memutil(info):
    inventory = []
    for line in info:
        if line[4] != 'N/A':
           inventory.append( ("GPU"+line[0], "", None) )
    return inventory
def inventory_nvidia_smi_temp(info):
    inventory = []
    for line in info:
        if line[5] != 'N/A':
           inventory.append( ("GPU"+line[0], "", None) )
    return inventory
def inventory_nvidia_smi_power(info):
    inventory = []
    for line in info:
        if line[6] != 'N/A' and line[7] != "N/A":
           inventory.append( ("GPU"+line[0], "", None) )
    return inventory

# the check functions

def check_nvidia_smi_fan(item, params, info):
    for line in info:
        if "GPU"+line[0] == item:
           value = int(line[2])
           perfdata = [('fan', value, 90, 95, 0, 100 )]
           if value > 95:
              return (2, "CRITICAL - %s fan speed is %d%%" % (line[1], value), perfdata)
           elif value > 90:
              return (1, "WARNING - %s fan speed is %d%%" % (line[1], value), perfdata)
           else:
              return (0, "OK - %s fan speed is %d%%" % (line[1], value), perfdata)
    return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_gpuutil(item, params, info):
    for line in info:
        if "GPU"+line[0] == item:
           value = int(line[3])
           perfdata = [('gpuutil', value, 100, 100, 0, 100 )]
           return (0, "OK - %s utilization is %s%%" % (line[1], value), perfdata)
    return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_memutil(item, params, info):
    for line in info:
        if "GPU"+line[0] == item:
           value = int(line[4])
           perfdata = [('memutil', value, 100, 100, 0, 100 )]
           if value > 95:
              return (2, "CRITICAL - %s memory utilization is %d%%" % (line[1], value), perfdata)
           elif value > 90:
              return (1, "WARNING - %s memory utilization is %d%%" % (line[1], value), perfdata)
           else:
              return (0, "OK - %s memory utilization is %d%%" % (line[1], value), perfdata)
    return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_temp(item, params, info):
    for line in info:
        if "GPU"+line[0] == item:
           value = int(line[5])
           perfdata = [('temp', value, 80, 90, 0, 95 )]
           if value > 90:
              return (2, "CRITICAL - %s temperature is %dC" % (line[1], value), perfdata)
           elif value > 80:
              return (1, "WARNING - %s temperature is %dC" % (line[1], value), perfdata)
           else:
              return (0, "OK - %s temperature is %dC" % (line[1], value), perfdata)
    return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_power(item, params, info):
    for line in info:
        if "GPU"+line[0] == item:
           draw = float(line[6])
           limit = float(line[7])
           value = draw * 100.0 / limit
           perfdata = [('power', draw, limit * 0.8, limit * 0.9, 0, limit )]
           if value > 90:
              return (2, "CRITICAL - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
           elif value > 80:
              return (1, "WARNING - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
           else:
              return (0, "OK - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
    return (3, "UNKNOWN - GPU %s not found in agent output" % item)

# declare the check to Check_MK

check_info['nvidia_smi.fan']     = (check_nvidia_smi_fan,     "%s fan speed"      , 1, inventory_nvidia_smi_fan)
check_info['nvidia_smi.gpuutil'] = (check_nvidia_smi_gpuutil, "%s utilization"    , 1, inventory_nvidia_smi_gpuutil)
check_info['nvidia_smi.memutil'] = (check_nvidia_smi_memutil, "%s memory"         , 1, inventory_nvidia_smi_memutil)
check_info['nvidia_smi.temp']    = (check_nvidia_smi_temp,    "%s temperature"    , 1, inventory_nvidia_smi_temp)
check_info['nvidia_smi.power']   = (check_nvidia_smi_power,   "%s power"          , 1, inventory_nvidia_smi_power)

To get the pretty indicators put this in /omd/sites/omd_XYZ/share/check_mk/web/plugins/perfometer/

#!/usr/bin/python

def perfometer_nvidia_smi_fan(row, check_command, perf_data):
    varname, value, unit, warn, crit, minn, maxx = perf_data[0]
    perc_used = 100 * (float(value) / float(maxx))
    perc_free = 100 - float(perc_used)
    return str(value)+" %", '<table><tr>' \
                               + perfometer_td(perc_used, '#0f8') \
                               + perfometer_td(perc_free, '#fff') \
                               + '</tr></table>'
def perfometer_nvidia_smi_gpuutil(row, check_command, perf_data):
    varname, value, unit, warn, crit, minn, maxx = perf_data[0]
    perc_used = 100 * (float(value) / float(maxx))
    perc_free = 100 - float(perc_used)
    return str(value)+" %", '<table><tr>' \
                               + perfometer_td(perc_used, '#0f8') \
                               + perfometer_td(perc_free, '#fff') \
                               + '</tr></table>'
def perfometer_nvidia_smi_memutil(row, check_command, perf_data):
    varname, value, unit, warn, crit, minn, maxx = perf_data[0]
    perc_used = 100 * (float(value) / float(maxx))
    perc_free = 100 - float(perc_used)
    return str(value)+" %", '<table><tr>' \
                               + perfometer_td(perc_used, '#0f8') \
                               + perfometer_td(perc_free, '#fff') \
                               + '</tr></table>'
def perfometer_nvidia_smi_temp(row, check_command, perf_data):
    varname, value, unit, warn, crit, minn, maxx = perf_data[0]
    perc_used = 100 * (float(value) / float(maxx))
    perc_free = 100 - float(perc_used)
    return str(value)+" C", '<table><tr>' \
                               + perfometer_td(perc_used, '#0f8') \
                               + perfometer_td(perc_free, '#fff') \
                               + '</tr></table>'
def perfometer_nvidia_smi_power(row, check_command, perf_data):
    varname, value, unit, warn, crit, minn, maxx = perf_data[0]
    perc_used = 100 * (float(value) / float(maxx))
    perc_free = 100 - float(perc_used)
    return str(value)+" W", '<table><tr>' \
                               + perfometer_td(perc_used, '#0f8') \
                               + perfometer_td(perc_free, '#fff') \
                               + '</tr></table>'

perfometers['check_mk-nvidia_smi.fan']     = perfometer_nvidia_smi_fan
perfometers['check_mk-nvidia_smi.gpuutil'] = perfometer_nvidia_smi_gpuutil
perfometers['check_mk-nvidia_smi.memutil'] = perfometer_nvidia_smi_memutil
perfometers['check_mk-nvidia_smi.temp']    = perfometer_nvidia_smi_temp
perfometers['check_mk-nvidia_smi.power']   = perfometer_nvidia_smi_power

Monitoring GPU temperatures with nvidia-smi and Check MK (OMD)

The Nvidia monitoring setup described at https://elwe.rhrk.uni-kl.de/howto/ worked in Check MK 1.2.8, but fails in 1.4. After some modification things now work – it required some modification of the check script /omd/yoursite/local/share/check_mk/checks/nvidia_smi. The two modifications needed were:

Remove the grouping of nvidia_smi.errors1 and 2 (I can live with this as our GTX1070 doesn’t report this anyway).

Remove the unicode degree characters from the temperature output, as this seems to cause the system to choke on the textual output.

Needed to delete and recreate the host to get it to work properly – possibly unicode characters hanging around in the generated graph definitions or similar?

#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# | ____ _ _ __ __ _ __ |
# | / ___| |__ ___ ___| | __ | \/ | |/ / |
# | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
# | | |___| | | | __/ (__| < | | | | . \ | # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ | # | | # | Copyright Mathias Kettner 2012 mk@mathias-kettner.de | # +------------------------------------------------------------------+ # # This file is part of Check_MK. # The official homepage is at http://mathias-kettner.de/check_mk. # # check_mk is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation in version 2. check_mk is distributed # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with- # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more de- # ails. You should have received a copy of the GNU General Public # License along with GNU Make; see the file COPYING. If not, write # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, # Boston, MA 02110-1301 USA. ####################################### # Check developed by ####################################### # Dr. Markus Hillenbrand # University of Kaiserslautern, Germany # hillenbr@rhrk.uni-kl.de ####################################### # Tweaked by Jamie Scott # University of Glasgow # Jamie.Scott@glasgow.ac.uk ####################################### # the inventory functions def inventory_nvidia_smi_fan(info): inventory = [] for line in info: if line[2] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_gpuutil(info): inventory = [] for line in info: if line[3] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_memutil(info): inventory = [] for line in info: if line[4] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_errors1(info): inventory = [] for line in info: if line[5] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_errors2(info): inventory = [] for line in info: if line[6] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_temp(info): inventory = [] for line in info: if line[7] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_power(info): inventory = [] for line in info: if line[8] != 'N/A' and line[9] != "N/A": inventory.append( ("GPU"+line[0], "", None) ) return inventory # the check functions def check_nvidia_smi_fan(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[2]) perfdata = [('fan', value, 90, 95, 0, 100 )] if value > 95:
return (2, "CRITICAL - %s fan speed is %d%%" % (line[1], value), perfdata)
elif value > 90:
return (1, "WARNING - %s fan speed is %d%%" % (line[1], value), perfdata)
else:
return (0, "OK - %s fan speed is %d%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_gpuutil(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[3])
perfdata = [('gpuutil', value, 100, 100, 0, 100 )]
return (0, "OK - %s utilization is %s%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_memutil(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[4])
perfdata = [('memutil', value, 100, 100, 0, 100 )]
if value > 95:
return (2, "CRITICAL - %s memory utilization is %d%%" % (line[1], value), perfdata)
elif value > 90:
return (1, "WARNING - %s memory utilization is %d%%" % (line[1], value), perfdata)
else:
return (0, "OK - %s memory utilization is %d%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_errors1(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[5])
if value > 500:
return (2, "CRITICAL - %s single bit error counter is %d" % (line[1], value))
if value > 100:
return (1, "WARNING - %s single bit error counter is %d" % (line[1], value))
else:
return (0, "OK - %s single bit error counter is %d" % (line[1], value))
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_errors2(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[6])
if value > 500:
return (2, "CRITICAL - %s double bit error counter is %d" % (line[1], value))
if value > 100:
return (1, "WARNING - %s double bit error counter is %d" % (line[1], value))
else:
return (0, "OK - %s double bit error counter is %d" % (line[1], value))
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_temp(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[7])
perfdata = [('temp', value, 80, 90, 0, 95 )]
if value > 90:
return (2, "CRITICAL - %s temperature is %dC" % (line[1], value), perfdata)
elif value > 80:
return (1, "WARNING - %s temperature is %dC" % (line[1], value), perfdata)
else:
return (0, "OK - %s temperature is %dC" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

def check_nvidia_smi_power(item, params, info):
for line in info:
if "GPU"+line[0] == item:
draw = float(line[8])
limit = float(line[9])
value = draw * 100.0 / limit
perfdata = [('power', draw, limit * 0.8, limit * 0.9, 0, limit )]
if value > 90:
return (2, "CRITICAL - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
elif value > 80:
return (1, "WARNING - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
else:
return (0, "OK - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)

# declare the check to Check_MK

check_info['nvidia_smi.fan'] = (check_nvidia_smi_fan, "%s fan speed" , 1, inventory_nvidia_smi_fan)
check_info['nvidia_smi.gpuutil'] = (check_nvidia_smi_gpuutil, "%s utilization" , 1, inventory_nvidia_smi_gpuutil)
check_info['nvidia_smi.memutil'] = (check_nvidia_smi_memutil, "%s memory" , 1, inventory_nvidia_smi_memutil)
#check_info['nvidia_smi.errors1'] = (check_nvidia_smi_errors1, "%s errors single" , 0, inventory_nvidia_smi_errors1)
#check_info['nvidia_smi.errors2'] = (check_nvidia_smi_errors2, "%s errors double" , 0, inventory_nvidia_smi_errors2)
check_info['nvidia_smi.temp'] = (check_nvidia_smi_temp, "%s temperature" , 1, inventory_nvidia_smi_temp)
check_info['nvidia_smi.power'] = (check_nvidia_smi_power, "%s power" , 1, inventory_nvidia_smi_power)

#checkgroup_of['nvidia_smi.errors1'] = 'hw_errors'
#checkgroup_of['nvidia_smi.errors2'] = 'hw_errors'