$ ssh linux.cscf ubuntu1404-202:~$ sudo -s ubuntu1404-202: # ssh salt-rsg-1604 salt-rsg-1604:# salt "our_minion_machine_FQDN" test.ping - our minion machine TrueIf the "test.ping" returns true skip to the section "To begin temperature logging"
apt-get install openssh-server
$ ssh linux.cscf.uwaterloo.ca ubuntu1404-202:$ sudo -s root@ubuntu1404-202:# cd /root root@ubuntu1404-202:/root# cd .ssh root@ubuntu1404-202:/root/.ssh# scp id_dsa.pub cscf-adm@our-minion-machine: root@ubuntu1404-202:/root/.ssh# scp id_ed25519.pub cscf-adm@our-minion-machine:On the minion machine:
# cd /root/.ssh # cat /home/cscf-adm/id_dsa.pub >> authorized_keys2 # cat /home/cscf-adm/id_ed25519.pub >> authorized_keys2
dpkg -l | grep curl apt-get install curl
# curl -L https://bootstrap.saltstack.com -o install_salt.sh # sh install_salt.sh # ls /etc/salt minion minion.d minion_id pki proxy proxy.d # ls -l /etc/salt/minion -rw-r--r-- 1 root root 35305 Oct 4 12:02 /etc/salt/minion # cd /etc/salt /etc/salt# vi minion # head -2 minion master: salt-rsg-1604.cscf.uwaterloo.ca hash_type: sha256
# systemctl restart salt-minion # /etc/init.d/salt-minion restart # systemctl status salt-minion salt-minion.service - The Salt Minion Loaded: loaded (/lib/systemd/system/salt-minion.service; enabled; vendor preset: enabled) Active: active (running) since Wed 2017-11-15 12:12:13 EST; 1min 42s ago Docs: man:salt-minion(1) file:///usr/share/doc/salt/html/contents.html https://docs.saltstack.com/en/latest/contents.html Main PID: 26985 (salt-minion) CGroup: /system.slice/salt-minion.service ├─26985 /usr/bin/python /usr/bin/salt-minion ├─26990 /usr/bin/python /usr/bin/salt-minion └─26994 /usr/bin/python /usr/bin/salt-minion Nov 15 12:12:24 salt-minion[26985]: [ERROR ] The Salt Master has cached the public key for this node, this salt minion will wait for 10 seconds before attempting to re-authenticateOn the master machine:
$ ssh linux.cscf ubuntu1404-202:~$ sudo -s ubuntu1404-202: # ssh salt-rsg-1604 salt-rsg-1604:~# salt-key Accepted Keys: - lists all machines already connected to the master Denied Keys: Unaccepted Keys: - our minion machine will be listed here Rejected Keys:
salt-rsg-1604:# salt-key -a "our_minion_machine_FQDN" The following keys are going to be accepted: Unaccepted Keys: - our minion machine Proceed? [n/Y] Y Key for minionaccepted. salt-rsg-1604:# salt-key Accepted Keys: - our minion machine - lists all machines already connected to the master Denied Keys: Unaccepted Keys: Rejected Keys: salt-rsg-1604:# salt "our_minion_machine_FQDN" test.ping - our minion machine True
Note: The above "our_minion_machine_FQDN" may not work. Just use the machine name and add the FQDN name to salt-rsg-1604 /etc/hosts file.
root@salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts# mkdir -p "our_minion_machine_FQDN"/temperature root@salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts# echo '99' > "our_minion_machine_FQDN"/temperature/maxRun the next two commands:
salt-rsg-1604:/srv/saltstack/pillar# salt 'our_minion_machine_FQDN' state.apply common.monitoring.temperature state=True salt-rsg-1604:/srv/saltstack/pillar# salt 'our_minion_machine_FQDN' state.apply common.monitoring.temperatureIf the Supermicro minion uses an IPMI temperature sensor named System or Sys then SALT will automatically set its maximum temperature to 99 degrees.
salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts# mkdir "our_minion_machine_FQDN" salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts# cd "our_minion_machine_FQDN" salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts/"our_minion_machine_FQDN"# mkdir temperature salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts/"our_minion_machine_FQDN"# cd temperature salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts/"our_minion_machine_FQDN"/temperature# echo '45' > max Note 45 is an example temperature. Change it to your desired maximum. salt-rsg-1604:/srv/saltstack/pillar/file_tree/hosts/"our_minion_machine_FQDN"/temperature# cd ../../.. salt-rsg-1604:/srv/saltstack/pillar# cd common/temperature salt-rsg-1604:/srv/saltstack/pillar/common/temperature# ls ambient.sls default.sls pci.sls sys.sls system.sls inlet.sls
If the minion uses an IPMI temperature sensor named "System" or just "Sys" then SALT will automatically set its maximum temperature to 99 degrees.
This suggests that if only temperature logging is desired don't create the /srv/saltstack/pillar/file_tree/hosts/"our_minion_machine_FQDN"/temperature/max file.
apt-get install ipmitool apt-get install freeipmi-tools modprobe ipmi_si type=kcs ports=0xCA2 regspacings=1 modprobe ipmi_devintf modprobe ipmi_msghandler modprobe ipmi_poweroff modprobe ipmi_watchdog
Find the sensor output of ipmitool.
ipmitool sdr list
Make sure "ipmitool sdr list" reported some data.
From the output determine the system or ambient temperature of the machine.
ipmitool sdr list CPU1 Temp | 53 degrees C | ok CPU2 Temp | 54 degrees C | ok PCH Temp | 44 degrees C | ok System Temp | 36 degrees C | ok Peripheral Temp | 43 degrees C | ok MB_10G Temp | 58 degrees C | ok ...
salt-rsg-1604:/srv/saltstack/pillar/common/temperature# ls ambient.sls default.sls pci.sls sys.sls system.sls inlet.slsIf the minion's temperature sensor name found by ipmitool is one of "ambient","PCI", "Sys", "System", or "Inlet" then proceed to the next step at "Add the minion to the top.sls file".
salt-rsg-1604:/srv/saltstack/pillar/common/temperature# cat pci.sls temperature: sensor: 'PCI'
Add the minion to the top.sls file in salt-rsg-1604 at /srv/saltstack/pillar/
salt-rsg-1604:/srv/saltstack/pillar# cat top.sls base: '*': - common - salt - temperature.default 'our_minion_machin_FQDN': - temperature.pci <- Note this is the temperature sensor-name "pci". Determine that name from the ipmitool output "ipmitool sdr list"
Run this SALT command to enable the temperature logging script.
salt-rsg-1604:/srv/saltstack/pillar/common/temperature# salt 'our_minion_machine_FQDN' state.apply common.monitoring.temperature state=True salt-rsg-1604:/srv/saltstack/pillar/common/temperature# salt 'our_minion_machine_FQDN' state.apply common.monitoring.temperature
SALT will
*/2 * * * * /root/ipmitool-log-temperature sensor-namewhere the sensor-name was previously determined as the machine's temperature sensor
/root/ipmitool-log-temperature
#!/bin/bash # Gordon Boerke 2017-11-21 (RT#672618) # # sensor is the ipmitool sensor reporting the room temperature. # The script is executed as: ipmitool-log-temperature sensor-name # One must determine the "sensor-name" # Test first the command to extract the temperature: # ipmitool sdr list | grep "the-sensor-name"|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/' # sensor=$1 getroomtemp=$(ipmitool sdr list | grep $sensor|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/') fileOldTime=`stat -c %Y /var/log/temperature-week4` echo $(date +"%Y-%m-%d %H:%M") $getroomtemp "degrees" >> /var/log/temperature-current-week fileNewTime=`stat -c %Y /var/log/temperature-current-week` declare -i difference difference=fileNewTime-fileOldTime if [ $difference -gt 604800 ];then cp /var/log/temperature-week2 /var/log/temperature-week1 cp /var/log/temperature-week3 /var/log/temperature-week2 cp /var/log/temperature-week4 /var/log/temperature-week3 cp /var/log/temperature-current-week /var/log/temperature-week4 true > /var/log/temperature-current-week fi
After reviewing the temperature logs determine normal temperature operating range. From that information select a maximum system temperature to shut down the minion. This is a qualitative decision. The system temperature will follow the room temperature. A high temperature can be set to shutdown the system or alert users of excessive temperature.
SALT will
*/2 * * * * /root/ipmitool-max-temperature sensor-name maximum-temperature-allowedwhere the sensor-name was previously determined as the System or ambient temperature sensor and maximum-temperature-allowed is determined to be the highest allowed temperature before shutdown
/root/ipmitool-max-temperature
#!/bin/bash # Gordon Boerke 2017-11-21 (RT#672618) # # sensor is the ipmitool sensor reporting the room temperature. # temp_max is the temperature at which to shutdown the machine. # warn_temp is the temperature to report a warning of excessive heat build-up. # The script is executed as: # ipmitool-max-temperature sensor-name maximum-temperature-allowed # One must determine the "sensor-name" and a "maximum-temperature-allowed" value # Test with the following command to extract the temperature: # ipmitool sdr list | grep "the-sensor-name"|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/') # sensor=$1 declare -i temp_max temp_max=$2 declare -i warn_temp getroomtemp=$(ipmitool sdr list | grep $sensor|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/') if [ "$getroomtemp" -gt $temp_max ];then echo "System temperature is at "$getroomtemp" which is above the max "$temp_max | wall -n 2>&1 > /dev/null echo "Shutting the system down now" | wall -n 2>&1 > /dev/null [ -f /root/external-shutdown ] && /root/external-shutdown /sbin/shutdown -h now else warn_temp=$temp_max-5 if [ "$getroomtemp" -gt $warn_temp ];then echo "System temperature is at "$getroomtemp" which is close to the max "$temp_max | wall -n 2>&1 > /dev/null echo "The machine will shutdown once the temperature exceeds "$temp_max | wall -n 2>&1 > /dev/null [ -f /root/external-shutdown2 ] && /root/external-shutdown2 fi fi
The external-shutdown script will send a shutdown command via SSH to other machines. Typically these machines cannot be configured with SALT.
The script will be placed in the same location as the ipmitool-max-temperature and ipmitool-log-temperature scripts under /root.
SSH keys must be added to the machines.
/root/external-shutdown
#!/bin/bash # Gordon Boerke 2017-12-18 (RT#672618) # # This script is called from another script ipmitool-max-temperature. # This script will send shutdown commands to other machines. # Other machines to be determined. They must have this machines SSH key. # # Report a machine shutdown first: # # echo "Machine xxx.cs.uwaterloo.ca is shutting down now" | wall -n 2>&1 > /dev/null # ssh xxx.cs.uwaterloo.ca 'shutdown -h now' # or whatever the shutdown command may be for the system # # In the "echo" above include the "| wall -n 2>&1 > /dev/null" as the ipmitool-max-temperature script is run from cron.
/root/external-shutdown2
This will be the same as /root/external-shutdown. This second script will run at 5 degrees below the maximum level.
New flag file /root/ipmitool-email-temperature Use a value of 0 in the file to represent the machine running at normal temperature. Set the value to 1 when the machine temperature hits or exceeds the warning temperature and has sent an email.
/root/ipmitool-max-temperature
#!/bin/bash # Gordon Boerke 2017-11-21 (RT#672618) # # sensor is the ipmitool sensor reporting the room temperature. # temp_max is the temperature at which to shutdown the machine. # warn_temp is the temperature to report a warning of excessive heat build-up. # The script is executed as: # ipmitool-max-temperature sensor-name maximum-temperature-allowed # One must determine the "sensor-name" and a "maximum-temperature-allowed" value # Test with the following command to extract the temperature: # ipmitool sdr list | grep "the-sensor-name"|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/' # sensor=$1 declare -i temp_max temp_max=$2 declare -i warn_temp getroomtemp=$(ipmitool sdr list | grep $sensor|head -1|sed 's/[^0-9]*//'|sed -r 's/(.{2}).*/\1/') if [ "$getroomtemp" -gt $temp_max ];then echo "System temperature is at "$getroomtemp" which is above the max "$temp_max | wall -n 2>&1 > /dev/null echo "Shutting the system down now" | wall -n 2>&1 > /dev/null # Have the email flag set back to "false" = 0, ready for when the machine is turned back on. echo "0" > /root/ipmitool-email-temperature # Send the shutdown to other machines by executing the external-shutdown script. [ -f /root/external-shutdown ] && /root/external-shutdown /sbin/shutdown -h now else warn_temp=$temp_max-5 if [ "$getroomtemp" -gt $warn_temp ];then echo "System temperature is at "$getroomtemp" which is close to the max "$temp_max | wall -n 2>&1 > /dev/null echo "The machine will shutdown once the temperature exceeds "$temp_max | wall -n 2>&1 > /dev/null # Send the shutdown to other machines by executing the externa-shutdown2 script. [ -f /root/external-shutdown2 ] && /root/external-shutdown2 # Send out an email warning. The email flag is in the file /root/ipmitool-email-temperature input="/root/ipmitool-email-temperature" while read line do email="$line" done < "$input" # If an email hasn't already been sent then send it out now. Set the email flag to 1. if [ $email -eq 0 ];then echo "1" > /root/ipmitool-email-temperature emaildomain="@cs.uwaterloo.ca" host="$(hostname)" emailaddress=$host$emaildomain /usr/bin/sendemail -u "System is overheating" -m "Warning that $host is reaching its shutdown temperature." -f $emailaddress -t gboerke@uwaterloo.ca -s connect.uwaterloo.ca:25 fi # If we are no longer in the warning temperature range, then reset the email flag. else input="/root/ipmitool-email-temperature" while read line do email="$line" done < "$input" # If an email has already been sent then send out an "alls clear now". Reset the email flag to 0. if [ $email -eq 1 ];then emaildomain="@cs.uwaterloo.ca" host="$(hostname)" emailaddress=$host$emaildomain /usr/bin/sendemail -u "System is within acceptable temperature range" -m "$host is operating below its shutdown temperature." -f $emailaddress -t gboerke@uwaterloo.ca -s connect.uwaterloo.ca:25 # Reset the email flag. echo "0" > /root/ipmitool-email-temperature fi fi fi
apt-get install ipmitool apt-get install freeipmi-tools modprobe ipmi_si type=kcs ports=0xCA2 regspacings=1 modprobe ipmi_devintf modprobe ipmi_msghandler modprobe ipmi_poweroff modprobe ipmi_watchdog
ipmitool sdr listMake sure "ipmitool sdr list" reported some data.
ipmitool sdr list CPU1 Temp | 53 degrees C | ok CPU2 Temp | 54 degrees C | ok PCH Temp | 44 degrees C | ok System Temp | 36 degrees C | ok Peripheral Temp | 43 degrees C | ok MB_10G Temp | 58 degrees C | ok ...
Typically Supermicro motherboards report System temperature, which is the default for our SALT configuration.
Other accepted temperature sensor key words are:
*ambient* for some Sun machines *PCI* Area Temp. *Sys* for some older Supermicro machines. *Inlet* for Huawei machines. Choose one that fluctuates with room temperature fluctuation.
-- GordBoerke - 2017-12-04