| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- #!/usr/bin/env bash
- #
- #(c) 2004-present, Facebook, all rights reserved.
- # See the LICENSE file for usage and distribution rights.
- #
- trap 'echo "Caught exception, dying"; exit' 1 2 3 15
- ME=`basename $0`
- SERVER=`hostname`
- #parameters used
- #
- Dump_Config=0
- DEBUG=
- OS=`/bin/uname -s`
- VMEM=
- RSS=
- CPU=
- VERBOSE=
- VAR=
- LIMIT=
- ACTION=
- N=
- WAIT=
- #
- #supported OS: Linux only for now. Easy to add
- #
- oscheck() {
- case ${OS} in
- Linux)
- VMEM=vsz
- RSS=rss
- CPU=bsdtime
- ;;
- *)
- die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
- ;;
- esac
- }
- verbose() {
- if [ "x$DEBUG" != "x" ]; then
- echo "$@" >&2
- fi
- }
- warn() {
- echo "$@" >&2
- }
- die() {
- echo "ERROR: " "$@" >&2;
- exit;
- }
- dump_config() {
- cat <<EOCONFIG;
- $ME running on ${HOSTNAME} at `date`
- Configuration for this run:
- PID to monitor : ${PID}
- Resource monitored : ${VAR}
- Resource limit : ${LIMIT}
- Check every : ${WAIT} seconds
- No. of times run : ${N}
- What to do : ${ACTION}
- EOCONFIG
- }
- usage() {
- cat <<USAGE; exit
- $@
- Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
- Monitor a process for set of violations. Options:
- -p: PID of process to monitor
- -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
- -l: what is the threshold/limit for the metric that is being sensed.
- Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
- NOTE: defaults to 1GB
- -a: action. Currently {warn|die|kill} are supported.
- The default action is to 'warn'. Here is the behavior:
- warn: complain if usage exceeds threshold, but continue monitoring
- kill: complain, kill the db_bench process and exit
- die: if usage exceeds threshold, die immediately
- -n: number of cycles to monitor. Default is to monitor until PID no longer exists.
- -w: wait time per cycle of monitoring. Default is 5 seconds.
- -v: verbose messaging
- USAGE
- }
- #set default values if none given
- set_defaults_if_noopt_given() {
- : ${VAR:=vsz}
- : ${LIMIT:=1024000}
- : ${WAIT:=5}
- : ${N:=999999}
- : ${ACTION:=warn}
- }
- validate_options() {
- if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
- usage "PID is mandatory"
- fi
- }
- ###### START
- while getopts ":p:x:l:a:n:t:vhd" opt; do
- case $opt in
- d)
- Dump_Config=1
- ;;
- h)
- usage;
- ;;
- a)
- ACTION=${OPTARG};
- ;;
- v)
- DEBUG=1;
- ;;
- p)
- PID=$OPTARG;
- ;;
- x)
- VAR=$OPTARG;
- ;;
- l)
- LIMIT=$OPTARG;
- ;;
- w)
- WAIT=$OPTARG;
- ;;
- n)
- N=$OPTARG;
- ;;
- \?)
- usage;
- ;;
- esac
- done
- oscheck;
- set_defaults_if_noopt_given;
- validate_options;
- if [ $Dump_Config -eq 1 ]; then
- dump_config;
- exit;
- fi
- Done=0
- verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
- while [ $Done -eq 0 ]; do
- VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
- if [ ${VAL:=0} -eq 0 ]; then
- warn "Process $PID ended without incident."
- Done=1;
- break;
- fi
- if [ $VAL -ge $LIMIT ]; then
- Done=1;
- else
- echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
- sleep $WAIT;
- fi
- if [ $Done -eq 1 ]; then
- if [ "$ACTION" = "kill" ]; then
- kill ${PID} || kill -3 ${PID}
- exit;
- elif [ "$ACTION" = "warn" ]; then
- # go back to monitoring.
- warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
- Done=0 #go back to monitoring
- elif [ "$ACTION" = "die" ]; then
- warn "WARNING: dying without killing process ${PID} on ${SERVER}"
- warn "The process details are below: "
- warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
- warn ""
- #should we send email/notify someone? TODO... for now, bail.
- exit -1;
- fi
- else
- :
- #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
- fi
- done
|