pflag 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #!/usr/bin/env bash
  2. #
  3. #(c) 2004-present, Facebook, all rights reserved.
  4. # See the LICENSE file for usage and distribution rights.
  5. #
  6. trap 'echo "Caught exception, dying"; exit' 1 2 3 15
  7. ME=`basename $0`
  8. SERVER=`hostname`
  9. #parameters used
  10. #
  11. Dump_Config=0
  12. DEBUG=
  13. OS=`/bin/uname -s`
  14. VMEM=
  15. RSS=
  16. CPU=
  17. VERBOSE=
  18. VAR=
  19. LIMIT=
  20. ACTION=
  21. N=
  22. WAIT=
  23. #
  24. #supported OS: Linux only for now. Easy to add
  25. #
  26. oscheck() {
  27. case ${OS} in
  28. Linux)
  29. VMEM=vsz
  30. RSS=rss
  31. CPU=bsdtime
  32. ;;
  33. *)
  34. die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
  35. ;;
  36. esac
  37. }
  38. verbose() {
  39. if [ "x$DEBUG" != "x" ]; then
  40. echo "$@" >&2
  41. fi
  42. }
  43. warn() {
  44. echo "$@" >&2
  45. }
  46. die() {
  47. echo "ERROR: " "$@" >&2;
  48. exit;
  49. }
  50. dump_config() {
  51. cat <<EOCONFIG;
  52. $ME running on ${HOSTNAME} at `date`
  53. Configuration for this run:
  54. PID to monitor : ${PID}
  55. Resource monitored : ${VAR}
  56. Resource limit : ${LIMIT}
  57. Check every : ${WAIT} seconds
  58. No. of times run : ${N}
  59. What to do : ${ACTION}
  60. EOCONFIG
  61. }
  62. usage() {
  63. cat <<USAGE; exit
  64. $@
  65. Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
  66. Monitor a process for set of violations. Options:
  67. -p: PID of process to monitor
  68. -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
  69. -l: what is the threshold/limit for the metric that is being sensed.
  70. Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
  71. NOTE: defaults to 1GB
  72. -a: action. Currently {warn|die|kill} are supported.
  73. The default action is to 'warn'. Here is the behavior:
  74. warn: complain if usage exceeds threshold, but continue monitoring
  75. kill: complain, kill the db_bench process and exit
  76. die: if usage exceeds threshold, die immediately
  77. -n: number of cycles to monitor. Default is to monitor until PID no longer exists.
  78. -w: wait time per cycle of monitoring. Default is 5 seconds.
  79. -v: verbose messaging
  80. USAGE
  81. }
  82. #set default values if none given
  83. set_defaults_if_noopt_given() {
  84. : ${VAR:=vsz}
  85. : ${LIMIT:=1024000}
  86. : ${WAIT:=5}
  87. : ${N:=999999}
  88. : ${ACTION:=warn}
  89. }
  90. validate_options() {
  91. if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
  92. usage "PID is mandatory"
  93. fi
  94. }
  95. ###### START
  96. while getopts ":p:x:l:a:n:t:vhd" opt; do
  97. case $opt in
  98. d)
  99. Dump_Config=1
  100. ;;
  101. h)
  102. usage;
  103. ;;
  104. a)
  105. ACTION=${OPTARG};
  106. ;;
  107. v)
  108. DEBUG=1;
  109. ;;
  110. p)
  111. PID=$OPTARG;
  112. ;;
  113. x)
  114. VAR=$OPTARG;
  115. ;;
  116. l)
  117. LIMIT=$OPTARG;
  118. ;;
  119. w)
  120. WAIT=$OPTARG;
  121. ;;
  122. n)
  123. N=$OPTARG;
  124. ;;
  125. \?)
  126. usage;
  127. ;;
  128. esac
  129. done
  130. oscheck;
  131. set_defaults_if_noopt_given;
  132. validate_options;
  133. if [ $Dump_Config -eq 1 ]; then
  134. dump_config;
  135. exit;
  136. fi
  137. Done=0
  138. verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
  139. while [ $Done -eq 0 ]; do
  140. VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
  141. if [ ${VAL:=0} -eq 0 ]; then
  142. warn "Process $PID ended without incident."
  143. Done=1;
  144. break;
  145. fi
  146. if [ $VAL -ge $LIMIT ]; then
  147. Done=1;
  148. else
  149. echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
  150. sleep $WAIT;
  151. fi
  152. if [ $Done -eq 1 ]; then
  153. if [ "$ACTION" = "kill" ]; then
  154. kill ${PID} || kill -3 ${PID}
  155. exit;
  156. elif [ "$ACTION" = "warn" ]; then
  157. # go back to monitoring.
  158. warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
  159. Done=0 #go back to monitoring
  160. elif [ "$ACTION" = "die" ]; then
  161. warn "WARNING: dying without killing process ${PID} on ${SERVER}"
  162. warn "The process details are below: "
  163. warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
  164. warn ""
  165. #should we send email/notify someone? TODO... for now, bail.
  166. exit -1;
  167. fi
  168. else
  169. :
  170. #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
  171. fi
  172. done