#! /bin/bash . /usr/src/m2/framework/bash_functions.sh # http://trac.kolmisoft.com/trac/ticket/15476 # Based on es_sync_check.sh # # Check last es_sync_check_period (default 31) days, if mismatch(s) found, resync only mismatched days # Check again, if problem still present --> support@kolmisoft.com # More information: http://doc.kolmisoft.com/display/kolmisoft/ES+Sync+Control+script if [[ "`/sbin/pidof -x $(basename $0) -o %PPID`" ]]; then echo "$(date_time) [WARNING] $(basename $0) script is already running with PID `/sbin/pidof -x $(basename $0) -o %PPID`" exit 0 fi if ! /bin/ps auxf | grep -F java | grep -Fq elasticsearch; then echo "$(date_time) [NOTICE] ES is not runnnig. Exiting." exit 0 fi RED='\033[0;31m' GREEN='\033[0;32m' BLUE='\033[0;34m' NC='\033[0m' GLOBAL_SKIP_FLAG=0 # /tmp/es_curl_resp file is created by elasticsearch resync command # So file modification date shows when last full resync has been started if [[ -e /tmp/es_curl_resp ]]; then last_resync_date=$(date -r /tmp/es_curl_resp "+%Y-%m-%d %H:%M:%S") time_to_check=$(TZ='UTC0' date -d "$last_resync_date UTC +12 hours" +'%Y-%m-%d %H:%M:%S') if [[ "$(date_time)" < "$time_to_check" ]]; then echo -e "$(date_time) [WARNING] Full resync has been initiated less than 12 hours ago (at ${GREEN}$last_resync_date${NC})." echo "$(date_time) [WARNING] Will not check sync status as resync could be still running" exit 0 fi fi bad_days="0" temp_file="/tmp/es_mismatch_days" log="/var/log/m2/es_sync_control.log" config="/etc/m2/system.conf" script_start_time=$(date_time) es_cluster_name=$(curl -s -XGET "localhost:9200" | jq -r '.cluster_name') time_start="00:00:00" time_end="23:59:59" # Initialization es_sync_control=$(sed 's/ //g' $config | awk -F"=" '/es_sync_control/{print $2}') es_sync_send_email=$(sed 's/ //g' $config | awk -F"=" '/es_sync_send_email/{print $2}') es_sync_email=$(sed 's/ //g' $config | awk -F"=" '/es_sync_email/{print $2}') es_sync_check_period=$(sed 's/ //g' $config | awk -F"=" '/es_sync_check_period/{print $2}') es_sync_interfering_conflines_check=$(sed 's/ //g' $config | awk -F"=" '/es_sync_interfering_conflines_check/{print $2}') server_id=$(sed 's/ //g' $config | awk -F"=" '/server_id/{print $2}') if [[ -z "$es_sync_email" || "$es_sync_email" == "" ]]; then es_sync_email="support@kolmisoft.com" fi if [[ -z $es_sync_control ]] || (( es_sync_control != 1 )); then echo "$(date_time) [NOTICE] ES sync control script is not enabled. Exiting" exit 0 fi if [[ -z $es_sync_check_period ]] || (( es_sync_check_period < 1 )); then es_sync_check_period=31 fi if [[ -n $es_sync_interfering_conflines_check ]] && (( es_sync_interfering_conflines_check == 0 )); then es_sync_interfering_conflines_check=0 else es_sync_interfering_conflines_check=1 fi check_interfering_settings() { min_confline_value="$es_sync_check_period" local min_confline_name="" for confline_name in Move_to_old_calls_older_than Delete_not_Archived_not_Answered_Calls_older_than; do confline_value=$(get_confline "$confline_name") if [[ -n $confline_value ]] && ((confline_value != 0)) && ((confline_value < min_confline_value)); then min_confline_value="$confline_value" min_confline_name="$confline_name" fi done if ((min_confline_value < es_sync_check_period)); then echo -e "$(date_time) [WARNING] ${BLUE}es_sync_check_period${NC} set to ${GREEN}$es_sync_check_period${NC}, but ${BLUE}$min_confline_name${NC} is set to ${RED}$min_confline_value${NC}" echo -e "$(date_time) [NOTICE] this setting can interfere with ES sync check" echo -e "$(date_time) [NOTICE] Setting ${BLUE}es_sync_check_period${NC} to to ${GREEN}$((min_confline_value - 1))${NC}" es_sync_check_period=$((min_confline_value - 1)) fi } print_settings() { echo -e "$(date_time) [NOTICE] ES sync control initial settings:" echo -e "${BLUE}es_sync_control:${NC} ${GREEN}$es_sync_control${NC}" echo -e "${BLUE}es_sync_send_email:${NC} ${GREEN}$es_sync_send_email${NC}" echo -e "${BLUE}es_sync_email:${NC} ${GREEN}$es_sync_email${NC}" echo -e "${BLUE}es_sync_check_period:${NC} ${GREEN}$es_sync_check_period${NC}" echo -e "${BLUE}es_sync_interfering_conflines_check:${NC} ${GREEN}$es_sync_interfering_conflines_check${NC}" } send_email() { retrieve_email_settings retrieve_ip_settings # HERE documents don't like indentation read -r -d '' email_output << EOF Hello, $0 script detected issues with ES sync. Please check $log for more information. Information about sever: External IP: $external_ip Main routing IP: $main_server_ip Client tag: $client_tag IP from GUI config: $gui_config_ip --------------- This email was sent automatically by script $0 EOF if [ "$Email_Login" == "" ] && [ "$Email_Password" == "" ]; then echo "$(date_time) [NOTICE] Sending Email using command below:" echo -e "${BLUE}/usr/local/m2/sendEmail -f \"$Email_from\" -t $es_sync_email -u \"[$external_ip] ES sync issues\" -s \"$Email_Smtp_Server:$Email_port\"${NC} -m \"$email_output\" ${BLUE}-o tls='auto'${NC}" sendemail_output=$(/usr/local/m2/sendEmail -f "$Email_from" -t $es_sync_email -u "[$external_ip] ES sync issues" -s "$Email_Smtp_Server:$Email_port" -m "$email_output" -o tls='auto' 2>&1) if echo "$sendemail_output" | grep -Fq 'was sent successfully!'; then echo -e "${GREEN}Email was sent successfully!${NC}" else echo -e "${RED}Failed to send Email. sendEmail output below:${NC}" echo "$sendemail_output" fi else echo "$(date_time) [NOTICE] Sending Email using command below:" echo -e "${BLUE}/usr/local/m2/sendEmail -f \"$Email_from\" -xu \"$Email_Login\" -xp \"$Email_Password\" -t $es_sync_email -u \"[$external_ip] ES sync issues\" -s \"$Email_Smtp_Server:$Email_port\"${NC} -m \"$email_output\" ${BLUE}-o tls='auto'${NC}" sendemail_output=$(/usr/local/m2/sendEmail -f "$Email_from" -xu "$Email_Login" -xp "$Email_Password" -t $es_sync_email -u "[$external_ip] ES sync issues" -s "$Email_Smtp_Server:$Email_port" -m "$email_output" -o tls='auto' 2>&1) if echo "$sendemail_output" | grep -Fq 'was sent successfully!'; then echo -e "${GREEN}Email was sent successfully!${NC}" else echo -e "${RED}Failed to send Email. sendEmail output below:${NC}" echo "$sendemail_output" fi fi } check_es_sync() { # $1 - days to check in the past from today # return # 0 if mismatch is not found # > 0 if mismatch(s) found, in this case mismatch dates will be in /tmp/es_mismatch_days file today=$(date +'%Y-%m-%d') date_start=$(date -I -d "$today - $es_sync_check_period days") date_end="$today" local bad_days="0" local last_restart=0 rm -f $temp_file while [ "$date_start" != "$date_end" ]; do date_current="$date_start" if ! es_count=$(get_es_count "$date_start"); then echo -e "$(date_time) [WARNING] Failed to get es_count." if last_restart=$(get_process_running_time "java") && ((last_restart < 60)); then echo -e "$(date_time) [WARNING] ES was restarted less than 60 seconds ago.." echo -e "$(date_time) [NOTICE] Sleeping 60 seconds and trying one more time" sleep 60 if ! es_count=$(get_es_count "$date_start"); then 1>&2 echo -e "$(date_time) [WARNING] Failed to get es_count. Skipping this iteration" continue fi fi fi if ! mysql_count=$(MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "select count(id) from calls where calldate between '$date_current $time_start' and '$date_current $time_end'"); then 1>&2 echo -e "$(date_time) [WARNING] ${RED}Error connecting to MySQL DB. Exiting${NC}" 1>&2 echo "$mysql_count" exit 1 fi if (( es_count != mysql_count )); then #echo "$date_current" "$mysql_count" "$es_count" >> $temp_file printf "%-12s %-9s %-9s\n" "$date_current" "$mysql_count" "$es_count" >> $temp_file (( bad_days=bad_days+1 )) fi date_start=$(date -I -d "$date_start + 1 day") done echo $bad_days } # Reads badly synced days and resync them # Returns: # 0 - mismatch days was resynced correctly # 1 - at least one day was not recynced correctly resync_bad_days() { local day="" local sleep_time="" local es_count="" local last_es_count=0 local last_es_count_initialized=0 local mysql_count="" local interrupt_checked=0 local skip_flag=0 if [ ! -e $temp_file ]; then return 0; fi while read -r day mysql_count es_count do echo -e "$(date_time) [NOTICE] Executing command: ${BLUE}elasticsearch resync interval \"$day $time_start\" \"$day $time_end\"${NC}" #echo "$(date_time)" > /tmp/es_sync_control_last_resync_attempt elasticsearch resync interval "$day $time_start" "$day $time_end" > /dev/null 2>&1 #mysql_count=$(/usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" --password="$DB_PASSWORD" "$DB_NAME" -sNe "select count(id) from calls where calldate between '$date_current $time_start' and '$date_current $time_end';") # max 5 minutes for 100k calls, but one hour max # If at least one day fails to resync, do not try to resync other if (( mysql_count < 100000 )); then max_sleep_time=300 else max_sleep_time=$(( mysql_count * 300 / 100000 )) if (( max_sleep_time > 3600 )); then max_sleep_time=3600 fi fi sleep_time=20 interrupt_checked=0; last_es_count_initialized=0; last_es_count=0 for (( slept_time=0; slept_time <= max_sleep_time; slept_time=slept_time + sleep_time)); do if [[ -e /tmp/es_curl_resp ]]; then if [[ $(date -r /tmp/es_curl_resp "+%Y-%m-%d %H:%M:%S") > $script_start_time ]]; then echo -e "$(date_time) [WARNING] Full resync has been initiated durinng ES sync check... Exiting" exit 1 fi fi sleep $sleep_time if last_restart=$(get_process_running_time "java") && ((last_restart < 60)); then echo "$(date_time) [WARNING] ES restart was less than 60 seconds ago. Sleeping for additional 60 seconds" sleep 60 fi if ! es_count=$(get_es_count "$day"); then echo -e "$(date_time) [WARNING] Failed to get es_count. Skipping this iteration" continue fi if ((last_es_count_initialized== 0)); then last_es_count="$es_count" last_es_count_initialized=1 fi if (( es_count != mysql_count )); then # Sometimes elasticsearch resync interval gets interrupted, ES logs show such errors: # [ERROR][river.jdbc.RiverPipeline ] Future got interrupted # and es_count remains 0 # So check for such case and repeat resync one more time # In system there issue was detected after resync'ing second time, error does not appear # Related commit from ES side which might also prevent this http://trac.kolmisoft.com/trac/changeset/73972 if ((es_count == 0 && interrupt_checked == 0)); then echo -e "$(date_time) [NOTICE] ${BLUE}$((slept_time + sleep_time))${NC} seconds after resync - es_count: ${RED}$es_count${NC}, mysql_count: ${BLUE}$mysql_count${NC}" echo -e "$(date_time) [WARNING] ${BLUE}elasticsearch resync interval \"$day $time_start\" \"$day $time_end\"${NC} ${RED}got interrupted${NC}" echo "$(date_time) [NOTICE] Trying one more time" echo -e "$(date_time) [NOTICE] Executing command: ${BLUE}elasticsearch resync interval \"$day $time_start\" \"$day $time_end\"${NC}" interrupt_checked=1 elasticsearch resync interval "$day $time_start" "$day $time_end" > /dev/null 2>&1 else if ((es_count < last_es_count)); then fresh_mysql_count=$(MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "select count(id) from calls where calldate between '$day 00:00:00' and '$day 23:59:59';") if ((fresh_mysql_count != mysql_count)); then echo -e "$(date_time) [WARNING] Calls count in DB is different than previously calculated - now: ${BLUE}$fresh_mysql_count${NC} , previously: ${BLUE}$mysql_count${NC}" echo -e "$(date_time) [WARNING] Someone is channging DB table, will not monitor ${BLUE}$day${NC} further" else echo -e "$(date_time) [WARNING] ES count is decreasing while DB stays the same: es_count: $es_count, last_es_count: $last_es_count, will not monitor this ${BLUE}$day${NC} further" fi skip_flag=1 break; fi echo -e "$(date_time) [NOTICE] ${BLUE}$((slept_time + sleep_time))${NC} seconds after resync - es_count: ${RED}$es_count${NC}, mysql_count: ${BLUE}$mysql_count${NC}" fi last_es_count=$es_count last_es_count_initialized=1 else break; fi done if (( skip_flag == 1 )); then ((GLOBAL_SKIP_FLAG++, skip_flag=0)) continue fi if [[ -n "$es_count" ]]; then if (( es_count == mysql_count )); then echo -e "$(date_time) [NOTICE] ${GREEN}$day${NC} has been resynced successfully ${BLUE}es_count:${NC} ${GREEN}$es_count${NC}, ${BLUE}mysql_count${NC}: ${GREEN}$mysql_count${NC}" continue else echo -e "$(date_time) [WARNING] ${RED}$day${NC} is mismatched after resync (we waited for ${BLUE}$slept_time${NC} seconds). es_count: ${RED}$es_count${NC}, mysql_count: ${BLUE}$mysql_count${NC}" return 1 fi else echo -e "$(date_time) [WARNING] Failed to get es_count (we waited for ${BLUE}$slept_time${NC} seconds)." return 1 fi done < $temp_file return 0 } sending_email_notice() { if (( es_sync_send_email == 1 )); then echo "$(date_time) [NOTICE] Sending email for manual check." send_email else echo "$(date_time) [WARNING] Sending email is disabled. Exiting." fi } check_realtime_sync() { local sync_status_ok=1 local total_hits=$(/usr/bin/curl -s "localhost:9200/_river/_search?q=_type:m2_jdbc_river&pretty" | /usr/bin/jq '.hits.total') if [ "$total_hits" == "2" ]; then echo -e "$(date_time) [NOTICE] Realtime elastiscearch sync status ${GREEN}OK${NC}" else sync_status_ok=0 echo -e "$(date_time) [NOTICE] Realtime elastiscearch sync status ${RED}FAIL${NC}" fi if [ "$sync_status_ok" == "0" ]; then local es_sync_max_id_field="" local es_sync_last_call_id_field="" echo "$(date_time) [NOTICE] Trying to fix realtime elasticsearch sync" # Delete old river (in case it is still there) /usr/bin/curl -XDELETE "localhost:9200/_river/m2_jdbc_river" &> /dev/null sleep 5 # Set correct conflines name if (( $server_id > 0 )); then es_sync_max_id_field="ES_SYNC_MAX_CALL_ID_$server_id" es_sync_last_call_id_field="ES_SYNC_LAST_CALL_ID_$server_id" else es_sync_max_id_field="ES_SYNC_MAX_CALL_ID" es_sync_last_call_id_field="ES_SYNC_LAST_CALL_ID" fi # Set sync start point to max call id local max_call_id=$(MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "SELECT IFNULL(MAX(id), 0) FROM calls") MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "DELETE FROM conflines WHERE name = '$es_sync_max_id_field'" &> /dev/null MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "DELETE FROM conflines WHERE name = '$es_sync_last_call_id_field'" &> /dev/null MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "INSERT INTO conflines(value, name) VALUES ($max_call_id, '$es_sync_max_id_field')" &> /dev/null MYSQL_PWD="$DB_PASSWORD" /usr/bin/mysql -h "$DB_HOST" -u "$DB_USERNAME" $P_OPT "$DB_NAME" -sNe "INSERT INTO conflines(value, name) VALUES ($max_call_id, '$es_sync_last_call_id_field')" &> /dev/null elasticsearch sync start &> /dev/null sleep 5 # Check again if sync status is ok total_hits=$(/usr/bin/curl -s "localhost:9200/_river/_search?q=_type:m2_jdbc_river&pretty" | /usr/bin/jq '.hits.total') if [ "$total_hits" == "2" ]; then echo -e "$(date_time) [NOTICE] Realtime elastiscearch sync ${GREEN}FIXED${NC}" else echo -e "$(date_time) [ERROR] ${RED}Could not fix realtime elastiscearch sync!${NC}" fi fi } ############################# MAIN ###################### set_database_variables print_settings ((es_sync_interfering_conflines_check == 1)) && check_interfering_settings # Check realtime ES sync check_realtime_sync echo -e "$(date_time) [NOTICE] Starting sync check for past ${GREEN}$es_sync_check_period${NC} days" # Check for two instances running if /bin/netstat -atnp | grep -F 9200 | grep -Fq java && /bin/netstat -atnp | grep -F 9201 | grep -Fq java; then echo "$(date_time) [WARNING] Two or more instances of elasticsearch are running" echo "$(date_time) [NOTICE] I will try to fix this manually" /bin/bash /usr/src/m2/elasticsearch/elasticsearch_restart.sh sleep 5 if /bin/netstat -atnp | grep -F 9200 | grep -Fq java && /bin/netstat -atnp | grep -F 9201 | grep -Fq java; then echo "$(date_time) [ERROR] Still two or more instances of elasticsearch are running..." sending_email_notice exit 0 fi fi last_restart="" while [[ -z $last_restart || $last_restart -le 60 ]]; do if ! last_restart=$(get_process_running_time "java"); then echo -e "$(date_time) [WARNING] Cannot get ES start time. Sleeping for 60 seconds before proceeding..." sleep 60 else if ((last_restart < 60)); then echo -e "$(date_time) [WARNING] ES was restarted ${BLUE}$last_restart${NC} seconds ago. Sleeping for 60 seconds before proceeding..." sleep 60 fi fi done successful_shards=$(/usr/bin/curl -s "localhost:9200/${DB_NAME}/calls/_count" | /usr/bin/jq '._shards.successful') total_shards=$(/usr/bin/curl -s "localhost:9200/${DB_NAME}/calls/_count" | /usr/bin/jq '._shards.total') if [[ -z $successful_shards || -z $total_shards ]]; then echo -e "$(date_time) [WARNING] Failed to get shard count. Something wrong with ES" sending_email_notice exit 0 fi if (( (successful_shards != total_shards) || successful_shards == 0)); then echo -e "$(date_time) [WARNING] Some shards are missing: successful_shards: ${RED}$successful_shards${NC}, total_shards: ${RED}$total_shards${NC}" echo "$(date_time) [NOTICE] Will not check days, as we need full resync to recover" sending_email_notice exit 0 fi bad_days=$(check_es_sync) if (( bad_days > 0 )); then echo -e "$(date_time) [WARNING] ${RED}$bad_days${NC} mismatched day(s) detected from last ${BLUE}$es_sync_check_period${NC} days:" printf "%-12s %-9s %-9s\n" "Date" "db_count" "es_count" cat "$temp_file" if ! resync_bad_days; then echo "$(date_time) [WARNING] There are still mismatch day(s) after resync" sending_email_notice else if ((GLOBAL_SKIP_FLAG == 1)); then echo -e "$(date_time) [NOTICE] ${BLUE}$GLOBAL_SKIP_FLAG${NC} mismatched day(s) has been skipped due decreasing call count" fi echo -e "$(date_time) [NOTICE] ${GREEN}$((bad_days-GLOBAL_SKIP_FLAG))${NC} mismatched day(s) have been successfully resynced" fi else echo -e "$(date_time) [NOTICE] There are no mismatched days in last ${GREEN}$es_sync_check_period${NC} days" fi