[icinga-checkins] icinga.org: icinga-core/r1.3: core: fix extraneous alerts for services when host is down (Ton Voon) #1097

git at icinga.org git at icinga.org
Tue Jan 11 17:31:29 CET 2011


Module: icinga-core
Branch: r1.3
Commit: f3acf86abfc933e7243cd0dfb0a97881e375f941
URL:    https://git.icinga.org/?p=icinga-core.git;a=commit;h=f3acf86abfc933e7243cd0dfb0a97881e375f941

Author: Michael Friedrich <michael.friedrich at univie.ac.at>
Date:   Tue Jan 11 17:30:41 2011 +0100

core: fix extraneous alerts for services when host is down (Ton Voon) #1097

fixes #1097

---

 Changelog           |    1 +
 base/checks.c       |    7 +++++-
 t-tap/Makefile.in   |    3 +-
 t-tap/test_checks.c |   61 +++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/Changelog b/Changelog
index d7c5a4c..2b418a9 100644
--- a/Changelog
+++ b/Changelog
@@ -53,6 +53,7 @@ FIXES
 * core: log error reason when failing to open the status file (Andreas Ericsson) #1078
 * core: fix allocate memory once for *GROUPMEMBERS macros (Stephane Lapie) #1076
 * core: protect against poorly behaving thread-libraries (Andreas Ericsson) #1080
+* core: fix extraneous alerts for services when host is down (Ton Voon) #1097
 
 * classic ui: change servicestatus letter color to default black, not grey #946
 * classic ui: fix waste of cpu in status summary (TomTom) #933
diff --git a/base/checks.c b/base/checks.c
index 29a85e9..bff01d8 100644
--- a/base/checks.c
+++ b/base/checks.c
@@ -1548,6 +1548,8 @@ int handle_async_service_check_result(service *temp_service, check_result *queue
 		/* 05/29/2007 NOTE: The host might be in a SOFT problem state due to host check retries/caching.  Not sure if we should take that into account and do something different or not... */
 		if(route_result!=HOST_UP){
 
+			log_debug_info(DEBUGL_CHECKS,2,"Host is not UP, so we mark state changes if appropriate\n");
+
 			/* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */
 			if(temp_service->last_hard_state!=temp_service->current_state)
 				hard_state_change=TRUE;
@@ -1555,8 +1557,11 @@ int handle_async_service_check_result(service *temp_service, check_result *queue
 			/* update last state change times */
 			if(state_change==TRUE || hard_state_change==TRUE)
 				temp_service->last_state_change=temp_service->last_check;
-			if(hard_state_change==TRUE)
+			if(hard_state_change==TRUE) {
 				temp_service->last_hard_state_change=temp_service->last_check;
+				temp_service->state_type=HARD_STATE;
+				temp_service->last_hard_state=temp_service->current_state;
+			}
 
 			/* put service into a hard state without attempting check retries and don't send out notifications about it */
 			temp_service->host_problem_at_last_check=TRUE;
diff --git a/t-tap/Makefile.in b/t-tap/Makefile.in
index c842022..0e09f7b 100644
--- a/t-tap/Makefile.in
+++ b/t-tap/Makefile.in
@@ -13,7 +13,8 @@ CFLAGS=@CFLAGS@ @DEFS@ -DNSCORE -I../include -I../tap/src
 TAPOBJ=../tap/src/tap.o
 
 TESTS = test_logging test_events test_timeperiods test_icinga_config
-TESTS += test_xsddefault test_checks
+TESTS += test_xsddefault
+TESTS += test_checks
 
 XSD_OBJS = $(SRC_CGI)/statusdata-cgi.o $(SRC_CGI)/xstatusdata-cgi.o
 XSD_OBJS += $(SRC_CGI)/objects-cgi.o $(SRC_CGI)/xobjects-cgi.o
diff --git a/t-tap/test_checks.c b/t-tap/test_checks.c
index a666e50..b0773df 100644
--- a/t-tap/test_checks.c
+++ b/t-tap/test_checks.c
@@ -34,6 +34,27 @@ service *svc1=NULL, *svc2=NULL;
 host *host1=NULL;
 int found_log_rechecking_host_when_service_wobbles=0;
 int found_log_run_async_host_check_3x=0;
+check_result *tmp_check_result;
+
+void setup_check_result() {
+	struct timeval start_time,finish_time;
+	start_time.tv_sec=1234567890L;
+	start_time.tv_usec=0L;
+	finish_time.tv_sec=1234567891L;
+	finish_time.tv_usec=0L;
+
+	tmp_check_result=(check_result *)malloc(sizeof(tmp_check_result));
+	tmp_check_result->check_type=SERVICE_CHECK_ACTIVE;
+	tmp_check_result->check_options=0;
+	tmp_check_result->scheduled_check=TRUE;
+	tmp_check_result->reschedule_check=TRUE;
+	tmp_check_result->exited_ok=TRUE;
+	tmp_check_result->return_code=0;
+	tmp_check_result->output=strdup("Fake result");
+	tmp_check_result->latency=0.6969;
+	tmp_check_result->start_time=start_time;
+	tmp_check_result->finish_time=finish_time;
+}
 
 int c=0;
 int update_program_status(int aggregated_dump){
@@ -68,9 +89,16 @@ setup_objects(time_t time) {
 	timed_event *new_event=NULL;
 
 	host1=(host *)calloc(1, sizeof(host));
+	host1->name=strdup("Host1");
+	host1->address=strdup("127.0.0.1");
+	host1->retry_interval=1;
+	host1->check_interval=5;
+	host1->check_options=0;
+	host1->state_type=SOFT_STATE;
 	host1->current_state=HOST_DOWN;
 	host1->has_been_checked=TRUE;
 	host1->last_check=time;
+	host1->next_check=time;
 
 	/* First service is a normal one */
 	svc1=(service *)calloc(1, sizeof(service));
@@ -83,8 +111,14 @@ setup_objects(time_t time) {
 	svc1->current_state=STATE_CRITICAL;
 	svc1->retry_interval=1;
 	svc1->check_interval=5;
+	svc1->current_attempt=1;
+	svc1->max_attempts=4;
 	svc1->last_state_change=0;
 	svc1->last_state_change=0;
+	svc1->last_check=(time_t)1234560000;
+	svc1->host_problem_at_last_check=FALSE;
+	svc1->plugin_output=strdup("Initial state");
+	svc1->last_hard_state_change=(time_t)1111111111;
 
 	/* Second service .... to be configured! */
 	svc2=(service *)calloc(1, sizeof(service));
@@ -101,9 +135,9 @@ setup_objects(time_t time) {
 int
 main (int argc, char **argv){
 	time_t now=0L;
-	check_result *tmp_check_result;
+	int log_service_event_flag;
 
-	plan_tests(4);
+	plan_tests(11);
 
 	time(&now);
 
@@ -144,6 +178,29 @@ main (int argc, char **argv){
 	ok( svc1->no_more_notifications==FALSE, "no_more_notifications reset due to state change" );
 	ok( svc1->current_notification_number==999, "notification number NOT reset" );
 
+	/* Test case:
+		service that transitions from OK to CRITICAL (where its host is set to DOWN) will get set to a hard state
+		even though check attempts = 1 of 4
+	*/
+	setup_objects((time_t) 1234567800L);
+	host1->current_state=HOST_DOWN;
+	svc1->current_state=STATE_OK;
+	svc1->state_type=HARD_STATE;
+	setup_check_result();
+	tmp_check_result->return_code=STATE_CRITICAL;
+	tmp_check_result->output=strdup("CRITICAL failure");
+	log_service_event_flag=0;
+
+	handle_async_service_check_result(svc1, tmp_check_result);
+
+	ok( log_service_event_flag==1, "log_service_event() was called");
+	ok( svc1->last_hard_state_change == (time_t)1234567890, "Got last_hard_state_change time=%lu", svc1->last_hard_state_change);
+	ok( svc1->last_state_change == svc1->last_hard_state_change, "Got same last_state_change" );
+	ok( svc1->last_hard_state == 2, "Should save the last hard state as critical for next time");
+	ok( svc1->host_problem_at_last_check==TRUE, "Got host_problem_at_last_check set to TRUE due to host failure - this needs to be saved otherwise extra alerts raised in subsequent runs");
+	ok( svc1->state_type == HARD_STATE, "This should be a HARD state since the host is in a failure state");
+	ok( svc1->current_attempt==1, "Previous status was OK, so this failure should show current_attempt=1") || diag("Current attempt=%d", svc1->current_attempt);
+
 	return exit_status ();
 }
 





More information about the icinga-checkins mailing list