[icinga-checkins] icinga.org: icinga-core/mfriedrich/core: core: try to avoid a single check being reaped multiple times (Andreas Ericsson) #1652

git at icinga.org git at icinga.org
Thu Jun 16 18:07:20 CEST 2011


Module: icinga-core
Branch: mfriedrich/core
Commit: b944875e7de3ef12974a161a038408d6453fe2cd
URL:    https://git.icinga.org/?p=icinga-core.git;a=commit;h=b944875e7de3ef12974a161a038408d6453fe2cd

Author: Michael Friedrich <michael.friedrich at univie.ac.at>
Date:   Thu Jun 16 18:05:23 2011 +0200

core: try to avoid a single check being reaped multiple times (Andreas Ericsson) #1652

previously, a check completing while hitting the timeout alarm
could have lead into writing to a closed file or doing multiple
close attempts. this patch tries to remove the close window where
it was possible, preventing coredumps on high reaper frequencies too.

refs #1652

---

 Changelog     |    1 +
 base/checks.c |   48 ++++++++++++++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/Changelog b/Changelog
index 2e30828..4c9c5f7 100644
--- a/Changelog
+++ b/Changelog
@@ -22,6 +22,7 @@ FIXES
 * core: fix problem where acknowledgements were getting reset when a hard state change occurred (Ton Voon) #1618
 * core: avoid using global macros when sending notifications (Andreas Ericsson) #1653
 * core: avoid sending notifications to the wrong contact (Andreas Ericsson) #1654
+* core: try to avoid a single check being reaped multiple times (Andreas Ericsson) #1652
 
 * classic ui: fix cross site scripting vulnerability in config.cgi on config expander arguments #1605
 * classic ui: remove sidebar.html inclusion in index.html causing troubles on reload #1632
diff --git a/base/checks.c b/base/checks.c
index c37c049..a1cddd8 100644
--- a/base/checks.c
+++ b/base/checks.c
@@ -491,6 +491,7 @@ int run_async_service_check(service *svc, int check_options, double latency, int
 	double old_latency=0.0;
 	dbuf checkresult_dbuf;
 	int dbuf_chunk=1024;
+	FILE *fp;
 #ifdef USE_EVENT_BROKER
 	int neb_result=OK;
 #endif
@@ -838,8 +839,8 @@ int run_async_service_check(service *svc, int check_options, double latency, int
 			/* become the process group leader */
 			setpgid(0,0);
 
-			/* catch term signals at this process level */
-			signal(SIGTERM,service_check_sighandler);
+			/* exit on term signals at this process level */
+			signal(SIGTERM, SIG_DFL);
 
 			/* catch plugins that don't finish in a timely manner */
 			signal(SIGALRM,service_check_sighandler);
@@ -929,7 +930,8 @@ int run_async_service_check(service *svc, int check_options, double latency, int
 			/* run the plugin check command */
 			pclose_result=run_check(processed_command,&checkresult_dbuf);
 
-			/* reset the alarm */
+			/* reset the alarm and ignore SIGALRM */
+			signal(SIGALRM, SIG_IGN);
 			alarm(0);
 
 			/* get the check finish time */
@@ -955,14 +957,18 @@ int run_async_service_check(service *svc, int check_options, double latency, int
 			/* write check result to file */
 			if(check_result_info.output_file_fp){
 
-				fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
-				fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
-				fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
-				fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
-				fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
+				/* avoid races with signal handling */
+				fp = check_result_info.output_file_fp;
+				check_result_info.output_file_fp = NULL;
+
+				fprintf(fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
+				fprintf(fp,"early_timeout=%d\n",check_result_info.early_timeout);
+				fprintf(fp,"exited_ok=%d\n",check_result_info.exited_ok);
+				fprintf(fp,"return_code=%d\n",check_result_info.return_code);
+				fprintf(fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
 
 				/* close the temp file */
-				fclose(check_result_info.output_file_fp);
+				fclose(fp);
 
 				/* move check result to queue directory */
 				move_check_result_to_queue(check_result_info.output_file);
@@ -3054,6 +3060,7 @@ int run_async_host_check_3x(host *hst, int check_options, double latency, int sc
 	double old_latency=0.0;
 	dbuf checkresult_dbuf;
 	int dbuf_chunk=1024;
+	FILE *fp;
 #ifdef USE_EVENT_BROKER
 	int neb_result=OK;
 #endif
@@ -3264,8 +3271,8 @@ int run_async_host_check_3x(host *hst, int check_options, double latency, int sc
 			/* become the process group leader */
 			setpgid(0,0);
 
-			/* catch term signals at this process level */
-			signal(SIGTERM,host_check_sighandler);
+			/* exit on term signals at this process level */
+			signal(SIGTERM, SIG_DFL);
 
 			/* catch plugins that don't finish in a timely manner */
 			signal(SIGALRM,host_check_sighandler);
@@ -3277,7 +3284,8 @@ int run_async_host_check_3x(host *hst, int check_options, double latency, int sc
 			/* run the plugin check command */
 			pclose_result=run_check(processed_command,&checkresult_dbuf);
 
-			/* reset the alarm */
+			/* reset the alarm and signal handling here */
+			signal(SIGALRM, SIG_IGN);
 			alarm(0);
 
 			/* get the check finish time */
@@ -3303,14 +3311,18 @@ int run_async_host_check_3x(host *hst, int check_options, double latency, int sc
 			/* write check result to file */
 			if(check_result_info.output_file_fp){
 
-				fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
-				fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
-				fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
-				fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
-				fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
+				/* protect against signal races */
+				fp = check_result_info.output_file_fp;
+				check_result_info.output_file_fp = NULL;
+
+				fprintf(fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
+				fprintf(fp,"early_timeout=%d\n",check_result_info.early_timeout);
+				fprintf(fp,"exited_ok=%d\n",check_result_info.exited_ok);
+				fprintf(fp,"return_code=%d\n",check_result_info.return_code);
+				fprintf(fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
 
 				/* close the temp file */
-				fclose(check_result_info.output_file_fp);
+				fclose(fp);
 
 				/* move check result to queue directory */
 				move_check_result_to_queue(check_result_info.output_file);





More information about the icinga-checkins mailing list