diff --git a/configure.ac b/configure.ac index c3ca9f1730..32fc08f818 100644 --- a/configure.ac +++ b/configure.ac @@ -991,7 +991,41 @@ fi - +dnl +dnl enable helgrind support +dnl +AC_MSG_CHECKING([whether to enable helgrind support]) +helgrinddir=disabled +AC_ARG_WITH(helgrind, + [ --with-helgrind=DIR Directory that holds the helgrind headers. + On Linux, 'yes' uses /usr/include/valgrind.], + [helgrinddir=$withval]) +case "$helgrinddir" in + disabled) ;; + no) helgrinddir=disabled ;; + yes) + case "${PBS_MACH}" in + linux) AC_DEFINE([HELGRIND], 1, [Define to enable helgrind support]) + CFLAGS="$CFLAGS -I/usr/include/valgrind" + AC_CHECK_HEADERS([helgrind.h], + [CFLAGS="$CFLAGS -DHELGRIND"], + [AC_MSG_ERROR(Helgrind headers not found. Try installing valgrind-devel)], + []);; + *) AC_MSG_ERROR([--with-job-create takes a full path to a directory]);; + esac ;; + *) + case "${PBS_MACH}" in + linux) AC_DEFINE([HELGRIND], 1, [Define to enable helgrind support]) + CFLAGS="$CFLAGS -I$helgrinddir" + AC_CHECK_HEADERS([helgrind.h], + [CFLAGS="$CFLAGS -DHELGRIND"], + [AC_MSG_ERROR("Helgrind headers not found. Is $helgrinddir correct??")], + []) + ;; + *) AC_MSG_ERROR([--with-helgrind takes a full path to a directory]);; + esac ;; +esac +AC_MSG_RESULT([$helgrinddir]) dnl dnl add an option to specify a different path where TORQUE should look for HWLOC diff --git a/src/lib/Libutils/u_lock_ctl.c b/src/lib/Libutils/u_lock_ctl.c index f9300b68e8..e14d7c3416 100644 --- a/src/lib/Libutils/u_lock_ctl.c +++ b/src/lib/Libutils/u_lock_ctl.c @@ -78,16 +78,15 @@ int lock_startup() { int rc = PBSE_NONE; - if (locks == NULL) + if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE)) { - if ((rc = lock_init()) == PBSE_NONE) - { - if (pthread_mutex_lock(locks->startup) != 0) - { - log_err(-1,"mutex_lock","ALERT: cannot lock startup mutex!\n"); - return(PBSE_MUTEX); - } - } + log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n"); + return(PBSE_MUTEX); + } + if (pthread_mutex_lock(locks->startup) != 0) + { + log_err(-1, "mutex_lock", "ALERT: cannot lock startup mutex!\n"); + return(PBSE_MUTEX); } return(rc); @@ -101,7 +100,7 @@ int unlock_startup() { if (pthread_mutex_unlock(locks->startup) != 0) { - log_err(-1,"mutex_unlock","ALERT: cannot unlock startup mutex!\n"); + log_err(-1, "mutex_unlock", "ALERT: cannot unlock startup mutex!\n"); return(PBSE_MUTEX); } @@ -116,18 +115,15 @@ int lock_conn_table() { int rc = PBSE_NONE; - if (locks == NULL) + if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE)) { - if ((rc = lock_init()) == PBSE_NONE) - { - lock_init(); - - if (pthread_mutex_lock(locks->conn_table) != 0) - { - log_err(-1,"mutex_lock","ALERT: cannot lock conn_table mutex!\n"); - return(PBSE_MUTEX); - } - } + log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n"); + return(PBSE_MUTEX); + } + if (pthread_mutex_lock(locks->conn_table) != 0) + { + log_err(-1, "mutex_lock", "ALERT: cannot lock conn_table mutex!\n"); + return(PBSE_MUTEX); } return(rc); @@ -155,17 +151,15 @@ int lock_ss() { int rc = PBSE_NONE; - if (locks == NULL) + if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE)) { - if ((rc = lock_init()) == PBSE_NONE) - { - - if (pthread_mutex_lock(locks->setup_save) != 0) - { - log_err(-1,"mutex_lock","ALERT: cannot lock setup_save mutex!\n"); - return(PBSE_MUTEX); - } - } + log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n"); + return(PBSE_MUTEX); + } + if (pthread_mutex_lock(locks->setup_save) != 0) + { + log_err(-1, "mutex_lock", "ALERT: cannot lock setup_save mutex!\n"); + return(PBSE_MUTEX); } return(rc); @@ -178,7 +172,7 @@ int unlock_ss() { if (pthread_mutex_unlock(locks->setup_save) != 0) { - log_err(-1,"mutex_unlock","ALERT: cannot unlock setup_save mutex!\n"); + log_err(-1, "mutex_unlock", "ALERT: cannot unlock setup_save mutex!\n"); return(PBSE_MUTEX); } diff --git a/src/server/array_func.c b/src/server/array_func.c index f67b64adfa..2497e382e2 100644 --- a/src/server/array_func.c +++ b/src/server/array_func.c @@ -1846,7 +1846,13 @@ int remove_array( int rc; char arrayid[PBS_MAXSVRJOBID+1]; +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(allarrays.allarrays_mutex)) +#endif { strcpy(arrayid, pa->ai_qs.parent_id); diff --git a/src/server/job_container.c b/src/server/job_container.c index fca7ab2a6c..e501f4ae04 100644 --- a/src/server/job_container.c +++ b/src/server/job_container.c @@ -665,7 +665,13 @@ int get_jobs_index( { int index; +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(aj->alljobs_mutex)) +#endif { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pthread_mutex_lock(aj->alljobs_mutex); @@ -703,7 +709,13 @@ int has_job( strcpy(jobid, pjob->ji_qs.ji_jobid); +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(aj->alljobs_mutex)) +#endif { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pthread_mutex_lock(aj->alljobs_mutex); @@ -750,7 +762,13 @@ int remove_job( if (LOGLEVEL >= 10) LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid); +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(aj->alljobs_mutex)) +#endif { unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); pthread_mutex_lock(aj->alljobs_mutex); diff --git a/src/server/node_func.c b/src/server/node_func.c index 03dd8d4aae..cd37b28676 100644 --- a/src/server/node_func.c +++ b/src/server/node_func.c @@ -3214,7 +3214,13 @@ int remove_node( { int rc = PBSE_NONE; +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(an->allnodes_mutex)) +#endif { unlock_node(pnode, __func__, NULL, LOGLEVEL); pthread_mutex_lock(an->allnodes_mutex); @@ -3241,7 +3247,13 @@ struct pbsnode *next_host( struct pbsnode *pnode; char *name = NULL; +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(an->allnodes_mutex)) +#endif { if (held != NULL) { diff --git a/src/server/pbsd_main.c b/src/server/pbsd_main.c index ada6eac82f..3c5fc9411c 100644 --- a/src/server/pbsd_main.c +++ b/src/server/pbsd_main.c @@ -140,6 +140,9 @@ #include "ji_mutex.h" #include "job_route.h" /* queue_route */ #include "exiting_jobs.h" +#ifdef HELGRIND +#include +#endif #define TASK_CHECK_INTERVAL 10 #define HELLO_WAIT_TIME 600 @@ -184,18 +187,18 @@ static void lock_out_ha(); /* external data items */ -extern hello_container failures; -extern int svr_chngNodesfile; -extern int svr_totnodes; -extern struct all_jobs alljobs; -extern int run_change_logs; - +extern hello_container failures; +extern int svr_chngNodesfile; +extern int svr_totnodes; +extern struct all_jobs alljobs; +extern int run_change_logs; +extern time_t pbs_tcp_timeout; extern pthread_mutex_t *poll_job_task_mutex; -extern int max_poll_job_tasks; +extern int max_poll_job_tasks; /* External Functions */ -extern int recov_svr_attr (int); +extern int recov_svr_attr (int); extern void change_logs_handler(int); extern void change_logs(); @@ -1651,6 +1654,17 @@ int main( extern char *msg_svrdown; /* log message */ extern char *msg_startup1; /* log message */ +#ifdef HELGRIND +/* These global variables are written to from one or more threads, but + * read from many threads. This is technically a data race, but they + * should be benign. Adding mutexes around their access would negatively + * impact performance. Tell the Helgrind tool to ignore these. + */ +VALGRIND_HG_DISABLE_CHECKING(&LOGLEVEL, sizeof LOGLEVEL); +VALGRIND_HG_DISABLE_CHECKING(&pbs_tcp_timeout, sizeof pbs_tcp_timeout); +VALGRIND_HG_DISABLE_CHECKING(&last_task_check_time, sizeof last_task_check_time); +#endif + ProgName = argv[0]; srand(get_random_number()); tzset(); /* localtime_r needs this */ diff --git a/src/server/queue_func.c b/src/server/queue_func.c index c3b37f1f84..e4b33432f6 100644 --- a/src/server/queue_func.c +++ b/src/server/queue_func.c @@ -247,7 +247,6 @@ pbs_queue *que_alloc( initialize_all_jobs_array(pq->qu_jobs); initialize_all_jobs_array(pq->qu_jobs_array_sum); pthread_mutex_init(pq->qu_mutex,NULL); - lock_queue(pq, __func__, NULL, LOGLEVEL); snprintf(pq->qu_qs.qu_name, sizeof(pq->qu_qs.qu_name), "%s", name); @@ -320,7 +319,7 @@ void que_free( remove_queue(&svr_queues, pq); pq->q_being_recycled = TRUE; insert_into_queue_recycler(pq); - unlock_queue(pq, "que_free", NULL, LOGLEVEL); + unlock_queue(pq, __func__, NULL, LOGLEVEL); return; } /* END que_free() */ @@ -468,7 +467,12 @@ void free_alljobs_array( - +/* + * insert a queue to an all_queues hash + * + * expects a queue that is unlocked + * returns a the queue locked + */ int insert_queue( all_queues *aq, @@ -490,6 +494,7 @@ int insert_queue( rc = PBSE_NONE; } + lock_queue(pque, __func__, NULL, LOGLEVEL); pthread_mutex_unlock(aq->allques_mutex); return(rc); @@ -509,7 +514,13 @@ int remove_queue( int index; char log_buf[1000]; +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(aq->allques_mutex)) +#endif { unlock_queue(pque, __func__, NULL, LOGLEVEL); pthread_mutex_lock(aq->allques_mutex); @@ -673,7 +684,13 @@ pbs_queue *lock_queue_with_job_held( if (pque != NULL) { +/* + * Acquiring this lock would be a lock order violation, but + * deadlock cannot occur. Have Helgrind ignore this. + */ +#ifndef HELGRIND if (pthread_mutex_trylock(pque->qu_mutex)) +#endif { /* if fail */ strcpy(jobid, pjob->ji_qs.ji_jobid); diff --git a/src/server/queue_recycler.c b/src/server/queue_recycler.c index 650dd09458..873866299d 100644 --- a/src/server/queue_recycler.c +++ b/src/server/queue_recycler.c @@ -100,6 +100,7 @@ int insert_into_queue_recycler( enqueue_threadpool_request(remove_some_recycle_queues,NULL); } + unlock_queue(pq, __func__, NULL, LOGLEVEL); rc = insert_queue(&q_recycler.queues,pq); update_queue_recycler_next_id(); diff --git a/src/server/req_select.c b/src/server/req_select.c index 38cbc7ab28..2f15e9c2d7 100644 --- a/src/server/req_select.c +++ b/src/server/req_select.c @@ -304,6 +304,7 @@ int req_selectjobs( plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_select); + /* This will either leave pque NULL or return a locked queue */ rc = build_selist(plist, preq->rq_perm, &selistp, &pque, &bad); if (rc != 0) @@ -358,7 +359,7 @@ int req_selectjobs( } if (pque != NULL) - unlock_queue(pque, "req_selectjobs", NULL, LOGLEVEL); + unlock_queue(pque, __func__, NULL, LOGLEVEL); return PBSE_NONE; } /* END req_selectjobs() */ @@ -972,8 +973,6 @@ static int build_selist( if (*pque == (pbs_queue *)0) return (PBSE_UNKQUE); - - unlock_queue(*pque, __func__, NULL, LOGLEVEL); } } } diff --git a/src/server/svr_movejob.c b/src/server/svr_movejob.c index 19b95685d2..d023d6f6df 100644 --- a/src/server/svr_movejob.c +++ b/src/server/svr_movejob.c @@ -867,9 +867,6 @@ int send_job_work( return(PBSE_SYSTEM); } - pthread_mutex_lock(connection[con].ch_mutex); - pthread_mutex_unlock(connection[con].ch_mutex); - if (attempt_to_queue_job == TRUE) { if (change_substate_on_attempt_to_queue == TRUE) diff --git a/src/server/svr_task.c b/src/server/svr_task.c index feda07b8b0..82afb3cd4e 100644 --- a/src/server/svr_task.c +++ b/src/server/svr_task.c @@ -115,6 +115,12 @@ extern task_recycler tr; +/* + * for a time-ordered work-task list, find the existing work + * task that comes chronologically AFTER the new work task + * + * The new work task should NOT be locked when calling this + */ struct work_task *find_insertion_point( all_tasks *at, @@ -179,7 +185,6 @@ struct work_task *set_task( } pthread_mutex_init(pnew->wt_mutex,NULL); - pthread_mutex_lock(pnew->wt_mutex); if (type == WORK_Timed) { @@ -361,6 +366,8 @@ work_task *next_task( /* * adds a task to the specified array + * expects the work task to be unlocked upon entry + * returns it locked */ int insert_task( @@ -379,6 +386,7 @@ int insert_task( log_err(rc, __func__, "Cannot allocate space to resize the array"); } + pthread_mutex_lock(wt->wt_mutex); wt->wt_tasklist = at; pthread_mutex_unlock(at->alltasks_mutex); @@ -389,10 +397,11 @@ int insert_task( /* - * NOTE: we currently don't unlock before in the trylock - * because this is only called in set_task, and before is - * always a new task not in the structure yet * adds a task to the array after another + * + * expects the 'after' work task to be locked + * expects the 'before' work task to be unlocked + * returns the 'before' task locked */ int insert_task_before( @@ -405,7 +414,15 @@ int insert_task_before( int rc; int i; +/* + * NOTE: the trylock below introduces a lock order violation. + * Since we are inserting a new task, we know nobody else + * can be holding it; deadlock cannot occur. Helgrind doesn't + * understand this, so use "correct" locking in that case + */ +#ifndef HELGRIND if (pthread_mutex_trylock(at->alltasks_mutex)) +#endif { pthread_mutex_unlock(after->wt_mutex); pthread_mutex_lock(at->alltasks_mutex); @@ -436,6 +453,7 @@ int insert_task_before( rc = PBSE_NONE; } + pthread_mutex_lock(before->wt_mutex); pthread_mutex_unlock(at->alltasks_mutex); before->wt_tasklist = at; @@ -447,6 +465,11 @@ int insert_task_before( +/* + * inserts a task at the beginning + * expects the work task to be unlocked + * returns it locked + */ int insert_task_first( all_tasks *at, @@ -463,6 +486,7 @@ int insert_task_first( log_err(rc, __func__, "Cannot allocate space to resize the array"); } + pthread_mutex_lock(wt->wt_mutex); wt->wt_tasklist = at; pthread_mutex_unlock(at->alltasks_mutex); @@ -511,7 +535,15 @@ int remove_task( { int rc = PBSE_NONE; +/* + * NOTE: the trylock below is a lock order violation. + * If it would cause a deadlock, it reverts to "correct" + * ordering. Helgrind doesn't understand this, so always + * use "correct" ordering when using Helgrind + */ +#ifndef HELGRIND if (pthread_mutex_trylock(at->alltasks_mutex)) +#endif { pthread_mutex_unlock(wt->wt_mutex); pthread_mutex_lock(at->alltasks_mutex);