Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,41 @@ fi




dnl
dnl enable helgrind support
dnl
AC_MSG_CHECKING([whether to enable helgrind support])
helgrinddir=disabled
AC_ARG_WITH(helgrind,
[ --with-helgrind=DIR Directory that holds the helgrind headers.
On Linux, 'yes' uses /usr/include/valgrind.],
[helgrinddir=$withval])
case "$helgrinddir" in
disabled) ;;
no) helgrinddir=disabled ;;
yes)
case "${PBS_MACH}" in
linux) AC_DEFINE([HELGRIND], 1, [Define to enable helgrind support])
CFLAGS="$CFLAGS -I/usr/include/valgrind"
AC_CHECK_HEADERS([helgrind.h],
[CFLAGS="$CFLAGS -DHELGRIND"],
[AC_MSG_ERROR(Helgrind headers not found. Try installing valgrind-devel)],
[]);;
*) AC_MSG_ERROR([--with-job-create takes a full path to a directory]);;
esac ;;
*)
case "${PBS_MACH}" in
linux) AC_DEFINE([HELGRIND], 1, [Define to enable helgrind support])
CFLAGS="$CFLAGS -I$helgrinddir"
AC_CHECK_HEADERS([helgrind.h],
[CFLAGS="$CFLAGS -DHELGRIND"],
[AC_MSG_ERROR("Helgrind headers not found. Is $helgrinddir correct??")],
[])
;;
*) AC_MSG_ERROR([--with-helgrind takes a full path to a directory]);;
esac ;;
esac
AC_MSG_RESULT([$helgrinddir])

dnl
dnl add an option to specify a different path where TORQUE should look for HWLOC
Expand Down
58 changes: 26 additions & 32 deletions src/lib/Libutils/u_lock_ctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,15 @@ int lock_startup()
{
int rc = PBSE_NONE;

if (locks == NULL)
if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE))
{
if ((rc = lock_init()) == PBSE_NONE)
{
if (pthread_mutex_lock(locks->startup) != 0)
{
log_err(-1,"mutex_lock","ALERT: cannot lock startup mutex!\n");
return(PBSE_MUTEX);
}
}
log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n");
return(PBSE_MUTEX);
}
if (pthread_mutex_lock(locks->startup) != 0)
{
log_err(-1, "mutex_lock", "ALERT: cannot lock startup mutex!\n");
return(PBSE_MUTEX);
}

return(rc);
Expand All @@ -101,7 +100,7 @@ int unlock_startup()
{
if (pthread_mutex_unlock(locks->startup) != 0)
{
log_err(-1,"mutex_unlock","ALERT: cannot unlock startup mutex!\n");
log_err(-1, "mutex_unlock", "ALERT: cannot unlock startup mutex!\n");
return(PBSE_MUTEX);
}

Expand All @@ -116,18 +115,15 @@ int lock_conn_table()
{
int rc = PBSE_NONE;

if (locks == NULL)
if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE))
{
if ((rc = lock_init()) == PBSE_NONE)
{
lock_init();

if (pthread_mutex_lock(locks->conn_table) != 0)
{
log_err(-1,"mutex_lock","ALERT: cannot lock conn_table mutex!\n");
return(PBSE_MUTEX);
}
}
log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n");
return(PBSE_MUTEX);
}
if (pthread_mutex_lock(locks->conn_table) != 0)
{
log_err(-1, "mutex_lock", "ALERT: cannot lock conn_table mutex!\n");
return(PBSE_MUTEX);
}

return(rc);
Expand Down Expand Up @@ -155,17 +151,15 @@ int lock_ss()
{
int rc = PBSE_NONE;

if (locks == NULL)
if ((locks == NULL) && ((rc = lock_init()) != PBSE_NONE))
{
if ((rc = lock_init()) == PBSE_NONE)
{

if (pthread_mutex_lock(locks->setup_save) != 0)
{
log_err(-1,"mutex_lock","ALERT: cannot lock setup_save mutex!\n");
return(PBSE_MUTEX);
}
}
log_err(-1, "mutex_lock", "ALERT: cannot initialize mutexes!\n");
return(PBSE_MUTEX);
}
if (pthread_mutex_lock(locks->setup_save) != 0)
{
log_err(-1, "mutex_lock", "ALERT: cannot lock setup_save mutex!\n");
return(PBSE_MUTEX);
}

return(rc);
Expand All @@ -178,7 +172,7 @@ int unlock_ss()
{
if (pthread_mutex_unlock(locks->setup_save) != 0)
{
log_err(-1,"mutex_unlock","ALERT: cannot unlock setup_save mutex!\n");
log_err(-1, "mutex_unlock", "ALERT: cannot unlock setup_save mutex!\n");
return(PBSE_MUTEX);
}

Expand Down
6 changes: 6 additions & 0 deletions src/server/array_func.c
Original file line number Diff line number Diff line change
Expand Up @@ -1846,7 +1846,13 @@ int remove_array(
int rc;
char arrayid[PBS_MAXSVRJOBID+1];

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matt: I am concerned that when HELGRIND is defined, we don't lock at all. It seems like instead of turning off the line that does locks, we'd want to get the order right. Do you agree?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pthread_mutex_trylock() will return non-zero immediately if it cannot acquire the lock (ie, another thread is already holding it). This could occur if the thread was deadlocked due to lock order violations, or just because another thread was busy using it. Either way, if it can't acquire the lock immediately, it drops the lock it already has. It then acquires them in "correct" order.

The IFNDEF simulates pthread_mutex_trylock() returning non-zero (ie, pretend it couldn't acquire the lock and always enter the conditional) so it forces it to use correct order.

Using the pthread_mutex_trylock() is an "optimization" for the usual case. There's no need to drop a lock and reacquire it unless it would otherwise deadlock. I'm fine if we want the policy to be you ALWAYS have to lock in the correct order, but I think it's fine as-is.

if (pthread_mutex_trylock(allarrays.allarrays_mutex))
#endif
{
strcpy(arrayid, pa->ai_qs.parent_id);

Expand Down
18 changes: 18 additions & 0 deletions src/server/job_container.c
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,13 @@ int get_jobs_index(
{
int index;

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(aj->alljobs_mutex))
#endif
{
unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
pthread_mutex_lock(aj->alljobs_mutex);
Expand Down Expand Up @@ -703,7 +709,13 @@ int has_job(

strcpy(jobid, pjob->ji_qs.ji_jobid);

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(aj->alljobs_mutex))
#endif
{
unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
pthread_mutex_lock(aj->alljobs_mutex);
Expand Down Expand Up @@ -750,7 +762,13 @@ int remove_job(

if (LOGLEVEL >= 10)
LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, pjob->ji_qs.ji_jobid);
/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(aj->alljobs_mutex))
#endif
{
unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL);
pthread_mutex_lock(aj->alljobs_mutex);
Expand Down
12 changes: 12 additions & 0 deletions src/server/node_func.c
Original file line number Diff line number Diff line change
Expand Up @@ -3214,7 +3214,13 @@ int remove_node(
{
int rc = PBSE_NONE;

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(an->allnodes_mutex))
#endif
{
unlock_node(pnode, __func__, NULL, LOGLEVEL);
pthread_mutex_lock(an->allnodes_mutex);
Expand All @@ -3241,7 +3247,13 @@ struct pbsnode *next_host(
struct pbsnode *pnode;
char *name = NULL;

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(an->allnodes_mutex))
#endif
{
if (held != NULL)
{
Expand Down
30 changes: 22 additions & 8 deletions src/server/pbsd_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@
#include "ji_mutex.h"
#include "job_route.h" /* queue_route */
#include "exiting_jobs.h"
#ifdef HELGRIND
#include <helgrind.h>
#endif

#define TASK_CHECK_INTERVAL 10
#define HELLO_WAIT_TIME 600
Expand Down Expand Up @@ -184,18 +187,18 @@ static void lock_out_ha();

/* external data items */

extern hello_container failures;
extern int svr_chngNodesfile;
extern int svr_totnodes;
extern struct all_jobs alljobs;
extern int run_change_logs;

extern hello_container failures;
extern int svr_chngNodesfile;
extern int svr_totnodes;
extern struct all_jobs alljobs;
extern int run_change_logs;
extern time_t pbs_tcp_timeout;
extern pthread_mutex_t *poll_job_task_mutex;
extern int max_poll_job_tasks;
extern int max_poll_job_tasks;

/* External Functions */

extern int recov_svr_attr (int);
extern int recov_svr_attr (int);
extern void change_logs_handler(int);
extern void change_logs();

Expand Down Expand Up @@ -1651,6 +1654,17 @@ int main(
extern char *msg_svrdown; /* log message */
extern char *msg_startup1; /* log message */

#ifdef HELGRIND
/* These global variables are written to from one or more threads, but
* read from many threads. This is technically a data race, but they
* should be benign. Adding mutexes around their access would negatively
* impact performance. Tell the Helgrind tool to ignore these.
*/
VALGRIND_HG_DISABLE_CHECKING(&LOGLEVEL, sizeof LOGLEVEL);
VALGRIND_HG_DISABLE_CHECKING(&pbs_tcp_timeout, sizeof pbs_tcp_timeout);
VALGRIND_HG_DISABLE_CHECKING(&last_task_check_time, sizeof last_task_check_time);
#endif

ProgName = argv[0];
srand(get_random_number());
tzset(); /* localtime_r needs this */
Expand Down
23 changes: 20 additions & 3 deletions src/server/queue_func.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@ pbs_queue *que_alloc(
initialize_all_jobs_array(pq->qu_jobs);
initialize_all_jobs_array(pq->qu_jobs_array_sum);
pthread_mutex_init(pq->qu_mutex,NULL);
lock_queue(pq, __func__, NULL, LOGLEVEL);

snprintf(pq->qu_qs.qu_name, sizeof(pq->qu_qs.qu_name), "%s", name);

Expand Down Expand Up @@ -320,7 +319,7 @@ void que_free(
remove_queue(&svr_queues, pq);
pq->q_being_recycled = TRUE;
insert_into_queue_recycler(pq);
unlock_queue(pq, "que_free", NULL, LOGLEVEL);
unlock_queue(pq, __func__, NULL, LOGLEVEL);

return;
} /* END que_free() */
Expand Down Expand Up @@ -468,7 +467,12 @@ void free_alljobs_array(




/*
* insert a queue to an all_queues hash
*
* expects a queue that is unlocked
* returns a the queue locked
*/
int insert_queue(

all_queues *aq,
Expand All @@ -490,6 +494,7 @@ int insert_queue(
rc = PBSE_NONE;
}

lock_queue(pque, __func__, NULL, LOGLEVEL);
pthread_mutex_unlock(aq->allques_mutex);

return(rc);
Expand All @@ -509,7 +514,13 @@ int remove_queue(
int index;
char log_buf[1000];

/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(aq->allques_mutex))
#endif
{
unlock_queue(pque, __func__, NULL, LOGLEVEL);
pthread_mutex_lock(aq->allques_mutex);
Expand Down Expand Up @@ -673,7 +684,13 @@ pbs_queue *lock_queue_with_job_held(

if (pque != NULL)
{
/*
* Acquiring this lock would be a lock order violation, but
* deadlock cannot occur. Have Helgrind ignore this.
*/
#ifndef HELGRIND
if (pthread_mutex_trylock(pque->qu_mutex))
#endif
{
/* if fail */
strcpy(jobid, pjob->ji_qs.ji_jobid);
Expand Down
1 change: 1 addition & 0 deletions src/server/queue_recycler.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ int insert_into_queue_recycler(
enqueue_threadpool_request(remove_some_recycle_queues,NULL);
}

unlock_queue(pq, __func__, NULL, LOGLEVEL);
rc = insert_queue(&q_recycler.queues,pq);

update_queue_recycler_next_id();
Expand Down
5 changes: 2 additions & 3 deletions src/server/req_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ int req_selectjobs(

plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_select);

/* This will either leave pque NULL or return a locked queue */
rc = build_selist(plist, preq->rq_perm, &selistp, &pque, &bad);

if (rc != 0)
Expand Down Expand Up @@ -358,7 +359,7 @@ int req_selectjobs(
}

if (pque != NULL)
unlock_queue(pque, "req_selectjobs", NULL, LOGLEVEL);
unlock_queue(pque, __func__, NULL, LOGLEVEL);

return PBSE_NONE;
} /* END req_selectjobs() */
Expand Down Expand Up @@ -972,8 +973,6 @@ static int build_selist(

if (*pque == (pbs_queue *)0)
return (PBSE_UNKQUE);

unlock_queue(*pque, __func__, NULL, LOGLEVEL);
}
}
}
Expand Down
3 changes: 0 additions & 3 deletions src/server/svr_movejob.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,9 +867,6 @@ int send_job_work(
return(PBSE_SYSTEM);
}

pthread_mutex_lock(connection[con].ch_mutex);
pthread_mutex_unlock(connection[con].ch_mutex);

if (attempt_to_queue_job == TRUE)
{
if (change_substate_on_attempt_to_queue == TRUE)
Expand Down
Loading