From de454ab87d1467cf3fc182afb2c82a6f2a9cffc2 Mon Sep 17 00:00:00 2001 From: Elizabeth Chan Date: Tue, 22 Jan 2013 15:52:09 -0700 Subject: [PATCH 1/8] Merge from trunk Fix for TRQ-1447: Desadlock in server under load. --- src/server/exiting_jobs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server/exiting_jobs.c b/src/server/exiting_jobs.c index cae600140d..029a988090 100644 --- a/src/server/exiting_jobs.c +++ b/src/server/exiting_jobs.c @@ -214,6 +214,7 @@ int check_exiting_jobs() } else { + pjob_mutex.unlock(); retry_job_exit(jeri); } } From fb0c5694865155a337dcc41efd48789badcf2e94 Mon Sep 17 00:00:00 2001 From: David Beer Date: Tue, 22 Jan 2013 16:47:37 -0700 Subject: [PATCH 2/8] Fix a deadlock introduced by the mutex managing code in job_route() --- src/server/job_route.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/server/job_route.c b/src/server/job_route.c index ade1f31a71..830938663d 100644 --- a/src/server/job_route.c +++ b/src/server/job_route.c @@ -337,19 +337,29 @@ int job_route( time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; - struct pbs_queue *qp = jobp->ji_qhdr; + struct pbs_queue *qp; long retry_time; - - if (qp == NULL) - return(PBSE_QUENOEN); if (LOGLEVEL >= 7) { sprintf(log_buf, "%s", jobp->ji_qs.ji_jobid); - LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); + log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); } + + qp = get_jobs_queue(&jobp); + + if (jobp == NULL) + { + return(PBSE_JOB_RECYCLED); + } + + if (qp == NULL) + { + return(PBSE_BADSTATE); + } + + mutex_mgr qp_mutex = mutex_mgr(qp->qu_mutex, true); - mutex_mgr qp_mutex = mutex_mgr(qp->qu_mutex); /* see if the job is able to be routed */ switch (jobp->ji_qs.ji_state) { From 1c5fb2a80fd4075c1ebfb722dc6f80b1fe692e93 Mon Sep 17 00:00:00 2001 From: David Beer Date: Wed, 23 Jan 2013 10:31:34 -0700 Subject: [PATCH 3/8] Fix a crash in get_jobs_array(), if the job has disappeared, then we need to make sure we have a valid array before attempting to unlock it. --- CHANGELOG | 2 ++ src/server/job_func.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f282308b47..65a21c14c3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,8 @@ c - crash b - bug fix e - enhancement f - new feature n - note 4.1.5 b - For cray: make sure that reservations are released when jobs are requeued. TRQ-1572. + c - If the job is no long valid after attempting to lock the array in get_jobs_array(), + make sure the array is valid before attempting to unlock it. TRQ-1598. 4.1.4 e - When in cray mode, write physmem and availmem in addition to totmem so that diff --git a/src/server/job_func.c b/src/server/job_func.c index f02a5ebf05..b37e95eaec 100644 --- a/src/server/job_func.c +++ b/src/server/job_func.c @@ -2056,8 +2056,12 @@ job_array *get_jobs_array( pjob = svr_find_job(jobid, TRUE); if (pjob == NULL) { - unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); - pa = NULL; + if (pa != NULL) + { + unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); + pa = NULL; + } + *pjob_ptr = NULL; } mutex_mgr job_mutex2(pjob->ji_mutex,true); From 6fcbd5611ee8a15a76af2a1e0f22059a5763659b Mon Sep 17 00:00:00 2001 From: David Beer Date: Wed, 23 Jan 2013 15:44:34 -0700 Subject: [PATCH 4/8] Remove an unnecessary mutex manager. --- src/server/job_func.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/server/job_func.c b/src/server/job_func.c index b37e95eaec..2de440b56c 100644 --- a/src/server/job_func.c +++ b/src/server/job_func.c @@ -2064,8 +2064,7 @@ job_array *get_jobs_array( *pjob_ptr = NULL; } - mutex_mgr job_mutex2(pjob->ji_mutex,true); - job_mutex2.set_lock_on_exit(false); + } return(pa); From 721190ec641af5a4617d304517b2a9cd9355b948 Mon Sep 17 00:00:00 2001 From: David Beer Date: Wed, 23 Jan 2013 15:51:14 -0700 Subject: [PATCH 5/8] Wrap a log_event in {} as was intended originally but not done correctly. --- src/server/queue_func.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/server/queue_func.c b/src/server/queue_func.c index 4f0fa4edc4..03a215a623 100644 --- a/src/server/queue_func.c +++ b/src/server/queue_func.c @@ -695,8 +695,10 @@ pbs_queue *lock_queue_with_job_held( else { if (LOGLEVEL >= 10) + { snprintf(log_buf, sizeof(log_buf), "try lock succeeded for queue %s on job %s", pque->qu_qs.qu_name, pjob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf); + } } } From 92af29bcf24cd62c837826548eac7edccaa9089f Mon Sep 17 00:00:00 2001 From: Ken Date: Wed, 23 Jan 2013 15:59:39 -0700 Subject: [PATCH 6/8] changed mutex locking function calls to use LOGLEVEL and not 0 --- src/server/display_alps_status.c | 4 ++- src/server/job_container.c | 8 ++--- src/server/job_func.c | 8 ++--- src/server/login_nodes.c | 9 +++--- src/server/node_func.c | 24 +++++++-------- src/server/node_manager.c | 38 +++++++++++------------ src/server/pbsd_main.c | 2 +- src/server/process_alps_status.c | 24 +++++++-------- src/server/process_mom_update.c | 2 +- src/server/queue_func.c | 8 ++--- src/server/receive_mom_communication.c | 2 +- src/server/req_delete.c | 2 +- src/server/req_jobobit.c | 42 +++++++++++++------------- src/server/req_quejob.c | 2 +- src/server/req_runjob.c | 12 ++++---- src/server/req_shutdown.c | 2 +- src/server/req_signal.c | 2 +- src/server/svr_jobfunc.c | 4 +-- 18 files changed, 99 insertions(+), 96 deletions(-) diff --git a/src/server/display_alps_status.c b/src/server/display_alps_status.c index 816edb0f15..dcde8d966d 100644 --- a/src/server/display_alps_status.c +++ b/src/server/display_alps_status.c @@ -5,6 +5,8 @@ #include "list_link.h" #include "../lib/Libutils/u_lock_ctl.h" +extern int LOGLEVEL; + int status_node(struct pbsnode *, struct batch_request *, int *, tlist_head *); @@ -24,7 +26,7 @@ int get_alps_statuses( while ((alps_node = next_host(&(parent->alps_subnodes), &iter, NULL)) != NULL) { rc = status_node(alps_node, preq, bad, pstathd); - unlock_node(alps_node, __func__, NULL, 0); + unlock_node(alps_node, __func__, NULL, LOGLEVEL); if (rc != PBSE_NONE) break; diff --git a/src/server/job_container.c b/src/server/job_container.c index 305c838c62..562df6a40a 100644 --- a/src/server/job_container.c +++ b/src/server/job_container.c @@ -478,18 +478,18 @@ job *svr_find_job( { pj = pj->ji_external_clone; - lock_ji_mutex(pj, __func__, NULL, 0); - unlock_ji_mutex(pj->ji_parent_job, __func__, NULL, 0); + lock_ji_mutex(pj, __func__, NULL, LOGLEVEL); + unlock_ji_mutex(pj->ji_parent_job, __func__, NULL, LOGLEVEL); if (pj->ji_being_recycled == TRUE) { - unlock_ji_mutex(pj, __func__, NULL, 0); + unlock_ji_mutex(pj, __func__, NULL, LOGLEVEL); pj = NULL; } } else { - unlock_ji_mutex(pj, __func__, NULL, 0); + unlock_ji_mutex(pj, __func__, NULL, LOGLEVEL); pj = NULL; } } diff --git a/src/server/job_func.c b/src/server/job_func.c index b37e95eaec..e2289f6f0a 100644 --- a/src/server/job_func.c +++ b/src/server/job_func.c @@ -741,13 +741,13 @@ void job_free( if (pj->ji_cray_clone != NULL) { - lock_ji_mutex(pj->ji_cray_clone, __func__, NULL, 0); + lock_ji_mutex(pj->ji_cray_clone, __func__, NULL, LOGLEVEL); job_free(pj->ji_cray_clone, TRUE); } if (pj->ji_external_clone != NULL) { - lock_ji_mutex(pj->ji_external_clone, __func__, NULL, 0); + lock_ji_mutex(pj->ji_external_clone, __func__, NULL, LOGLEVEL); job_free(pj->ji_external_clone, TRUE); } @@ -2303,7 +2303,7 @@ int split_job( change_external_job_name(external); external->ji_parent_job = pjob; pjob->ji_external_clone = external; - unlock_ji_mutex(external, __func__, NULL, 0); + unlock_ji_mutex(external, __func__, NULL, LOGLEVEL); } if (pjob->ji_cray_clone == NULL) @@ -2312,7 +2312,7 @@ int split_job( fix_cray_exec_hosts(cray); cray->ji_parent_job = pjob; pjob->ji_cray_clone = cray; - unlock_ji_mutex(cray, __func__, NULL, 0); + unlock_ji_mutex(cray, __func__, NULL, LOGLEVEL); } return(PBSE_NONE); diff --git a/src/server/login_nodes.c b/src/server/login_nodes.c index 57d14ae889..6990def59b 100644 --- a/src/server/login_nodes.c +++ b/src/server/login_nodes.c @@ -84,6 +84,7 @@ #include "login_nodes.h" #include "../lib/Libutils/u_lock_ctl.h" +extern int LOGLEVEL; login_holder logins; @@ -132,7 +133,7 @@ struct pbsnode *check_node( { struct pbsnode *pnode = ln->pnode; - lock_node(pnode, __func__, NULL, 20); + lock_node(pnode, __func__, NULL, LOGLEVEL); if ((hasprop(pnode, needed) == TRUE) && (pnode->nd_nsn - pnode->nd_np_to_be_used >= 1) && @@ -141,7 +142,7 @@ struct pbsnode *check_node( return(pnode); else { - unlock_node(pnode, __func__, NULL, 20); + unlock_node(pnode, __func__, NULL, LOGLEVEL); return(NULL); } } /* END check_node() */ @@ -252,7 +253,7 @@ struct pbsnode *get_next_login_node( if (ln != NULL) { pnode = ln->pnode; - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); if (needed != NULL) { @@ -272,7 +273,7 @@ struct pbsnode *get_next_login_node( if (node_fits == FALSE) { - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); pnode = find_fitting_node(needed); } else diff --git a/src/server/node_func.c b/src/server/node_func.c index bd8e9c6f41..94ff8eb099 100644 --- a/src/server/node_func.c +++ b/src/server/node_func.c @@ -285,7 +285,7 @@ struct pbsnode *find_node_in_allnodes( pnode = (struct pbsnode *)an->ra->slots[index].item; if (pnode != NULL) - lock_node(pnode, __func__, 0, 0); + lock_node(pnode, __func__, 0, LOGLEVEL); } pthread_mutex_unlock(an->allnodes_mutex); @@ -351,17 +351,17 @@ struct pbsnode *find_nodebyname( { if (alps_reporter != NULL) { - lock_node(alps_reporter, __func__, NULL, 0); + lock_node(alps_reporter, __func__, NULL, LOGLEVEL); if ((i = get_value_hash(alps_reporter->alps_subnodes.ht, (void *)nodename)) >= 0) { if ((pnode = (struct pbsnode *)alps_reporter->alps_subnodes.ra->slots[i].item) != NULL) { - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); } } - unlock_node(alps_reporter, __func__, NULL, 0); + unlock_node(alps_reporter, __func__, NULL, LOGLEVEL); } } else @@ -2427,7 +2427,7 @@ int setup_nodes(void) np->nd_is_alps_reporter = TRUE; alps_reporter = np; initialize_all_nodes_array(&(np->alps_subnodes)); - unlock_node(np, __func__, NULL, 0); + unlock_node(np, __func__, NULL, LOGLEVEL); } else if (is_alps_starter == TRUE) { @@ -2435,7 +2435,7 @@ int setup_nodes(void) np->nd_is_alps_login = TRUE; add_to_login_holder(np); /* NYI: add to login node list */ - unlock_node(np, __func__, NULL, 0); + unlock_node(np, __func__, NULL, LOGLEVEL); } } @@ -3248,7 +3248,7 @@ static struct pbsnode *get_my_next_alps_node( { struct pbsnode *alps_node = next_host(&(pnode->alps_subnodes), &(iter->alps_index), NULL); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); return(alps_node); } /* END get_my_next_alps_node() */ @@ -3340,7 +3340,7 @@ struct pbsnode *next_node( if (next != NULL) { - lock_node(next, __func__, NULL, 0); + lock_node(next, __func__, NULL, LOGLEVEL); if (next->nd_is_alps_reporter) next = get_my_next_alps_node(iter, next); @@ -3349,7 +3349,7 @@ struct pbsnode *next_node( } else { - unlock_node(current, __func__, NULL, 0); + unlock_node(current, __func__, NULL, LOGLEVEL); iter->alps_index = -1; pthread_mutex_lock(an->allnodes_mutex); @@ -3358,7 +3358,7 @@ struct pbsnode *next_node( if (next != NULL) { - lock_node(next, __func__, NULL, 0); + lock_node(next, __func__, NULL, LOGLEVEL); if (next->nd_is_alps_reporter) next = get_my_next_alps_node(iter, next); @@ -3542,7 +3542,7 @@ struct pbsnode *next_host( if (held != NULL) { name = strdup(held->nd_name); - unlock_node(held, __func__, NULL, 0); + unlock_node(held, __func__, NULL, LOGLEVEL); } pthread_mutex_lock(an->allnodes_mutex); } @@ -3595,7 +3595,7 @@ void *send_hierarchy_threadtask( if (pnode != NULL) { port = pnode->nd_mom_rm_port; - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); if (send_hierarchy(hi->name, port) != PBSE_NONE) { diff --git a/src/server/node_manager.c b/src/server/node_manager.c index 9d8aa6ca7c..16c925505d 100644 --- a/src/server/node_manager.c +++ b/src/server/node_manager.c @@ -730,10 +730,10 @@ int kill_job_on_mom( strcpy(preq->rq_ind.rq_signal.rq_jid, jobid); strcpy(preq->rq_ind.rq_signal.rq_signame, "SIGKILL"); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); rc = issue_Drequest(conn, preq); free_br(preq); - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); } } @@ -757,9 +757,9 @@ int job_should_be_on_node( if ((is_job_on_node(pnode, jobid)) == FALSE) { /* must lock the job before the node */ - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); pjob = svr_find_job(jobid, TRUE); - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); if (pjob != NULL) { @@ -837,9 +837,9 @@ int remove_jobs_that_have_disappeared( job *pjob; /* locking priority is job before node */ - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); pjob = svr_find_job(jobid, TRUE); - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); if (pjob == NULL) { @@ -971,7 +971,7 @@ void *sync_node_jobs( /*free_resizable_array(ms_jobs);*/ - unlock_node(np, __func__, NULL, 0); + unlock_node(np, __func__, NULL, LOGLEVEL); return(NULL); } /* END sync_node_jobs() */ @@ -2500,7 +2500,7 @@ int is_compute_node( if ((pnode = find_nodebyname(node_id)) != NULL) { rc = TRUE; - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } if (colon != NULL) @@ -2571,13 +2571,13 @@ int check_for_node_type( (!strcmp(p->name, alps_starter_feature))) continue; - lock_node(reporter, __func__, NULL, 0); + lock_node(reporter, __func__, NULL, LOGLEVEL); pnode = find_node_in_allnodes(&(reporter->alps_subnodes), p->name); - unlock_node(reporter, __func__, NULL, 0); + unlock_node(reporter, __func__, NULL, LOGLEVEL); if (pnode != NULL) { - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); if (nt == ND_TYPE_CRAY) { @@ -2597,7 +2597,7 @@ int check_for_node_type( if (pnode->nd_is_alps_login == TRUE) login = TRUE; - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); if (nt == ND_TYPE_EXTERNAL) { @@ -2674,7 +2674,7 @@ int add_login_node_if_needed( if (login->nd_is_alps_login == FALSE) need_to_add_login = TRUE; - unlock_node(login, __func__, NULL, 0); + unlock_node(login, __func__, NULL, LOGLEVEL); } if (need_to_add_login == TRUE) @@ -2701,7 +2701,7 @@ int add_login_node_if_needed( rc = PBSE_NONE; - unlock_node(login, __func__, NULL, 0); + unlock_node(login, __func__, NULL, LOGLEVEL); } if (prop != NULL) @@ -4078,7 +4078,7 @@ int add_to_ms_list( { insert_thing(pnode->nd_ms_jobs, strdup(pjob->ji_qs.ji_jobid)); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } return(PBSE_NONE); @@ -4994,7 +4994,7 @@ void free_nodes( remove_job_from_node(pnode, pjob); remove_job_from_nodes_gpus(pnode, pjob); remove_job_from_nodes_mics(pnode, pjob); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } } @@ -5005,7 +5005,7 @@ void free_nodes( if ((pnode = find_nodebyname(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str)) != NULL) { remove_job_from_node(pnode, pjob); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } } @@ -5036,9 +5036,9 @@ struct pbsnode *get_compute_node( } } - lock_node(ar, __func__, NULL, 0); + lock_node(ar, __func__, NULL, LOGLEVEL); compute_node = create_alps_subnode(ar, node_name); - unlock_node(ar, __func__, NULL, 0); + unlock_node(ar, __func__, NULL, LOGLEVEL); return(compute_node); } /* END get_compute_node() */ diff --git a/src/server/pbsd_main.c b/src/server/pbsd_main.c index 59015ac095..137a56cd7a 100644 --- a/src/server/pbsd_main.c +++ b/src/server/pbsd_main.c @@ -1217,7 +1217,7 @@ void *handle_queue_routing_retries( } } } - unlock_queue(pque, __func__, NULL, 0); + unlock_queue(pque, __func__, NULL, LOGLEVEL); } sleep(route_retry_interval); } diff --git a/src/server/process_alps_status.c b/src/server/process_alps_status.c index 25a9e5f02c..584646c0de 100644 --- a/src/server/process_alps_status.c +++ b/src/server/process_alps_status.c @@ -128,7 +128,7 @@ struct pbsnode *find_alpsnode_by_name( pthread_mutex_unlock(parent->alps_subnodes.allnodes_mutex); if (node != NULL) - lock_node(node, __func__, NULL, 0); + lock_node(node, __func__, NULL, LOGLEVEL); return(node); } /* END find_alpsnode_by_name() */ @@ -186,7 +186,7 @@ struct pbsnode *create_alps_subnode( /* add any properties to the subnodes */ copy_properties(subnode, parent); - lock_node(subnode, __func__, NULL, 0); + lock_node(subnode, __func__, NULL, LOGLEVEL); insert_node(&(parent->alps_subnodes), subnode); @@ -237,7 +237,7 @@ void *check_if_orphaned( } /* unlock before the network transaction */ - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); if (handle >= 0) issue_Drequest(handle, preq); @@ -268,7 +268,7 @@ struct pbsnode *determine_node_from_str( (strcmp(node_id, current->nd_name))) { if (current != NULL) - unlock_node(current, __func__, NULL, 0); + unlock_node(current, __func__, NULL, LOGLEVEL); if ((next = find_alpsnode_by_name(parent, node_id)) == NULL) { @@ -488,7 +488,7 @@ int record_reservation( { strcpy(jobid, sub_node->jobs->jobid); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); if ((pjob = svr_find_job(jobid, TRUE)) != NULL) { @@ -500,11 +500,11 @@ int record_reservation( found_job = TRUE; job_mutex.unlock(); - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); break; } else - lock_node(pnode, __func__, NULL, 0); + lock_node(pnode, __func__, NULL, LOGLEVEL); } } @@ -612,12 +612,12 @@ int process_alps_status( /* sub-functions will attempt to lock a job, so we must unlock the * reporter node */ - unlock_node(parent, __func__, NULL, 0); + unlock_node(parent, __func__, NULL, LOGLEVEL); process_reservation_id(current, str); current_node_id = strdup(current->nd_name); - unlock_node(current, __func__, NULL, 0); + unlock_node(current, __func__, NULL, LOGLEVEL); /* re-lock the parent */ if ((parent = find_nodebyname(nd_name)) == NULL) @@ -634,7 +634,7 @@ int process_alps_status( if ((current = find_node_in_allnodes(&parent->alps_subnodes, current_node_id)) == NULL) { /* current node disappeared, this shouldn't be possible either */ - unlock_node(parent, __func__, NULL, 0); + unlock_node(parent, __func__, NULL, LOGLEVEL); snprintf(log_buf, sizeof(log_buf), "Current node '%s' disappeared while recording a reservation", current_node_id); log_err(PBSE_UNKNODE, __func__, log_buf); @@ -675,10 +675,10 @@ int process_alps_status( snprintf(node_index_buf, sizeof(node_index_buf), "node_index=%d", node_index++); decode_arst(&temp, NULL, NULL, node_index_buf, 0); save_node_status(current, &temp); - unlock_node(current, __func__, NULL, 0); + unlock_node(current, __func__, NULL, LOGLEVEL); } - unlock_node(parent, __func__, NULL, 0); + unlock_node(parent, __func__, NULL, LOGLEVEL); free_all_keys(rsv_ht); free_hash(rsv_ht); diff --git a/src/server/process_mom_update.c b/src/server/process_mom_update.c index 69243c663e..20a0301bd3 100644 --- a/src/server/process_mom_update.c +++ b/src/server/process_mom_update.c @@ -790,7 +790,7 @@ int process_status_info( if (current != NULL) { save_node_status(current, &temp); - unlock_node(current, __func__, NULL, 0); + unlock_node(current, __func__, NULL, LOGLEVEL); } if ((rc == PBSE_NONE) && diff --git a/src/server/queue_func.c b/src/server/queue_func.c index 4f0fa4edc4..cab231a6e0 100644 --- a/src/server/queue_func.c +++ b/src/server/queue_func.c @@ -617,7 +617,7 @@ int get_parent_dest_queues( unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); - unlock_queue(*parent, __func__, NULL, 0); + unlock_queue(*parent, __func__, NULL, LOGLEVEL); *parent = NULL; @@ -645,8 +645,8 @@ int get_parent_dest_queues( else { /* SUCCESS! */ - lock_queue(pque_parent, __func__, NULL, 0); - lock_queue(pque_dest, __func__, (char *)NULL, 0); + lock_queue(pque_parent, __func__, NULL, LOGLEVEL); + lock_queue(pque_dest, __func__, (char *)NULL,LOGLEVEL); *parent = pque_parent; *dest = pque_dest; @@ -687,7 +687,7 @@ pbs_queue *lock_queue_with_job_held( if ((pjob = svr_find_job(jobid, TRUE)) == NULL) { - unlock_queue(pque, __func__, NULL, 0); + unlock_queue(pque, __func__, NULL, LOGLEVEL); pque = NULL; *pjob_ptr = NULL; } diff --git a/src/server/receive_mom_communication.c b/src/server/receive_mom_communication.c index 1ebd197fdc..e56090dbda 100644 --- a/src/server/receive_mom_communication.c +++ b/src/server/receive_mom_communication.c @@ -151,7 +151,7 @@ int is_reporter_node( if (pnode != NULL) { rc = pnode->nd_is_alps_reporter; - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } return(rc); diff --git a/src/server/req_delete.c b/src/server/req_delete.c index 47cc0e4744..8299f6578e 100644 --- a/src/server/req_delete.c +++ b/src/server/req_delete.c @@ -825,7 +825,7 @@ int handle_single_delete( } else { - unlock_ji_mutex(pjob, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); /* send the asynchronous reply if needed */ if (preq_tmp != NULL) diff --git a/src/server/req_jobobit.c b/src/server/req_jobobit.c index a25c17a7e3..d60a12736c 100644 --- a/src/server/req_jobobit.c +++ b/src/server/req_jobobit.c @@ -673,7 +673,7 @@ int mom_comm( strcpy(jobid, pjob->ji_qs.ji_jobid); - unlock_ji_mutex(pjob, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); handle = svr_connect( pjob->ji_qs.ji_un.ji_exect.ji_momaddr, @@ -1587,8 +1587,8 @@ int handle_complete_subjob( int rc = PBSE_NONE; int complete_parent = FALSE; - unlock_ji_mutex(pjob, __func__, NULL, 0); - lock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); + lock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); if (parent_job->ji_being_recycled == FALSE) { @@ -1597,13 +1597,13 @@ int handle_complete_subjob( else other_subjob = parent_job->ji_cray_clone; - lock_ji_mutex(other_subjob, __func__, NULL, 0); + lock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); if ((other_subjob->ji_being_recycled == TRUE) || (other_subjob->ji_qs.ji_state == JOB_STATE_COMPLETE)) complete_parent = TRUE; - unlock_ji_mutex(other_subjob, __func__, NULL, 0); + unlock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); if (complete_parent == TRUE) { @@ -1624,7 +1624,7 @@ int handle_complete_subjob( } } - unlock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); return(rc); } /* END handle_complete_subjob() */ @@ -2683,8 +2683,8 @@ int handle_subjob_exit_status( parent_job = pjob->ji_parent_job; - unlock_ji_mutex(pjob, __func__, NULL, 0); - lock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); + lock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); if (parent_job->ji_qs.ji_un.ji_exect.ji_exitstat == 0) { @@ -2704,8 +2704,8 @@ int handle_subjob_exit_status( add_comment_to_parent(parent_job, cray_exited_nonzero, exit_status); - unlock_ji_mutex(parent_job, __func__, NULL, 0); - lock_ji_mutex(other_subjob, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); + lock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); if (other_subjob->ji_qs.ji_state <= JOB_STATE_RUNNING) { @@ -2713,7 +2713,7 @@ int handle_subjob_exit_status( pnode = find_nodebyname(other_subjob->ji_qs.ji_destin); } - unlock_ji_mutex(other_subjob, __func__, NULL, 0); + unlock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); if (pnode != NULL) { @@ -2723,14 +2723,14 @@ int handle_subjob_exit_status( log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, other_jobid, log_buf); kill_job_on_mom(other_jobid, pnode); - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } } else - unlock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); } else - unlock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); return(rc); } /* END handle_subjob_exit_status() */ @@ -2831,24 +2831,24 @@ int handle_rerunning_heterogeneous_jobs( if ((rc = rerun_job(pjob, newstate, newsubst, acctbuf)) == PBSE_NONE) { - unlock_ji_mutex(pjob, __func__, NULL, 0); - lock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); + lock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); if (parent_job->ji_external_clone == pjob) other_subjob = parent_job->ji_cray_clone; else other_subjob = parent_job->ji_external_clone; - unlock_ji_mutex(parent_job, __func__, NULL, 0); - lock_ji_mutex(other_subjob, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); + lock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); if ((rc = rerun_job(other_subjob, newstate, newsubst, acctbuf)) == PBSE_NONE) { - unlock_ji_mutex(other_subjob, __func__, NULL, 0); - lock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(other_subjob, __func__, NULL, LOGLEVEL); + lock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); if ((rc = rerun_job(parent_job, newstate, newsubst, acctbuf)) == PBSE_NONE) - unlock_ji_mutex(parent_job, __func__, NULL, 0); + unlock_ji_mutex(parent_job, __func__, NULL, LOGLEVEL); } } diff --git a/src/server/req_quejob.c b/src/server/req_quejob.c index a86c812e91..d44c407f9d 100644 --- a/src/server/req_quejob.c +++ b/src/server/req_quejob.c @@ -1970,7 +1970,7 @@ int set_interactive_job_roaming_policy( pjob->ji_wattr[JOB_ATR_login_prop].at_flags |= ATR_VFLAG_SET; pjob->ji_wattr[JOB_ATR_login_prop].at_val.at_str = submit_node_id; - unlock_node(pnode, __func__, NULL, 0); + unlock_node(pnode, __func__, NULL, LOGLEVEL); } else { diff --git a/src/server/req_runjob.c b/src/server/req_runjob.c index 3301624751..678f185d15 100644 --- a/src/server/req_runjob.c +++ b/src/server/req_runjob.c @@ -1271,9 +1271,9 @@ int handle_heterogeneous_job_launch( batch_request *external_preq; batch_request *cray_preq; - unlock_ji_mutex(pjob, __func__, NULL, 0); - lock_ji_mutex(external_clone, __func__, NULL, 0); - lock_ji_mutex(cray_clone, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); + lock_ji_mutex(external_clone, __func__, NULL, LOGLEVEL); + lock_ji_mutex(cray_clone, __func__, NULL, LOGLEVEL); /* clone the batch requests to avoid double frees */ external_preq = duplicate_request(preq); @@ -1297,12 +1297,12 @@ int handle_heterogeneous_job_launch( free_br(cray_preq); if (cray_clone != NULL) - unlock_ji_mutex(cray_clone, __func__, NULL, 0); + unlock_ji_mutex(cray_clone, __func__, NULL, LOGLEVEL); if (external_clone != NULL) - unlock_ji_mutex(external_clone, __func__, NULL, 0); + unlock_ji_mutex(external_clone, __func__, NULL, LOGLEVEL); - lock_ji_mutex(pjob, __func__, NULL, 0); + lock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); if (both_running == TRUE) { diff --git a/src/server/req_shutdown.c b/src/server/req_shutdown.c index 1d7254604c..d919a920c5 100644 --- a/src/server/req_shutdown.c +++ b/src/server/req_shutdown.c @@ -154,7 +154,7 @@ void save_queues() while ((pque = next_queue(&svr_queues, &iter)) != NULL) { que_save(pque); - unlock_queue(pque, __func__, NULL, 0); + unlock_queue(pque, __func__, NULL, LOGLEVEL); } } /* END save_queues() */ diff --git a/src/server/req_signal.c b/src/server/req_signal.c index 3df2ab4457..307ee12b8a 100644 --- a/src/server/req_signal.c +++ b/src/server/req_signal.c @@ -281,7 +281,7 @@ int issue_signal( (pjob != NULL)) { strcpy(jobid, pjob->ji_qs.ji_jobid); - unlock_ji_mutex(pjob, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); func(newreq); *pjob_ptr = svr_find_job((char *)jobid, TRUE); diff --git a/src/server/svr_jobfunc.c b/src/server/svr_jobfunc.c index 508780aafb..3b2103dc21 100644 --- a/src/server/svr_jobfunc.c +++ b/src/server/svr_jobfunc.c @@ -729,7 +729,7 @@ int svr_dequejob( strcpy(job_id, pjob->ji_qs.ji_jobid); /* this function will lock queues and jobs */ - unlock_ji_mutex(pjob, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); if (parent_queue_mutex_held == TRUE) { @@ -2053,7 +2053,7 @@ static int check_queue_job_limit( int user_jobs = 0; /* count number of jobs user has in queue */ - unlock_ji_mutex(pjob, __func__, NULL, 0); + unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); user_jobs = count_queued_jobs(pque, pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str); From ee252fa51f0808b0e623528e7e52b0ef392bad4e Mon Sep 17 00:00:00 2001 From: Ken Date: Wed, 23 Jan 2013 17:14:57 -0700 Subject: [PATCH 7/8] TRQ-1602. Modified pam_pbssimpleauth.c to remove compile time error when using --with-pam --- src/pam/pam_pbssimpleauth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pam/pam_pbssimpleauth.c b/src/pam/pam_pbssimpleauth.c index b9de21b33a..12ed43c716 100644 --- a/src/pam/pam_pbssimpleauth.c +++ b/src/pam/pam_pbssimpleauth.c @@ -19,6 +19,7 @@ #include "portability.h" #include "list_link.h" #include "pbs_ifl.h" +#include "lib_ifl.h" #include "attribute.h" #include "server_limits.h" #include "pbs_job.h" @@ -54,7 +55,6 @@ #endif - /* --- authentication management functions (only) --- */ PAM_EXTERN @@ -104,7 +104,7 @@ int pam_sm_authenticate(pam_handle_t *pamh, int flags, int argc, } /* get the username and passwd, allow uid 0 */ - retval = pam_get_user(pamh, &username, NULL); + retval = pam_get_user(pamh, (const char **)&username, NULL); #if defined(PAM_CONV_AGAIN) && defined(PAM_INCOMPLETE) if (retval == PAM_CONV_AGAIN) From 38b8d5157b13088bead16923d572a24f0eec8c65 Mon Sep 17 00:00:00 2001 From: David Beer Date: Wed, 23 Jan 2013 17:18:40 -0700 Subject: [PATCH 8/8] Fix a g++ error with --enable-shell-use-argv --- src/resmom/start_exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/resmom/start_exec.c b/src/resmom/start_exec.c index 8e8061a150..aac4c0f215 100644 --- a/src/resmom/start_exec.c +++ b/src/resmom/start_exec.c @@ -4147,7 +4147,7 @@ int TMomFinalizeChild( /* Put the script's arguments on the command line (see configure option --enable-shell-use-argv). */ if (TJE->is_interactive == FALSE) { - arg[aindex] = calloc(1, + arg[aindex] = (char *)calloc(1, strlen(path_jobs) + strlen(pjob->ji_qs.ji_fileprefix) + strlen(JOB_SCRIPT_SUFFIX) + 6);