From faf6b19f123b505778ed40f23afbbd826d6c1830 Mon Sep 17 00:00:00 2001 From: Michael Jennings Date: Tue, 11 Dec 2012 17:29:29 -0800 Subject: [PATCH 1/2] Fix "make dist" --- Makefile.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.am b/Makefile.am index ca8fdb7f2e..028012209a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,12 +7,12 @@ EXTRA_DIST = acinclude.m4 \ autogen.sh \ CHANGELOG \ configure.ac \ - cov_file_results.pl \ + cov_file_results.py \ Doxyfile \ INSTALL \ INSTALL.GNU \ Makefile.am \ - parse_cov_results.pl \ + parse_cov_results.py \ PBS_License.txt \ README.array_changes \ README.coding_notes \ @@ -23,7 +23,7 @@ EXTRA_DIST = acinclude.m4 \ README.trqauthd \ README.building_40 \ Release_Notes \ - run_report.pl \ + run_report.py \ torque.setup \ torque.spec \ buildutils/config.mk \ From c374b184e351a1969efd84ef9485300e4ec22bae Mon Sep 17 00:00:00 2001 From: Michael Jennings Date: Wed, 12 Dec 2012 12:06:26 -0800 Subject: [PATCH 2/2] Move loop to within queue_route() so that each queue has a single thread for routing its jobs. No mutex is needed this way. This keeps one queue from blocking other queues while preventing multiple queue_route() instances operating on the same queue at the same time. TODO: Must create new queue_route() thread for runtime-added queues. --- src/server/job_route.c | 76 +++++++++++---------- src/server/pbsd_main.c | 149 ++++++++++++++++++----------------------- 2 files changed, 107 insertions(+), 118 deletions(-) diff --git a/src/server/job_route.c b/src/server/job_route.c index 0da21a00e3..d450e280f9 100644 --- a/src/server/job_route.c +++ b/src/server/job_route.c @@ -134,6 +134,8 @@ extern char *msg_err_noqueue; extern int LOGLEVEL; extern pthread_mutex_t *reroute_job_mutex; +int route_retry_interval = 5; /* time in seconds to check routing queues */ + /* * Add an entry to the list of bad destinations for a job. * @@ -168,6 +170,7 @@ void add_dest( + /* * Check the job for a match of dest in the list of rejected destinations. * @@ -275,7 +278,7 @@ int default_router( if (is_bad_dest(jobp, destination)) continue; - switch (svr_movejob(jobp, destination, &local_errno, NULL, FALSE)) + switch (svr_movejob(jobp, destination, &local_errno, NULL, TRUE)) { case ROUTE_PERM_FAILURE: /* permanent failure */ @@ -488,7 +491,6 @@ int reroute_job( job_abt(&pjob, msg_routexceed); else if (rc == PBSE_QUENOEN) job_abt(&pjob, msg_err_noqueue); - } return(rc); @@ -530,48 +532,50 @@ void *queue_route( { sprintf(log_buf, "NULL queue name"); log_err(-1, __func__, log_buf); - pthread_exit(0); + return(NULL); } - if (LOGLEVEL>=7) - { - snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "queue name: %s ", queue_name); - log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf); - } - - - pthread_mutex_lock(reroute_job_mutex); - - pque = find_queuebyname(queue_name); - if (pque == NULL) + while (1) { - sprintf(log_buf, "Could not find queue %s", queue_name); - log_err(-1, __func__, log_buf); - free(queue_name); - pthread_mutex_unlock(reroute_job_mutex); - pthread_exit(0); - } + if (LOGLEVEL >= 7) + { + snprintf(log_buf, sizeof(log_buf), "queue name: %s", queue_name); + log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_QUEUE, __func__, log_buf); + } + + pque = find_queuebyname(queue_name); + if (pque == NULL) + { + sprintf(log_buf, "Could not find queue %s", queue_name); + log_err(-1, __func__, log_buf); + free(queue_name); + pthread_mutex_unlock(reroute_job_mutex); + return(NULL); + } - while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) - { - /* the second condition says we only want to try if routing - * has been tried once - this is to let req_commit have the - * first crack at routing always */ - unlock_queue(pque, __func__, NULL, LOGLEVEL); - if ((pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now) && - (pjob->ji_qs.ji_un.ji_routet.ji_rteretry != 0)) + pthread_mutex_lock(reroute_job_mutex); + while ((pjob = next_job(pque->qu_jobs,&iter)) != NULL) { - reroute_job(pjob, pque); - unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); + /* the second condition says we only want to try if routing + * has been tried once - this is to let req_commit have the + * first crack at routing always */ + unlock_queue(pque, __func__, (char *)NULL, 0); + if ((pjob->ji_qs.ji_un.ji_routet.ji_rteretry <= time_now - ROUTE_RETRY_TIME) && + (pjob->ji_qs.ji_un.ji_routet.ji_rteretry != 0)) + { + reroute_job(pjob, pque); + unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); + } + else + unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } - else - unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); - } + unlock_queue(pque, __func__, (char *)NULL, 0); + pthread_mutex_unlock(reroute_job_mutex); + sleep(route_retry_interval); + } free(queue_name); - unlock_queue(pque, __func__, NULL, LOGLEVEL); - pthread_mutex_unlock(reroute_job_mutex); - pthread_exit(0); + return(NULL); } /* END queue_route() */ diff --git a/src/server/pbsd_main.c b/src/server/pbsd_main.c index 3cc9bdb969..73782d707b 100644 --- a/src/server/pbsd_main.c +++ b/src/server/pbsd_main.c @@ -120,7 +120,6 @@ #include "tracking.h" #include "acct.h" #include "sched_cmds.h" -#include "rpp.h" #include "dis.h" #include "dis_init.h" #include "batch_request.h" @@ -258,7 +257,6 @@ int queue_rank = 0; int a_opt_init = -1; int wait_for_moms_hierarchy = FALSE; -int route_retry_interval = 10; /* time in seconds to check routing queues */ /* HA global data items */ long HALockCheckTime = 0; long HALockUpdateTime = 0; @@ -417,7 +415,7 @@ int process_pbs_server_port( if (rc != DIS_SUCCESS) { - log_err(-1, __func__, "Cannot read version - skipping this request.\n"); + log_err(-1, __func__, (char *)"Cannot read version - skipping this request.\n"); rc = PBSE_SOCKET_CLOSE; break; } @@ -806,7 +804,7 @@ void parse_command_line( } else { - log_err(-1, __func__, "unable to determine full server hostname"); + log_err(-1, __func__, (char *)"unable to determine full server hostname"); } exit(1); @@ -1116,14 +1114,14 @@ static int start_hot_jobs(void) PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, - "attempting to hot start job"); + (char *)"attempting to hot start job"); svr_startjob(pjob, NULL, NULL, NULL); ct++; } - unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); + unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } return(ct); @@ -1148,6 +1146,8 @@ void send_any_hellos_needed() } /* END send_any_hellos_needed() */ + + void *handle_queue_routing_retries( void *vp) @@ -1156,46 +1156,26 @@ void *handle_queue_routing_retries( pbs_queue *pque; char *queuename; int iter = -1; - void *status; - pthread_t queue_route_thread_id = -1; - pthread_attr_t queue_route_attr; - - - if ((pthread_attr_init(&queue_route_attr)) != 0) - { - perror("pthread_attr_init failed. handle_queue_routing_retries not started"); - log_err(-1, msg_daemonname, "pthread_attr_init failed. handle_queue_routing_retries not started"); - return(NULL); - } - else if (pthread_attr_setdetachstate(&queue_route_attr, PTHREAD_CREATE_JOINABLE) != 0) - { - perror("pthread_attr_setdetachstate failed. handle_queue_routing_retries not started"); - log_err(-1, msg_daemonname, "pthread_attr_setdetachstate failed. handle_queue_routing_retries not started"); - return(NULL); - } - - while(1) + while ((pque = next_queue(&svr_queues, &iter)) != NULL) { - sleep(route_retry_interval); - while ((pque = next_queue(&svr_queues, &iter)) != NULL) + if (pque->qu_qs.qu_type == QTYPE_RoutePush) { - if (pque->qu_qs.qu_type == QTYPE_RoutePush) - { - queuename = strdup(pque->qu_qs.qu_name); /* make sure this gets freed inside queue_route */ - unlock_queue(pque, __func__, NULL, 0); - pthread_create(&queue_route_thread_id, &queue_route_attr, queue_route, queuename); - pthread_join(queue_route_thread_id, &status); - } - else - unlock_queue(pque, __func__, NULL, 0); + queuename = strdup(pque->qu_qs.qu_name); /* make sure this gets freed inside queue_route */ + enqueue_threadpool_request(queue_route, queuename); } + + unlock_queue(pque, __func__, (char *)NULL, 0); } return(NULL); } /* END handle_queue_routing_retries() */ + + + + void *handle_scheduler_contact( void *vp) @@ -1230,21 +1210,21 @@ void start_accept_thread() { pthread_attr_t accept_attr; + accept_thread_id = -1; if ((pthread_attr_init(&accept_attr)) != 0) { perror("pthread_attr_init failed. Could not start accept thread"); - log_err(-1, msg_daemonname,"pthread_attr_init failed. Could not start accept thread"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_init failed. Could not start accept thread"); } else if ((pthread_attr_setdetachstate(&accept_attr, PTHREAD_CREATE_DETACHED) != 0)) { perror("pthread_attr_setdetatchedstate failed. Could not start accept thread"); - log_err(-1, msg_daemonname,"pthread_attr_setdetachedstate failed. Could not start accept thread"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_setdetachedstate failed. Could not start accept thread"); } else if ((pthread_create(&accept_thread_id, &accept_attr, start_accept_listener, NULL)) != 0) { - accept_thread_id = -1; perror("could not start listener for pbs_server"); - log_err(-1, msg_daemonname, "Failed to start listener for pbs_server"); + log_err(-1, msg_daemonname, (char *)"Failed to start listener for pbs_server"); } } /* END start_accept_thread() */ @@ -1257,17 +1237,17 @@ void start_routing_retry_thread() if ((pthread_attr_init(&routing_attr)) != 0) { perror("pthread_attr_init failed. Could not start accept thread"); - log_err(-1, msg_daemonname,"pthread_attr_init failed. Could not start handle_queue_routing_retries"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_init failed. Could not start handle_queue_routing_retries"); } else if ((pthread_attr_setdetachstate(&routing_attr, PTHREAD_CREATE_DETACHED) != 0)) { perror("pthread_attr_setdetatchedstate failed. Could not start accept thread"); - log_err(-1, msg_daemonname,"pthread_attr_setdetachedstate failed. Could not start handle_queue_routing_retries"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_setdetachedstate failed. Could not start handle_queue_routing_retries"); } else if ((pthread_create(&route_retry_thread_id, &routing_attr, handle_queue_routing_retries, NULL)) != 0) { perror("could not start listener for pbs_server"); - log_err(-1, msg_daemonname, "Failed to start handle_queue_routing_retries"); + log_err(-1, msg_daemonname, (char *)"Failed to start handle_queue_routing_retries"); } } /* END start_routing_retry_thread() */ @@ -1283,17 +1263,17 @@ void start_exiting_retry_thread() if (pthread_attr_init(&attr) != 0) { perror("pthread_attr_init failed. Could not start exiting retry thread"); - log_err(-1, msg_daemonname,"pthread_attr_init failed. Could not start inspect_exiting_jobs"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_init failed. Could not start inspect_exiting_jobs"); } else if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0) { perror("pthread_attr_setdetatchedstate failed. Could not start exiting retry thread"); - log_err(-1, msg_daemonname,"pthread_attr_setdetachedstate failed. Could not start inspect_exiting_jobs"); + log_err(-1, msg_daemonname,(char *)"pthread_attr_setdetachedstate failed. Could not start inspect_exiting_jobs"); } else if (pthread_create(&exiting_thread, &attr, inspect_exiting_jobs, NULL) != 0) { perror("could not start exiting job retry thread for pbs_server"); - log_err(-1, msg_daemonname, "Failed to start inspect_exiting_jobs"); + log_err(-1, msg_daemonname, (char *)"Failed to start inspect_exiting_jobs"); } } /* END start_exiting_retry_thread() */ @@ -1368,18 +1348,18 @@ void main_loop(void) /* for large systems, give newly reported nodes more time before being marked down while pbs_moms are initialy reporting in */ - set_task(WORK_Timed, when + svr_totnodes / 12, check_nodes, NULL, FALSE); + set_task(WORK_Timed, when + svr_totnodes / 12, check_nodes, (char *)NULL, FALSE); } else { - set_task(WORK_Timed, when, check_nodes, NULL, FALSE); + set_task(WORK_Timed, when, check_nodes, (char *)NULL, FALSE); } /* Just check the nodes with check_nodes above and don't ping anymore. */ - set_task(WORK_Timed, time_now + 5, check_log, NULL, FALSE); + set_task(WORK_Timed, time_now + 5, check_log, (char *)NULL, FALSE); - set_task(WORK_Timed,time_now + 10,check_acct_log, NULL, FALSE); + set_task(WORK_Timed,time_now + 10,check_acct_log, (char *)NULL, FALSE); /* * Now at last, we are ready to do some batch work. The @@ -1553,7 +1533,7 @@ void main_loop(void) if (pjob->ji_modified) job_save(pjob, SAVEJOB_FULL, 0); - unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); + unlock_ji_mutex(pjob, __func__, (char *)"1", LOGLEVEL); } if (svr_chngNodesfile) @@ -1894,7 +1874,7 @@ int main( if (write(lockfds, log_buf, strlen(log_buf)) != (ssize_t)strlen(log_buf)) { - log_err(errno, msg_daemonname, "failed to write pid to lockfile"); + log_err(errno, msg_daemonname, (char *)"failed to write pid to lockfile"); exit(-1); } @@ -1931,7 +1911,7 @@ int main( /* NOTE: env cleared in pbsd_init() */ if (pbsd_init(server_init_type) != PBSE_NONE) { - log_err(-1, msg_daemonname, "pbsd_init failed"); + log_err(-1, msg_daemonname, (char *)"pbsd_init failed"); exit(3); } @@ -1955,7 +1935,7 @@ int main( { perror("pbs_server: unix domain socket"); - log_err(-1, msg_daemonname, "init_network failed unix domain socket"); + log_err(-1, msg_daemonname, (char *)"init_network failed unix domain socket"); exit(3); } @@ -1967,7 +1947,7 @@ int main( if (poll_job_task_mutex == NULL) { perror("pbs_server: failed to initialize poll_job_task_mutex"); - log_err(-1, msg_daemonname, "pbs_server: failed to initialize poll_job_task_mutex"); + log_err(-1, msg_daemonname, (char *)"pbs_server: failed to initialize poll_job_task_mutex"); exit(3); } @@ -2045,7 +2025,7 @@ void check_job_log( if (log_remove_old(path_jobinfo_log, keep_days * SECS_PER_DAY) != 0) { - log_err(-1,"check_job_log","failure occurred when checking for old job logs"); + log_err(-1,"check_job_log",(char *)"failure occurred when checking for old job logs"); } } @@ -2060,7 +2040,7 @@ void check_job_log( PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, msg_daemonname, - "Rolling job log file"); + (char *)"Rolling job log file"); if (roll_depth != -1) { @@ -2069,7 +2049,7 @@ void check_job_log( if ((depth >= INT_MAX) || (depth < 1)) { - log_err(-1, "check_job_log", "job log roll cancelled, logfile depth is out of range"); + log_err(-1, "check_job_log", (char *)"job log roll cancelled, logfile depth is out of range"); } else { @@ -2081,7 +2061,7 @@ void check_job_log( free(ptask->wt_mutex); free(ptask); - set_task(WORK_Timed, time_now + PBS_LOG_CHECK_RATE, check_job_log, NULL, FALSE); + set_task(WORK_Timed, time_now + PBS_LOG_CHECK_RATE, check_job_log, (char *)NULL, FALSE); } /* END check_job_log */ @@ -2113,7 +2093,7 @@ void check_log( if (log_remove_old(path_svrlog, keep_days * SECS_PER_DAY) != 0) { - log_err(-1,"check_log","failure occurred when checking for old pbs_server logs"); + log_err(-1,"check_log",(char *)"failure occurred when checking for old pbs_server logs"); } } @@ -2128,13 +2108,13 @@ void check_log( PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, msg_daemonname, - "Rolling log file"); + (char *)"Rolling log file"); get_svr_attr_l(SRV_ATR_LogFileRollDepth, &roll_depth); if ((roll_depth >= INT_MAX) || (roll_depth < 1)) { - log_err(-1, "check_log", "log roll cancelled, logfile depth is out of range"); + log_err(-1, "check_log", (char *)"log roll cancelled, logfile depth is out of range"); } else { @@ -2157,7 +2137,7 @@ void check_log( free(ptask->wt_mutex); free(ptask); - set_task(WORK_Timed, time_now + PBS_LOG_CHECK_RATE, check_log, NULL, FALSE); + set_task(WORK_Timed, time_now + PBS_LOG_CHECK_RATE, check_log, (char *)NULL, FALSE); return; } /* END check_log */ @@ -2192,7 +2172,7 @@ void check_acct_log( free(ptask->wt_mutex); free(ptask); - set_task(WORK_Timed,time_now + PBS_ACCT_CHECK_RATE,check_acct_log,NULL,FALSE); + set_task(WORK_Timed,time_now + PBS_ACCT_CHECK_RATE,check_acct_log,(char *)NULL,FALSE); return; } /* END check_acct_log */ @@ -2357,7 +2337,7 @@ int is_ha_lock_file_valid( if (GoodPermissions == FALSE) { - log_err(-1, __func__, "could not obtain the needed permissions for the lock file"); + log_err(-1, __func__, (char *)"could not obtain the needed permissions for the lock file"); } return(GoodPermissions); @@ -2649,7 +2629,7 @@ int start_update_ha_lock_thread() if (fds < 0) { - log_err(-1, __func__, "Couldn't write the pid to the lockfile\n"); + log_err(-1, __func__, (char *)"Couldn't write the pid to the lockfile\n"); return(FAILURE); } @@ -2657,7 +2637,7 @@ int start_update_ha_lock_thread() snprintf(smallBuf,sizeof(smallBuf),"%ld\n",(long)sid); if (write(fds,smallBuf,strlen(smallBuf)) != (ssize_t)strlen(smallBuf)) { - log_err(-1, __func__, "Couldn't write the pid to the lockfile\n"); + log_err(-1, __func__, (char *)"Couldn't write the pid to the lockfile\n"); close(fds); return(FAILURE); @@ -2666,7 +2646,12 @@ int start_update_ha_lock_thread() /* we don't need an open handle on the lockfile, just correct update times */ close(fds); - pthread_attr_init(&HALockThreadAttr); + if ((rc = pthread_attr_init(&HALockThreadAttr)) != 0) + { + perror("pthread_attr_init failed. Could not start update ha lock thread"); + log_err(-1, msg_daemonname,"pthread_attr_init failed. Could not start ha lock thread"); + return FAILURE; + } rc = pthread_create(&HALockThread,&HALockThreadAttr,update_ha_lock_thread,NULL); @@ -2674,7 +2659,7 @@ int start_update_ha_lock_thread() { /* error creating thread */ - log_err(-1, __func__, "Could not create HA Lock Thread\n"); + log_err(-1, __func__, (char *)"Could not create HA Lock Thread\n"); return(FAILURE); } @@ -2698,7 +2683,7 @@ int mutex_lock( { if (pthread_mutex_lock(Mutex) != 0) { - log_err(-1,"mutex_lock","ALERT: cannot lock mutex!\n"); + log_err(-1,"mutex_lock",(char *)"ALERT: cannot lock mutex!\n"); return(FAILURE); } @@ -2716,7 +2701,7 @@ int mutex_unlock( { if (pthread_mutex_unlock(Mutex) != 0) { - log_err(-1,"mutex_unlock","ALERT: cannot unlock mutex!\n"); + log_err(-1,"mutex_unlock",(char *)"ALERT: cannot unlock mutex!\n"); return(FAILURE); } @@ -2772,7 +2757,7 @@ static void lock_out_ha() { /* try to get a filesystem lock on the "mutex" file */ - while (acquire_file_lock(MutexLockFile,&MutexLockFD,"HA") == FAILURE) + while (acquire_file_lock(MutexLockFile,&MutexLockFD, (char *)"HA") == FAILURE) { sprintf(log_buf,"Could not acquire HA flock--trying again in 1 second\n"); @@ -2852,7 +2837,7 @@ static void lock_out_ha() PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, - "high availability file lock obtained"); + (char *)"high availability file lock obtained"); } /* END lock_out_ha() */ @@ -2893,7 +2878,7 @@ static int daemonize_server( if ((pid = fork()) == -1) { - log_err(errno, __func__, "cannot fork into background"); + log_err(errno, __func__, (char *)"cannot fork into background"); return(FAILURE); } @@ -2907,7 +2892,7 @@ static int daemonize_server( PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, - "INFO: parent is exiting"); + (char *)"INFO: parent is exiting"); exit(0); } @@ -2916,7 +2901,7 @@ static int daemonize_server( if ((*sid = setsid()) == -1) { - log_err(errno, __func__, "Could not disconnect from controlling terminal"); + log_err(errno, __func__, (char *)"Could not disconnect from controlling terminal"); return(FAILURE); } @@ -2938,7 +2923,7 @@ static int daemonize_server( if ((pid = fork()) == -1) { - log_err(errno, __func__, "cannot fork into background"); + log_err(errno, __func__, (char *)"cannot fork into background"); return(FAILURE); } @@ -2951,7 +2936,7 @@ static int daemonize_server( PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, - "INFO: parent is exiting"); + (char *)"INFO: parent is exiting"); exit(0); } @@ -2964,7 +2949,7 @@ static int daemonize_server( PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, - "INFO: child process in background"); + (char *)"INFO: child process in background"); return(SUCCESS); } /* END daemonize_server() */ @@ -3084,7 +3069,7 @@ int get_full_path( { char *TokPtr = NULL; - char *Delims = ":;"; /* windows and unix path deliminators */ + char *Delims = (char *)":;"; /* windows and unix path deliminators */ char *PathLocation; char tmpPath[MAX_LINE]; bool_t IsExe = FALSE; @@ -3192,13 +3177,13 @@ int svr_restart() { free(ArgV[0]); - ArgV[0] = calloc(sizeof(char), (strlen(FullCmd) + 1)); + ArgV[0] = (char *)calloc(sizeof(char), (strlen(FullCmd) + 1)); if (ArgV[0] == NULL) { /* could not calloc */ - log_err(errno, __func__, "ERROR: cannot allocate memory for full command, cannot restart\n"); + log_err(errno, __func__, (char *)"ERROR: (char *) cannot allocate memory for full command, cannot restart\n"); exit(-10); }