Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 12 additions & 19 deletions cmd/traffic_cop/traffic_cop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "RecordsConfig.h"
#include "ClusterCom.h"
#include "ts/ink_cap.h"
#include "Cop.h"

#include <string>
#include <map>
Expand Down Expand Up @@ -110,11 +111,7 @@ static int source_port = 0;
static int manager_failures = 0;
static int server_failures = 0;
static int server_not_found = 0;

static const int sleep_time = 10; // 10 sec
static int init_sleep_time = sleep_time; // 10 sec
static const int manager_timeout = 3 * 60; // 3 min
static const int server_timeout = 3 * 60; // 3 min
static int init_sleep_time = cop_sleep_time; // 10 sec

// traffic_manager flap detection
#define MANAGER_FLAP_DETECTION 1
Expand All @@ -131,8 +128,6 @@ static ink_hrtime manager_flap_retry_start_time = 0; // first time we attempt
// transient syscall error timeout
#define TRANSIENT_ERROR_WAIT_MS 500

static const int kill_timeout = 1 * 60; // 1 min

static int child_pid = 0;
static int child_status = 0;

Expand Down Expand Up @@ -316,12 +311,10 @@ sig_alarm_warn(int signum)
#endif
{
cop_log_trace("Entering sig_alarm_warn(%d)\n", signum);
cop_log(COP_WARNING, "unable to kill traffic_server for the last"
" %d seconds\n",
kill_timeout);
cop_log(COP_WARNING, "unable to kill traffic_server for the last %d seconds\n", cop_kill_timeout);

// Set us up for another alarm
alarm(kill_timeout);
alarm(cop_kill_timeout);
cop_log_trace("Leaving sig_alarm_warn(%d)\n", signum);
}

Expand Down Expand Up @@ -402,7 +395,7 @@ safe_kill(const char *lockfile_name, const char *pname, bool group)

cop_log_trace("Entering safe_kill(%s, %s, %d)\n", lockfile_name, pname, group);
set_alarm_warn();
alarm(kill_timeout);
alarm(cop_kill_timeout);

if (group == true) {
lockfile.KillGroup(killsig, coresig, pname);
Expand Down Expand Up @@ -1017,7 +1010,7 @@ read_manager_string(const char *variable, char *value, size_t val_len)

snprintf(request, sizeof(request), "read %s\n", variable);

err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
if (err < 0) {
return err;
}
Expand Down Expand Up @@ -1071,7 +1064,7 @@ read_manager_int(const char *variable, int *value)

snprintf(request, sizeof(request), "read %s\n", variable);

err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
if (err < 0) {
return err;
}
Expand Down Expand Up @@ -1236,7 +1229,7 @@ test_server_http_port()
// servers up on the autoconf port.
snprintf(request, sizeof(request), "GET http://127.0.0.1:%d/synthetic.txt HTTP/1.0\r\n\r\n", synthetic_port);

return test_http_port(http_backdoor_port, request, server_timeout * 1000, localhost, localhost);
return test_http_port(http_backdoor_port, request, cop_server_timeout * 1000, localhost, localhost);
}

static int
Expand Down Expand Up @@ -1444,7 +1437,7 @@ check_programs()
// is up, we make sure there is actually a server process
// running. If there is we test it.

alarm(2 * manager_timeout);
alarm(2 * cop_manager_timeout);
err = heartbeat_manager();
alarm(0);

Expand All @@ -1471,7 +1464,7 @@ check_programs()
safe_kill(manager_lockfile, manager_binary, true);
}
} else {
alarm(2 * server_timeout);
alarm(2 * cop_server_timeout);
heartbeat_server();
alarm(0);
}
Expand Down Expand Up @@ -1566,7 +1559,7 @@ check(void *arg)
chown_file_to_admin_user(manager_lockfile);
chown_file_to_admin_user(server_lockfile);

alarm(2 * (sleep_time + manager_timeout * 2 + server_timeout));
alarm(2 * (cop_sleep_time + cop_manager_timeout * 2 + cop_server_timeout));

if (check_no_run() < 0) {
break;
Expand Down Expand Up @@ -1601,7 +1594,7 @@ check(void *arg)
// Pause to catch our breath. (10 seconds).
// Use 'millisleep()' because normal 'sleep()' interferes with
// the SIGALRM signal which we use to heartbeat the cop.
millisleep(sleep_time * 1000);
millisleep(cop_sleep_time * 1000);

// We do this after the first round of checks, since the first "check" will spawn traffic_manager
if (!mgmt_init) {
Expand Down
16 changes: 13 additions & 3 deletions cmd/traffic_manager/MgmtHandlers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "MgmtSocket.h"
#include "NetworkUtilsRemote.h"
#include "MIME.h"
#include "Cop.h"

// INKqa09866
#include "TSControlMain.h"
Expand Down Expand Up @@ -157,14 +158,20 @@ synthetic_thread(void *info)
// Read the request
bufp = buffer;
while (len < strlen(RequestStr)) {
if (read_ready(clientFD, cop_server_timeout * 1000) <= 0) {
mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no request to read()");
goto error;
}
bytes = read(clientFD, buffer, sizeof(buffer));
if (bytes < 0) {
if (0 == bytes) {
mgmt_log(stderr, "[SyntheticHealthServer] EOF on the socket, likely prematurely closed");
goto error;
} else if (bytes < 0) {
if (errno == EINTR || errno == EAGAIN) {
continue;
} else {
mgmt_log(stderr, "[SyntheticHealthServer] Failed to read the request");
goto error;
break;
}
} else {
len += bytes;
Expand All @@ -186,14 +193,17 @@ synthetic_thread(void *info)
// Write it
bufp = buffer;
while (len) {
if (write_ready(clientFD, cop_server_timeout * 1000) <= 0) {
mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no response to write()");
goto error;
}
bytes = write(clientFD, buffer, len);
if (bytes < 0) {
if (errno == EINTR || errno == EAGAIN) {
continue;
} else {
mgmt_log(stderr, "[SyntheticHealthServer] Failed to write the response");
goto error;
break;
}
} else {
len -= bytes;
Expand Down
8 changes: 4 additions & 4 deletions lib/ts/ink_sock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ safe_blocking(int fd)
}

int
write_ready(int fd)
write_ready(int fd, int timeout_msec)
{
struct pollfd p;
p.events = POLLOUT;
p.fd = fd;
int r = poll(&p, 1, 0);
int r = poll(&p, 1, timeout_msec);
if (r <= 0)
return r;
if (p.revents & (POLLERR | POLLNVAL))
Expand All @@ -139,12 +139,12 @@ write_ready(int fd)
}

int
read_ready(int fd)
read_ready(int fd, int timeout_msec)
{
struct pollfd p;
p.events = POLLIN;
p.fd = fd;
int r = poll(&p, 1, 0);
int r = poll(&p, 1, timeout_msec);
if (r <= 0)
return r;
if (p.revents & (POLLERR | POLLNVAL))
Expand Down
4 changes: 2 additions & 2 deletions lib/ts/ink_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ int safe_clr_fl(int fd, int arg);
int safe_blocking(int fd);
int safe_nonblocking(int fd);

int write_ready(int fd);
int read_ready(int fd);
int write_ready(int fd, int timeout_msec = 0);
int read_ready(int fd, int timeout_msec = 0);

char fd_read_char(int fd);
int fd_read_line(int fd, char *s, int len);
Expand Down
27 changes: 27 additions & 0 deletions mgmt/Cop.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/** @file

Main entry point for the traffic_cop application.

@section license License

Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

static const int cop_sleep_time = 10; // 10 sec
static const int cop_manager_timeout = 3 * 60; // 3 min
static const int cop_server_timeout = 3 * 60; // 3 min
static const int cop_kill_timeout = 1 * 60; // 1 min