From 897907b20273a65a69867ecb520c2bb70efb6463 Mon Sep 17 00:00:00 2001 From: Markus Makela Date: Sat, 12 Sep 2015 06:10:26 +0300 Subject: [PATCH] Fix to MXS-352: https://mariadb.atlassian.net/browse/MXS-352 If a service fails, MaxScale will try to start it again later on. --- .../Getting-Started/Configuration-Guide.md | 4 + server/core/config.c | 4 +- server/core/service.c | 153 ++++++++++++------ server/include/service.h | 5 +- 4 files changed, 114 insertions(+), 52 deletions(-) diff --git a/Documentation/Getting-Started/Configuration-Guide.md b/Documentation/Getting-Started/Configuration-Guide.md index c3a608b2d..e48d44293 100644 --- a/Documentation/Getting-Started/Configuration-Guide.md +++ b/Documentation/Getting-Started/Configuration-Guide.md @@ -350,6 +350,10 @@ This parameter takes a boolean value and when enabled, will strip all `\` charac Enabling this feature will transform wildcard grants to individual database grants. This will consume more memory but authentication in MaxScale will be done faster. The parameter takes a boolean value. +#### `retry_on_failure` + +The retry_on_failure parameter controls whether MaxScale will try to restart failed services and accepts a boolean value. This functionality is enabled by default to prevent services being permanently disabled if the starting of the service failed due to a network outage. Disabling the restarting of the failed services will cause them to be permanently disabled if the services can't be started when MaxScale is started. + #### `connection_timeout` The connection_timeout parameter is used to disconnect sessions to MaxScale that have been idle for too long. The session timeouts are disabled by default. To enable them, define the timeout in seconds in the service's configuration section. diff --git a/server/core/config.c b/server/core/config.c index 47d56a818..5c73d6963 100644 --- a/server/core/config.c +++ b/server/core/config.c @@ -622,7 +622,9 @@ process_config_context(CONFIG_CONTEXT *context) } - if (enable_root_user) + serviceSetRetryOnFailure(obj->element, config_get_value(obj->parameters, "retry_on_failure")); + + if (enable_root_user) serviceEnableRootUser( obj->element, config_truth_value(enable_root_user)); diff --git a/server/core/service.c b/server/core/service.c index 51df84c03..7071273ce 100644 --- a/server/core/service.c +++ b/server/core/service.c @@ -64,6 +64,7 @@ #include #include #include +#include /** Defined in log_manager.cc */ extern int lm_enabled_logfiles_bitmask; @@ -99,6 +100,7 @@ static int find_type(typelib_t* tl, const char* needle, int maxlen); static void service_add_qualified_param( SERVICE* svc, CONFIG_PARAMETER* param); +void service_interal_restart(void *data); /** * Allocate a new service for the gateway to support @@ -140,7 +142,8 @@ SERVICE *service; service->routerModule = strdup(router); service->users_from_all = false; service->resources = NULL; - service->localhost_match_wildcard_host = SERVICE_PARAM_UNINIT; + service->localhost_match_wildcard_host = SERVICE_PARAM_UNINIT; + service->retry_start = true; service->ssl_mode = SSL_DISABLED; service->ssl_init_done = false; service->ssl_ca_cert = NULL; @@ -157,6 +160,7 @@ SERVICE *service; return NULL; } service->stats.started = time(0); + service->stats.n_failed_starts = 0; service->state = SERVICE_STATE_ALLOC; spinlock_init(&service->spin); spinlock_init(&service->users_table_spin); @@ -260,6 +264,7 @@ GWPROTOCOL *funcs; { hashtable_free(service->users->data); free(service->users); + service->users = NULL; dcb_close(port->listener); port->listener = NULL; goto retblock; @@ -403,6 +408,50 @@ retblock: return listeners; } +/** + * Start all ports for a service. + * serviceStartAllPorts will try to start all listeners associated with the service. + * If no listeners are started, the starting of ports will be retried after a period of time. + * @param service Service to start + * @return Number of started listeners. This is equal to the number of ports the service + * is listening to. + */ +int serviceStartAllPorts(SERVICE* service) +{ + SERV_PROTOCOL *port = service->ports; + int listeners = 0; + while (!service->svc_do_shutdown && port) + { + listeners += serviceStartPort(service, port); + port = port->next; + } + + if (listeners) + { + service->state = SERVICE_STATE_STARTED; + service->stats.started = time(0); + /** Add the task that monitors session timeouts */ + if (service->conn_timeout > 0) + { + hktask_add("connection_timeout", session_close_timeouts, NULL, 5); + } + } + else if(service->retry_start) + { + /** Service failed to start any ports. Try again later. */ + service->stats.n_failed_starts++; + char taskname[strlen(service->name) + strlen("_start_retry_") + (int)ceil(log10(INT_MAX)) + 1]; + int retry_after = MIN(service->stats.n_failed_starts * 10, SERVICE_MAX_RETRY_INTERVAL); + snprintf(taskname, sizeof (taskname), "%s_start_retry_%d", + service->name, service->stats.n_failed_starts); + hktask_oneshot(taskname, service_interal_restart, + (void*) service, retry_after); + skygw_log_write(LM, "Failed to start service %s, retrying in %d seconds.", + service->name, retry_after); + } + return listeners; +} + /** * Start a service * @@ -417,59 +466,42 @@ retblock: int serviceStart(SERVICE *service) { -SERV_PROTOCOL *port; -int listeners = 0; + int listeners = 0; -if(!check_service_permissions(service)) -{ - skygw_log_write_flush(LE, - "%s: Error: Inadequate user permissions for service. Service not started.", - service->name); - service->state = SERVICE_STATE_FAILED; - return 0; -} - -if(service->ssl_mode != SSL_DISABLED) -{ - if(serviceInitSSL(service) != 0) + if (check_service_permissions(service)) { - LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "%s: SSL initialization failed. Service not started.", - service->name))); - service->state = SERVICE_STATE_FAILED; - return 0; + if (service->ssl_mode == SSL_DISABLED || (service->ssl_mode != SSL_DISABLED && serviceInitSSL(service) != 0)) + { + if ((service->router_instance = service->router->createInstance( + service,service->routerOptions))) + { + listeners += serviceStartAllPorts(service); + } + else + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "%s: Failed to create router instance for service. Service not started.", + service->name))); + service->state = SERVICE_STATE_FAILED; + } + } + else + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "%s: SSL initialization failed. Service not started.", + service->name))); + service->state = SERVICE_STATE_FAILED; + } } -} - if ((service->router_instance = service->router->createInstance(service, - service->routerOptions)) == NULL) - { - LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "%s: Failed to create router instance for service. Service not started.", - service->name))); - service->state = SERVICE_STATE_FAILED; - return 0; - } - - port = service->ports; - while (!service->svc_do_shutdown && port) - { - listeners += serviceStartPort(service, port); - port = port->next; - } - if (listeners) - { - service->state = SERVICE_STATE_STARTED; - service->stats.started = time(0); - } - - /** Add the task that monitors session timeouts */ - if(service->conn_timeout > 0) - { - hktask_add("connection_timeout",session_close_timeouts,NULL,5); - } - - return listeners; + else + { + skygw_log_write_flush(LE, + "%s: Error: Inadequate user permissions for service. Service not started.", + service->name); + service->state = SERVICE_STATE_FAILED; + } + return listeners; } /** @@ -1022,6 +1054,18 @@ serviceSetTimeout(SERVICE *service, int val) return 1; } +/** + * Enable or disable the restarting of the service on failure. + * @param service Service to configure + * @param value A string representation of a boolean value + */ +void serviceSetRetryOnFailure(SERVICE *service, char* value) +{ + if(value) + { + service->retry_start = config_truth_value(value); + } +} /** * Trim whitespace from the from an rear of a string @@ -2055,3 +2099,12 @@ int serviceInitSSL(SERVICE* service) } return 0; } +/** + * Function called by the housekeeper thread to retry starting of a service + * @param data Service to restart + */ +void service_interal_restart(void *data) +{ + SERVICE* service = (SERVICE*)data; + serviceStartAllPorts(service); +} \ No newline at end of file diff --git a/server/include/service.h b/server/include/service.h index 293856e85..7064468df 100644 --- a/server/include/service.h +++ b/server/include/service.h @@ -79,6 +79,7 @@ typedef struct servprotocol { */ typedef struct { time_t started; /**< The time when the service was started */ + int n_failed_starts; /**< Number of times this service has failed to start */ int n_sessions; /**< Number of sessions created on service since start */ int n_current; /**< Current number of sessions */ } SERVICE_STATS; @@ -127,7 +128,7 @@ enum{ }; #define DEFAULT_SSL_CERT_VERIFY_DEPTH 100 /*< The default certificate verification depth */ - +#define SERVICE_MAX_RETRY_INTERVAL 3600 /*< The maximum interval between service start retries */ /** * Parameters that are automatically detected but can also be configured by the * user are initially set to this value. @@ -190,6 +191,7 @@ typedef struct service { char* ssl_key; /*< SSL private key */ char* ssl_ca_cert; /*< SSL CA certificate */ bool ssl_init_done; /*< If SSL has already been initialized for this service */ + bool retry_start; /*< If starting of the service should be retried later */ } SERVICE; @@ -225,6 +227,7 @@ extern int serviceSetSSLVerifyDepth(SERVICE* service, int depth); extern void serviceSetCertificates(SERVICE *service, char* cert,char* key, char* ca_cert); extern int serviceEnableRootUser(SERVICE *, int ); extern int serviceSetTimeout(SERVICE *, int ); +extern void serviceSetRetryOnFailure(SERVICE *service, char* value); extern void serviceWeightBy(SERVICE *, char *); extern char *serviceGetWeightingParameter(SERVICE *); extern int serviceEnableLocalhostMatchWildcardHost(SERVICE *, int);