From b1e232845ecf5e9fbbb148e87f90b7f318ce7d1a Mon Sep 17 00:00:00 2001 From: Jan Vales Date: Sun, 20 Jan 2019 03:31:31 +0100 Subject: [PATCH] a working failover service, with broken /health check url configuration. --- g6t4.yml | 13 ++--- service-failover/Dockerfile | 11 +++++ service-failover/README.md | 5 ++ service-failover/main.py | 82 +++++++++++++++++++++++++++++++ service-failover/requirements.txt | 2 + service-failover/tsprint.py | 12 +++++ service-fallback/Dockerfile | 1 - service-fallback/README.md | 3 -- 8 files changed, 119 insertions(+), 10 deletions(-) create mode 100644 service-failover/Dockerfile create mode 100644 service-failover/README.md create mode 100644 service-failover/main.py create mode 100644 service-failover/requirements.txt create mode 100644 service-failover/tsprint.py delete mode 100644 service-fallback/Dockerfile delete mode 100644 service-fallback/README.md diff --git a/g6t4.yml b/g6t4.yml index 73fe4e8..6786362 100644 --- a/g6t4.yml +++ b/g6t4.yml @@ -8,7 +8,6 @@ services: restart: always depends_on: - service-analysis - - service-fallback - service-reporting - service-twitter @@ -19,11 +18,12 @@ services: - "8081:8081" restart: always - service-fallback: - build: ./service-fallback - container_name: fallback - ports: - - "8082:8082" + service-failover: + build: ./service-failover + container_name: failover + volumes: + - /var/run/docker.sock:/var/run/docker.sock + restart: always service-reporting: build: ./service-reporting @@ -44,5 +44,6 @@ services: container_name: website ports: - "8080:8080" + restart: always depends_on: - camunda diff --git a/service-failover/Dockerfile b/service-failover/Dockerfile new file mode 100644 index 0000000..577c3ef --- /dev/null +++ b/service-failover/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.7-slim +LABEL maintainer="Jan Vales " + +COPY . /app/ + +WORKDIR /app +RUN ["pip", "install", "-r", "requirements.txt"] + +RUN ["useradd", "--no-create-home", "failover"] +#USER failover:failover +ENTRYPOINT ["python3.7", "-u", "./main.py"] diff --git a/service-failover/README.md b/service-failover/README.md new file mode 100644 index 0000000..89c9422 --- /dev/null +++ b/service-failover/README.md @@ -0,0 +1,5 @@ +# Fallback service + +Periodically checks wether all services are in a healthy state. + +If some service fails, this service will restart it. diff --git a/service-failover/main.py b/service-failover/main.py new file mode 100644 index 0000000..4942db9 --- /dev/null +++ b/service-failover/main.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import docker +from pprint import pprint +import os +import requests +import sys +import signal +import time +import threading + +# thread safe print +from tsprint import print + + +services = dict() +# Service, check url, initial delay, timeout, dockername +services["analysis"] = ["http://analysis:8081/health", 30, 15, "analysis"] +services["camunda"] = ["http://camunda:8085/health", 300, 15, "camunda"] +services["reporting"] = ["http://reporting:8083/health", 30, 15, "reporting"] +services["twitter"] = ["http://twitter:8084/health", 30, 15, "twitter"] +services["website"] = ["http://website:8080/health", 30, 15, "website"] + + + +docker_ctl = None + +def check_service(servicename): + global services +# print(servicename+" -- checking service.") + try: + r = requests.get(services[servicename][0], timeout=3) + if r.status_code != 200: + raise Exception("errorcode != 200: "+int(r.status_code)) + print("+ "+servicename+" -- looks good :)") + + # schedule next check. + th = threading.Timer(services[servicename][2], check_service, args=[servicename]) + th.setName('failover_'+servicename) + th.start() + except: + print("- "+servicename+" -- looks bad :( "+str(sys.exc_info()[0])) + try: + reset_service(servicename) + except: + print("! Docker said NO/service restart failed."+str(sys.exc_info()[0])) + + # schedule next check using startup delay. + th = threading.Timer(services[servicename][1], check_service, args=[servicename]) + th.setName('failover_'+servicename) + th.start() + + + +def reset_service(servicename): + global services + global docker_ctl + print(" "+servicename+" -- resetting service") + docker_ctl.containers.client.api.kill(services[servicename][3]) + docker_ctl.containers.client.api.restart(services[servicename][3]) + + + +if __name__ == "__main__": + def signal_handler(signal, frame): + print('SIG received. exitting!') + os._exit(1) + signal.signal(signal.SIGINT, signal_handler) + + docker_ctl = docker.from_env() + print(docker_ctl.info()) + + print() + print("####################") + print("# Failover service #") + print("####################") + + for servicename, service in services.items(): + print(" "+servicename+" -- postponing initial check for "+str(service[1])+" sec") + th = threading.Timer(service[1], check_service, args=[servicename]) + th.setName('failover_'+servicename) + th.start() diff --git a/service-failover/requirements.txt b/service-failover/requirements.txt new file mode 100644 index 0000000..2f54570 --- /dev/null +++ b/service-failover/requirements.txt @@ -0,0 +1,2 @@ +docker +requests diff --git a/service-failover/tsprint.py b/service-failover/tsprint.py new file mode 100644 index 0000000..72839e7 --- /dev/null +++ b/service-failover/tsprint.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +# make print threadsafe. + +from threading import Lock + +printlock = Lock() +oldprint = print + +def print(*a, **b): + with printlock: + oldprint(*a, **b) diff --git a/service-fallback/Dockerfile b/service-fallback/Dockerfile deleted file mode 100644 index b09b037..0000000 --- a/service-fallback/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM alpine:latest diff --git a/service-fallback/README.md b/service-fallback/README.md deleted file mode 100644 index d3a44f8..0000000 --- a/service-fallback/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Fallback service - -tbd \ No newline at end of file -- 2.43.0