From b1e232845ecf5e9fbbb148e87f90b7f318ce7d1a Mon Sep 17 00:00:00 2001
From: Jan Vales <jan@jvales.net>
Date: Sun, 20 Jan 2019 03:31:31 +0100
Subject: [PATCH] a working failover service, with broken /health check url
 configuration.

---
 g6t4.yml                          | 13 ++---
 service-failover/Dockerfile       | 11 +++++
 service-failover/README.md        |  5 ++
 service-failover/main.py          | 82 +++++++++++++++++++++++++++++++
 service-failover/requirements.txt |  2 +
 service-failover/tsprint.py       | 12 +++++
 service-fallback/Dockerfile       |  1 -
 service-fallback/README.md        |  3 --
 8 files changed, 119 insertions(+), 10 deletions(-)
 create mode 100644 service-failover/Dockerfile
 create mode 100644 service-failover/README.md
 create mode 100644 service-failover/main.py
 create mode 100644 service-failover/requirements.txt
 create mode 100644 service-failover/tsprint.py
 delete mode 100644 service-fallback/Dockerfile
 delete mode 100644 service-fallback/README.md

diff --git a/g6t4.yml b/g6t4.yml
index 73fe4e8..6786362 100644
--- a/g6t4.yml
+++ b/g6t4.yml
@@ -8,7 +8,6 @@ services:
     restart: always
     depends_on:
       - service-analysis
-      - service-fallback
       - service-reporting
       - service-twitter
 
@@ -19,11 +18,12 @@ services:
       - "8081:8081"
     restart: always
 
-  service-fallback:
-    build: ./service-fallback
-    container_name: fallback
-    ports:
-      - "8082:8082"
+  service-failover:
+    build: ./service-failover
+    container_name: failover
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    restart: always
 
   service-reporting:
     build: ./service-reporting
@@ -44,5 +44,6 @@ services:
     container_name: website
     ports:
       - "8080:8080"
+    restart: always
     depends_on:
       - camunda
diff --git a/service-failover/Dockerfile b/service-failover/Dockerfile
new file mode 100644
index 0000000..577c3ef
--- /dev/null
+++ b/service-failover/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.7-slim
+LABEL maintainer="Jan Vales <jan.vales@tuwien.ac.at>"
+
+COPY . /app/
+
+WORKDIR /app
+RUN ["pip", "install", "-r", "requirements.txt"]
+
+RUN ["useradd", "--no-create-home", "failover"]
+#USER failover:failover
+ENTRYPOINT ["python3.7", "-u", "./main.py"]
diff --git a/service-failover/README.md b/service-failover/README.md
new file mode 100644
index 0000000..89c9422
--- /dev/null
+++ b/service-failover/README.md
@@ -0,0 +1,5 @@
+# Fallback service
+
+Periodically checks wether all services are in a healthy state.
+
+If some service fails, this service will restart it.
diff --git a/service-failover/main.py b/service-failover/main.py
new file mode 100644
index 0000000..4942db9
--- /dev/null
+++ b/service-failover/main.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import docker
+from pprint import pprint
+import os
+import requests
+import sys
+import signal
+import time
+import threading
+
+# thread safe print
+from tsprint import print
+
+
+services = dict()
+# Service, check url, initial delay, timeout, dockername
+services["analysis"]  = ["http://analysis:8081/health",  30,  15, "analysis"]
+services["camunda"]   = ["http://camunda:8085/health",   300, 15, "camunda"]
+services["reporting"] = ["http://reporting:8083/health", 30,  15, "reporting"]
+services["twitter"]   = ["http://twitter:8084/health",   30,  15, "twitter"]
+services["website"]   = ["http://website:8080/health",   30,  15, "website"]
+
+
+
+docker_ctl = None
+
+def check_service(servicename):
+    global services
+#    print(servicename+" -- checking service.")
+    try:
+        r = requests.get(services[servicename][0], timeout=3)
+        if r.status_code != 200:
+            raise Exception("errorcode != 200: "+int(r.status_code))
+        print("+ "+servicename+" -- looks good :)")
+
+        # schedule next check.
+        th = threading.Timer(services[servicename][2], check_service, args=[servicename])
+        th.setName('failover_'+servicename)
+        th.start()
+    except:
+        print("- "+servicename+" -- looks bad :( "+str(sys.exc_info()[0]))
+        try:
+            reset_service(servicename)
+        except:
+            print("! Docker said NO/service restart failed."+str(sys.exc_info()[0]))
+
+        # schedule next check using startup delay.
+        th = threading.Timer(services[servicename][1], check_service, args=[servicename])
+        th.setName('failover_'+servicename)
+        th.start()
+
+
+
+def reset_service(servicename):
+    global services
+    global docker_ctl
+    print("  "+servicename+" -- resetting service")
+    docker_ctl.containers.client.api.kill(services[servicename][3])
+    docker_ctl.containers.client.api.restart(services[servicename][3])
+
+
+
+if __name__ == "__main__":
+    def signal_handler(signal, frame):
+        print('SIG received. exitting!')
+        os._exit(1)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    docker_ctl = docker.from_env()
+    print(docker_ctl.info())
+
+    print()
+    print("####################")
+    print("# Failover service #")
+    print("####################")
+
+    for servicename, service in services.items():
+        print("  "+servicename+" -- postponing initial check for "+str(service[1])+" sec")
+        th = threading.Timer(service[1], check_service, args=[servicename])
+        th.setName('failover_'+servicename)
+        th.start()
diff --git a/service-failover/requirements.txt b/service-failover/requirements.txt
new file mode 100644
index 0000000..2f54570
--- /dev/null
+++ b/service-failover/requirements.txt
@@ -0,0 +1,2 @@
+docker
+requests
diff --git a/service-failover/tsprint.py b/service-failover/tsprint.py
new file mode 100644
index 0000000..72839e7
--- /dev/null
+++ b/service-failover/tsprint.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+# make print threadsafe.
+
+from threading import Lock
+
+printlock = Lock()
+oldprint = print
+
+def print(*a, **b):
+	with printlock:
+		oldprint(*a, **b)
diff --git a/service-fallback/Dockerfile b/service-fallback/Dockerfile
deleted file mode 100644
index b09b037..0000000
--- a/service-fallback/Dockerfile
+++ /dev/null
@@ -1 +0,0 @@
-FROM alpine:latest
diff --git a/service-fallback/README.md b/service-fallback/README.md
deleted file mode 100644
index d3a44f8..0000000
--- a/service-fallback/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Fallback service
-
-tbd
\ No newline at end of file
-- 
2.43.0