finnball pushed to branch finn/bot-refactor at BuildGrid / buildgrid
Commits:
-
aa24a43a
by finn at 2018-08-06T16:33:08Z
8 changed files:
- app/commands/cmd_bot.py
- buildgrid/bot/bot.py
- buildgrid/bot/bot_interface.py
- + buildgrid/bot/bot_session.py
- buildgrid/server/job.py
- buildgrid/server/scheduler.py
- buildgrid/server/worker/bots_interface.py
- tests/integration/bot_interface.py
Changes:
| ... | ... | @@ -30,10 +30,12 @@ import os |
| 30 | 30 |
import random
|
| 31 | 31 |
import subprocess
|
| 32 | 32 |
import tempfile
|
| 33 |
+import time
|
|
| 33 | 34 |
|
| 34 | 35 |
from pathlib import Path, PurePath
|
| 35 | 36 |
|
| 36 |
-from buildgrid.bot import bot
|
|
| 37 |
+from buildgrid.bot import bot, bot_interface
|
|
| 38 |
+from buildgrid.bot.bot_session import BotSession, Device, Worker
|
|
| 37 | 39 |
from buildgrid._exceptions import BotError
|
| 38 | 40 |
|
| 39 | 41 |
from ..cli import pass_context
|
| ... | ... | @@ -45,18 +47,25 @@ from google.protobuf import any_pb2 |
| 45 | 47 |
@click.group(short_help = 'Create a bot client')
|
| 46 | 48 |
@click.option('--continuous', is_flag=True)
|
| 47 | 49 |
@click.option('--parent', default='bgd_test')
|
| 48 |
-@click.option('--number-of-leases', default=1)
|
|
| 49 | 50 |
@click.option('--port', default='50051')
|
| 50 | 51 |
@click.option('--host', default='localhost')
|
| 51 | 52 |
@pass_context
|
| 52 |
-def cli(context, host, port, number_of_leases, parent, continuous):
|
|
| 53 |
+def cli(context, host, port, parent, continuous):
|
|
| 54 |
+ channel = grpc.insecure_channel('{}:{}'.format(host, port))
|
|
| 55 |
+ interface = bot_interface.BotInterface(channel)
|
|
| 56 |
+ |
|
| 53 | 57 |
context.logger = logging.getLogger(__name__)
|
| 54 | 58 |
context.logger.info("Starting on port {}".format(port))
|
| 55 | 59 |
|
| 56 | 60 |
context.continuous = continuous
|
| 57 |
- context.channel = grpc.insecure_channel('{}:{}'.format(host, port))
|
|
| 58 |
- context.number_of_leases = number_of_leases
|
|
| 59 |
- context.parent = parent
|
|
| 61 |
+ |
|
| 62 |
+ worker = Worker()
|
|
| 63 |
+ worker.add_device(Device())
|
|
| 64 |
+ |
|
| 65 |
+ bot_session = BotSession(parent, interface)
|
|
| 66 |
+ bot_session.add_worker(worker)
|
|
| 67 |
+ |
|
| 68 |
+ context.bot_session = bot_session
|
|
| 60 | 69 |
|
| 61 | 70 |
@cli.command('dummy', short_help='Create a dummy bot session')
|
| 62 | 71 |
@pass_context
|
| ... | ... | @@ -65,16 +74,11 @@ def dummy(context): |
| 65 | 74 |
Simple dummy client. Creates a session, accepts leases, does fake work and
|
| 66 | 75 |
updates the server.
|
| 67 | 76 |
"""
|
| 68 |
- |
|
| 69 |
- context.logger.info("Creating a bot session")
|
|
| 70 |
- |
|
| 71 | 77 |
try:
|
| 72 |
- bot.Bot(work=_work_dummy,
|
|
| 73 |
- context=context,
|
|
| 74 |
- channel=context.channel,
|
|
| 75 |
- parent=context.parent,
|
|
| 76 |
- number_of_leases=context.number_of_leases,
|
|
| 77 |
- continuous=context.continuous)
|
|
| 78 |
+ b = bot.Bot(context.bot_session)
|
|
| 79 |
+ b.session(_work_dummy,
|
|
| 80 |
+ context,
|
|
| 81 |
+ context.continuous)
|
|
| 78 | 82 |
|
| 79 | 83 |
except KeyboardInterrupt:
|
| 80 | 84 |
pass
|
| ... | ... | @@ -88,7 +92,7 @@ def dummy(context): |
| 88 | 92 |
@click.option('--port', show_default = True, default=11001)
|
| 89 | 93 |
@click.option('--remote', show_default = True, default='localhost')
|
| 90 | 94 |
@pass_context
|
| 91 |
-def _work_buildbox(context, remote, port, server_cert, client_key, client_cert, local_cas, fuse_dir):
|
|
| 95 |
+def work_buildbox(context, remote, port, server_cert, client_key, client_cert, local_cas, fuse_dir):
|
|
| 92 | 96 |
"""
|
| 93 | 97 |
Uses BuildBox to run commands.
|
| 94 | 98 |
"""
|
| ... | ... | @@ -104,12 +108,14 @@ def _work_buildbox(context, remote, port, server_cert, client_key, client_cert, |
| 104 | 108 |
context.fuse_dir = fuse_dir
|
| 105 | 109 |
|
| 106 | 110 |
try:
|
| 107 |
- bot.Bot(work=_work_buildbox,
|
|
| 108 |
- context=context,
|
|
| 109 |
- channel=context.channel,
|
|
| 110 |
- parent=context.parent,
|
|
| 111 |
- number_of_leases=context.number_of_leases,
|
|
| 112 |
- continuous=context.continuous)
|
|
| 111 |
+ b = bot.Bot(work=_work_buildbox,
|
|
| 112 |
+ bot_session=context.bot_session,
|
|
| 113 |
+ channel=context.channel,
|
|
| 114 |
+ parent=context.parent)
|
|
| 115 |
+ |
|
| 116 |
+ b.session(context.parent,
|
|
| 117 |
+ _work_buildbox,
|
|
| 118 |
+ context)
|
|
| 113 | 119 |
|
| 114 | 120 |
except KeyboardInterrupt:
|
| 115 | 121 |
pass
|
| ... | ... | @@ -23,160 +23,46 @@ Creates a bot session. |
| 23 | 23 |
"""
|
| 24 | 24 |
|
| 25 | 25 |
import asyncio
|
| 26 |
-import inspect
|
|
| 26 |
+import collections
|
|
| 27 | 27 |
import logging
|
| 28 |
-import platform
|
|
| 29 |
-import queue
|
|
| 30 | 28 |
import time
|
| 31 | 29 |
|
| 32 |
-from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
|
|
| 33 |
- |
|
| 34 |
-from . import bot_interface
|
|
| 30 |
+from . import bot_interface, bot_session
|
|
| 31 |
+from .bot_session import BotStatus, LeaseState
|
|
| 35 | 32 |
from .._exceptions import BotError
|
| 36 | 33 |
|
| 37 |
-class Bot(object):
|
|
| 34 |
+class Bot:
|
|
| 38 | 35 |
"""
|
| 39 | 36 |
Creates a local BotSession.
|
| 40 | 37 |
"""
|
| 41 | 38 |
|
| 42 |
- def __init__(self, work, context, channel, parent, number_of_leases, continuous=True):
|
|
| 43 |
- if not inspect.iscoroutinefunction(work):
|
|
| 44 |
- raise BotError("work function must be async")
|
|
| 45 |
- |
|
| 46 |
- print(type(context))
|
|
| 39 |
+ UPDATE_PERIOD = 1
|
|
| 47 | 40 |
|
| 48 |
- self.interface = bot_interface.BotInterface(channel)
|
|
| 41 |
+ def __init__(self, bot_session):
|
|
| 49 | 42 |
self.logger = logging.getLogger(__name__)
|
| 50 | 43 |
|
| 51 |
- self._create_session(parent, number_of_leases)
|
|
| 52 |
- self._work_queue = queue.Queue(maxsize = number_of_leases)
|
|
| 53 |
- |
|
| 54 |
- while continuous:
|
|
| 55 |
- ## TODO: Leases independently finish
|
|
| 56 |
- ## Allow leases to queue finished work independently instead
|
|
| 57 |
- ## of waiting for all to finish
|
|
| 58 |
- futures = [self._do_work(work, context, lease) for lease in self._get_work()]
|
|
| 59 |
- if futures:
|
|
| 60 |
- loop = asyncio.new_event_loop()
|
|
| 61 |
- leases_complete, _ = loop.run_until_complete(asyncio.wait(futures))
|
|
| 62 |
- work_complete = [(lease.result().id, lease.result(),) for lease in leases_complete]
|
|
| 63 |
- self._work_complete(work_complete)
|
|
| 64 |
- loop.close()
|
|
| 65 |
- self._update_bot_session()
|
|
| 66 |
- time.sleep(2)
|
|
| 67 |
- |
|
| 68 |
- @property
|
|
| 69 |
- def bot_session(self):
|
|
| 70 |
- ## Read only, shouldn't have to set any of the variables in here
|
|
| 71 |
- return self._bot_session
|
|
| 72 |
- |
|
| 73 |
- def close_session(self):
|
|
| 74 |
- self.logger.warning("Session closing not yet implemented")
|
|
| 75 |
- |
|
| 76 |
- async def _do_work(self, work, context, lease):
|
|
| 77 |
- """ Work is done here, work function should be asynchronous
|
|
| 78 |
- """
|
|
| 79 |
- self.logger.info("Work found: {}".format(lease.id))
|
|
| 80 |
- lease = await work(context=context, lease=lease)
|
|
| 81 |
- lease.state = bots_pb2.LeaseState.Value('COMPLETED')
|
|
| 82 |
- self.logger.info("Work complete: {}".format(lease.id))
|
|
| 83 |
- return lease
|
|
| 84 |
- |
|
| 85 |
- def _update_bot_session(self):
|
|
| 86 |
- """ Should call the server periodically to inform the server the client
|
|
| 87 |
- has not died.
|
|
| 88 |
- """
|
|
| 89 |
- self.logger.debug("Updating bot session")
|
|
| 90 |
- self._bot_session = self.interface.update_bot_session(self._bot_session)
|
|
| 91 |
- leases_update = ([self._update_lease(lease) for lease in self._bot_session.leases])
|
|
| 92 |
- del self._bot_session.leases[:]
|
|
| 93 |
- self._bot_session.leases.extend(leases_update)
|
|
| 94 |
- |
|
| 95 |
- def _get_work(self):
|
|
| 96 |
- while not self._work_queue.empty():
|
|
| 97 |
- yield self._work_queue.get()
|
|
| 98 |
- |
|
| 99 |
- def _work_complete(self, leases_complete):
|
|
| 100 |
- """ Bot updates itself with any completed work.
|
|
| 101 |
- """
|
|
| 102 |
- # Should really improve this...
|
|
| 103 |
- # Maybe add some call back function sentoff work...
|
|
| 104 |
- leases_active = list(filter(self._lease_active, self._bot_session.leases))
|
|
| 105 |
- leases_not_active = [lease for lease in self._bot_session.leases if not self._lease_active(lease)]
|
|
| 106 |
- del self._bot_session.leases[:]
|
|
| 107 |
- for lease in leases_active:
|
|
| 108 |
- for lease_tuple in leases_complete:
|
|
| 109 |
- if lease.id == lease_tuple[0]:
|
|
| 110 |
- leases_not_active.extend([lease_tuple[1]])
|
|
| 111 |
- self._bot_session.leases.extend(leases_not_active)
|
|
| 112 |
- |
|
| 113 |
- def _update_lease(self, lease):
|
|
| 114 |
- """
|
|
| 115 |
- State machine for any recieved updates to the leases.
|
|
| 116 |
- """
|
|
| 117 |
- if self._lease_pending(lease):
|
|
| 118 |
- lease.state = bots_pb2.LeaseState.Value('ACTIVE')
|
|
| 119 |
- self._work_queue.put(lease)
|
|
| 120 |
- return lease
|
|
| 121 |
- |
|
| 122 |
- else:
|
|
| 123 |
- return lease
|
|
| 124 |
- |
|
| 125 |
- def _create_session(self, parent, number_of_leases):
|
|
| 126 |
- self.logger.debug("Creating bot session")
|
|
| 127 |
- worker = self._create_worker()
|
|
| 128 |
- |
|
| 129 |
- """ Unique bot ID within the farm used to identify this bot
|
|
| 130 |
- Needs to be human readable.
|
|
| 131 |
- All prior sessions with bot_id of same ID are invalidated.
|
|
| 132 |
- If a bot attempts to update an invalid session, it must be rejected and
|
|
| 133 |
- may be put in quarantine.
|
|
| 134 |
- """
|
|
| 135 |
- bot_id = '{}.{}'.format(parent, platform.node())
|
|
| 136 |
- |
|
| 137 |
- leases = [bots_pb2.Lease() for x in range(number_of_leases)]
|
|
| 138 |
- |
|
| 139 |
- bot_session = bots_pb2.BotSession(worker = worker,
|
|
| 140 |
- status = bots_pb2.BotStatus.Value('OK'),
|
|
| 141 |
- leases = leases,
|
|
| 142 |
- bot_id = bot_id)
|
|
| 143 |
- self._bot_session = self.interface.create_bot_session(parent, bot_session)
|
|
| 144 |
- self.logger.info("Name: {}, Id: {}".format(self._bot_session.name,
|
|
| 145 |
- self._bot_session.bot_id))
|
|
| 146 |
- |
|
| 147 |
- def _create_worker(self):
|
|
| 148 |
- devices = self._create_devices()
|
|
| 149 |
- |
|
| 150 |
- # Contains a list of devices and the connections between them.
|
|
| 151 |
- worker = worker_pb2.Worker(devices = devices)
|
|
| 152 |
- |
|
| 153 |
- """ Keys supported:
|
|
| 154 |
- *pool
|
|
| 155 |
- """
|
|
| 156 |
- worker.Property.key = "pool"
|
|
| 157 |
- worker.Property.value = "all"
|
|
| 158 |
- |
|
| 159 |
- return worker
|
|
| 160 |
- |
|
| 161 |
- def _create_devices(self):
|
|
| 162 |
- """ Creates devices available to the worker
|
|
| 163 |
- The first device is know as the Primary Device - the revice which
|
|
| 164 |
- is running a bit and responsible to actually executing commands.
|
|
| 165 |
- All other devices are known as Attatched Devices and must be controlled
|
|
| 166 |
- by the Primary Device.
|
|
| 167 |
- """
|
|
| 44 |
+ self._bot_session = bot_session
|
|
| 168 | 45 |
|
| 169 |
- devices = []
|
|
| 46 |
+ def session(self, work, context, continuous = False):
|
|
| 47 |
+ loop = asyncio.get_event_loop()
|
|
| 170 | 48 |
|
| 171 |
- for i in range(0, 1): # Append one device for now
|
|
| 172 |
- dev = worker_pb2.Device()
|
|
| 49 |
+ self._bot_session.create_bot_session(work, context)
|
|
| 173 | 50 |
|
| 174 |
- devices.append(dev)
|
|
| 51 |
+ try:
|
|
| 52 |
+ task = asyncio.ensure_future(self._update_bot_session())
|
|
| 53 |
+ loop.run_forever()
|
|
| 175 | 54 |
|
| 176 |
- return devices
|
|
| 55 |
+ except KeyboardInterrupt:
|
|
| 56 |
+ pass
|
|
| 177 | 57 |
|
| 178 |
- def _lease_pending(self, lease):
|
|
| 179 |
- return lease.state == bots_pb2.LeaseState.Value('PENDING')
|
|
| 58 |
+ finally:
|
|
| 59 |
+ task.cancel()
|
|
| 60 |
+ loop.close()
|
|
| 180 | 61 |
|
| 181 |
- def _lease_active(self, lease):
|
|
| 182 |
- return lease.state == bots_pb2.LeaseState.Value('ACTIVE')
|
|
| 62 |
+ async def _update_bot_session(self):
|
|
| 63 |
+ while True:
|
|
| 64 |
+ """ Calls the server periodically to inform the server the client
|
|
| 65 |
+ has not died.
|
|
| 66 |
+ """
|
|
| 67 |
+ self._bot_session.update_bot_session()
|
|
| 68 |
+ await asyncio.sleep(self.UPDATE_PERIOD)
|
| ... | ... | @@ -29,7 +29,7 @@ from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, bo |
| 29 | 29 |
|
| 30 | 30 |
from .._exceptions import BotError
|
| 31 | 31 |
|
| 32 |
-class BotInterface(object):
|
|
| 32 |
+class BotInterface:
|
|
| 33 | 33 |
""" Interface handles calls to the server.
|
| 34 | 34 |
"""
|
| 35 | 35 |
|
| ... | ... | @@ -39,22 +39,12 @@ class BotInterface(object): |
| 39 | 39 |
self._stub = bots_pb2_grpc.BotsStub(channel)
|
| 40 | 40 |
|
| 41 | 41 |
def create_bot_session(self, parent, bot_session):
|
| 42 |
- try:
|
|
| 43 |
- request = bots_pb2.CreateBotSessionRequest(parent = parent,
|
|
| 44 |
- bot_session = bot_session)
|
|
| 45 |
- return self._stub.CreateBotSession(request)
|
|
| 46 |
- |
|
| 47 |
- except Exception as e:
|
|
| 48 |
- self.logger.error(e)
|
|
| 49 |
- raise BotError(e)
|
|
| 42 |
+ request = bots_pb2.CreateBotSessionRequest(parent = parent,
|
|
| 43 |
+ bot_session = bot_session)
|
|
| 44 |
+ return self._stub.CreateBotSession(request)
|
|
| 50 | 45 |
|
| 51 | 46 |
def update_bot_session(self, bot_session, update_mask = None):
|
| 52 |
- try:
|
|
| 53 |
- request = bots_pb2.UpdateBotSessionRequest(name = bot_session.name,
|
|
| 54 |
- bot_session = bot_session,
|
|
| 55 |
- update_mask = update_mask)
|
|
| 56 |
- return self._stub.UpdateBotSession(request)
|
|
| 57 |
- |
|
| 58 |
- except Exception as e:
|
|
| 59 |
- self.logger.error(e)
|
|
| 60 |
- raise BotError(e)
|
|
| 47 |
+ request = bots_pb2.UpdateBotSessionRequest(name = bot_session.name,
|
|
| 48 |
+ bot_session = bot_session,
|
|
| 49 |
+ update_mask = update_mask)
|
|
| 50 |
+ return self._stub.UpdateBotSession(request)
|
| 1 |
+# Copyright (C) 2018 Bloomberg LP
|
|
| 2 |
+#
|
|
| 3 |
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
| 4 |
+# you may not use this file except in compliance with the License.
|
|
| 5 |
+# You may obtain a copy of the License at
|
|
| 6 |
+#
|
|
| 7 |
+# <http://www.apache.org/licenses/LICENSE-2.0>
|
|
| 8 |
+#
|
|
| 9 |
+# Unless required by applicable law or agreed to in writing, software
|
|
| 10 |
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
| 11 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
| 12 |
+# See the License for the specific language governing permissions and
|
|
| 13 |
+# limitations under the License.
|
|
| 14 |
+ |
|
| 15 |
+"""
|
|
| 16 |
+Bot Session
|
|
| 17 |
+====
|
|
| 18 |
+ |
|
| 19 |
+Allows connections
|
|
| 20 |
+"""
|
|
| 21 |
+import asyncio
|
|
| 22 |
+import logging
|
|
| 23 |
+import platform
|
|
| 24 |
+import uuid
|
|
| 25 |
+ |
|
| 26 |
+from enum import Enum
|
|
| 27 |
+ |
|
| 28 |
+from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
|
|
| 29 |
+ |
|
| 30 |
+class BotStatus(Enum):
|
|
| 31 |
+ BOT_STATUS_UNSPECIFIED = bots_pb2.BotStatus.Value('BOT_STATUS_UNSPECIFIED')
|
|
| 32 |
+ OK = bots_pb2.BotStatus.Value('OK')
|
|
| 33 |
+ UNHEALTHY = bots_pb2.BotStatus.Value('UNHEALTHY');
|
|
| 34 |
+ HOST_REBOOTING = bots_pb2.BotStatus.Value('HOST_REBOOTING')
|
|
| 35 |
+ BOT_TERMINATING = bots_pb2.BotStatus.Value('BOT_TERMINATING')
|
|
| 36 |
+ |
|
| 37 |
+class LeaseState(Enum):
|
|
| 38 |
+ LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
|
|
| 39 |
+ PENDING = bots_pb2.LeaseState.Value('PENDING')
|
|
| 40 |
+ ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
|
|
| 41 |
+ COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
|
|
| 42 |
+ CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
|
|
| 43 |
+ |
|
| 44 |
+ |
|
| 45 |
+class BotSession:
|
|
| 46 |
+ def __init__(self, parent, interface):
|
|
| 47 |
+ """ Unique bot ID within the farm used to identify this bot
|
|
| 48 |
+ Needs to be human readable.
|
|
| 49 |
+ All prior sessions with bot_id of same ID are invalidated.
|
|
| 50 |
+ If a bot attempts to update an invalid session, it must be rejected and
|
|
| 51 |
+ may be put in quarantine.
|
|
| 52 |
+ """
|
|
| 53 |
+ |
|
| 54 |
+ self.logger = logging.getLogger(__name__)
|
|
| 55 |
+ |
|
| 56 |
+ self._bot_id = '{}.{}'.format(parent, platform.node())
|
|
| 57 |
+ self._interface = interface
|
|
| 58 |
+ self._leases = {}
|
|
| 59 |
+ self._name = None
|
|
| 60 |
+ self._parent = parent
|
|
| 61 |
+ self._status = BotStatus.OK.value
|
|
| 62 |
+ self._work = None
|
|
| 63 |
+ self._worker = None
|
|
| 64 |
+ |
|
| 65 |
+ @property
|
|
| 66 |
+ def bot_id(self):
|
|
| 67 |
+ return self._bot_id
|
|
| 68 |
+ |
|
| 69 |
+ def add_worker(self, worker):
|
|
| 70 |
+ self._worker = worker
|
|
| 71 |
+ |
|
| 72 |
+ def create_bot_session(self, work, context=None):
|
|
| 73 |
+ self.logger.debug("Creating bot session")
|
|
| 74 |
+ self._work = work
|
|
| 75 |
+ self._context = context
|
|
| 76 |
+ |
|
| 77 |
+ session = self._interface.create_bot_session(self._parent, self.get_pb2())
|
|
| 78 |
+ self._name = session.name
|
|
| 79 |
+ self.logger.info("Created bot session with name: {}".format(self._name))
|
|
| 80 |
+ |
|
| 81 |
+ def update_bot_session(self):
|
|
| 82 |
+ session = self._interface.update_bot_session(self.get_pb2())
|
|
| 83 |
+ for lease in session.leases:
|
|
| 84 |
+ self._update_lease_from_server(lease)
|
|
| 85 |
+ |
|
| 86 |
+ def get_pb2(self):
|
|
| 87 |
+ leases = list(self._leases.values())
|
|
| 88 |
+ if not leases:
|
|
| 89 |
+ leases = None
|
|
| 90 |
+ |
|
| 91 |
+ return bots_pb2.BotSession(worker=self._worker.get_pb2(),
|
|
| 92 |
+ status=self._status,
|
|
| 93 |
+ leases=leases,
|
|
| 94 |
+ bot_id=self._bot_id,
|
|
| 95 |
+ name = self._name)
|
|
| 96 |
+ |
|
| 97 |
+ def lease_completed(self, lease):
|
|
| 98 |
+ lease.state = LeaseState.COMPLETED.value
|
|
| 99 |
+ self._leases[lease.id] = lease
|
|
| 100 |
+ |
|
| 101 |
+ def _update_lease_from_server(self, lease):
|
|
| 102 |
+ """
|
|
| 103 |
+ State machine for any recieved updates to the leases.
|
|
| 104 |
+ """
|
|
| 105 |
+ ## TODO: Compare with previous state of lease
|
|
| 106 |
+ lease_bot = self._leases.get(lease.id)
|
|
| 107 |
+ |
|
| 108 |
+ if lease.state == LeaseState.PENDING.value:
|
|
| 109 |
+ lease.state = LeaseState.ACTIVE.value
|
|
| 110 |
+ asyncio.ensure_future(self.create_work(lease))
|
|
| 111 |
+ self._leases[lease.id] = lease
|
|
| 112 |
+ |
|
| 113 |
+ elif lease.state == LeaseState.COMPLETED.value and \
|
|
| 114 |
+ lease_bot.state == LeaseState.COMPLETED.value:
|
|
| 115 |
+ del self._leases[lease.id]
|
|
| 116 |
+ |
|
| 117 |
+ async def create_work(self, lease):
|
|
| 118 |
+ self.logger.debug("Work created: {}".format(lease.id))
|
|
| 119 |
+ lease = await self._work(self._context, lease)
|
|
| 120 |
+ self.logger.debug("Work complete: {}".format(lease.id))
|
|
| 121 |
+ self.lease_completed(lease)
|
|
| 122 |
+ |
|
| 123 |
+class Worker:
|
|
| 124 |
+ def __init__(self, properties=None, configs=None):
|
|
| 125 |
+ self.properties = {}
|
|
| 126 |
+ self._configs = {}
|
|
| 127 |
+ self._devices = []
|
|
| 128 |
+ |
|
| 129 |
+ if properties:
|
|
| 130 |
+ for k, v in properties.items():
|
|
| 131 |
+ if k == 'pool':
|
|
| 132 |
+ self.properties[k] = v
|
|
| 133 |
+ else:
|
|
| 134 |
+ raise KeyError('Key not supported: {}'.format(k))
|
|
| 135 |
+ |
|
| 136 |
+ if configs:
|
|
| 137 |
+ for k, v in configs.items():
|
|
| 138 |
+ if k == 'DockerImage':
|
|
| 139 |
+ self.properties[k] = v
|
|
| 140 |
+ else:
|
|
| 141 |
+ raise KeyError('Key not supported: {}'.format(k))
|
|
| 142 |
+ |
|
| 143 |
+ def add_device(self, device):
|
|
| 144 |
+ self._devices.append(device)
|
|
| 145 |
+ |
|
| 146 |
+ def get_pb2(self):
|
|
| 147 |
+ devices = [device.get_pb2() for device in self._devices]
|
|
| 148 |
+ worker = worker_pb2.Worker(devices=devices)
|
|
| 149 |
+ property_message = worker_pb2.Worker.Property()
|
|
| 150 |
+ for k, v in self.properties.items():
|
|
| 151 |
+ property_message.key = k
|
|
| 152 |
+ property_message.value = v
|
|
| 153 |
+ worker.properties.extend([property_message])
|
|
| 154 |
+ |
|
| 155 |
+ config_message = worker_pb2.Worker.Config()
|
|
| 156 |
+ for k, v in self.properties.items():
|
|
| 157 |
+ property_message.key = k
|
|
| 158 |
+ property_message.value = v
|
|
| 159 |
+ worker.configs.extend([config_message])
|
|
| 160 |
+ |
|
| 161 |
+ return worker
|
|
| 162 |
+ |
|
| 163 |
+class Device:
|
|
| 164 |
+ def __init__(self, properties=None):
|
|
| 165 |
+ """ Creates devices available to the worker
|
|
| 166 |
+ The first device is know as the Primary Device - the revice which
|
|
| 167 |
+ is running a bit and responsible to actually executing commands.
|
|
| 168 |
+ All other devices are known as Attatched Devices and must be controlled
|
|
| 169 |
+ by the Primary Device.
|
|
| 170 |
+ """
|
|
| 171 |
+ |
|
| 172 |
+ self._name = str(uuid.uuid4())
|
|
| 173 |
+ self._properties = {}
|
|
| 174 |
+ |
|
| 175 |
+ if properties:
|
|
| 176 |
+ for k, v in properties.items():
|
|
| 177 |
+ if k == 'os':
|
|
| 178 |
+ self._properties[k] = v
|
|
| 179 |
+ |
|
| 180 |
+ elif k == 'docker':
|
|
| 181 |
+ if v not in ('True', 'False'):
|
|
| 182 |
+ raise ValueError('Value not supported: {}'.format(v))
|
|
| 183 |
+ self._properties[k] = v
|
|
| 184 |
+ |
|
| 185 |
+ else:
|
|
| 186 |
+ raise KeyError('Key not supported: {}'.format(k))
|
|
| 187 |
+ |
|
| 188 |
+ def get_pb2(self):
|
|
| 189 |
+ device = worker_pb2.Device(handle=self._name)
|
|
| 190 |
+ property_message = worker_pb2.Device.Property()
|
|
| 191 |
+ for k, v in self._properties.items():
|
|
| 192 |
+ property_message.key = k
|
|
| 193 |
+ property_message.value = v
|
|
| 194 |
+ device.properties.extend([property_message])
|
|
| 195 |
+ return device
|
| ... | ... | @@ -57,9 +57,9 @@ class Job(): |
| 57 | 57 |
self.lease = None
|
| 58 | 58 |
self.logger = logging.getLogger(__name__)
|
| 59 | 59 |
self.name = str(uuid.uuid4())
|
| 60 |
+ self.n_tries = 0
|
|
| 60 | 61 |
self.result = None
|
| 61 | 62 |
|
| 62 |
- self._n_tries = 0
|
|
| 63 | 63 |
self._operation = operations_pb2.Operation(name = self.name)
|
| 64 | 64 |
self._operation_update_queues = []
|
| 65 | 65 |
|
| ... | ... | @@ -97,8 +97,7 @@ class Job(): |
| 97 | 97 |
action_digest = self._pack_any(self.action_digest)
|
| 98 | 98 |
|
| 99 | 99 |
lease = bots_pb2.Lease(id = self.name,
|
| 100 |
- payload = action_digest,
|
|
| 101 |
- state = LeaseState.PENDING.value)
|
|
| 100 |
+ payload = action_digest)
|
|
| 102 | 101 |
self.lease = lease
|
| 103 | 102 |
return lease
|
| 104 | 103 |
|
| ... | ... | @@ -50,24 +50,19 @@ class Scheduler(): |
| 50 | 50 |
self.queue.append(job)
|
| 51 | 51 |
|
| 52 | 52 |
def retry_job(self, name):
|
| 53 |
- job = self.jobs[name]
|
|
| 54 |
- |
|
| 55 |
- if job.n_tries >= self.MAX_N_TRIES:
|
|
| 56 |
- # TODO: Decide what to do with these jobs
|
|
| 57 |
- job.update_execute_stage(ExecuteStage.COMPLETED)
|
|
| 58 |
- else:
|
|
| 59 |
- job.update_execute_stage(ExecuteStage.QUEUED)
|
|
| 60 |
- job.n_tries += 1
|
|
| 61 |
- self.queue.appendleft(job)
|
|
| 53 |
+ job = self.jobs.get(name)
|
|
| 62 | 54 |
|
| 63 |
- self.jobs[name] = job
|
|
| 55 |
+ if job is not None:
|
|
| 56 |
+ if job.n_tries >= self.MAX_N_TRIES:
|
|
| 57 |
+ # TODO: Decide what to do with these jobs
|
|
| 58 |
+ job.update_execute_stage(ExecuteStage.COMPLETED)
|
|
| 59 |
+ # TODO: Mark these jobs as done
|
|
| 60 |
+ else:
|
|
| 61 |
+ job.update_execute_stage(ExecuteStage.QUEUED)
|
|
| 62 |
+ job.n_tries += 1
|
|
| 63 |
+ self.queue.appendleft(job)
|
|
| 64 | 64 |
|
| 65 |
- def create_job(self):
|
|
| 66 |
- if len(self.queue) > 0:
|
|
| 67 |
- job = self.queue.popleft()
|
|
| 68 |
- job.update_execute_stage(ExecuteStage.EXECUTING)
|
|
| 69 |
- self.jobs[job.name] = job
|
|
| 70 |
- return job
|
|
| 65 |
+ self.jobs[name] = job
|
|
| 71 | 66 |
|
| 72 | 67 |
def job_complete(self, name, result):
|
| 73 | 68 |
job = self.jobs[name]
|
| ... | ... | @@ -81,48 +76,13 @@ class Scheduler(): |
| 81 | 76 |
response.operations.extend([v.get_operation()])
|
| 82 | 77 |
return response
|
| 83 | 78 |
|
| 84 |
- def update_lease(self, lease):
|
|
| 85 |
- name = lease.id
|
|
| 79 |
+ def update_job_lease_state(self, name, state):
|
|
| 86 | 80 |
job = self.jobs.get(name)
|
| 87 |
- state = lease.state
|
|
| 88 |
- |
|
| 89 |
- if state == LeaseState.LEASE_STATE_UNSPECIFIED.value:
|
|
| 90 |
- create_job = self.create_job()
|
|
| 91 |
- if create_job is None:
|
|
| 92 |
- # No job? Return lease.
|
|
| 93 |
- return lease
|
|
| 94 |
- else:
|
|
| 95 |
- job = create_job
|
|
| 96 |
- job.lease = job.create_lease()
|
|
| 97 |
- |
|
| 98 |
- elif state == LeaseState.PENDING.value:
|
|
| 99 |
- job.lease = lease
|
|
| 100 |
- |
|
| 101 |
- elif state == LeaseState.ACTIVE.value:
|
|
| 102 |
- job.lease = lease
|
|
| 103 |
- |
|
| 104 |
- elif state == LeaseState.COMPLETED.value:
|
|
| 105 |
- self.job_complete(job.name, lease.result)
|
|
| 106 |
- |
|
| 107 |
- create_job = self.create_job()
|
|
| 108 |
- if create_job is None:
|
|
| 109 |
- # Docs say not to use this state though if job has
|
|
| 110 |
- # completed and no more jobs, then use this state to stop
|
|
| 111 |
- # job being processed again
|
|
| 112 |
- job.lease = lease
|
|
| 113 |
- job.lease.state = LeaseState.LEASE_STATE_UNSPECIFIED.value
|
|
| 114 |
- else:
|
|
| 115 |
- job = create_job
|
|
| 116 |
- job.lease = job.create_lease()
|
|
| 117 |
- |
|
| 118 |
- elif state == LeaseState.CANCELLED.value:
|
|
| 119 |
- job.lease = lease
|
|
| 120 |
- |
|
| 121 |
- else:
|
|
| 122 |
- raise Exception("Unknown state: {}".format(state))
|
|
| 123 |
- |
|
| 81 |
+ job.lease.state = state
|
|
| 124 | 82 |
self.jobs[name] = job
|
| 125 |
- return job.lease
|
|
| 83 |
+ |
|
| 84 |
+ def get_job_lease_state(self, name):
|
|
| 85 |
+ return self.jobs[name].lease.state
|
|
| 126 | 86 |
|
| 127 | 87 |
def cancel_session(self, name):
|
| 128 | 88 |
job = self.jobs[name]
|
| ... | ... | @@ -131,6 +91,15 @@ class Scheduler(): |
| 131 | 91 |
state == LeaseState.ACTIVE.value:
|
| 132 | 92 |
self.retry_job(name)
|
| 133 | 93 |
|
| 94 |
+ def create_leases(self):
|
|
| 95 |
+ while len(self.queue) > 0:
|
|
| 96 |
+ job = self.queue.popleft()
|
|
| 97 |
+ job.update_execute_stage(ExecuteStage.EXECUTING)
|
|
| 98 |
+ job.lease = job.create_lease()
|
|
| 99 |
+ job.lease.state = LeaseState.PENDING.value
|
|
| 100 |
+ self.jobs[job.name] = job
|
|
| 101 |
+ yield job.lease
|
|
| 102 |
+ |
|
| 134 | 103 |
def _update_execute_stage(self, job, stage):
|
| 135 | 104 |
job.update_execute_stage(stage)
|
| 136 | 105 |
return job
|
| ... | ... | @@ -35,6 +35,7 @@ class BotsInterface(): |
| 35 | 35 |
self.logger = logging.getLogger(__name__)
|
| 36 | 36 |
|
| 37 | 37 |
self._bot_ids = {}
|
| 38 |
+ self._bot_sessions = {}
|
|
| 38 | 39 |
self._scheduler = scheduler
|
| 39 | 40 |
|
| 40 | 41 |
def create_bot_session(self, parent, bot_session):
|
| ... | ... | @@ -59,6 +60,7 @@ class BotsInterface(): |
| 59 | 60 |
bot_session.name = name
|
| 60 | 61 |
|
| 61 | 62 |
self._bot_ids[name] = bot_id
|
| 63 |
+ self._bot_sessions[name] = bot_session
|
|
| 62 | 64 |
self.logger.info("Created bot session name={} with bot_id={}".format(name, bot_id))
|
| 63 | 65 |
return bot_session
|
| 64 | 66 |
|
| ... | ... | @@ -69,13 +71,61 @@ class BotsInterface(): |
| 69 | 71 |
self.logger.debug("Updating bot session name={}".format(name))
|
| 70 | 72 |
self._check_bot_ids(bot_session.bot_id, name)
|
| 71 | 73 |
|
| 72 |
- leases = [self._scheduler.update_lease(lease) for lease in bot_session.leases]
|
|
| 74 |
+ server_session = self._bot_sessions[name]
|
|
| 75 |
+ |
|
| 76 |
+ leases = [self.check_states(lease) for lease in bot_session.leases]
|
|
| 73 | 77 |
|
| 74 | 78 |
del bot_session.leases[:]
|
| 75 | 79 |
bot_session.leases.extend(leases)
|
| 76 | 80 |
|
| 81 |
+ for lease in self._scheduler.create_leases():
|
|
| 82 |
+ bot_session.leases.extend([lease])
|
|
| 83 |
+ |
|
| 84 |
+ self._bot_sessions[name] = bot_session
|
|
| 77 | 85 |
return bot_session
|
| 78 | 86 |
|
| 87 |
+ def check_states(self, lease_client):
|
|
| 88 |
+ """ Edge detector for states
|
|
| 89 |
+ """
|
|
| 90 |
+ ## TODO: Handle cancelled states
|
|
| 91 |
+ server_state = LeaseState(self._scheduler.get_job_lease_state(lease_client.id))
|
|
| 92 |
+ client_state = LeaseState(lease_client.state)
|
|
| 93 |
+ |
|
| 94 |
+ if server_state == LeaseState.PENDING:
|
|
| 95 |
+ |
|
| 96 |
+ if client_state == LeaseState.ACTIVE:
|
|
| 97 |
+ self._scheduler.update_job_lease_state(lease_client.id, lease_client.state)
|
|
| 98 |
+ elif client_state == LeaseState.COMPLETED:
|
|
| 99 |
+ # TODO: Lease was rejected
|
|
| 100 |
+ raise NotImplementedError("'Not Accepted' is unsupported")
|
|
| 101 |
+ else:
|
|
| 102 |
+ raise OutofSyncError("Server lease: {}. Client lease: {}".format(lease_server, lease_client))
|
|
| 103 |
+ |
|
| 104 |
+ elif server_state == LeaseState.ACTIVE:
|
|
| 105 |
+ |
|
| 106 |
+ if client_state == LeaseState.ACTIVE:
|
|
| 107 |
+ pass
|
|
| 108 |
+ |
|
| 109 |
+ elif client_state == LeaseState.COMPLETED:
|
|
| 110 |
+ self._scheduler.job_complete(lease_client.id, lease_client.result)
|
|
| 111 |
+ self._scheduler.update_job_lease_state(lease_client.id, lease_client.state)
|
|
| 112 |
+ |
|
| 113 |
+ else:
|
|
| 114 |
+ raise OutofSyncError("Server lease: {}. Client lease: {}".format(lease_server, lease_client))
|
|
| 115 |
+ |
|
| 116 |
+ elif server_state == LeaseState.COMPLETED:
|
|
| 117 |
+ raise OutofSyncError("Server lease: {}. Client lease: {}".format(lease_server, lease_client))
|
|
| 118 |
+ |
|
| 119 |
+ elif server_state == LeaseState.CANCELLED:
|
|
| 120 |
+ raise NotImplementedError("Cancelled states not supported yet")
|
|
| 121 |
+ |
|
| 122 |
+ else:
|
|
| 123 |
+ # Sould never get here
|
|
| 124 |
+ raise OutofSyncError("State now allowed: {}".format(server_state))
|
|
| 125 |
+ |
|
| 126 |
+ return lease_client
|
|
| 127 |
+ |
|
| 128 |
+ |
|
| 79 | 129 |
def _check_bot_ids(self, bot_id, name = None):
|
| 80 | 130 |
""" Checks the ID and the name of the bot.
|
| 81 | 131 |
"""
|
| ... | ... | @@ -103,7 +153,10 @@ class BotsInterface(): |
| 103 | 153 |
raise InvalidArgumentError("Bot id does not exist: {}".format(name))
|
| 104 | 154 |
|
| 105 | 155 |
self.logger.debug("Attempting to close {} with name: {}".format(bot_id, name))
|
| 106 |
- self._scheduler.retry_job(name)
|
|
| 156 |
+ for lease in self._bot_sessions[name].leases:
|
|
| 157 |
+ if lease.state != LeaseState.COMPLETED.value:
|
|
| 158 |
+ self._scheduler.retry_job(lease.id)
|
|
| 159 |
+ |
|
| 107 | 160 |
self.logger.debug("Closing bot session: {}".format(name))
|
| 108 | 161 |
self._bot_ids.pop(name)
|
| 109 | 162 |
self.logger.info("Closed bot {} with name: {}".format(bot_id, name))
|
| ... | ... | @@ -22,6 +22,8 @@ import mock |
| 22 | 22 |
import pytest
|
| 23 | 23 |
import uuid
|
| 24 | 24 |
|
| 25 |
+from google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
|
|
| 26 |
+ |
|
| 25 | 27 |
from buildgrid.bot import bot, bot_interface
|
| 26 | 28 |
|
| 27 | 29 |
async def _work_dummy(context, lease):
|
| ... | ... | @@ -39,16 +41,9 @@ def context(): |
| 39 | 41 |
# GRPC context
|
| 40 | 42 |
@pytest.fixture
|
| 41 | 43 |
def channel():
|
| 42 |
- yield mock.MagicMock(spec = grpc.insecure_channel(''))
|
|
| 44 |
+ yield mock.MagicMock(spec = grpc.insecure_channel)
|
|
| 43 | 45 |
|
| 44 |
-@pytest.fixture
|
|
| 45 |
-def instance(channel):
|
|
| 46 |
- yield bot.Bot(work=_work_dummy,
|
|
| 47 |
- context=ContextMock(),
|
|
| 48 |
- channel=channel,
|
|
| 49 |
- parent='rach',
|
|
| 50 |
- number_of_leases=1,
|
|
| 51 |
- continuous=False)
|
|
| 52 |
- |
|
| 53 |
-def test_create_job(instance):
|
|
| 54 |
- instance.bot_session()
|
|
| 46 |
+@mock.patch.object(bot.bot_interface, 'bots_pb2', autospec = True)
|
|
| 47 |
+@mock.patch.object(bot.bot_interface, 'bots_pb2_grpc', autospec = True)
|
|
| 48 |
+def test_me(mock_pb2, mock_pb2_grpc, channel, context):
|
|
| 49 |
+ pass
|
