Merge pull request #1074 from dod-ccpo/lock-environments

Implement simple locking system for environments
This commit is contained in:
richard-dds
2019-09-17 15:44:25 -04:00
committed by GitHub
7 changed files with 281 additions and 84 deletions

View File

@@ -1,7 +1,7 @@
from sqlalchemy import text
from sqlalchemy import text, func, or_
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.orm import load_only
from typing import List
from uuid import UUID
from atst.database import db
from atst.models import Environment, Application, Portfolio, TaskOrder, CLIN
@@ -97,44 +97,53 @@ class Environments(object):
@classmethod
def base_provision_query(cls, now):
return (
db.session.query(Environment)
db.session.query(Environment.id)
.join(Application)
.join(Portfolio)
.join(TaskOrder)
.join(CLIN)
.filter(CLIN.start_date <= now)
.filter(CLIN.end_date > now)
# select only these columns
.options(load_only("id", "creator_id"))
.filter(
or_(
Environment.claimed_until == None,
Environment.claimed_until <= func.now(),
)
)
)
@classmethod
def get_environments_pending_creation(cls, now) -> List[Environment]:
def get_environments_pending_creation(cls, now) -> List[UUID]:
"""
Any environment with an active CLIN that doesn't yet have a `cloud_id`.
"""
return cls.base_provision_query(now).filter(Environment.cloud_id == None).all()
results = (
cls.base_provision_query(now).filter(Environment.cloud_id == None).all()
)
return [id_ for id_, in results]
@classmethod
def get_environments_pending_atat_user_creation(cls, now) -> List[Environment]:
def get_environments_pending_atat_user_creation(cls, now) -> List[UUID]:
"""
Any environment with an active CLIN that has a cloud_id but no `root_user_info`.
"""
return (
results = (
cls.base_provision_query(now)
.filter(Environment.cloud_id != None)
.filter(Environment.root_user_info == text("'null'"))
).all()
return [id_ for id_, in results]
@classmethod
def get_environments_pending_baseline_creation(cls, now) -> List[Environment]:
def get_environments_pending_baseline_creation(cls, now) -> List[UUID]:
"""
Any environment with an active CLIN that has a `cloud_id` and `root_user_info`
but no `baseline_info`.
"""
return (
results = (
cls.base_provision_query(now)
.filter(Environment.cloud_id != None)
.filter(Environment.root_user_info != text("'null'"))
.filter(Environment.baseline_info == text("'null'"))
).all()
return [id_ for id_, in results]

View File

@@ -44,3 +44,12 @@ class NoAccessError(Exception):
@property
def message(self):
return "Route for {} cannot be accessed".format(self.resource_name)
class ClaimFailedException(Exception):
def __init__(self, resource):
self.resource = resource
message = (
f"Could not acquire claim for {resource.__class__.__name__} {resource.id}."
)
super().__init__(message)

View File

@@ -6,7 +6,7 @@ from atst.queue import celery
from atst.models import EnvironmentJobFailure, EnvironmentRoleJobFailure
from atst.domain.csp.cloud import CloudProviderInterface, GeneralCSPException
from atst.domain.environments import Environments
from atst.domain.users import Users
from atst.models.utils import claim_for_update
class RecordEnvironmentFailure(celery.Task):
@@ -44,58 +44,61 @@ def send_notification_mail(recipients, subject, body):
app.mailer.send(recipients, subject, body)
def do_create_environment(
csp: CloudProviderInterface, environment_id=None, atat_user_id=None
):
def do_create_environment(csp: CloudProviderInterface, environment_id=None):
environment = Environments.get(environment_id)
if environment.cloud_id is not None:
# TODO: Return value for this?
return
with claim_for_update(environment) as environment:
user = Users.get(atat_user_id)
if environment.cloud_id is not None:
# TODO: Return value for this?
return
# we'll need to do some checking in this job for cases where it's retrying
# when a failure occured after some successful steps
# (e.g. if environment.cloud_id is not None, then we can skip first step)
user = environment.creator
# credentials either from a given user or pulled from config?
# if using global creds, do we need to log what user authorized action?
atat_root_creds = csp.root_creds()
# we'll need to do some checking in this job for cases where it's retrying
# when a failure occured after some successful steps
# (e.g. if environment.cloud_id is not None, then we can skip first step)
# user is needed because baseline root account in the environment will
# be assigned to the requesting user, open question how to handle duplicate
# email addresses across new environments
csp_environment_id = csp.create_environment(atat_root_creds, user, environment)
environment.cloud_id = csp_environment_id
db.session.add(environment)
db.session.commit()
# credentials either from a given user or pulled from config?
# if using global creds, do we need to log what user authorized action?
atat_root_creds = csp.root_creds()
# user is needed because baseline root account in the environment will
# be assigned to the requesting user, open question how to handle duplicate
# email addresses across new environments
csp_environment_id = csp.create_environment(atat_root_creds, user, environment)
environment.cloud_id = csp_environment_id
db.session.add(environment)
db.session.commit()
def do_create_atat_admin_user(csp: CloudProviderInterface, environment_id=None):
environment = Environments.get(environment_id)
atat_root_creds = csp.root_creds()
atat_remote_root_user = csp.create_atat_admin_user(
atat_root_creds, environment.cloud_id
)
environment.root_user_info = atat_remote_root_user
db.session.add(environment)
db.session.commit()
with claim_for_update(environment) as environment:
atat_root_creds = csp.root_creds()
atat_remote_root_user = csp.create_atat_admin_user(
atat_root_creds, environment.cloud_id
)
environment.root_user_info = atat_remote_root_user
db.session.add(environment)
db.session.commit()
def do_create_environment_baseline(csp: CloudProviderInterface, environment_id=None):
environment = Environments.get(environment_id)
# ASAP switch to use remote root user for provisioning
atat_remote_root_creds = environment.root_user_info["credentials"]
with claim_for_update(environment) as environment:
# ASAP switch to use remote root user for provisioning
atat_remote_root_creds = environment.root_user_info["credentials"]
baseline_info = csp.create_environment_baseline(
atat_remote_root_creds, environment.cloud_id
)
environment.baseline_info = baseline_info
db.session.add(environment)
db.session.commit()
baseline_info = csp.create_environment_baseline(
atat_remote_root_creds, environment.cloud_id
)
environment.baseline_info = baseline_info
db.session.add(environment)
db.session.commit()
def do_work(fn, task, csp, **kwargs):
@@ -106,39 +109,46 @@ def do_work(fn, task, csp, **kwargs):
@celery.task(bind=True)
def create_environment(self, environment_id=None, atat_user_id=None):
do_work(do_create_environment, self, app.csp.cloud, **kwargs)
def create_environment(self, environment_id=None):
do_work(do_create_environment, self, app.csp.cloud, environment_id=environment_id)
@celery.task(bind=True)
def create_atat_admin_user(self, environment_id=None):
do_work(do_create_atat_admin_user, self, app.csp.cloud, **kwargs)
do_work(
do_create_atat_admin_user, self, app.csp.cloud, environment_id=environment_id
)
@celery.task(bind=True)
def create_environment_baseline(self, environment_id=None):
do_work(do_create_environment_baseline, self, app.csp.cloud, **kwargs)
do_work(
do_create_environment_baseline,
self,
app.csp.cloud,
environment_id=environment_id,
)
@celery.task(bind=True)
def dispatch_create_environment(self):
for environment in Environments.get_environments_pending_creation(pendulum.now()):
create_environment.delay(
environment_id=environment.id, atat_user_id=environment.creator_id
)
for environment_id in Environments.get_environments_pending_creation(
pendulum.now()
):
create_environment.delay(environment_id=environment_id)
@celery.task(bind=True)
def dispatch_create_atat_admin_user(self):
for environment in Environments.get_environments_pending_atat_user_creation(
for environment_id in Environments.get_environments_pending_atat_user_creation(
pendulum.now()
):
create_atat_admin_user.delay(environment_id=environment.id)
create_atat_admin_user.delay(environment_id=environment_id)
@celery.task(bind=True)
def dispatch_create_environment_baseline(self):
for environment in Environments.get_environments_pending_baseline_creation(
for environment_id in Environments.get_environments_pending_baseline_creation(
pendulum.now()
):
create_environment_baseline.delay(environment_id=environment.id)
create_environment_baseline.delay(environment_id=environment_id)

View File

@@ -1,4 +1,4 @@
from sqlalchemy import Column, ForeignKey, String
from sqlalchemy import Column, ForeignKey, String, TIMESTAMP
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import JSONB
from enum import Enum
@@ -29,6 +29,8 @@ class Environment(
root_user_info = Column(JSONB)
baseline_info = Column(JSONB)
claimed_until = Column(TIMESTAMP(timezone=True))
job_failures = relationship("EnvironmentJobFailure")
class ProvisioningStatus(Enum):

49
atst/models/utils.py Normal file
View File

@@ -0,0 +1,49 @@
from sqlalchemy import func, sql, Interval, and_, or_
from contextlib import contextmanager
from atst.database import db
from atst.domain.exceptions import ClaimFailedException
@contextmanager
def claim_for_update(resource, minutes=30):
"""
Claim a mutually exclusive expiring hold on a resource.
Uses the database as a central source of time in case the server clocks have drifted.
Args:
resource: A SQLAlchemy model instance with a `claimed_until` attribute.
minutes: The maximum amount of time, in minutes, to hold the claim.
"""
Model = resource.__class__
claim_until = func.now() + func.cast(
sql.functions.concat(minutes, " MINUTES"), Interval
)
# Optimistically query for and update the resource in question. If it's
# already claimed, `rows_updated` will be 0 and we can give up.
rows_updated = (
db.session.query(Model)
.filter(
and_(
Model.id == resource.id,
or_(Model.claimed_until == None, Model.claimed_until <= func.now()),
)
)
.update({"claimed_until": claim_until}, synchronize_session="fetch")
)
if rows_updated < 1:
raise ClaimFailedException(resource)
# Fetch the claimed resource
claimed = db.session.query(Model).filter_by(id=resource.id).one()
try:
# Give the resource to the caller.
yield claimed
finally:
# Release the claim.
db.session.query(Model).filter(Model.id == resource.id).filter(
Model.claimed_until != None
).update({"claimed_until": None}, synchronize_session="fetch")