Record job failures with application context.

AT-AT needs to be able to track which user tasks failed and why. To accomplish this we: - Enabled Celery's results backend, which logs task results to a data store; a Postgres table, in our case. (https://docs.celeryproject.org/en/latest/userguide/tasks.html#result-backends) - Created tables to track the relationships between the relevant models (Environment, EnvironmentRole) and their task failures. - Added an `on_failure` hook that tasks can use. The hook will add records to the job failure tables. Now a resource like an `Environment` has access to it task failures through the corresponding failure table. Notes: - It might prove useful to use a real foreign key to the Celery results table eventually. I did not do it here because it requires that we explicitly store the Celery results table schema as a migration and add a model for it. In the current implementation, AT-AT can be agnostic about where the results live. - We store the task results indefinitely, so it is important to specify tasks for which we do not care about the results (like `send_mail`) via the `ignore_result` kwarg.
2019-09-05 11:07:51 -04:00 · 2019-09-05 11:07:51 -04:00 · 7010bdb09c
commit 7010bdb09c
parent 1cbefb099b
11 changed files with 158 additions and 2 deletions
--- a/alembic/versions/30ea1cb20807_add_job_failure_tables.py
+++ b/alembic/versions/30ea1cb20807_add_job_failure_tables.py
@ -0,0 +1,42 @@
+"""add job failure tables
+
+Revision ID: 30ea1cb20807
+Revises: 4a3122ffe898
+Create Date: 2019-09-06 06:56:25.685805
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '30ea1cb20807' # pragma: allowlist secret
+down_revision = '4a3122ffe898' # pragma: allowlist secret
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('environment_job_failures',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('task_id', sa.String(), nullable=False),
+    sa.Column('environment_id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.ForeignKeyConstraint(['environment_id'], ['environments.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_table('environment_role_job_failures',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('task_id', sa.String(), nullable=False),
+    sa.Column('environment_role_id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.ForeignKeyConstraint(['environment_role_id'], ['environment_roles.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('environment_role_job_failures')
+    op.drop_table('environment_job_failures')
+    # ### end Alembic commands ###
--- a/atst/app.py
+++ b/atst/app.py
@ -169,6 +169,12 @@ def map_config(config):
        "LIMIT_CONCURRENT_SESSIONS": config.getboolean(
            "default", "LIMIT_CONCURRENT_SESSIONS"
        ),
+        # Store the celery task results in a database table (celery_taskmeta)
+        "CELERY_RESULT_BACKEND": "db+{}".format(config.get("default", "DATABASE_URI")),
+        # Do not automatically delete results (by default, Celery will do this
+        # with a Beat job once a day)
+        "CELERY_RESULT_EXPIRES": 0,
+        "CELERY_RESULT_EXTENDED": True,
    }


--- a/atst/jobs.py
+++ b/atst/jobs.py
@ -1,14 +1,36 @@
 from flask import current_app as app

 from atst.queue import celery
+from atst.database import db
+from atst.models import EnvironmentJobFailure, EnvironmentRoleJobFailure


-@celery.task()
+class RecordEnvironmentFailure(celery.Task):
+    def on_failure(self, exc, task_id, args, kwargs, einfo):
+        if "environment_id" in kwargs:
+            failure = EnvironmentJobFailure(
+                environment_id=kwargs["environment_id"], task_id=task_id
+            )
+            db.session.add(failure)
+            db.session.commit()
+
+
+class RecordEnvironmentRoleFailure(celery.Task):
+    def on_failure(self, exc, task_id, args, kwargs, einfo):
+        if "environment_role_id" in kwargs:
+            failure = EnvironmentRoleJobFailure(
+                environment_role_id=kwargs["environment_role_id"], task_id=task_id
+            )
+            db.session.add(failure)
+            db.session.commit()
+
+
+@celery.task(ignore_result=True)
 def send_mail(recipients, subject, body):
    app.mailer.send(recipients, subject, body)


-@celery.task()
+@celery.task(ignore_result=True)
 def send_notification_mail(recipients, subject, body):
    app.logger.info(
        "Sending a notification to these recipients: {}\n\nSubject: {}\n\n{}".format(
--- a/atst/models/init.py
+++ b/atst/models/init.py
@ -11,6 +11,7 @@ from .audit_event import AuditEvent
 from .clin import CLIN, JEDICLINType
 from .environment import Environment
 from .environment_role import EnvironmentRole, CSPRole
+from .job_failure import EnvironmentJobFailure, EnvironmentRoleJobFailure
 from .notification_recipient import NotificationRecipient
 from .permissions import Permissions
 from .permission_set import PermissionSet
--- a/atst/models/environment.py
+++ b/atst/models/environment.py
@ -19,6 +19,8 @@ class Environment(

    cloud_id = Column(String)

+    job_failures = relationship("EnvironmentJobFailure")
+
    @property
    def users(self):
        return {r.application_role.user for r in self.roles}
--- a/atst/models/environment_role.py
+++ b/atst/models/environment_role.py
@ -31,6 +31,8 @@ class EnvironmentRole(
    )
    application_role = relationship("ApplicationRole")

+    job_failures = relationship("EnvironmentRoleJobFailure")
+
    def __repr__(self):
        return "<EnvironmentRole(role='{}', user='{}', environment='{}', id='{}')>".format(
            self.role, self.application_role.user_name, self.environment.name, self.id
--- a/atst/models/job_failure.py
+++ b/atst/models/job_failure.py
@ -0,0 +1,16 @@
+from sqlalchemy import Column, ForeignKey
+
+from atst.models import Base
+from atst.models import mixins
+
+
+class EnvironmentJobFailure(Base, mixins.JobFailureMixin):
+    __tablename__ = "environment_job_failures"
+
+    environment_id = Column(ForeignKey("environments.id"), nullable=False)
+
+
+class EnvironmentRoleJobFailure(Base, mixins.JobFailureMixin):
+    __tablename__ = "environment_role_job_failures"
+
+    environment_role_id = Column(ForeignKey("environment_roles.id"), nullable=False)
--- a/atst/models/mixins/init.py
+++ b/atst/models/mixins/init.py
@ -3,3 +3,4 @@ from .auditable import AuditableMixin
 from .permissions import PermissionsMixin
 from .deletable import DeletableMixin
 from .invites import InvitesMixin
+from .job_failure import JobFailureMixin
--- a/atst/models/mixins/job_failure.py
+++ b/atst/models/mixins/job_failure.py
@ -0,0 +1,14 @@
+from celery.result import AsyncResult
+from sqlalchemy import Column, String, Integer
+
+
+class JobFailureMixin(object):
+    id = Column(Integer(), primary_key=True)
+    task_id = Column(String(), nullable=False)
+
+    @property
+    def task(self):
+        if not hasattr(self, "_task"):
+            self._task = AsyncResult(self.task_id)
+
+        return self._task
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -320,3 +320,12 @@ def notification_sender(app):
    yield app.notification_sender

    app.notification_sender = real_notification_sender
+
+
+# This is the only effective means I could find to disable logging. Setting a
+# `celery_enable_logging` fixture to return False should work according to the
+# docs, but doesn't:
+# https://docs.celeryproject.org/en/latest/userguide/testing.html#celery-enable-logging-override-to-enable-logging-in-embedded-workers
+@pytest.fixture(scope="function")
+def celery_worker_parameters():
+    return {"loglevel": "FATAL"}
--- a/tests/test_jobs.py
+++ b/tests/test_jobs.py
@ -0,0 +1,41 @@
+import pytest
+
+from atst.jobs import RecordEnvironmentFailure, RecordEnvironmentRoleFailure
+
+from tests.factories import EnvironmentFactory, EnvironmentRoleFactory
+
+
+def test_environment_job_failure(celery_app, celery_worker):
+    @celery_app.task(bind=True, base=RecordEnvironmentFailure)
+    def _fail_hard(self, environment_id=None):
+        raise ValueError("something bad happened")
+
+    environment = EnvironmentFactory.create()
+    celery_worker.reload()
+
+    # Use apply instead of delay since we are testing the on_failure hook only
+    task = _fail_hard.apply(kwargs={"environment_id": environment.id})
+    with pytest.raises(ValueError):
+        task.get()
+
+    assert environment.job_failures
+    job_failure = environment.job_failures[0]
+    assert job_failure.task == task
+
+
+def test_environment_role_job_failure(celery_app, celery_worker):
+    @celery_app.task(bind=True, base=RecordEnvironmentRoleFailure)
+    def _fail_hard(self, environment_role_id=None):
+        raise ValueError("something bad happened")
+
+    role = EnvironmentRoleFactory.create()
+    celery_worker.reload()
+
+    # Use apply instead of delay since we are testing the on_failure hook only
+    task = _fail_hard.apply(kwargs={"environment_role_id": role.id})
+    with pytest.raises(ValueError):
+        task.get()
+
+    assert role.job_failures
+    job_failure = role.job_failures[0]
+    assert job_failure.task == task