Monitor backups with PagerDuty hook integration (#245).
This commit is contained in:
parent
e76d5ad988
commit
bc02c123e6
9 changed files with 146 additions and 9 deletions
2
NEWS
2
NEWS
|
@ -1,4 +1,6 @@
|
|||
1.5.0
|
||||
* #245: Monitor backups with PagerDuty hook integration. See the documentation for more
|
||||
information: https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook
|
||||
* #255: Add per-action hooks: "before_prune", "after_prune", "before_check", and "after_check".
|
||||
* #274: Add ~/.config/borgmatic.d as another configuration directory default.
|
||||
* #277: Customize Healthchecks log level via borgmatic "--monitoring-verbosity" flag.
|
||||
|
|
|
@ -66,6 +66,7 @@ borgmatic is powered by [Borg Backup](https://www.borgbackup.org/).
|
|||
<a href="https://healthchecks.io/"><img src="docs/static/healthchecks.png" alt="Healthchecks" height="60px" style="margin-bottom:20px;"></a>
|
||||
<a href="https://cronitor.io/"><img src="docs/static/cronitor.png" alt="Cronitor" height="60px" style="margin-bottom:20px;"></a>
|
||||
<a href="https://cronhub.io/"><img src="docs/static/cronhub.png" alt="Cronhub" height="60px" style="margin-bottom:20px;"></a>
|
||||
<a href="https://www.pagerduty.com/"><img src="docs/static/pagerduty.png" alt="PagerDuty" height="60px" style="margin-bottom:20px;"></a>
|
||||
<a href="https://www.rsync.net/cgi-bin/borg.cgi?campaign=borg&adgroup=borgmatic"><img src="docs/static/rsyncnet.png" alt="rsync.net" height="60px" style="margin-bottom:20px;"></a>
|
||||
<a href="https://www.borgbase.com/?utm_source=borgmatic"><img src="docs/static/borgbase.png" alt="BorgBase" height="60px" style="margin-bottom:20px;"></a>
|
||||
|
||||
|
|
|
@ -567,6 +567,15 @@ map:
|
|||
for details.
|
||||
example:
|
||||
https://cronitor.link/d3x0c1
|
||||
pagerduty:
|
||||
type: str
|
||||
desc: |
|
||||
PagerDuty integration key used to notify PagerDuty when a backup errors. Create
|
||||
an account at https://www.pagerduty.com/ if you'd like to use this service. See
|
||||
https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook
|
||||
for details.
|
||||
example:
|
||||
a177cad45bd374409f78906a810a3074
|
||||
cronhub:
|
||||
type: str
|
||||
desc: |
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import logging
|
||||
|
||||
from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, postgresql
|
||||
from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, pagerduty, postgresql
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -8,6 +8,7 @@ HOOK_NAME_TO_MODULE = {
|
|||
'healthchecks': healthchecks,
|
||||
'cronitor': cronitor,
|
||||
'cronhub': cronhub,
|
||||
'pagerduty': pagerduty,
|
||||
'postgresql_databases': postgresql,
|
||||
'mysql_databases': mysql,
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from enum import Enum
|
||||
|
||||
MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub')
|
||||
MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub', 'pagerduty')
|
||||
|
||||
|
||||
class State(Enum):
|
||||
|
|
62
borgmatic/hooks/pagerduty.py
Normal file
62
borgmatic/hooks/pagerduty.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import platform
|
||||
|
||||
import requests
|
||||
|
||||
from borgmatic.hooks import monitor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EVENTS_API_URL = 'https://events.pagerduty.com/v2/enqueue'
|
||||
|
||||
|
||||
def ping_monitor(integration_key, config_filename, state, monitoring_log_level, dry_run):
|
||||
'''
|
||||
If this is an error state, create a PagerDuty event with the given integration key. Use the
|
||||
given configuration filename in any log entries. If this is a dry run, then don't actually
|
||||
create an event.
|
||||
'''
|
||||
if state != monitor.State.FAIL:
|
||||
logger.debug(
|
||||
'{}: Ignoring unsupported monitoring {} in PagerDuty hook'.format(
|
||||
config_filename, state.name.lower()
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
dry_run_label = ' (dry run; not actually sending)' if dry_run else ''
|
||||
logger.info('{}: Sending failure event to PagerDuty {}'.format(config_filename, dry_run_label))
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
|
||||
hostname = platform.node()
|
||||
local_timestamp = (
|
||||
datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).astimezone().isoformat()
|
||||
)
|
||||
payload = json.dumps(
|
||||
{
|
||||
'routing_key': integration_key,
|
||||
'event_action': 'trigger',
|
||||
'payload': {
|
||||
'summary': 'backup failed on {}'.format(hostname),
|
||||
'severity': 'error',
|
||||
'source': hostname,
|
||||
'timestamp': local_timestamp,
|
||||
'component': 'borgmatic',
|
||||
'group': 'backups',
|
||||
'class': 'backup failure',
|
||||
'custom_details': {
|
||||
'hostname': hostname,
|
||||
'configuration filename': config_filename,
|
||||
'server time': local_timestamp,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
logger.debug('{}: Using PagerDuty payload: {}'.format(config_filename, payload))
|
||||
|
||||
logging.getLogger('urllib3').setLevel(logging.ERROR)
|
||||
requests.post(EVENTS_API_URL, data=payload.encode('utf-8'))
|
|
@ -28,14 +28,15 @@ hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hoo
|
|||
below for how to configure this.
|
||||
4. **borgmatic monitoring hooks**: This feature integrates with monitoring
|
||||
services like [Healthchecks](https://healthchecks.io/),
|
||||
[Cronitor](https://cronitor.io), and [Cronhub](https://cronhub.io), and pings
|
||||
these services whenever borgmatic runs. That way, you'll receive an alert when
|
||||
something goes wrong or the service doesn't hear from borgmatic for a
|
||||
configured interval. See
|
||||
[Healthchecks
|
||||
[Cronitor](https://cronitor.io), [Cronhub](https://cronhub.io), and
|
||||
[PagerDuty](https://www.pagerduty.com/) and pings these services whenever
|
||||
borgmatic runs. That way, you'll receive an alert when something goes wrong or
|
||||
(for certain hooks) the service doesn't hear from borgmatic for a configured
|
||||
interval. See [Healthchecks
|
||||
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook), [Cronitor
|
||||
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), and [Cronhub
|
||||
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook)
|
||||
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), [Cronhub
|
||||
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook), and
|
||||
[PagerDuty hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook)
|
||||
below for how to configure this.
|
||||
3. **Third-party monitoring software**: You can use traditional monitoring
|
||||
software to consume borgmatic JSON output and track when the last
|
||||
|
@ -200,6 +201,32 @@ mechanisms](https://docs.cronhub.io/integrations.html) when backups fail
|
|||
or it doesn't hear from borgmatic for a certain period of time.
|
||||
|
||||
|
||||
## PagerDuty hook
|
||||
|
||||
[PagerDuty](https://cronhub.io/) provides incident monitoring and alerting,
|
||||
and borgmatic has built-in integration with it. Once you create a PagerDuty
|
||||
account and <a
|
||||
href="https://support.pagerduty.com/docs/services-and-integrations">service</a>
|
||||
on their site, all you need to do is configure borgmatic with the unique
|
||||
"Integration Key" for your service. Here's an example:
|
||||
|
||||
|
||||
```yaml
|
||||
hooks:
|
||||
pagerduty: a177cad45bd374409f78906a810a3074
|
||||
```
|
||||
|
||||
With this hook in place, borgmatic creates a PagerDuty event for your service
|
||||
whenever backups fail. Specifically, if an error occurs during a `create`,
|
||||
`prune`, or `check` action, borgmatic sends an event to PagerDuty after the
|
||||
`on_error` hooks run. Note that borgmatic does not contact PagerDuty when a
|
||||
backup starts or ends without error.
|
||||
|
||||
You can configure PagerDuty to notify you by a [variety of
|
||||
mechanisms](https://support.pagerduty.com/docs/notifications) when backups
|
||||
fail.
|
||||
|
||||
|
||||
## Scripting borgmatic
|
||||
|
||||
To consume the output of borgmatic in other software, you can include an
|
||||
|
|
BIN
docs/static/pagerduty.png
vendored
Normal file
BIN
docs/static/pagerduty.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 20 KiB |
35
tests/unit/hooks/test_pagerduty.py
Normal file
35
tests/unit/hooks/test_pagerduty.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
from flexmock import flexmock
|
||||
|
||||
from borgmatic.hooks import pagerduty as module
|
||||
|
||||
|
||||
def test_ping_monitor_ignores_start_state():
|
||||
flexmock(module.requests).should_receive('post').never()
|
||||
|
||||
module.ping_monitor(
|
||||
'abc123', 'config.yaml', module.monitor.State.START, monitoring_log_level=1, dry_run=False
|
||||
)
|
||||
|
||||
|
||||
def test_ping_monitor_ignores_finish_state():
|
||||
flexmock(module.requests).should_receive('post').never()
|
||||
|
||||
module.ping_monitor(
|
||||
'abc123', 'config.yaml', module.monitor.State.FINISH, monitoring_log_level=1, dry_run=False
|
||||
)
|
||||
|
||||
|
||||
def test_ping_monitor_calls_api_for_fail_state():
|
||||
flexmock(module.requests).should_receive('post')
|
||||
|
||||
module.ping_monitor(
|
||||
'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=False
|
||||
)
|
||||
|
||||
|
||||
def test_ping_monitor_dry_run_does_not_call_api():
|
||||
flexmock(module.requests).should_receive('post').never()
|
||||
|
||||
module.ping_monitor(
|
||||
'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=True
|
||||
)
|
Loading…
Reference in a new issue