docs/source/working-notes/troyraen/billing/README.md

billing: pitt-google-broker-billing

  • [x] Export Cloud Billing data to BigQuery

  • [x] Pull some data and create figures

    • [ ] annotate figures with the date range the data comes from

    • [x] ~increase decimal precision for cost annotations showing zero. actual zeros have already been dropped~ drop costs below some threshhold

    • [x] specificy that “per million alerts” is the number of ZTF alerts entering our system

    • [ ] rewrite transforms._shorten_sku() to use regex instead of trying to manually map every sku. there’s too many.

    • [ ] move the compute engine PD storage to the mean cost per day figure

    • [ ] calculate storage cost per number of alerts in storage

  • [ ] Write the table joins and pivots directly into the SQL queries

  • [ ] Create a figure showing total cost of the project over time, from the beginning

  • Billing project:

    • Name: pitt-google-broker-billing

    • ID: light-cycle-328823

  • Billing account ID: <obtain this from https://console.cloud.google.com/billing/projects>

Setup: Set environment variables

# project ids, used to create/manage service accounts, and make queries on the tables
export GOOGLE_CLOUD_PROJECT="light-cycle-328823"        # billing project
export GOOGLE_CLOUD_PROJECT2="ardent-cycling-243415"    # production project

# set the following to a local path where the billing service account credentials
# key file will be downloaded to (upon creation) and/or accessed from (queries)
export GOOGLE_APPLICATION_CREDENTIALS="<path/to/credentials/keyfile.json"
# export GOOGLE_APPLICATION_CREDENTIALS="/Users/troyraen/Documents/broker/Pitt-Google/repo/GCP_auth_key-broker_billing_bigquery.json"

If you need to create a service account, set these additional variables then follow the link below.

# billing service account that will be used to make queries
SERVICE_ACCOUNT_NAME="myaccount-bigquery"
# SERVICE_ACCOUNT_NAME="tjraen-bigquery"
SERVICE_ACCOUNT="${SERVICE_ACCOUNT_NAME}@${GOOGLE_CLOUD_PROJECT}.iam.gserviceaccount.com"
ROLE="roles/bigquery.resourceViewer"

Setup a service account: ../service-account.md

This service account will have permission to query the production dataset/table because those resources are public access.

Setup for queries and plotting

# navigate to this directory (replace with your path)
cd /Users/troyraen/Documents/broker/Pitt-Google/troy/docs/source/working-notes/troyraen/billing

# activate the billing service account, if it's not already:
gcloud auth activate-service-account \
    --project="$GOOGLE_CLOUD_PROJECT" \
    --key-file="$GOOGLE_APPLICATION_CREDENTIALS"

# pip install --upgrade pyarrow
# pip uninstall pyarrow
# pip install pyarrow==0.17.1
import datetime
import importlib as imp
import os
import pandas as pd
from matplotlib import pyplot as plt
from broker_utils import gcp_utils

from custom_classes import loaded_data
import figures
import queries
import transforms

billing_project_id = os.getenv('GOOGLE_CLOUD_PROJECT', 'light-cycle-328823')
prod_project_id = os.getenv('GOOGLE_CLOUD_PROJECT2', 'ardent-cycling-243415')

Query tables or load dataframes from files

# load data from file
f = 'billing_20211203.csv'  # all data in table as of 12/3/2021. big, so not pushing to git.
ldata = queries.load_countdf_litebilldf_from_file(f)        # load data
countdf, litebilldf, litebill_ispipelinesku = ldata         # unpack dataframes for convenience


# ALTERNATELY, run new queries to load data:

# query billing table
lookback = 180
query, job_config = queries.billing(lookback=lookback)
billdf = gcp_utils.query_bigquery(
    query, job_config=job_config, project_id=billing_project_id
).to_dataframe()

# create a lightened billdf containing only what we want to look at
lite_df_cols = ['project_id', 'service', 'sku', 'cost', 'usage_date']
litebilldf = billdf.loc[:, lite_df_cols]
# get indexes where the sku is a live pipeline sku
litebill_ispipelinesku = (litebilldf.apply(transforms.is_pipeline_sku, axis=1))

# query the ztf metadata table for alert counts
query, job_config = queries.count_metadata_by_date(lookback=lookback)
countdf = gcp_utils.query_bigquery(
    query, job_config=job_config, project_id=billing_project_id
).to_dataframe()

# created loaded_data object for convenience
ldata = loaded_data(countdf, litebilldf, litebill_ispipelinesku)

Look at countdf

countdf = countdf.set_index('publish_date').sort_index()

# look at the data
countdf.plot.bar()
plt.show(block=False)

Live pipeline: Bar chart of average cost per sku per million alerts, colored by service

# keep rows where project and sku => live pipeline
mylite_df_indexes = (litebill_ispipelinesku) & (litebilldf["project_id"] == prod_project_id)
mylitebilldf = litebilldf.loc[mylite_df_indexes]

# remove dates where abnormal things happened affecting costs
remove_dates = [
    # ZTF dumped alerts at LSST rates
    datetime.date(2021, 9, 23),
    # AllWISE crossmatch queried BigQuery for every alert
    datetime.date(2021, 9, 26),
    datetime.date(2021, 9, 27),
    datetime.date(2021, 9, 28),
]
mylitebilldf = mylitebilldf.loc[~mylitebilldf.usage_date.isin(remove_dates)]

# keep only dates with num_alerts > 0
mylitebilldf = mylitebilldf.set_index('usage_date').sort_index()
commondates = set(mylitebilldf.index).intersection(set(countdf.index))
indexes_to_keep = countdf.loc[(countdf.num_alerts>0) & (countdf.index.isin(commondates))].index
mylitebilldf = mylitebilldf.loc[indexes_to_keep]

# sum by sku and date
costdf_bydate = transforms.cost_by_sku(mylitebilldf.reset_index(), how='sum', bydate=True)

# calculate cost per million
cost_per_mil = 'cost_per_million_alerts_ingested'
costdf_bydate.set_index('usage_date', inplace=True)
costdf_bydate[cost_per_mil] = costdf_bydate.cost / countdf.loc[indexes_to_keep].num_alerts *1e6

# get average cost/million alerts
costdf = transforms.cost_by_sku(costdf_bydate, cost=cost_per_mil, how='mean')
# keep only significant costs
min_cost_per_mil = 0.10
costdf = costdf.loc[costdf[cost_per_mil] > min_cost_per_mil]

save = 'billing_per_sku_per_million_alerts_ingested.png'
title = f"Mean cost per million alerts ingested under normal conditions (pipeline SKUs, >${min_cost_per_mil:.2f})"
figures.plot_cost_by_sku(costdf, save=save, cost=cost_per_mil, title=title)
billing_per_sku_per_million_alerts_ingested.png

Not live pipeline: Bar chart of average cost per sku per day, colored by service

# keep rows where sku => ~(live pipeline)
mylitebilldf = litebilldf.loc[~litebill_ispipelinesku]

# sum by sku and date, then take the mean
costdf_bydate = transforms.cost_by_sku(mylitebilldf.reset_index(), how='sum', bydate=True)
costdf = transforms.cost_by_sku(costdf_bydate, how='mean')
cost_per_day = "cost_per_day"
costdf[cost_per_day] = costdf["cost"]

# keep only significant costs
min_cost = 0.01
costdf = costdf.loc[costdf[cost_per_day] > min_cost]

save = 'billing_per_sku_per_day.png'
title = f"Mean cost per day (non-pipeline SKUs, >${min_cost:.2f})"
figures.plot_cost_by_sku(costdf, cost=cost_per_day, save=save, title=title)
billing_per_sku_per_day.png

Live pipeline, Cloud Run: Bar chart of average cost per sku per million alerts, colored by service

# keep rows where project and sku => live pipeline and service is Cloud Run
mylite_df_indexes = (litebill_ispipelinesku) & (litebilldf["project_id"] == prod_project_id) & (litebilldf["service"] == "Cloud Run")
mylitebilldf = litebilldf.loc[mylite_df_indexes]

# keep only dates with num_alerts > 0
mylitebilldf = mylitebilldf.set_index('usage_date').sort_index()
commondates = set(mylitebilldf.index).intersection(set(countdf.index))
indexes_to_keep = countdf.loc[(countdf.num_alerts>0) & (countdf.index.isin(commondates))].index
mylitebilldf = mylitebilldf.loc[indexes_to_keep]

# sum by sku and date
costdf_bydate = transforms.cost_by_sku(mylitebilldf.reset_index(), how='sum', bydate=True)

# calculate cost per million alerts
costdf_bydate.set_index('usage_date', inplace=True)
costdf_bydate['cost_per_million_alerts'] = costdf_bydate.cost / countdf.loc[indexes_to_keep].num_alerts *1e6

# get average cost/million alerts
costdf = transforms.cost_by_sku(costdf_bydate, cost='cost_per_million_alerts', how='mean')
# keep only significant costs
costdf = costdf.loc[costdf.cost_per_million_alerts>0]

save = 'billing_per_sku_per_million_alerts_cloud_run.png'
title = "Mean cost per million alerts"
figures.plot_cost_by_sku(costdf, save=save, cost='cost_per_million_alerts', title=title)
billing_per_sku_per_million_alerts_cloud_run.png

Stacked (services) bar chart of cost vs date

mycostdf_bydateservice = litebilldf.groupby(['usage_date', 'service']).sum().reset_index()
mycostdf_bydate = mycostdf_bydateservice.pivot(index='usage_date', columns='service', values='cost')
mycostdf_bydate.fillna(0, inplace=True)

# cost bar chart
fig = plt.figure(figsize=(13,8))
ax = fig.gca()
mycostdf_bydate.plot.bar(stacked=True, ax=ax)

# annotate with number of alerts received
sum_day_cost = mycostdf_bydate.sum(axis=1)  # height of each stacked bar
y_offset = 5  # put space between bar and text
labels = tuple(f"{countdf.num_alerts.get(date, 0)} alerts" for date in mycostdf_bydate.index)
for i, (label, total) in enumerate(zip(labels, sum_day_cost)):
    ax.text(i, total + y_offset, label, ha='center', rotation='vertical')

plt.ylabel("Cost")
plt.savefig('billing_per_day_and_service.png')
plt.show(block=False)
billing_per_day_and_service.png

Bar chart of total cost per sku, colored by service

import datetime


day = datetime.date(2021, 12, 2)
cols = ['service', 'sku', 'cost']
# litebilldf = billdf.loc[(billdf['usage_date']==day) & (billdf['project_id']==my_project), cols]
litebilldf = billdf.loc[billdf['project_id']==prod_project_id, cols]

save = 'billing_per_sku.png'
# sum costs, clean, and sort
costdf = transforms.sum_cost_by_sku(litebilldf)
figures.plot_cost_by_sku(costdf, save=None)
billing_per_sku.png

Bar chart cost per day, (stack) services

# plot
my_project = prod_project_id
gb = ['usage_date', 'service']
df = billdf.loc[billdf['project_id']==my_project, gb+['cost']].groupby(gb).sum()
df = df.reset_index().pivot(index='usage_date', columns='service', values='cost')
df = df[df > 1.0].dropna(axis=1, how='all')  # drop cols where all < $1.00
def rename_column(colname):
    return f"{colname} (${per_alert[colname]:.3e}/alert)"
df = df.rename(rename_column, axis=1)
df.plot.bar(stacked=True)
plt.savefig('billing_per_day.png')
plt.show(block=False)
billing_per_day.png