dissertation.md

Performance figures for disseration

Setup

from collections import namedtuple
from matplotlib import pyplot as plt
import pandas as pd
import pprint

from broker_utils import gcp_utils

import figures as figs


plt.rcParams.update(
    {
        "text.usetex": True,
        "mathtext.fontset": "cm",
        "font.family": "STIXGeneral",
        "font.size": 15,
    }
)

Query for number of nightly ZTF alerts.

query = """
    SELECT
      kafka_topic__alerts, COUNT(DISTINCT candid) as num_alerts
    FROM
      `ardent-cycling-243415.ztf_alerts.metadata`
    GROUP BY kafka_topic__alerts
"""

daily_counts_df = gcp_utils.query_bigquery(query).to_dataframe()
daily_counts_df.to_csv("daily_counts.dat")

Query for processing times and make plots.

# load dataframes into a MetadataPlotter object
def load_plotter(qdate, read_csv=True, save_csv=False, loaddf=None):
    # set all the options
    survey, testid = "ztf", False
    savefig_format = "png"
    querycols = [
        "candid",
        "kafka_timestamp__alerts",
        "publish_time__alerts",
        "publish_time__BigQuery",
        "publish_time__alert_avros",
        # 'publish_time__AllWISE',
        # 'publish_time__alerts_pure',
        "publish_time__exgalac_trans_cf",
        "publish_time__SuperNNova",
    ]
    savefig_dir = f"figures/{qdate}"
    data_file = f"{savefig_dir}/{qdate}.dat"
    kwargs = {
        "query": {
            "survey": survey,
            "testid": testid,
            "date": qdate,
            "columns": querycols,
        },
        "savefig_dir": savefig_dir,
        "savefig_format": savefig_format,
    }
    # specify where to get the dataframe from
    if loaddf:
        print("Loading from the supplied df")
        kwargs["df"] = loaddf.df
    elif read_csv:
        print("Loading from file")
        kwargs["df"] = pd.read_csv(
            data_file,
            parse_dates=[i for i in querycols if "time" in i],
            infer_datetime_format=True,
        )
    else:
        print("Querying BigQuery")

    mplots = figs.MetadataPlotter(**kwargs)

    if save_csv:
        mplots.df.to_csv(data_file, index=False)

    return mplots


mplots = {}
for qdate in ["20220401", "20220429"]:  # , "20220404", "20210927"]
    k = str(int(qdate[-4:]))  # use mmdd as the key, drop leading zeros
    mplots[k] = load_plotter(qdate, loaddf=mplots.get(k, None))

# make figures
plotcols = ["alerts", "BigQuery", "alert_avros", "SuperNNova"]
clip_first = 0
for k, mp in mplots.items():
    for c in plotcols:
        for tref in ["Trigger"]:  # , 'Kafka']:
            print(f"Making {c} plot for {mp.query['date']}, tref={tref}")
            stats = mp.make_paper_plot(c, tref)
            pprint.pprint(stats)
            print()


# mplot = figs.MetadataPlotter(df=mplot.df, query=query, savefig_dir=savefig_dir, savefig_format=savefig_format)


# plot processing times with marginal histograms