Skip to content

Statistics #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
May 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e8b6105
Statistics and pre-treatments representations
scholtzan Apr 22, 2020
67605fe
Define some example statistics
scholtzan Apr 22, 2020
23d8bcc
Represent statistic results
scholtzan Apr 23, 2020
0b0e6cb
Fix output format for statistics
scholtzan Apr 23, 2020
2e0e5ca
Move CI calculation to specific statistics
scholtzan Apr 24, 2020
7bbcfb5
Collect CIs from bootstrap_one_branch
tdsmith Apr 24, 2020
83e5b93
Combine statistics and metrics
scholtzan Apr 24, 2020
ebf15ee
mozanalysis dependency
scholtzan Apr 28, 2020
e913ba1
Make statistics configurable
scholtzan Apr 28, 2020
e1aa46b
Update integration test
scholtzan Apr 28, 2020
0ff962c
Cleanup
scholtzan Apr 28, 2020
7622548
Fix types
scholtzan Apr 28, 2020
93cd622
Minor statistics refactoring
scholtzan Apr 28, 2020
ea657c9
Sort imports
tdsmith Apr 29, 2020
e1b9913
Use abstractmethod decorator
tdsmith Apr 29, 2020
c198dc7
Use compare_branches in BootstrapMean
scholtzan Apr 30, 2020
d283552
Update configuration specification for statistics
scholtzan Apr 30, 2020
1d23ee4
Rename MetricWithTreatment to Summary
scholtzan Apr 30, 2020
09829c0
Refactor pre-treatments
scholtzan Apr 30, 2020
a0c49e2
Convert pre-treatment references to class instances
scholtzan Apr 30, 2020
de88c00
Use BigQueryStorageClient for faster to_dataframe export
scholtzan May 1, 2020
9d59ce9
Move pre-treatments to summary
scholtzan May 1, 2020
6bf5c07
Consider ref_branch_label from experimenter
scholtzan May 1, 2020
471a236
Fix types
scholtzan May 1, 2020
056d936
Tests for pre-treatments
scholtzan May 1, 2020
d0320cb
Write statistics in one write operation
scholtzan May 7, 2020
0f8ec2d
raise NotImplementedError
scholtzan May 7, 2020
0851f1e
Merge branch 'master' into statistics
scholtzan May 7, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 81 additions & 27 deletions pensieve/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
import google.cloud.bigquery.dataset
import google.cloud.bigquery.job
import google.cloud.bigquery.table
from google.cloud import bigquery
from google.cloud.bigquery_storage_v1beta1 import BigQueryStorageClient
import mozanalysis
from mozanalysis.experiment import TimeLimits
from mozanalysis.utils import add_days

from . import AnalysisPeriod
from . import config
from pensieve.config import AnalysisConfiguration


@attr.s(auto_attribs=True)
Expand All @@ -25,7 +27,7 @@ class Analysis:

project: str
dataset: str
config: config.AnalysisConfiguration
config: AnalysisConfiguration

def __attrs_post_init__(self):
self.logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -128,6 +130,72 @@ def _publish_view(self, window_period: AnalysisPeriod):
)
self.bigquery.execute(sql)

def _calculate_metrics(
self,
exp: mozanalysis.experiment.Experiment,
time_limits: TimeLimits,
period: AnalysisPeriod,
dry_run: bool,
):
"""
Calculate metrics for a specific experiment.
Returns the BigQuery table results are written to.
"""

window = len(time_limits.analysis_windows)
last_analysis_window = time_limits.analysis_windows[-1]
# TODO: Add this functionality to TimeLimits.
last_window_limits = attr.evolve(
time_limits,
analysis_windows=[last_analysis_window],
first_date_data_required=add_days(
time_limits.first_enrollment_date, last_analysis_window.start
),
)

res_table_name = self._table_name(period.value, window)

sql = exp.build_query(
{m.metric for m in self.config.metrics[period]},
last_window_limits,
"normandy",
self.config.experiment.enrollment_query,
)

self.logger.info("Executing query for %s (%s)", self.config.experiment.slug, period.value)
self.bigquery.execute(sql, res_table_name)
self._publish_view(period)

return res_table_name

def _calculate_statistics(self, metrics_table: str, period: AnalysisPeriod):
"""
Run statistics on metrics.
"""

metrics_data = self.bigquery.table_to_dataframe(metrics_table)
destination_table = f"{self.project}.{self.dataset}.statistics_{metrics_table}"

results = []

for m in self.config.metrics[period]:
results.append(m.run(metrics_data).to_dict())

job_config = bigquery.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField("metric", "STRING"),
bigquery.SchemaField("statistic", "STRING"),
bigquery.SchemaField("parameter", "NUMERIC"),
bigquery.SchemaField("label", "STRING"),
bigquery.SchemaField("ci_width", "FLOAT64"),
bigquery.SchemaField("point", "FLOAT64"),
bigquery.SchemaField("lower", "FLOAT64"),
bigquery.SchemaField("upper", "FLOAT64"),
]
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE

self.bigquery.client.load_table_from_json(results, destination_table, job_config=job_config)

def run(self, current_date: datetime, dry_run: bool):
"""
Run analysis using mozanalysis for a specific experiment.
Expand Down Expand Up @@ -155,26 +223,6 @@ def run(self, current_date: datetime, dry_run: bool):
start_date=self.config.experiment.start_date.strftime("%Y-%m-%d"),
)

window = len(time_limits.analysis_windows)
last_analysis_window = time_limits.analysis_windows[-1]
# TODO: Add this functionality to TimeLimits.
last_window_limits = attr.evolve(
time_limits,
analysis_windows=[last_analysis_window],
first_date_data_required=add_days(
time_limits.first_enrollment_date, last_analysis_window.start
),
)

res_table_name = self._table_name(period.value, window)

sql = exp.build_query(
self.config.metrics[period],
last_window_limits,
"normandy",
self.config.experiment.enrollment_query,
)

if dry_run:
self.logger.info(
"Not executing query for %s (%s); dry run",
Expand All @@ -183,11 +231,8 @@ def run(self, current_date: datetime, dry_run: bool):
)
return

self.logger.info(
"Executing query for %s (%s)", self.config.experiment.slug, period.value
)
self.bigquery.execute(sql, res_table_name)
self._publish_view(period)
metrics_table = self._calculate_metrics(exp, time_limits, period, dry_run)
self._calculate_statistics(metrics_table, period)
self.logger.info(
"Finished running query for %s (%s)", self.config.experiment.slug, period.value
)
Expand All @@ -198,12 +243,21 @@ class BigQueryClient:
project: str
dataset: str
_client: Optional[google.cloud.bigquery.client.Client] = None
_storage_client: Optional[BigQueryStorageClient] = None

@property
def client(self):
self._client = self._client or google.cloud.bigquery.client.Client(self.project)
return self._client

def table_to_dataframe(self, table: str):
"""Return all rows of the specified table as a dataframe."""
self._storage_client = self._storage_client or BigQueryStorageClient()

table_ref = self.client.get_table(f"{self.project}.{self.dataset}.{table}")
rows = self.client.list_rows(table_ref)
return rows.to_dataframe(bqstorage_client=self._storage_client)

def execute(self, query: str, destination_table: Optional[str] = None) -> None:
dataset = google.cloud.bigquery.dataset.DatasetReference.from_string(
self.dataset, default_project=self.project,
Expand Down
Loading