mozilla · scholtzan · May 7, 2020 · Apr 22, 2020 · Apr 22, 2020 · Apr 23, 2020
diff --git a/pensieve/analysis.py b/pensieve/analysis.py
@@ -9,12 +9,14 @@
 import google.cloud.bigquery.dataset
 import google.cloud.bigquery.job
 import google.cloud.bigquery.table
+from google.cloud import bigquery
+from google.cloud.bigquery_storage_v1beta1 import BigQueryStorageClient
 import mozanalysis
 from mozanalysis.experiment import TimeLimits
 from mozanalysis.utils import add_days
 
 from . import AnalysisPeriod
-from . import config
+from pensieve.config import AnalysisConfiguration
 
 
 @attr.s(auto_attribs=True)
@@ -25,7 +27,7 @@ class Analysis:
 
     project: str
     dataset: str
-    config: config.AnalysisConfiguration
+    config: AnalysisConfiguration
 
     def __attrs_post_init__(self):
         self.logger = logging.getLogger(__name__)
@@ -128,6 +130,72 @@ def _publish_view(self, window_period: AnalysisPeriod):
         )
         self.bigquery.execute(sql)
 
+    def _calculate_metrics(
+        self,
+        exp: mozanalysis.experiment.Experiment,
+        time_limits: TimeLimits,
+        period: AnalysisPeriod,
+        dry_run: bool,
+    ):
+        """
+        Calculate metrics for a specific experiment.
+        Returns the BigQuery table results are written to.
+        """
+
+        window = len(time_limits.analysis_windows)
+        last_analysis_window = time_limits.analysis_windows[-1]
+        # TODO: Add this functionality to TimeLimits.
+        last_window_limits = attr.evolve(
+            time_limits,
+            analysis_windows=[last_analysis_window],
+            first_date_data_required=add_days(
+                time_limits.first_enrollment_date, last_analysis_window.start
+            ),
+        )
+
+        res_table_name = self._table_name(period.value, window)
+
+        sql = exp.build_query(
+            {m.metric for m in self.config.metrics[period]},
+            last_window_limits,
+            "normandy",
+            self.config.experiment.enrollment_query,
+        )
+
+        self.logger.info("Executing query for %s (%s)", self.config.experiment.slug, period.value)
+        self.bigquery.execute(sql, res_table_name)
+        self._publish_view(period)
+
+        return res_table_name
+
+    def _calculate_statistics(self, metrics_table: str, period: AnalysisPeriod):
+        """
+        Run statistics on metrics.
+        """
+
+        metrics_data = self.bigquery.table_to_dataframe(metrics_table)
+        destination_table = f"{self.project}.{self.dataset}.statistics_{metrics_table}"
+
+        results = []
+
+        for m in self.config.metrics[period]:
+            results.append(m.run(metrics_data).to_dict())
+
+        job_config = bigquery.LoadJobConfig()
+        job_config.schema = [
+            bigquery.SchemaField("metric", "STRING"),
+            bigquery.SchemaField("statistic", "STRING"),
+            bigquery.SchemaField("parameter", "NUMERIC"),
+            bigquery.SchemaField("label", "STRING"),
+            bigquery.SchemaField("ci_width", "FLOAT64"),
+            bigquery.SchemaField("point", "FLOAT64"),
+            bigquery.SchemaField("lower", "FLOAT64"),
+            bigquery.SchemaField("upper", "FLOAT64"),
+        ]
+        job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
+
+        self.bigquery.client.load_table_from_json(results, destination_table, job_config=job_config)
+
     def run(self, current_date: datetime, dry_run: bool):
         """
         Run analysis using mozanalysis for a specific experiment.
@@ -155,26 +223,6 @@ def run(self, current_date: datetime, dry_run: bool):
                 start_date=self.config.experiment.start_date.strftime("%Y-%m-%d"),
             )
 
-            window = len(time_limits.analysis_windows)
-            last_analysis_window = time_limits.analysis_windows[-1]
-            # TODO: Add this functionality to TimeLimits.
-            last_window_limits = attr.evolve(
-                time_limits,
-                analysis_windows=[last_analysis_window],
-                first_date_data_required=add_days(
-                    time_limits.first_enrollment_date, last_analysis_window.start
-                ),
-            )
-
-            res_table_name = self._table_name(period.value, window)
-
-            sql = exp.build_query(
-                self.config.metrics[period],
-                last_window_limits,
-                "normandy",
-                self.config.experiment.enrollment_query,
-            )
-
             if dry_run:
                 self.logger.info(
                     "Not executing query for %s (%s); dry run",
@@ -183,11 +231,8 @@ def run(self, current_date: datetime, dry_run: bool):
                 )
                 return
 
-            self.logger.info(
-                "Executing query for %s (%s)", self.config.experiment.slug, period.value
-            )
-            self.bigquery.execute(sql, res_table_name)
-            self._publish_view(period)
+            metrics_table = self._calculate_metrics(exp, time_limits, period, dry_run)
+            self._calculate_statistics(metrics_table, period)
             self.logger.info(
                 "Finished running query for %s (%s)", self.config.experiment.slug, period.value
             )
@@ -198,12 +243,21 @@ class BigQueryClient:
     project: str
     dataset: str
     _client: Optional[google.cloud.bigquery.client.Client] = None
+    _storage_client: Optional[BigQueryStorageClient] = None
 
     @property
     def client(self):
         self._client = self._client or google.cloud.bigquery.client.Client(self.project)
         return self._client
 
+    def table_to_dataframe(self, table: str):
+        """Return all rows of the specified table as a dataframe."""
+        self._storage_client = self._storage_client or BigQueryStorageClient()
+
+        table_ref = self.client.get_table(f"{self.project}.{self.dataset}.{table}")
+        rows = self.client.list_rows(table_ref)
+        return rows.to_dataframe(bqstorage_client=self._storage_client)
+
     def execute(self, query: str, destination_table: Optional[str] = None) -> None:
         dataset = google.cloud.bigquery.dataset.DatasetReference.from_string(
             self.dataset, default_project=self.project,