Fix output format for statistics

scholtzan · scholtzan · commit bf60f4b3d3ff · 2020-04-23T15:45:52.000-07:00
diff --git a/pensieve/analysis.py b/pensieve/analysis.py
@@ -169,12 +169,11 @@ def _calculate_statistics(self, metrics_table: str):
         """
 
         metrics_data = self.bigquery.table_to_dataframe(metrics_table)
-        biguqery_client = self.bigquery.client
-        destination_table = f"statistics_{metrics_table}"
+        destination_table = f"{self.project}.{self.dataset}.statistics_{metrics_table}"
 
         for statistic in self.STANDARD_STATISTICS:
             statistic.apply(metrics_data).save_to_bigquery(
-                biguqery_client, destination_table, append=True
+                self.bigquery.client, destination_table, append=True
             )
 
     def run(self, current_date: datetime, dry_run: bool):
diff --git a/pensieve/statistics.py b/pensieve/statistics.py
@@ -1,10 +1,12 @@
 import attr
 import cattr
-import mozanalysis
+import mozanalysis.bayesian_stats.bayesian_bootstrap as mabsbb
 from google.cloud import bigquery
+from google.api_core.exceptions import NotFound
 from typing import Callable, Any, Dict, List, Tuple, Optional
 from pandas import DataFrame
 import pandas
+import numpy as np
 
 from pensieve.pre_treatment import PreTreatment, RemoveNulls
 
@@ -20,10 +22,41 @@ class StatisticResult:
     statistic: str
     parameter: float
     label: str
-    ci_width: Optional[float]
-    point: Optional[float]
-    lower: Optional[float]
-    upper: Optional[float]
+    ci_width: Optional[float] = 0.0
+    point: Optional[float] = 0.0
+    lower: Optional[float] = 0.0
+    upper: Optional[float] = 0.0
+
+    def with_ci(self, data: DataFrame, t: float, confidence_level: float) -> "StatisticResult":
+        """Calculate the confidence interval and update result."""
+        confidence_margin = 0.5 * (1.0 - confidence_level)
+        confidence_high = (0.0 + confidence_margin) * 100
+        confidence_low = (1.0 - confidence_margin) * 100
+        self.lower = t - np.percentile(data - t, confidence_low)
+        self.upper = t - np.percentile(data - t, confidence_high)
+        self.ci_width = confidence_level
+        self.point = t
+        return self
+
+    def with_point(self, point_value: float) -> "StatisticResult":
+        """Set provided value as point value result for statistic."""
+        self.point = point_value
+        return self
+
+
+@attr.s(auto_attribs=True)
+class StatisticResultCollection:
+    """
+    Represents a set of statistics result data.
+    """
+
+    data: List[StatisticResult] = []
+
+    def append(self, result: StatisticResult):
+        self.data.append(result)
+
+    def merge(self, result_collection: "StatisticResultCollection"):
+        self.data = self.data + result_collection.data
 
     def save_to_bigquery(self, client, destination_table, append=True):
         """Stores the data to a BigQuery table with a defined schema."""
@@ -32,20 +65,25 @@ def save_to_bigquery(self, client, destination_table, append=True):
         job_config.schema = [
             bigquery.SchemaField("metric", "STRING"),
             bigquery.SchemaField("statistic", "STRING"),
-            bigquery.SchemaField("parameter", "FLOAT"),
+            bigquery.SchemaField("parameter", "FLOAT64"),
             bigquery.SchemaField("label", "STRING"),
-            bigquery.SchemaField("ci_width", "FLOAT"),
-            bigquery.SchemaField("point", "FLOAT"),
-            bigquery.SchemaField("lower", "FLOAT"),
-            bigquery.SchemaField("upper", "FLOAT"),
+            bigquery.SchemaField("ci_width", "FLOAT64"),
+            bigquery.SchemaField("point", "FLOAT64"),
+            bigquery.SchemaField("lower", "FLOAT64"),
+            bigquery.SchemaField("upper", "FLOAT64"),
         ]
 
         if append:
             job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND
         else:
             job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
 
-        client.load_table_from_dataframe(self.data, destination_table, job_config=job_config)
+        client.load_table_from_json(self.to_dict()["data"], destination_table)
+
+    def to_dict(self):
+        """Return statistic results as dict."""
+
+        return attr.asdict(self)
 
 
 @attr.s(auto_attribs=True)
@@ -65,17 +103,22 @@ class Statistic:
     def name(cls):
         return __name__  # todo: snake case names?
 
-    def apply(self, df: DataFrame) -> "StatisticResult":
+    def apply(self, df: DataFrame) -> "StatisticResultCollection":
         """Run statistic on provided dataframe."""
 
         data = df
         for pre_treatment in self.pre_treatments:
             data = pre_treatment.apply(data)
 
-        results = [self.transformation(df, metric) for metric in self.metrics if metric in df]
-        return StatisticResult(pandas.concat(results))
+        col = StatisticResultCollection([])
+
+        for metric in self.metrics:
+            if metric in df:
+                col.merge(self.transformation(df, metric))
+
+        return col
 
-    def transformation(self, df: DataFrame, metric: str) -> "StatisticResult":
+    def transformation(self, df: DataFrame, metric: str) -> "StatisticResultCollection":
         raise NotImplementedError("Statistic subclasses must override transformation()")
 
     @classmethod
@@ -84,29 +127,34 @@ def from_config(cls, config_dict: Dict[str, Any]):  # todo: plug in config file
         return cls(**config_dict)
 
 
+@attr.s(auto_attribs=True)
 class BootstrapOneBranch(Statistic):
     num_samples: int = 100
     summary_quantiles: Tuple[int] = (0.5)
-    pre_treatments = [RemoveNulls()]
-    branches = []
+    confidence_interval: float = 0.95
+    pre_treatments: List[PreTreatment] = [RemoveNulls()]
+    branches: List[str] = []
+
+    def transformation(self, df: DataFrame, metric: str) -> "StatisticResultCollection":
+        stats_results = StatisticResultCollection([])
 
-    def transformation(self, df: DataFrame, metric: str):
         results_per_branch = df.groupby("branch")
 
-        data_by_branch = [results_per_branch.get_group(branch) for branch in self.branches]
+        for branch in self.branches:
+            branch_data = results_per_branch.get_group(branch)
+            stats_result = mabsbb.bootstrap_one_branch(
+                branch_data[metric],
+                num_samples=self.num_samples,
+                summary_quantiles=self.summary_quantiles,
+            ).to_dict()
 
-        results = [
-            mozanalysis.bayesian_stats.bayesian_bootstrap(
-                data[metric], self.num_samples, self.summary_quantiles
-            )
-            for data in data_by_branch
-        ]
+            for quantile in self.summary_quantiles:
+                result = StatisticResult(
+                    metric=metric, statistic="quantiles", parameter=quantile, label=branch
+                ).with_ci(
+                    branch_data[metric], stats_result[str(quantile)], self.confidence_interval
+                )
 
-        print(results)
+                stats_results.append(result)
 
-        # return StatisticResult(
-        #     metric=metric,
-        #     statistic=self.name(),
-        #     parameter=0.0   # todo
-        #     ci
-        # )
+        return stats_results