Skip to content

Commit bf60f4b

Browse files
committed
Fix output format for statistics
1 parent a0fe8f0 commit bf60f4b

File tree

2 files changed

+82
-35
lines changed

2 files changed

+82
-35
lines changed

pensieve/analysis.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,11 @@ def _calculate_statistics(self, metrics_table: str):
169169
"""
170170

171171
metrics_data = self.bigquery.table_to_dataframe(metrics_table)
172-
biguqery_client = self.bigquery.client
173-
destination_table = f"statistics_{metrics_table}"
172+
destination_table = f"{self.project}.{self.dataset}.statistics_{metrics_table}"
174173

175174
for statistic in self.STANDARD_STATISTICS:
176175
statistic.apply(metrics_data).save_to_bigquery(
177-
biguqery_client, destination_table, append=True
176+
self.bigquery.client, destination_table, append=True
178177
)
179178

180179
def run(self, current_date: datetime, dry_run: bool):

pensieve/statistics.py

Lines changed: 80 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import attr
22
import cattr
3-
import mozanalysis
3+
import mozanalysis.bayesian_stats.bayesian_bootstrap as mabsbb
44
from google.cloud import bigquery
5+
from google.api_core.exceptions import NotFound
56
from typing import Callable, Any, Dict, List, Tuple, Optional
67
from pandas import DataFrame
78
import pandas
9+
import numpy as np
810

911
from pensieve.pre_treatment import PreTreatment, RemoveNulls
1012

@@ -20,10 +22,41 @@ class StatisticResult:
2022
statistic: str
2123
parameter: float
2224
label: str
23-
ci_width: Optional[float]
24-
point: Optional[float]
25-
lower: Optional[float]
26-
upper: Optional[float]
25+
ci_width: Optional[float] = 0.0
26+
point: Optional[float] = 0.0
27+
lower: Optional[float] = 0.0
28+
upper: Optional[float] = 0.0
29+
30+
def with_ci(self, data: DataFrame, t: float, confidence_level: float) -> "StatisticResult":
31+
"""Calculate the confidence interval and update result."""
32+
confidence_margin = 0.5 * (1.0 - confidence_level)
33+
confidence_high = (0.0 + confidence_margin) * 100
34+
confidence_low = (1.0 - confidence_margin) * 100
35+
self.lower = t - np.percentile(data - t, confidence_low)
36+
self.upper = t - np.percentile(data - t, confidence_high)
37+
self.ci_width = confidence_level
38+
self.point = t
39+
return self
40+
41+
def with_point(self, point_value: float) -> "StatisticResult":
42+
"""Set provided value as point value result for statistic."""
43+
self.point = point_value
44+
return self
45+
46+
47+
@attr.s(auto_attribs=True)
48+
class StatisticResultCollection:
49+
"""
50+
Represents a set of statistics result data.
51+
"""
52+
53+
data: List[StatisticResult] = []
54+
55+
def append(self, result: StatisticResult):
56+
self.data.append(result)
57+
58+
def merge(self, result_collection: "StatisticResultCollection"):
59+
self.data = self.data + result_collection.data
2760

2861
def save_to_bigquery(self, client, destination_table, append=True):
2962
"""Stores the data to a BigQuery table with a defined schema."""
@@ -32,20 +65,25 @@ def save_to_bigquery(self, client, destination_table, append=True):
3265
job_config.schema = [
3366
bigquery.SchemaField("metric", "STRING"),
3467
bigquery.SchemaField("statistic", "STRING"),
35-
bigquery.SchemaField("parameter", "FLOAT"),
68+
bigquery.SchemaField("parameter", "FLOAT64"),
3669
bigquery.SchemaField("label", "STRING"),
37-
bigquery.SchemaField("ci_width", "FLOAT"),
38-
bigquery.SchemaField("point", "FLOAT"),
39-
bigquery.SchemaField("lower", "FLOAT"),
40-
bigquery.SchemaField("upper", "FLOAT"),
70+
bigquery.SchemaField("ci_width", "FLOAT64"),
71+
bigquery.SchemaField("point", "FLOAT64"),
72+
bigquery.SchemaField("lower", "FLOAT64"),
73+
bigquery.SchemaField("upper", "FLOAT64"),
4174
]
4275

4376
if append:
4477
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND
4578
else:
4679
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
4780

48-
client.load_table_from_dataframe(self.data, destination_table, job_config=job_config)
81+
client.load_table_from_json(self.to_dict()["data"], destination_table)
82+
83+
def to_dict(self):
84+
"""Return statistic results as dict."""
85+
86+
return attr.asdict(self)
4987

5088

5189
@attr.s(auto_attribs=True)
@@ -65,17 +103,22 @@ class Statistic:
65103
def name(cls):
66104
return __name__ # todo: snake case names?
67105

68-
def apply(self, df: DataFrame) -> "StatisticResult":
106+
def apply(self, df: DataFrame) -> "StatisticResultCollection":
69107
"""Run statistic on provided dataframe."""
70108

71109
data = df
72110
for pre_treatment in self.pre_treatments:
73111
data = pre_treatment.apply(data)
74112

75-
results = [self.transformation(df, metric) for metric in self.metrics if metric in df]
76-
return StatisticResult(pandas.concat(results))
113+
col = StatisticResultCollection([])
114+
115+
for metric in self.metrics:
116+
if metric in df:
117+
col.merge(self.transformation(df, metric))
118+
119+
return col
77120

78-
def transformation(self, df: DataFrame, metric: str) -> "StatisticResult":
121+
def transformation(self, df: DataFrame, metric: str) -> "StatisticResultCollection":
79122
raise NotImplementedError("Statistic subclasses must override transformation()")
80123

81124
@classmethod
@@ -84,29 +127,34 @@ def from_config(cls, config_dict: Dict[str, Any]): # todo: plug in config file
84127
return cls(**config_dict)
85128

86129

130+
@attr.s(auto_attribs=True)
87131
class BootstrapOneBranch(Statistic):
88132
num_samples: int = 100
89133
summary_quantiles: Tuple[int] = (0.5)
90-
pre_treatments = [RemoveNulls()]
91-
branches = []
134+
confidence_interval: float = 0.95
135+
pre_treatments: List[PreTreatment] = [RemoveNulls()]
136+
branches: List[str] = []
137+
138+
def transformation(self, df: DataFrame, metric: str) -> "StatisticResultCollection":
139+
stats_results = StatisticResultCollection([])
92140

93-
def transformation(self, df: DataFrame, metric: str):
94141
results_per_branch = df.groupby("branch")
95142

96-
data_by_branch = [results_per_branch.get_group(branch) for branch in self.branches]
143+
for branch in self.branches:
144+
branch_data = results_per_branch.get_group(branch)
145+
stats_result = mabsbb.bootstrap_one_branch(
146+
branch_data[metric],
147+
num_samples=self.num_samples,
148+
summary_quantiles=self.summary_quantiles,
149+
).to_dict()
97150

98-
results = [
99-
mozanalysis.bayesian_stats.bayesian_bootstrap(
100-
data[metric], self.num_samples, self.summary_quantiles
101-
)
102-
for data in data_by_branch
103-
]
151+
for quantile in self.summary_quantiles:
152+
result = StatisticResult(
153+
metric=metric, statistic="quantiles", parameter=quantile, label=branch
154+
).with_ci(
155+
branch_data[metric], stats_result[str(quantile)], self.confidence_interval
156+
)
104157

105-
print(results)
158+
stats_results.append(result)
106159

107-
# return StatisticResult(
108-
# metric=metric,
109-
# statistic=self.name(),
110-
# parameter=0.0 # todo
111-
# ci
112-
# )
160+
return stats_results

0 commit comments

Comments
 (0)