1
1
import attr
2
2
import cattr
3
- import mozanalysis
3
+ import mozanalysis . bayesian_stats . bayesian_bootstrap as mabsbb
4
4
from google .cloud import bigquery
5
+ from google .api_core .exceptions import NotFound
5
6
from typing import Callable , Any , Dict , List , Tuple , Optional
6
7
from pandas import DataFrame
7
8
import pandas
9
+ import numpy as np
8
10
9
11
from pensieve .pre_treatment import PreTreatment , RemoveNulls
10
12
@@ -20,10 +22,41 @@ class StatisticResult:
20
22
statistic : str
21
23
parameter : float
22
24
label : str
23
- ci_width : Optional [float ]
24
- point : Optional [float ]
25
- lower : Optional [float ]
26
- upper : Optional [float ]
25
+ ci_width : Optional [float ] = 0.0
26
+ point : Optional [float ] = 0.0
27
+ lower : Optional [float ] = 0.0
28
+ upper : Optional [float ] = 0.0
29
+
30
+ def with_ci (self , data : DataFrame , t : float , confidence_level : float ) -> "StatisticResult" :
31
+ """Calculate the confidence interval and update result."""
32
+ confidence_margin = 0.5 * (1.0 - confidence_level )
33
+ confidence_high = (0.0 + confidence_margin ) * 100
34
+ confidence_low = (1.0 - confidence_margin ) * 100
35
+ self .lower = t - np .percentile (data - t , confidence_low )
36
+ self .upper = t - np .percentile (data - t , confidence_high )
37
+ self .ci_width = confidence_level
38
+ self .point = t
39
+ return self
40
+
41
+ def with_point (self , point_value : float ) -> "StatisticResult" :
42
+ """Set provided value as point value result for statistic."""
43
+ self .point = point_value
44
+ return self
45
+
46
+
47
+ @attr .s (auto_attribs = True )
48
+ class StatisticResultCollection :
49
+ """
50
+ Represents a set of statistics result data.
51
+ """
52
+
53
+ data : List [StatisticResult ] = []
54
+
55
+ def append (self , result : StatisticResult ):
56
+ self .data .append (result )
57
+
58
+ def merge (self , result_collection : "StatisticResultCollection" ):
59
+ self .data = self .data + result_collection .data
27
60
28
61
def save_to_bigquery (self , client , destination_table , append = True ):
29
62
"""Stores the data to a BigQuery table with a defined schema."""
@@ -32,20 +65,25 @@ def save_to_bigquery(self, client, destination_table, append=True):
32
65
job_config .schema = [
33
66
bigquery .SchemaField ("metric" , "STRING" ),
34
67
bigquery .SchemaField ("statistic" , "STRING" ),
35
- bigquery .SchemaField ("parameter" , "FLOAT " ),
68
+ bigquery .SchemaField ("parameter" , "FLOAT64 " ),
36
69
bigquery .SchemaField ("label" , "STRING" ),
37
- bigquery .SchemaField ("ci_width" , "FLOAT " ),
38
- bigquery .SchemaField ("point" , "FLOAT " ),
39
- bigquery .SchemaField ("lower" , "FLOAT " ),
40
- bigquery .SchemaField ("upper" , "FLOAT " ),
70
+ bigquery .SchemaField ("ci_width" , "FLOAT64 " ),
71
+ bigquery .SchemaField ("point" , "FLOAT64 " ),
72
+ bigquery .SchemaField ("lower" , "FLOAT64 " ),
73
+ bigquery .SchemaField ("upper" , "FLOAT64 " ),
41
74
]
42
75
43
76
if append :
44
77
job_config .write_disposition = bigquery .job .WriteDisposition .WRITE_APPEND
45
78
else :
46
79
job_config .write_disposition = bigquery .job .WriteDisposition .WRITE_TRUNCATE
47
80
48
- client .load_table_from_dataframe (self .data , destination_table , job_config = job_config )
81
+ client .load_table_from_json (self .to_dict ()["data" ], destination_table )
82
+
83
+ def to_dict (self ):
84
+ """Return statistic results as dict."""
85
+
86
+ return attr .asdict (self )
49
87
50
88
51
89
@attr .s (auto_attribs = True )
@@ -65,17 +103,22 @@ class Statistic:
65
103
def name (cls ):
66
104
return __name__ # todo: snake case names?
67
105
68
- def apply (self , df : DataFrame ) -> "StatisticResult " :
106
+ def apply (self , df : DataFrame ) -> "StatisticResultCollection " :
69
107
"""Run statistic on provided dataframe."""
70
108
71
109
data = df
72
110
for pre_treatment in self .pre_treatments :
73
111
data = pre_treatment .apply (data )
74
112
75
- results = [self .transformation (df , metric ) for metric in self .metrics if metric in df ]
76
- return StatisticResult (pandas .concat (results ))
113
+ col = StatisticResultCollection ([])
114
+
115
+ for metric in self .metrics :
116
+ if metric in df :
117
+ col .merge (self .transformation (df , metric ))
118
+
119
+ return col
77
120
78
- def transformation (self , df : DataFrame , metric : str ) -> "StatisticResult " :
121
+ def transformation (self , df : DataFrame , metric : str ) -> "StatisticResultCollection " :
79
122
raise NotImplementedError ("Statistic subclasses must override transformation()" )
80
123
81
124
@classmethod
@@ -84,29 +127,34 @@ def from_config(cls, config_dict: Dict[str, Any]): # todo: plug in config file
84
127
return cls (** config_dict )
85
128
86
129
130
+ @attr .s (auto_attribs = True )
87
131
class BootstrapOneBranch (Statistic ):
88
132
num_samples : int = 100
89
133
summary_quantiles : Tuple [int ] = (0.5 )
90
- pre_treatments = [RemoveNulls ()]
91
- branches = []
134
+ confidence_interval : float = 0.95
135
+ pre_treatments : List [PreTreatment ] = [RemoveNulls ()]
136
+ branches : List [str ] = []
137
+
138
+ def transformation (self , df : DataFrame , metric : str ) -> "StatisticResultCollection" :
139
+ stats_results = StatisticResultCollection ([])
92
140
93
- def transformation (self , df : DataFrame , metric : str ):
94
141
results_per_branch = df .groupby ("branch" )
95
142
96
- data_by_branch = [results_per_branch .get_group (branch ) for branch in self .branches ]
143
+ for branch in self .branches :
144
+ branch_data = results_per_branch .get_group (branch )
145
+ stats_result = mabsbb .bootstrap_one_branch (
146
+ branch_data [metric ],
147
+ num_samples = self .num_samples ,
148
+ summary_quantiles = self .summary_quantiles ,
149
+ ).to_dict ()
97
150
98
- results = [
99
- mozanalysis . bayesian_stats . bayesian_bootstrap (
100
- data [ metric ], self . num_samples , self . summary_quantiles
101
- )
102
- for data in data_by_branch
103
- ]
151
+ for quantile in self . summary_quantiles :
152
+ result = StatisticResult (
153
+ metric = metric , statistic = "quantiles" , parameter = quantile , label = branch
154
+ ). with_ci (
155
+ branch_data [ metric ], stats_result [ str ( quantile )], self . confidence_interval
156
+ )
104
157
105
- print ( results )
158
+ stats_results . append ( result )
106
159
107
- # return StatisticResult(
108
- # metric=metric,
109
- # statistic=self.name(),
110
- # parameter=0.0 # todo
111
- # ci
112
- # )
160
+ return stats_results
0 commit comments