|
import scipy.stats as stats |
|
|
|
def isfloat(text): |
|
try: |
|
|
|
fl = float(text) |
|
return True |
|
except ValueError: |
|
return False |
|
|
|
def extract_p_val(text): |
|
parts = text.split(' ') |
|
return parts[-1], parts[-2] |
|
|
|
|
|
class HypothesisTest: |
|
def __init__(self, text): |
|
|
|
|
|
|
|
|
|
self.test_type = '' |
|
self.test_stat = 0.0 |
|
self.df1 = 0 |
|
self.tails = 1 |
|
|
|
parts = text.split(' ') |
|
|
|
if parts[0].lower().startswith('t'): |
|
self.test_type = 't' |
|
elif parts[0].lower().startswith('z'): |
|
self.test_type = 'z' |
|
else: |
|
raise Exception('Failed to parse the test') |
|
|
|
for part in parts: |
|
if isfloat(part): |
|
if part.isdigit(): |
|
self.df1 = part |
|
else: |
|
self.test_stat = part |
|
|
|
@property |
|
def reported_p_val(self): |
|
return self._reported_p_val |
|
|
|
@reported_p_val.setter |
|
def reported_p_val(self, value): |
|
|
|
self._reported_p_val = value |
|
|
|
@property |
|
def reported_p_val_dir(self): |
|
return self._reported_p_val_dir |
|
|
|
@reported_p_val_dir.setter |
|
def reported_p_val_dir(self, dirn): |
|
|
|
if dirn in ['<', '>', '=']: |
|
self._reported_p_val_dir = dirn |
|
else: |
|
print("The direction can be one of <, >, =") |
|
|
|
def calculate_p_val(self): |
|
if self.test_type == 't': |
|
return self.tails*(1 - stats.t.cdf(abs(float(self.test_stat)), df=int(self.df1))) |
|
elif self.test_type == 'z': |
|
return self.tails*(1 - stats.norm.cdf(abs(float(self.test_stat)))) |
|
|
|
@staticmethod |
|
def get_reported_stat_tests(sentence): |
|
tests = [] |
|
|
|
|
|
labeled_entities = sentence.get_labels('ner') |
|
for idx, entity in enumerate(labeled_entities): |
|
if entity.value == 'T': |
|
try: |
|
test = HypothesisTest(entity.data_point.text) |
|
|
|
|
|
|
|
p_val_span = labeled_entities[idx+1].data_point.text |
|
p_val, dirn = extract_p_val(p_val_span) |
|
|
|
test.reported_p_val = p_val |
|
test.reported_p_val_dir = dirn |
|
|
|
tests.append(test) |
|
|
|
except: |
|
|
|
pass |
|
|
|
return tests |
|
|
|
def __str__(self): |
|
return "Test Type : " + self.test_type + " | " + "Test Stat : " + str(self.test_stat) + " | " + "DF : " + str(self.df1) + " | " + "Rep P-val : " + str(self._reported_p_val_dir) + str(self.reported_p_val) |