import scipy.stats as stats def isfloat(text): try: # Attempt to convert the string to a float fl = float(text) return True except ValueError: return False def extract_p_val(text): parts = text.split(' ') return parts[-1], parts[-2] class HypothesisTest: def __init__(self, text): # split text to form the parts of the test reporting # "t-test ( 37 ) = 1.414" # initialization self.test_type = '' self.test_stat = 0.0 self.df1 = 0 self.tails = 1 parts = text.split(' ') if parts[0].lower().startswith('t'): self.test_type = 't' elif parts[0].lower().startswith('z'): self.test_type = 'z' else: raise Exception('Failed to parse the test') for part in parts: if isfloat(part): if part.isdigit(): self.df1 = part else: self.test_stat = part @property def reported_p_val(self): return self._reported_p_val @reported_p_val.setter def reported_p_val(self, value): # Add any validation or processing logic here self._reported_p_val = value @property def reported_p_val_dir(self): return self._reported_p_val_dir @reported_p_val_dir.setter def reported_p_val_dir(self, dirn): # Add any validation or processing logic here if dirn in ['<', '>', '=']: self._reported_p_val_dir = dirn else: print("The direction can be one of <, >, =") def calculate_p_val(self): if self.test_type == 't': return self.tails*(1 - stats.t.cdf(abs(float(self.test_stat)), df=int(self.df1))) elif self.test_type == 'z': return self.tails*(1 - stats.norm.cdf(abs(float(self.test_stat)))) @staticmethod def get_reported_stat_tests(sentence): tests = [] # group tests with p values labeled_entities = sentence.get_labels('ner') for idx, entity in enumerate(labeled_entities): if entity.value == 'T': try: test = HypothesisTest(entity.data_point.text) # get the p-value closest to this span # assume this to be at the next index in the list p_val_span = labeled_entities[idx+1].data_point.text p_val, dirn = extract_p_val(p_val_span) test.reported_p_val = p_val test.reported_p_val_dir = dirn tests.append(test) except: # print('Not a test!') pass return tests def __str__(self): return "Test Type : " + self.test_type + " | " + "Test Stat : " + str(self.test_stat) + " | " + "DF : " + str(self.df1) + " | " + "Rep P-val : " + str(self._reported_p_val_dir) + str(self.reported_p_val)