VinayNR
/

stats-nerd

Token Classification

Model card Files Files and versions Community

stats-nerd / hypothesis.py

VinayNR's picture

Upload hypothesis.py

f5c1d3a almost 2 years ago

history blame contribute delete

3.1 kB

	import scipy.stats as stats

	def isfloat(text):
	try:
	# Attempt to convert the string to a float
	fl = float(text)
	return True
	except ValueError:
	return False

	def extract_p_val(text):
	parts = text.split(' ')
	return parts[-1], parts[-2]


	class HypothesisTest:
	def __init__(self, text):
	# split text to form the parts of the test reporting
	# "t-test ( 37 ) = 1.414"

	# initialization
	self.test_type = ''
	self.test_stat = 0.0
	self.df1 = 0
	self.tails = 1

	parts = text.split(' ')

	if parts[0].lower().startswith('t'):
	self.test_type = 't'
	elif parts[0].lower().startswith('z'):
	self.test_type = 'z'
	else:
	raise Exception('Failed to parse the test')

	for part in parts:
	if isfloat(part):
	if part.isdigit():
	self.df1 = part
	else:
	self.test_stat = part

	@property
	def reported_p_val(self):
	return self._reported_p_val

	@reported_p_val.setter
	def reported_p_val(self, value):
	# Add any validation or processing logic here
	self._reported_p_val = value

	@property
	def reported_p_val_dir(self):
	return self._reported_p_val_dir

	@reported_p_val_dir.setter
	def reported_p_val_dir(self, dirn):
	# Add any validation or processing logic here
	if dirn in ['<', '>', '=']:
	self._reported_p_val_dir = dirn
	else:
	print("The direction can be one of <, >, =")

	def calculate_p_val(self):
	if self.test_type == 't':
	return self.tails*(1 - stats.t.cdf(abs(float(self.test_stat)), df=int(self.df1)))
	elif self.test_type == 'z':
	return self.tails*(1 - stats.norm.cdf(abs(float(self.test_stat))))

	@staticmethod
	def get_reported_stat_tests(sentence):
	tests = []

	# group tests with p values
	labeled_entities = sentence.get_labels('ner')
	for idx, entity in enumerate(labeled_entities):
	if entity.value == 'T':
	try:
	test = HypothesisTest(entity.data_point.text)

	# get the p-value closest to this span
	# assume this to be at the next index in the list
	p_val_span = labeled_entities[idx+1].data_point.text
	p_val, dirn = extract_p_val(p_val_span)

	test.reported_p_val = p_val
	test.reported_p_val_dir = dirn

	tests.append(test)

	except:
	# print('Not a test!')
	pass

	return tests

	def __str__(self):
	return "Test Type : " + self.test_type + " \| " + "Test Stat : " + str(self.test_stat) + " \| " + "DF : " + str(self.df1) + " \| " + "Rep P-val : " + str(self._reported_p_val_dir) + str(self.reported_p_val)