Spaces:

furkan1618
/

tokenizer

Sleeping

tokenizer / bap_preprocessing.py

Furkan Akkurt

update app

cca1792 over 2 years ago

3.11 kB

	# -- coding: utf-8 --


	import pickle

	char_to_ix = pickle.load(open("chardict.pickle", "rb"))
	# print(char_to_ix)

	tag_to_ix = {'N':0, 'B':1, 'I':2}

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import torch.autograd as autograd
	torch.manual_seed(1)

	class LSTMTagger(nn.Module):

	def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
	super(LSTMTagger, self).__init__()
	self.hidden_dim = hidden_dim
	#self.batch_size = batch_size

	self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)

	# The LSTM takes word embeddings as inputs, and outputs hidden states
	# with dimensionality hidden_dim.
	self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # <- change here

	# The linear layer that maps from hidden state space to tag space
	self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

	def forward(self, sentence):
	embeds = self.char_embeddings(sentence)
	lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
	tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
	tag_scores = F.log_softmax(tag_space, dim=1)
	return tag_scores

	#Train the model

	model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix))
	loss_function = nn.NLLLoss()
	optimizer = optim.SGD(model.parameters(), lr = 0.1)

	model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt'

	model.load_state_dict(torch.load(model_save_name))

	def prepare_sequence(seq, to_ix):
	idxs = [to_ix[ch] for ch in seq]
	return torch.tensor(idxs, dtype = torch.long)

	def prob_to_tag(out):
	_sentence_tag_list = []
	_prob_to_tag = []
	for ch in out:
	chlist = list(ch)
	maxi = max(chlist)
	ind = chlist.index(maxi)
	_prob_to_tag.append((list(tag_to_ix.keys())[ind]))
	_sentence_tag_list.append(_prob_to_tag)
	return _sentence_tag_list

	def _char_to_token(samplesent, _sentence_tag_list):
	token_list = []
	token = []
	for j in range(len(_sentence_tag_list[0])): #for each character of a sentence
	ch = _sentence_tag_list[0][j]
	ach = samplesent[j]

	if ch == 'I':
	token.append(ach)
	if j == len(_sentence_tag_list[0]) -1:
	token_list.append(token)

	else:
	if ch =='B':
	if j == 0:
	token.append(ach)
	else:
	token_list.append(token)
	token=[]
	token.append(ach)
	if j == len(_sentence_tag_list[0]) -1:
	token_list.append(token)
	elif ch == 'N':
	continue

	return token_list

	def char_unifier(_token_list):
	for item in range(len(_token_list)):
	_token_list[item]= ''.join(_token_list[item])
	return _token_list

	def tokenize(sentence):
	input = prepare_sequence(sentence, char_to_ix)
	out = model(input)
	sentence_tag_list = prob_to_tag(out)
	token_char_list = _char_to_token(sentence, sentence_tag_list)
	token_list = char_unifier(token_char_list)
	return token_list