Spaces:

nontgcob
/

T2E_Vocabulary_Exam_Generator

Runtime error

App Files Files Community

T2E_Vocabulary_Exam_Generator / model.py

nontgcob

fix spacy and en_core_web_sm errors

f473190 about 1 year ago

raw

history blame

5.96 kB

	# Importing libraries
	from nltk.corpus import wordnet
	import nltk
	import transformers
	import pandas as pd
	import json
	import random
	import torch

	device='cpu'

	# Declare the (trained) model that will be used
	classifier = transformers.pipeline("zero-shot-classification", model="simple_trained_wsd_pipeline", device=device)

	import spacy
	# Part Of Speech tagging (POS tagging)
	nlp = spacy.load("en_core_web_sm")

	# Importing as module.
	# import en_core_web_sm
	# nlp = en_core_web_sm.load()

	print('successfully download model')


	def model(passage, level):
	# pip install spacy
	# pip install transformers
	# pip install torch
	# pip install en_core_web_sm
	# python -m spacy download en_core_web_sm
	# pip install spacy-download
	# pip install nltk

	nltk.download('wordnet')
	nltk.download('omw-1.4')

	# Passing file directories into variables
	# text_input = "./text_input.txt"
	cefr_vocab = "cefr-vocab.csv"

	# Create and open the text file
	# with open(text_input, "a") as file:
	# file.write(".") # Add a full stop at the end to make sure there is a full stop at the end of the text for the model to understand where to stop the sentence


	# Ask the user for the CEFR level
	# while True:
	# cefr_level = input("Which CEFR level you want to test?: ").upper()
	# if "A1" in cefr_level or "A2" in cefr_level or "B1" in cefr_level or "B2" in cefr_level or "C1" in cefr_level or "C2" in cefr_level:
	# break
	# else:
	# continue
	cefr_level = level

	# Read from the input file
	# with open(text_input, "r") as file:
	# txt = str(file.readlines()).replace("[", "").replace("'", "").replace("]", "")
	txt = passage + "."

	if "." in txt:
	txt = (txt.split("."))
	else:
	txt = txt

	text_dict = {}
	for n in txt:
	n = n.strip()
	ex1 = nlp(n)

	for word in ex1:
	sentence_question_tag = n.replace(word.text, f"[{word.text}]")
	text_dict[f"{word.lemma_} = {sentence_question_tag}"] = word.pos_

	# Collect the tagging results (filter in just NOUN, PROPN, VERB, ADJ, or ADV only)
	collector = {}
	for key, value in text_dict.items():
	if "NOUN" in value or "VERB" in value or "ADJ" in value or "ADV" in value:
	collector[key] = value

	# Collect the CEFR level of the words collected before
	reference = pd.read_csv(cefr_vocab)

	matching = {}
	for row_idx in range(reference.shape[0]):
	row = reference.iloc[row_idx]
	key = f"{row.headword}, {row.pos}"
	matching[key] = row.CEFR

	# Convert pos of the word into all lowercase to match another data set with CEFR level
	for key1, value1 in collector.items():
	if value1 == "NOUN":
	collector[key1] = "noun"
	if value1 == "VERB":
	collector[key1] = "verb"
	if value1 == "ADJ":
	collector[key1] = "adjective"
	if value1 == "ADV":
	collector[key1] = "adverb"

	# Matching 2 datasets together by the word and the pos
	ready2filter = {}
	for key, value in matching.items():
	first_key, second_key = key.split(", ")
	for key2, value2 in collector.items():
	key2 = key2.split(" = ")
	if first_key == key2[0].lower():
	if second_key == value2:
	ready2filter[f"{key} = {key2[1]}"] = value

	# Filter in just the vocab that has the selected CEFR level that the user provided at the beginning
	filtered0 = {}
	for key, value in ready2filter.items():
	if cefr_level == "ALL":
	filtered0[key] = value
	else:
	if value == cefr_level:
	filtered0[key] = value

	# Rearrange the Python dictionary structure
	filtered = {}
	for key, value in filtered0.items():
	key_parts = key.split(', ')
	new_key = key_parts[0]
	new_value = key_parts[1]
	filtered[new_key] = new_value

	# Grab the definition of each vocab from the NLTK wordnet English dictionary
	def_filtered = {}
	for key3, value3 in filtered.items():
	syns = wordnet.synsets(key3)
	partofspeech, context = value3.split(" = ")
	def_filtered[f"{key3} = {context}"] = []

	# pos conversion
	if partofspeech == "noun":
	partofspeech = "n"
	if partofspeech == "verb":
	partofspeech = "v"
	if partofspeech == "adjective":
	partofspeech = "s"
	if partofspeech == "adverb":
	partofspeech = "r"

	# print("def_filtered 0:", def_filtered)

	# Adding the definitions into the Python dictionary, def_filtered (syns variable does the job of finding the relevant word aka synonyms)
	for s in syns:
	# print('s:', s)
	# print("syns:", syns)
	def_filtered[f"{key3} = {context}"].append(s.definition())
	# print("def_filtered 1:", def_filtered)

	# Use Nvidia CUDA core if available
	# if torch.cuda.is_available():
	# device=0
	# else:


	# Process Python dictionary, def_filtereddic
	correct_def = {}
	for key4, value4 in def_filtered.items():
	vocab, context = key4.split(" = ")
	sequence_to_classify = context
	candidate_labels = value4
	# correct_def[key4] = []
	correct_def_list = []
	temp_def = []
	hypothesis_template = 'The meaning of [' + vocab + '] is {}.'

	output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

	# Process the score of each definition and add it to the Python dictionary, correct_def
	for label, score in zip(output['labels'], output['scores']):
	temp_def.append(label)
	# print(temp_def)
	for first in range(len(temp_def)):
	if first == 0:
	val = f">> {temp_def[first]}"
	else:
	val = f"{temp_def[first]}"

	correct_def_list.append(val)

	print(type(key4), type(correct_def_list))
	correct_def[key4] = correct_def_list

	# correct_def[key4].append(f"{label}")

	return correct_def

	# with open(T2E_exam, "r") as file:
	# exam = file.readlines()
	# print(exam)
	# return(exam)


	# passage = "Computer is good"
	# level = "A1"
	# print(model(passage, level))