Spaces:
Runtime error
Runtime error
| # Importing libraries | |
| from nltk.corpus import wordnet | |
| import nltk | |
| import transformers | |
| import pandas as pd | |
| import json | |
| import random | |
| import torch | |
| device='cpu' | |
| # Declare the (trained) model that will be used | |
| classifier = transformers.pipeline("zero-shot-classification", model="simple_trained_wsd_pipeline", device=device) | |
| import spacy | |
| # Part Of Speech tagging (POS tagging) | |
| nlp = spacy.load("en_core_web_sm") | |
| # Importing as module. | |
| # import en_core_web_sm | |
| # nlp = en_core_web_sm.load() | |
| print('successfully download model') | |
| def model(passage, level): | |
| # pip install spacy | |
| # pip install transformers | |
| # pip install torch | |
| # pip install en_core_web_sm | |
| # python -m spacy download en_core_web_sm | |
| # pip install spacy-download | |
| # pip install nltk | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| # Passing file directories into variables | |
| # text_input = "./text_input.txt" | |
| cefr_vocab = "cefr-vocab.csv" | |
| # Create and open the text file | |
| # with open(text_input, "a") as file: | |
| # file.write(".") # Add a full stop at the end to make sure there is a full stop at the end of the text for the model to understand where to stop the sentence | |
| # Ask the user for the CEFR level | |
| # while True: | |
| # cefr_level = input("Which CEFR level you want to test?: ").upper() | |
| # if "A1" in cefr_level or "A2" in cefr_level or "B1" in cefr_level or "B2" in cefr_level or "C1" in cefr_level or "C2" in cefr_level: | |
| # break | |
| # else: | |
| # continue | |
| cefr_level = level | |
| # Read from the input file | |
| # with open(text_input, "r") as file: | |
| # txt = str(file.readlines()).replace("[", "").replace("'", "").replace("]", "") | |
| txt = passage + "." | |
| if "." in txt: | |
| txt = (txt.split(".")) | |
| else: | |
| txt = txt | |
| text_dict = {} | |
| for n in txt: | |
| n = n.strip() | |
| ex1 = nlp(n) | |
| for word in ex1: | |
| sentence_question_tag = n.replace(word.text, f"[{word.text}]") | |
| text_dict[f"{word.lemma_} = {sentence_question_tag}"] = word.pos_ | |
| # Collect the tagging results (filter in just NOUN, PROPN, VERB, ADJ, or ADV only) | |
| collector = {} | |
| for key, value in text_dict.items(): | |
| if "NOUN" in value or "VERB" in value or "ADJ" in value or "ADV" in value: | |
| collector[key] = value | |
| # Collect the CEFR level of the words collected before | |
| reference = pd.read_csv(cefr_vocab) | |
| matching = {} | |
| for row_idx in range(reference.shape[0]): | |
| row = reference.iloc[row_idx] | |
| key = f"{row.headword}, {row.pos}" | |
| matching[key] = row.CEFR | |
| # Convert pos of the word into all lowercase to match another data set with CEFR level | |
| for key1, value1 in collector.items(): | |
| if value1 == "NOUN": | |
| collector[key1] = "noun" | |
| if value1 == "VERB": | |
| collector[key1] = "verb" | |
| if value1 == "ADJ": | |
| collector[key1] = "adjective" | |
| if value1 == "ADV": | |
| collector[key1] = "adverb" | |
| # Matching 2 datasets together by the word and the pos | |
| ready2filter = {} | |
| for key, value in matching.items(): | |
| first_key, second_key = key.split(", ") | |
| for key2, value2 in collector.items(): | |
| key2 = key2.split(" = ") | |
| if first_key == key2[0].lower(): | |
| if second_key == value2: | |
| ready2filter[f"{key} = {key2[1]}"] = value | |
| # Filter in just the vocab that has the selected CEFR level that the user provided at the beginning | |
| filtered0 = {} | |
| for key, value in ready2filter.items(): | |
| if cefr_level == "ALL": | |
| filtered0[key] = value | |
| else: | |
| if value == cefr_level: | |
| filtered0[key] = value | |
| # Rearrange the Python dictionary structure | |
| filtered = {} | |
| for key, value in filtered0.items(): | |
| key_parts = key.split(', ') | |
| new_key = key_parts[0] | |
| new_value = key_parts[1] | |
| filtered[new_key] = new_value | |
| # Grab the definition of each vocab from the NLTK wordnet English dictionary | |
| def_filtered = {} | |
| for key3, value3 in filtered.items(): | |
| syns = wordnet.synsets(key3) | |
| partofspeech, context = value3.split(" = ") | |
| def_filtered[f"{key3} = {context}"] = [] | |
| # pos conversion | |
| if partofspeech == "noun": | |
| partofspeech = "n" | |
| if partofspeech == "verb": | |
| partofspeech = "v" | |
| if partofspeech == "adjective": | |
| partofspeech = "s" | |
| if partofspeech == "adverb": | |
| partofspeech = "r" | |
| # print("def_filtered 0:", def_filtered) | |
| # Adding the definitions into the Python dictionary, def_filtered (syns variable does the job of finding the relevant word aka synonyms) | |
| for s in syns: | |
| # print('s:', s) | |
| # print("syns:", syns) | |
| def_filtered[f"{key3} = {context}"].append(s.definition()) | |
| # print("def_filtered 1:", def_filtered) | |
| # Use Nvidia CUDA core if available | |
| # if torch.cuda.is_available(): | |
| # device=0 | |
| # else: | |
| # Process Python dictionary, def_filtereddic | |
| correct_def = {} | |
| for key4, value4 in def_filtered.items(): | |
| vocab, context = key4.split(" = ") | |
| sequence_to_classify = context | |
| candidate_labels = value4 | |
| # correct_def[key4] = [] | |
| correct_def_list = [] | |
| temp_def = [] | |
| hypothesis_template = 'The meaning of [' + vocab + '] is {}.' | |
| output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template) | |
| # Process the score of each definition and add it to the Python dictionary, correct_def | |
| for label, score in zip(output['labels'], output['scores']): | |
| temp_def.append(label) | |
| # print(temp_def) | |
| for first in range(len(temp_def)): | |
| if first == 0: | |
| val = f">> {temp_def[first]}" | |
| else: | |
| val = f"{temp_def[first]}" | |
| correct_def_list.append(val) | |
| print(type(key4), type(correct_def_list)) | |
| correct_def[key4] = correct_def_list | |
| # correct_def[key4].append(f"{label}") | |
| return correct_def | |
| # with open(T2E_exam, "r") as file: | |
| # exam = file.readlines() | |
| # print(exam) | |
| # return(exam) | |
| # passage = "Computer is good" | |
| # level = "A1" | |
| # print(model(passage, level)) |