Spaces:
Sleeping
Sleeping
Aakash Goel
commited on
Commit
·
fabb80a
1
Parent(s):
4f7449b
change readme and improvements
Browse files- README.md +1 -1
- code/helper.py.py +155 -0
- code/quiz_gen_new2.py +7 -20
- code/quiz_gen_new3.py +124 -0
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: yellow
|
|
5 |
colorTo: green
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.10.0
|
8 |
-
app_file: code/
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: green
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.10.0
|
8 |
+
app_file: code/quiz_gen_new3.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
code/helper.py.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk, pke, string, torch, requests, random
|
2 |
+
from nltk.tokenize import sent_tokenize
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from flashtext import KeywordProcessor
|
5 |
+
|
6 |
+
def postprocesstext(content):
|
7 |
+
final=""
|
8 |
+
for sent in sent_tokenize(content):
|
9 |
+
sent = sent.capitalize()
|
10 |
+
final = final +" "+sent
|
11 |
+
return final
|
12 |
+
|
13 |
+
def summarizer(text,model,tokenizer):
|
14 |
+
text = text.strip().replace("\n"," ")
|
15 |
+
text = "summarize: "+text
|
16 |
+
# print (text)
|
17 |
+
max_len = 512
|
18 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
+
encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,\
|
20 |
+
truncation=True, return_tensors="pt").to(device)
|
21 |
+
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
|
22 |
+
outs = model.generate(input_ids=input_ids,
|
23 |
+
attention_mask=attention_mask,
|
24 |
+
early_stopping=True,
|
25 |
+
num_beams=3,
|
26 |
+
num_return_sequences=1,
|
27 |
+
no_repeat_ngram_size=2,
|
28 |
+
min_length = 75,
|
29 |
+
max_length=300)
|
30 |
+
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
|
31 |
+
summary = dec[0]
|
32 |
+
summary = postprocesstext(summary)
|
33 |
+
summary= summary.strip()
|
34 |
+
return summary
|
35 |
+
|
36 |
+
def get_nouns_multipartite(content):
|
37 |
+
out=[]
|
38 |
+
try:
|
39 |
+
extractor = pke.unsupervised.MultipartiteRank()
|
40 |
+
extractor.load_document(input=content)
|
41 |
+
# not contain punctuation marks or stopwords as candidates.
|
42 |
+
pos = {'PROPN','NOUN'}
|
43 |
+
#pos = {'PROPN','NOUN'}
|
44 |
+
stoplist = list(string.punctuation)
|
45 |
+
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
|
46 |
+
stoplist += stopwords.words('english')
|
47 |
+
# extractor.candidate_selection(pos=pos, stoplist=stoplist)
|
48 |
+
extractor.candidate_selection(pos=pos)
|
49 |
+
# 4. build the Multipartite graph and rank candidates using random walk,
|
50 |
+
# alpha controls the weight adjustment mechanism, see TopicRank for
|
51 |
+
# threshold/method parameters.
|
52 |
+
extractor.candidate_weighting(alpha=1.1,
|
53 |
+
threshold=0.75,
|
54 |
+
method='average')
|
55 |
+
keyphrases = extractor.get_n_best(n=15)
|
56 |
+
for val in keyphrases:
|
57 |
+
out.append(val[0])
|
58 |
+
except Exception as e:
|
59 |
+
out = []
|
60 |
+
#traceback.print_exc()
|
61 |
+
print("EXCEPTION: {}".format(e))
|
62 |
+
return out
|
63 |
+
|
64 |
+
def filter_overlap_words(l):
|
65 |
+
nl = []
|
66 |
+
for i in range(len(l)):
|
67 |
+
temp_l = l[0:i]+l[i+1:]
|
68 |
+
inside=False
|
69 |
+
for j in temp_l:
|
70 |
+
if l[i] not in j:
|
71 |
+
if l[i] not in nl:
|
72 |
+
nl.append(l[i])
|
73 |
+
inside = True
|
74 |
+
else:
|
75 |
+
if inside:
|
76 |
+
nl.remove(l[i])
|
77 |
+
break
|
78 |
+
return nl
|
79 |
+
|
80 |
+
def get_keywords(originaltext,summarytext):
|
81 |
+
keywords = get_nouns_multipartite(originaltext)
|
82 |
+
print ("keywords unsummarized: ",keywords)
|
83 |
+
keyword_processor = KeywordProcessor()
|
84 |
+
for keyword in keywords:
|
85 |
+
keyword_processor.add_keyword(keyword)
|
86 |
+
keywords_found = keyword_processor.extract_keywords(summarytext)
|
87 |
+
keywords_found = list(set(keywords_found))
|
88 |
+
print("keywords_found in summarized: ",keywords_found)
|
89 |
+
important_keywords =[]
|
90 |
+
for keyword in keywords:
|
91 |
+
if keyword in keywords_found:
|
92 |
+
important_keywords.append(keyword)
|
93 |
+
## find keywords which don't have common word ..
|
94 |
+
imp_words = filter_overlap_words(important_keywords)
|
95 |
+
return imp_words[:4]
|
96 |
+
|
97 |
+
def get_question(context,answer,model,tokenizer):
|
98 |
+
text = "context: {} answer: {}".format(context,answer)
|
99 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
100 |
+
encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,\
|
101 |
+
truncation=True, return_tensors="pt").to(device)
|
102 |
+
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
|
103 |
+
outs = model.generate(input_ids=input_ids,
|
104 |
+
attention_mask=attention_mask,
|
105 |
+
early_stopping=True,
|
106 |
+
num_beams=5,
|
107 |
+
num_return_sequences=1,
|
108 |
+
no_repeat_ngram_size=2,
|
109 |
+
max_length=72)
|
110 |
+
dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
|
111 |
+
Question = dec[0].replace("question:","")
|
112 |
+
Question= Question.strip()
|
113 |
+
return Question
|
114 |
+
|
115 |
+
def get_related_word(word):
|
116 |
+
url = "https://api.datamuse.com/words"
|
117 |
+
querystring = {"ml":word}
|
118 |
+
responses = requests.request("GET", url, params=querystring)
|
119 |
+
related_words = []
|
120 |
+
count = 0
|
121 |
+
responses = responses.json()
|
122 |
+
for res in responses:
|
123 |
+
if count >= 4:
|
124 |
+
break
|
125 |
+
if res["word"]!=word and res["word"]!="":
|
126 |
+
related_words.append(res["word"])
|
127 |
+
count += 1
|
128 |
+
return related_words
|
129 |
+
|
130 |
+
def get_final_option_list(ans,other_options):
|
131 |
+
option1 = ans
|
132 |
+
option2,option3,option4 = "dummy","dummy","dummy"
|
133 |
+
try:
|
134 |
+
option2 = other_options[0]
|
135 |
+
except:
|
136 |
+
pass
|
137 |
+
try:
|
138 |
+
option3 = other_options[1]
|
139 |
+
except:
|
140 |
+
pass
|
141 |
+
try:
|
142 |
+
option4 = other_options[2]
|
143 |
+
except:
|
144 |
+
pass
|
145 |
+
final_options = [option1,option2,option3,option4]
|
146 |
+
random.shuffle(final_options)
|
147 |
+
final_options = tuple(final_options)
|
148 |
+
ans_index= 0
|
149 |
+
for i in range(4):
|
150 |
+
if final_options[i] == ans:
|
151 |
+
ans_index = i
|
152 |
+
return final_options, ans_index
|
153 |
+
|
154 |
+
def load_raw_text():
|
155 |
+
return " Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet, Musk put out a statement from Tesla that it was concerned about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, To be clear, I strongly believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal. It triggered a downward spiral for Bitcoin value but the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising that Dogecoin is here to stay and another referred to Musk's previous assertion that crypto could become the world's future currency."
|
code/quiz_gen_new2.py
CHANGED
@@ -1,16 +1,3 @@
|
|
1 |
-
# !pip install --quiet transformers==4.5.0
|
2 |
-
# !pip install --quiet sentencepiece==0.1.95
|
3 |
-
# !pip install --quiet git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b
|
4 |
-
# pip install git+https://github.com/boudinfl/pke.git
|
5 |
-
# !pip install --quiet nltk==3.2.5
|
6 |
-
|
7 |
-
|
8 |
-
# pip install git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b
|
9 |
-
# pip install spacy==3.1.3
|
10 |
-
# pip install textwrap3==0.9.2
|
11 |
-
# pip install flashtext==2.7
|
12 |
-
|
13 |
-
|
14 |
import streamlit as st
|
15 |
from textwrap3 import wrap
|
16 |
from flashtext import KeywordProcessor
|
@@ -18,7 +5,6 @@ import torch, random, nltk, string, traceback, sys, os, requests, datetime
|
|
18 |
import numpy as np
|
19 |
import pandas as pd
|
20 |
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
21 |
-
# st.write("Import pke")
|
22 |
import pke
|
23 |
|
24 |
def set_seed(seed: int):
|
@@ -29,7 +15,6 @@ def set_seed(seed: int):
|
|
29 |
|
30 |
set_seed(42)
|
31 |
|
32 |
-
|
33 |
@st.cache(allow_output_mutation = True)
|
34 |
def load_model():
|
35 |
nltk.download('punkt')
|
@@ -37,13 +22,15 @@ def load_model():
|
|
37 |
nltk.download('wordnet')
|
38 |
nltk.download('stopwords')
|
39 |
nltk.download('wordnet')
|
40 |
-
nltk.download('omw-1.4')
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
summary_model = summary_model.to(device)
|
45 |
-
question_model = T5ForConditionalGeneration.from_pretrained(
|
46 |
-
question_tokenizer = T5Tokenizer.from_pretrained(
|
47 |
question_model = question_model.to(device)
|
48 |
return summary_model, summary_tokenizer, question_tokenizer, question_model
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from textwrap3 import wrap
|
3 |
from flashtext import KeywordProcessor
|
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
|
|
8 |
import pke
|
9 |
|
10 |
def set_seed(seed: int):
|
|
|
15 |
|
16 |
set_seed(42)
|
17 |
|
|
|
18 |
@st.cache(allow_output_mutation = True)
|
19 |
def load_model():
|
20 |
nltk.download('punkt')
|
|
|
22 |
nltk.download('wordnet')
|
23 |
nltk.download('stopwords')
|
24 |
nltk.download('wordnet')
|
25 |
+
nltk.download('omw-1.4')
|
26 |
+
summary_mod_name = os.environ.getattribute("summary_mod_name")
|
27 |
+
question_mod_name = os.environ.getattribute("question_mod_name")
|
28 |
+
summary_model = T5ForConditionalGeneration.from_pretrained(summary_mod_name)
|
29 |
+
summary_tokenizer = T5Tokenizer.from_pretrained(summary_mod_name)
|
30 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
31 |
summary_model = summary_model.to(device)
|
32 |
+
question_model = T5ForConditionalGeneration.from_pretrained(question_mod_name)
|
33 |
+
question_tokenizer = T5Tokenizer.from_pretrained(question_mod_name)
|
34 |
question_model = question_model.to(device)
|
35 |
return summary_model, summary_tokenizer, question_tokenizer, question_model
|
36 |
|
code/quiz_gen_new3.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from textwrap3 import wrap
|
3 |
+
from flashtext import KeywordProcessor
|
4 |
+
import torch, random, nltk, string, traceback, sys, os, requests, datetime
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
8 |
+
import pke
|
9 |
+
from helper import postprocesstext, summarizer, get_nouns_multipartite, get_keywords,\
|
10 |
+
get_question, get_related_word, get_final_option_list, load_raw_text
|
11 |
+
|
12 |
+
|
13 |
+
def set_seed(seed: int):
|
14 |
+
random.seed(seed)
|
15 |
+
np.random.seed(seed)
|
16 |
+
torch.manual_seed(seed)
|
17 |
+
torch.cuda.manual_seed_all(seed)
|
18 |
+
|
19 |
+
set_seed(42)
|
20 |
+
|
21 |
+
@st.cache(allow_output_mutation = True)
|
22 |
+
def load_model():
|
23 |
+
nltk.download('punkt')
|
24 |
+
nltk.download('brown')
|
25 |
+
nltk.download('wordnet')
|
26 |
+
nltk.download('stopwords')
|
27 |
+
nltk.download('wordnet')
|
28 |
+
nltk.download('omw-1.4')
|
29 |
+
summary_mod_name = os.environ.getattribute("summary_mod_name")
|
30 |
+
question_mod_name = os.environ.getattribute("question_mod_name")
|
31 |
+
summary_model = T5ForConditionalGeneration.from_pretrained(summary_mod_name)
|
32 |
+
summary_tokenizer = T5Tokenizer.from_pretrained(summary_mod_name)
|
33 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
34 |
+
summary_model = summary_model.to(device)
|
35 |
+
question_model = T5ForConditionalGeneration.from_pretrained(question_mod_name)
|
36 |
+
question_tokenizer = T5Tokenizer.from_pretrained(question_mod_name)
|
37 |
+
question_model = question_model.to(device)
|
38 |
+
return summary_model, summary_tokenizer, question_tokenizer, question_model
|
39 |
+
|
40 |
+
from nltk.corpus import wordnet as wn
|
41 |
+
from nltk.tokenize import sent_tokenize
|
42 |
+
from nltk.corpus import stopwords
|
43 |
+
|
44 |
+
def csv_downloader(df):
|
45 |
+
res = df.to_csv(index=False,sep="\t").encode('utf-8')
|
46 |
+
st.download_button(
|
47 |
+
label="Download logs data as CSV separated by tab",
|
48 |
+
data=res,
|
49 |
+
file_name='df_quiz_log_file_v1.csv',
|
50 |
+
mime='text/csv')
|
51 |
+
|
52 |
+
def load_file():
|
53 |
+
"""Load text from file"""
|
54 |
+
uploaded_file = st.file_uploader("Upload Files",type=['txt'])
|
55 |
+
if uploaded_file is not None:
|
56 |
+
if uploaded_file.type == "text/plain":
|
57 |
+
raw_text = str(uploaded_file.read(),"utf-8")
|
58 |
+
return raw_text
|
59 |
+
|
60 |
+
st.markdown('')
|
61 |
+
|
62 |
+
# Loading Model
|
63 |
+
summary_model, summary_tokenizer, question_tokenizer, question_model =load_model()
|
64 |
+
|
65 |
+
# App title and description
|
66 |
+
st.title("Exam Assistant")
|
67 |
+
st.write("Upload text, Get ready for answering autogenerated questions")
|
68 |
+
|
69 |
+
# Load file
|
70 |
+
st.text("Disclaimer: This app stores user's input for model improvement purposes !!")
|
71 |
+
|
72 |
+
# Load file
|
73 |
+
|
74 |
+
default_text = load_raw_text()
|
75 |
+
raw_text = st.text_area("Enter text here", default_text, height=400, max_chars=1000000, )
|
76 |
+
|
77 |
+
# raw_text = load_file()
|
78 |
+
start_time = str(datetime.datetime.now())
|
79 |
+
if raw_text != None and raw_text != '':
|
80 |
+
|
81 |
+
# Display text
|
82 |
+
# with st.expander("See text"):
|
83 |
+
# st.write(raw_text)
|
84 |
+
|
85 |
+
summary_text = summarizer(raw_text,summary_model,summary_tokenizer)
|
86 |
+
ans_list = get_keywords(raw_text,summary_text)
|
87 |
+
questions = []
|
88 |
+
option1=[];option2=[];option3=[];option4=[]
|
89 |
+
for idx,ans in enumerate(ans_list):
|
90 |
+
ques = get_question(summary_text,ans,question_model,question_tokenizer)
|
91 |
+
other_options = get_related_word(ans)
|
92 |
+
final_options = get_final_option_list(ans,other_options)
|
93 |
+
final_options, ans_index = get_final_option_list(ans,other_options)
|
94 |
+
option1.append(final_options[0]);option2.append(final_options[1])
|
95 |
+
option3.append(final_options[2]);option4.append(final_options[3])
|
96 |
+
if ques not in questions:
|
97 |
+
# st.write(final_options)
|
98 |
+
html_str = f"""
|
99 |
+
<div>
|
100 |
+
<p>
|
101 |
+
{idx+1}: <b> {ques} </b>
|
102 |
+
</p>
|
103 |
+
</div>
|
104 |
+
"""
|
105 |
+
html_str += f' <p style="color:Green;"><b> {final_options[0]} </b></p> ' if ans_index == 0 else f' <p><b> {final_options[0]} </b></p> '
|
106 |
+
html_str += f' <p style="color:Green;"><b> {final_options[1]} </b></p> ' if ans_index == 1 else f' <p><b> {final_options[1]} </b></p> '
|
107 |
+
html_str += f' <p style="color:Green;"><b> {final_options[2]} </b></p> ' if ans_index == 2 else f' <p><b> {final_options[2]} </b></p> '
|
108 |
+
html_str += f' <p style="color:Green;"><b> {final_options[3]} </b></p> ' if ans_index == 3 else f' <p><b> {final_options[3]} </b></p> '
|
109 |
+
html_str += f"""
|
110 |
+
"""
|
111 |
+
st.markdown(html_str , unsafe_allow_html=True)
|
112 |
+
st.markdown("-----")
|
113 |
+
questions.append(ques)
|
114 |
+
output_path = "results/df_quiz_log_file_v1.csv"
|
115 |
+
res_df = pd.DataFrame({"TimeStamp":[start_time]*len(questions),\
|
116 |
+
"Input":[str(raw_text)]*len(questions),\
|
117 |
+
"Question":questions,"Option1":[option1],\
|
118 |
+
"Option2":[option2],\
|
119 |
+
"Option3":[option3],\
|
120 |
+
"Option4":[option4],\
|
121 |
+
"Correct Answer":ans_list})
|
122 |
+
res_df.to_csv(output_path, mode='a', index=False, sep="\t", header= not os.path.exists(output_path))
|
123 |
+
st.dataframe(pd.read_csv(output_path,sep="\t").tail(5))
|
124 |
+
csv_downloader(pd.read_csv(output_path,sep="\t"))
|