Spaces:

Linseypass
/

PLTS

Runtime error

App Files Files Community

PLTS / app.py

Linseypass

Update app.py

f84963e about 2 years ago

raw

history blame

7.9 kB

	import gradio as gr
	from nltk.tokenize import sent_tokenize
	import torch
	import ujson as json
	from transformers import AutoModelForCausalLM,LlamaTokenizer,BitsAndBytesConfig
	from peft import PeftModel
	from keybert import KeyBERT
	from keyphrase_vectorizers import KeyphraseCountVectorizer
	import nltk
	nltk.download('punkt')

	# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
	model_name = "decapoda-research/llama-7b-hf"
	adapters_name = 'timdettmers/guanaco-7b'
	# print(f"Starting to load the model {model_name} into memory")
	m = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16)

	m = PeftModel.from_pretrained(m, adapters_name)
	m = m.merge_and_unload()
	tok = LlamaTokenizer.from_pretrained(model_name)
	tok.bos_token_id = 1
	stop_token_ids = [0]
	# print(f"Successfully loaded the model {model_name} into memory")
	print('Guanaco model loaded into memory.')


	def keyphraseElaboration(title, abstract, userGivenKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase):
	numKeywordsToExtract = 2
	if userGivenKeyphrases == "":
	'''
	Process Abstract (eliminate word abstract at front and put into sentences)
	'''
	# eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
	if abstract.lower()[0:9] == "abstract.":
	abstract = abstract[9:]
	elif abstract.lower()[0:8] == "abstract":
	abstract = abstract[8:]
	abstractSentences = sent_tokenize(abstract)
	tooShort = True # if the document only has one or fewer abstract sentences, then the document is too short for the keyphrase extraction/elaboration to give a meaningful output.
	numAbstractSentences = len(abstractSentences)
	if numAbstractSentences > 1:
	tooShort = False
	numAbstractSentencesKeyphrase = min(numAbstractSentences, numAbstractSentencesKeyphrase)
	doc = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesKeyphrase])}"
	kw_model = KeyBERT(model="all-MiniLM-L6-v2")
	vectorizer = KeyphraseCountVectorizer()
	keywordsOut = kw_model.extract_keywords(doc, stop_words="english", top_n = numKeywordsToExtract, vectorizer=vectorizer, use_mmr=True)
	keyBERTKeywords = [x[0] for x in keywordsOut]
	for entry in keyBERTKeywords:
	print(entry)

	keywordString = ""
	if userGivenKeyphrases != "":
	keywordString = userGivenKeyphrases
	elif not tooShort:
	separator = ', '
	keywordString = separator.join(keyBERTKeywords)
	prompt = "What is the purpose of studying " + keywordString + "? Comment on areas of application."
	if keywordString != "":
	formatted_prompt = (
	f"A chat between a curious human and an artificial intelligence assistant."
	f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
	f"### Human: {prompt} \n"
	f"### Assistant:"
	)
	inputs = tok(formatted_prompt, return_tensors="pt").to(deviceElaboration)
	outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensElaboration)
	output = tok.decode(outputs[0], skip_special_tokens=True)
	index_response = output.find("### Assistant: ") + 15
	end_response = output.rfind('.') + 1
	response = output[index_response:end_response]
	return keywordString, response

	def plainLanguageSummary(title, abstract, maxTokensSummary, numAbstractSentencesSummary):
	'''
	Process Abstract (eliminate word abstract at front and put into sentences)
	'''
	# eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
	if abstract.lower()[0:9] == "abstract.":
	abstract = abstract[9:]
	elif abstract.lower()[0:8] == "abstract":
	abstract = abstract[8:]
	abstractSentences = sent_tokenize(abstract)
	'''
	This is for summarization
	'''
	prompt = """
	Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
	"""
	text = ""
	if text == "":
	numAbstractSentences = len(abstractSentences)
	numAbstractSentencesSummary = min(numAbstractSentences, numAbstractSentencesSummary)
	text = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesSummary])}"

	formatted_prompt = (
	f"A chat between a curious human and an artificial intelligence assistant."
	f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
	f"### Human: {prompt + text} \n"
	f"### Assistant:"
	)
	inputs = tok(formatted_prompt, return_tensors="pt").to(deviceSummary)
	outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensSummary)
	output = tok.decode(outputs[0], skip_special_tokens=True)
	index_response = output.find("### Assistant: ") + 15
	if (output[index_response:index_response + 10] == "Certainly!" or output[index_response:index_response + 10] == "Certainly,"):
	index_response += 10
	end_response = output.rfind('.') + 1
	response = output[index_response:end_response]
	return response


	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	title = gr.Textbox(label="Title")
	abstract = gr.Textbox(label="Abstract")
	userDefinedKeyphrases = gr.Textbox(label="Your keyphrases (Optional - Model will elaborate on these keyphrases without using the title or abstract)")
	keyphraseButton = gr.Button("Generate Keyphrase Elaboration")
	summaryButton = gr.Button("Generate Plain Language Summary")
	with gr.Accordion(label="Parameters", open=False):
	maxTokensElaboration = gr.Slider(
	label="Maximum Number of Elaboration Tokens",
	value=500,
	minimum=0,
	maximum=2048,
	step=10,
	interactive=True,
	info="Length of Keyphrase Elaboration",
	)
	maxTokensSummary = gr.Slider(
	label="Maximum Number of Summary Tokens",
	value=300,
	minimum=0,
	maximum=2048,
	step=10,
	interactive=True,
	info="Length of Plain Language Summary",
	)
	numAbstractSentencesKeyphrase = gr.Slider(
	label="Number of Abstract Sentences to use for Keyphrase Extraction",
	value=2,
	minimum=0,
	maximum=20,
	step=1,
	interactive=True,
	info="Default: use first two sentences of abstract."
	)
	numAbstractSentencesSummary = gr.Slider(
	label="Number of Abstract Sentences to use for Plain Language Summary",
	value=2,
	minimum=0,
	maximum=20,
	step=1,
	interactive=True,
	info="Default: use first two sentences of abstract."
	)
	with gr.Column():
	outputKeyphrase = [gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration")]
	outputSummary = gr.Textbox(label="Plain Language Summary")

	keyphraseButton.click(fn=keyphraseElaboration, inputs=[title, abstract, userDefinedKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase], outputs=outputKeyphrase)
	summaryButton.click(fn=plainLanguageSummary, inputs=[title, abstract, maxTokensSummary, numAbstractSentencesSummary], outputs = outputSummary)

	demo.launch(share=True)