Research_Bot / helper.py
singhvaibhav924's picture
Prompts Updated
c83f530
import requests
import numpy as np
import arxiv
from langchain.utilities import ArxivAPIWrapper
import os
from dotenv import load_dotenv
load_dotenv()
HF_API_TOKEN = os.environ.get('HF_API_TOKEN')
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
def hf_api_call(model_name, payload):
API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
response = requests.post(API_URL, headers=HEADERS, json=payload)
return response.json()
def extract_keywords(abstract):
payload = {"inputs": abstract}
result = hf_api_call(feature_extractor_model_name, payload)
keyphrases = np.unique([item['word'].strip() for item in result])
print(keyphrases)
return keyphrases
def search_papers(keywords, n_papers):
arxiv_agent = ArxivAPIWrapper(top_k_results=n_papers, doc_content_chars_max=None, load_max_docs=n_papers+3)
query = " ".join(keywords)
results = arxiv_agent.get_summaries_as_docs(query)
return results
def re_rank_papers(query_abstract, papers, n_papers):
summaries = {paper.page_content: {"Title": paper.metadata['Title']} for paper in papers}
summ_list = []
payload = {
"inputs": {
"source_sentence": query_abstract,
"sentences": list(summaries.keys())
}
}
result = hf_api_call(ranker_model_name, payload)
for i, key in enumerate(summaries.keys()):
summ_list.append((key, summaries[key]["Title"], result[i]))
print((key, summaries[key]["Title"], result[i]))
summ_list = sorted(summ_list, key=lambda x: x[2], reverse=True)
summaries = {}
for i in range(n_papers) :
summaries[summ_list[i][0]] = {
"Title" : summ_list[i][1],
"score" : summ_list[i][2]
}
return summaries
def format_abstracts_as_references(papers):
cite_text = ""
i = 0
for key in papers.keys() :
citation = f"{i+1}"
cite_text = f"{cite_text}[{citation}]: {key}\n"
i+=1
return cite_text
def format_authors(authors):
formatted_authors = []
for author in authors:
name_parts = author.name.split()
last_name = name_parts[-1]
initials = ''.join([name[0] for name in name_parts[:-1]])
formatted_authors.append(f"{last_name} {initials}")
return ', '.join(formatted_authors)
def to_vancouver_style(entry):
authors = format_authors(entry.authors)
title = entry.title
journal = 'arXiv'
year = entry.published.year
arxiv_id = entry.get_short_id()
return f"{authors}. {title}. {journal}. {year}. arXiv:{arxiv_id}"
def generate_refs(papers) :
client = arxiv.Client()
results = []
for key in papers.keys() :
search = arxiv.Search(
query = papers[key]["Title"],
max_results = 1,
sort_by = arxiv.SortCriterion.Relevance
)
results.append(list(client.results(search))[0])
references = [to_vancouver_style(entry) for entry in results]
ids = [entry.get_short_id() for entry in results]
i = 0
refs = "\n\nReferences:\n"
for reference in references:
refs = f"{refs}[{i+1}] {reference}\n"
i+=1
return refs, ids
def generate_related_work(query_abstract, ranked_papers, base_prompt, sentence_plan, n_words):
data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
complete_prompt = f"{base_prompt}\n```{data}```"
payload = {
"inputs": complete_prompt,
"parameters": {
"max_new_tokens": n_words,
"temperature": 0.01,
"return_full_text": False,
"do_sample": False
}
}
result = hf_api_call(summarizer_model_name, payload)
print(result)
related_work = result[0]['generated_text']
refs, ids = generate_refs(ranked_papers)
related_work += refs
with open("literature review.txt", "w") as f:
f.write(related_work)
return related_work, ids