Spaces:
Sleeping
Sleeping
import requests | |
import numpy as np | |
import arxiv | |
from langchain.utilities import ArxivAPIWrapper | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
HF_API_TOKEN = os.environ.get('HF_API_TOKEN') | |
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} | |
summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct" | |
feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec" | |
ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
def hf_api_call(model_name, payload): | |
API_URL = f"https://api-inference.huggingface.co/models/{model_name}" | |
response = requests.post(API_URL, headers=HEADERS, json=payload) | |
return response.json() | |
def extract_keywords(abstract): | |
payload = {"inputs": abstract} | |
result = hf_api_call(feature_extractor_model_name, payload) | |
keyphrases = np.unique([item['word'].strip() for item in result]) | |
print(keyphrases) | |
return keyphrases | |
def search_papers(keywords, n_papers): | |
arxiv_agent = ArxivAPIWrapper(top_k_results=n_papers, doc_content_chars_max=None, load_max_docs=n_papers+3) | |
query = " ".join(keywords) | |
results = arxiv_agent.get_summaries_as_docs(query) | |
return results | |
def re_rank_papers(query_abstract, papers, n_papers): | |
summaries = {paper.page_content: {"Title": paper.metadata['Title']} for paper in papers} | |
summ_list = [] | |
payload = { | |
"inputs": { | |
"source_sentence": query_abstract, | |
"sentences": list(summaries.keys()) | |
} | |
} | |
result = hf_api_call(ranker_model_name, payload) | |
for i, key in enumerate(summaries.keys()): | |
summ_list.append((key, summaries[key]["Title"], result[i])) | |
print((key, summaries[key]["Title"], result[i])) | |
summ_list = sorted(summ_list, key=lambda x: x[2], reverse=True) | |
summaries = {} | |
for i in range(n_papers) : | |
summaries[summ_list[i][0]] = { | |
"Title" : summ_list[i][1], | |
"score" : summ_list[i][2] | |
} | |
return summaries | |
def format_abstracts_as_references(papers): | |
cite_text = "" | |
i = 0 | |
for key in papers.keys() : | |
citation = f"{i+1}" | |
cite_text = f"{cite_text}[{citation}]: {key}\n" | |
i+=1 | |
return cite_text | |
def format_authors(authors): | |
formatted_authors = [] | |
for author in authors: | |
name_parts = author.name.split() | |
last_name = name_parts[-1] | |
initials = ''.join([name[0] for name in name_parts[:-1]]) | |
formatted_authors.append(f"{last_name} {initials}") | |
return ', '.join(formatted_authors) | |
def to_vancouver_style(entry): | |
authors = format_authors(entry.authors) | |
title = entry.title | |
journal = 'arXiv' | |
year = entry.published.year | |
arxiv_id = entry.get_short_id() | |
return f"{authors}. {title}. {journal}. {year}. arXiv:{arxiv_id}" | |
def generate_refs(papers) : | |
client = arxiv.Client() | |
results = [] | |
for key in papers.keys() : | |
search = arxiv.Search( | |
query = papers[key]["Title"], | |
max_results = 1, | |
sort_by = arxiv.SortCriterion.Relevance | |
) | |
results.append(list(client.results(search))[0]) | |
references = [to_vancouver_style(entry) for entry in results] | |
ids = [entry.get_short_id() for entry in results] | |
i = 0 | |
refs = "\n\nReferences:\n" | |
for reference in references: | |
refs = f"{refs}[{i+1}] {reference}\n" | |
i+=1 | |
return refs, ids | |
def generate_related_work(query_abstract, ranked_papers, base_prompt, sentence_plan, n_words): | |
data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}" | |
complete_prompt = f"{base_prompt}\n```{data}```" | |
payload = { | |
"inputs": complete_prompt, | |
"parameters": { | |
"max_new_tokens": n_words, | |
"temperature": 0.01, | |
"return_full_text": False, | |
"do_sample": False | |
} | |
} | |
result = hf_api_call(summarizer_model_name, payload) | |
print(result) | |
related_work = result[0]['generated_text'] | |
refs, ids = generate_refs(ranked_papers) | |
related_work += refs | |
with open("literature review.txt", "w") as f: | |
f.write(related_work) | |
return related_work, ids |