Spaces:
Runtime error
Runtime error
shaocongma
commited on
Commit
·
d18c569
1
Parent(s):
7d01fc4
Bug fix.
Browse files- app.py +1 -4
- references_generator.py +1 -1
- utils/gpt_interaction.py +5 -2
- utils/references.py +20 -19
app.py
CHANGED
@@ -6,7 +6,6 @@ from utils.file_operations import hash_name
|
|
6 |
from references_generator import generate_top_k_references
|
7 |
|
8 |
# todo:
|
9 |
-
# generation.log sometimes disappears
|
10 |
# 6. get logs when the procedure is not completed. *
|
11 |
# 7. 自己的文件库; 更多的prompts
|
12 |
# 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
|
@@ -15,10 +14,8 @@ from references_generator import generate_top_k_references
|
|
15 |
# 3. Check API Key GPT-4 Support.
|
16 |
# 8. Re-build some components using `langchain`
|
17 |
# - in `gpt_interation`, use LLM
|
18 |
-
# 5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
|
19 |
# future:
|
20 |
-
#
|
21 |
-
# 12. Change link to more appealing color # after the website is built;
|
22 |
# 1. Check if there are any duplicated citations
|
23 |
# 2. Remove potential thebibliography and bibitem in .tex file
|
24 |
|
|
|
6 |
from references_generator import generate_top_k_references
|
7 |
|
8 |
# todo:
|
|
|
9 |
# 6. get logs when the procedure is not completed. *
|
10 |
# 7. 自己的文件库; 更多的prompts
|
11 |
# 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
|
|
|
14 |
# 3. Check API Key GPT-4 Support.
|
15 |
# 8. Re-build some components using `langchain`
|
16 |
# - in `gpt_interation`, use LLM
|
|
|
17 |
# future:
|
18 |
+
# generation.log sometimes disappears (ignore this)
|
|
|
19 |
# 1. Check if there are any duplicated citations
|
20 |
# 2. Remove potential thebibliography and bibitem in .tex file
|
21 |
|
references_generator.py
CHANGED
@@ -32,7 +32,7 @@ def generate_raw_references(title, description="",
|
|
32 |
print(f"keywords: {keywords}\n\n")
|
33 |
|
34 |
ref.collect_papers(keywords, tldr=tldr)
|
35 |
-
|
36 |
|
37 |
with open(save_to, "w") as f:
|
38 |
json.dump(paper_json, f)
|
|
|
32 |
print(f"keywords: {keywords}\n\n")
|
33 |
|
34 |
ref.collect_papers(keywords, tldr=tldr)
|
35 |
+
paper_json = ref.to_json()
|
36 |
|
37 |
with open(save_to, "w") as f:
|
38 |
json.dump(paper_json, f)
|
utils/gpt_interaction.py
CHANGED
@@ -25,7 +25,7 @@ def get_gpt_responses(systems, prompts, model="gpt-4", temperature=0.4):
|
|
25 |
|
26 |
|
27 |
def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, base_url=None, key=None):
|
28 |
-
end_point = r"/v1/
|
29 |
if base_url is None:
|
30 |
base_url = r"https://api.openai.com" + end_point
|
31 |
if key is None:
|
@@ -45,10 +45,13 @@ def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, bas
|
|
45 |
"message": message,
|
46 |
"temperature": temperature
|
47 |
}
|
|
|
48 |
response = requests.post(url, headers=headers, json=data)
|
|
|
49 |
response = response.json()
|
50 |
return response['choices'][0]["message"]["content"]
|
51 |
|
52 |
|
53 |
if __name__ == "__main__":
|
54 |
-
|
|
|
|
25 |
|
26 |
|
27 |
def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, base_url=None, key=None):
|
28 |
+
end_point = r"/v1/completions"
|
29 |
if base_url is None:
|
30 |
base_url = r"https://api.openai.com" + end_point
|
31 |
if key is None:
|
|
|
45 |
"message": message,
|
46 |
"temperature": temperature
|
47 |
}
|
48 |
+
print(data)
|
49 |
response = requests.post(url, headers=headers, json=data)
|
50 |
+
print(response)
|
51 |
response = response.json()
|
52 |
return response['choices'][0]["message"]["content"]
|
53 |
|
54 |
|
55 |
if __name__ == "__main__":
|
56 |
+
rep = get_gpt_responses_test(None, "Hello!", base_url=r"https://api.openai.com/v1/completions", key="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")
|
57 |
+
print(rep)
|
utils/references.py
CHANGED
@@ -10,13 +10,13 @@
|
|
10 |
# A sample prompt: {"paper_id": "paper summary"}
|
11 |
|
12 |
# todo: (1) citations & citedby of provided papers:
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
# (2) separate references:
|
18 |
-
|
19 |
-
|
20 |
|
21 |
import requests
|
22 |
import re
|
@@ -44,7 +44,7 @@ def remove_newlines(serie):
|
|
44 |
|
45 |
def search_paper_abstract(title):
|
46 |
pg = ProxyGenerator()
|
47 |
-
success = pg.FreeProxies()
|
48 |
if success:
|
49 |
try:
|
50 |
scholarly.use_proxy(pg)
|
@@ -91,15 +91,17 @@ def load_papers_from_bibtex(bib_file_path):
|
|
91 |
return bib_papers
|
92 |
|
93 |
|
94 |
-
|
95 |
# `tokenizer`: used to count how many tokens
|
96 |
tokenizer_name = tiktoken.encoding_for_model('gpt-4')
|
97 |
tokenizer = tiktoken.get_encoding(tokenizer_name.name)
|
98 |
|
|
|
99 |
def tiktoken_len(text):
|
100 |
# evaluate how many tokens for the given text
|
101 |
tokens = tokenizer.encode(text, disallowed_special=())
|
102 |
return len(tokens)
|
|
|
|
|
103 |
######################################################################################################################
|
104 |
# Semantic Scholar (SS) API
|
105 |
######################################################################################################################
|
@@ -218,6 +220,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
|
|
218 |
results = parse_search_results(search_results)
|
219 |
return results
|
220 |
|
|
|
221 |
######################################################################################################################
|
222 |
# References Class
|
223 |
######################################################################################################################
|
@@ -225,8 +228,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
|
|
225 |
class References:
|
226 |
def __init__(self, title, load_papers=None, keyword="customized_refs"):
|
227 |
if load_papers is not None:
|
228 |
-
self.papers = {}
|
229 |
-
self.papers[keyword] = load_papers_from_bibtex(load_papers)
|
230 |
else:
|
231 |
self.papers = {}
|
232 |
self.title = title
|
@@ -259,7 +261,6 @@ class References:
|
|
259 |
# for key, counts in keywords_dict.items():
|
260 |
# self.papers[key] = _collect_papers_ss(key, counts, tldr)
|
261 |
|
262 |
-
|
263 |
def to_bibtex(self, path_to_bibtex="ref.bib"):
|
264 |
"""
|
265 |
Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
|
@@ -267,7 +268,7 @@ class References:
|
|
267 |
# todo:
|
268 |
# use embeddings to evaluate; keep top k relevant references in papers
|
269 |
# send (title, .bib file) to evaluate embeddings; recieve truncated papers
|
270 |
-
papers = self._get_papers(keyword
|
271 |
|
272 |
# clear the bibtex file
|
273 |
with open(path_to_bibtex, "w", encoding="utf-8") as file:
|
@@ -296,7 +297,7 @@ class References:
|
|
296 |
file.write("\n\n")
|
297 |
return paper_ids
|
298 |
|
299 |
-
def _get_papers(self, keyword
|
300 |
if keyword == "_all":
|
301 |
papers = []
|
302 |
for k, v in self.papers.items():
|
@@ -305,7 +306,7 @@ class References:
|
|
305 |
papers = self.papers["keyword"]
|
306 |
return papers
|
307 |
|
308 |
-
def to_prompts(self, keyword
|
309 |
# `prompts`:
|
310 |
# {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
|
311 |
# this will be used to instruct GPT model to cite the correct bibtex entry.
|
@@ -319,6 +320,7 @@ class References:
|
|
319 |
json.dump(papers_json, f)
|
320 |
|
321 |
try:
|
|
|
322 |
title = self.title
|
323 |
client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
|
324 |
result = client.predict(
|
@@ -347,7 +349,7 @@ class References:
|
|
347 |
break
|
348 |
return prompts
|
349 |
|
350 |
-
def to_json(self, keyword
|
351 |
papers = self._get_papers(keyword)
|
352 |
papers_json = {}
|
353 |
for paper in papers:
|
@@ -355,7 +357,6 @@ class References:
|
|
355 |
return papers_json
|
356 |
|
357 |
|
358 |
-
|
359 |
if __name__ == "__main__":
|
360 |
# testing search results
|
361 |
print("================Testing `ss_search`================")
|
@@ -375,7 +376,7 @@ if __name__ == "__main__":
|
|
375 |
print("================Testing `References.collect_papers`================")
|
376 |
refs.collect_papers(keywords_dict, tldr=True)
|
377 |
for k in refs.papers:
|
378 |
-
papers = refs.papers[k]
|
379 |
print("keyword: ", k)
|
380 |
for paper in papers:
|
381 |
print(paper["paper_id"])
|
@@ -384,8 +385,8 @@ if __name__ == "__main__":
|
|
384 |
refs.to_bibtex()
|
385 |
|
386 |
print("================Testing `References.to_json`================")
|
387 |
-
papers_json = refs.to_json()
|
388 |
-
with open("papers.json", "w",
|
389 |
text_file.write(f"{papers_json}")
|
390 |
|
391 |
print("================Testing `References.to_prompts`================")
|
|
|
10 |
# A sample prompt: {"paper_id": "paper summary"}
|
11 |
|
12 |
# todo: (1) citations & citedby of provided papers:
|
13 |
+
# load the pre-defined papers; use S2 to find all related works
|
14 |
+
# add all citations to `bib_papers`
|
15 |
+
# add all citedby to `bib_papers`
|
16 |
+
# use Semantic Scholar to find their embeddings
|
17 |
# (2) separate references:
|
18 |
+
# divide references into different groups to reduce the tokens count
|
19 |
+
# for generating different paragraph of related works, use different set of references
|
20 |
|
21 |
import requests
|
22 |
import re
|
|
|
44 |
|
45 |
def search_paper_abstract(title):
|
46 |
pg = ProxyGenerator()
|
47 |
+
success = pg.FreeProxies() # pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
|
48 |
if success:
|
49 |
try:
|
50 |
scholarly.use_proxy(pg)
|
|
|
91 |
return bib_papers
|
92 |
|
93 |
|
|
|
94 |
# `tokenizer`: used to count how many tokens
|
95 |
tokenizer_name = tiktoken.encoding_for_model('gpt-4')
|
96 |
tokenizer = tiktoken.get_encoding(tokenizer_name.name)
|
97 |
|
98 |
+
|
99 |
def tiktoken_len(text):
|
100 |
# evaluate how many tokens for the given text
|
101 |
tokens = tokenizer.encode(text, disallowed_special=())
|
102 |
return len(tokens)
|
103 |
+
|
104 |
+
|
105 |
######################################################################################################################
|
106 |
# Semantic Scholar (SS) API
|
107 |
######################################################################################################################
|
|
|
220 |
results = parse_search_results(search_results)
|
221 |
return results
|
222 |
|
223 |
+
|
224 |
######################################################################################################################
|
225 |
# References Class
|
226 |
######################################################################################################################
|
|
|
228 |
class References:
|
229 |
def __init__(self, title, load_papers=None, keyword="customized_refs"):
|
230 |
if load_papers is not None:
|
231 |
+
self.papers = {keyword: load_papers_from_bibtex(load_papers)}
|
|
|
232 |
else:
|
233 |
self.papers = {}
|
234 |
self.title = title
|
|
|
261 |
# for key, counts in keywords_dict.items():
|
262 |
# self.papers[key] = _collect_papers_ss(key, counts, tldr)
|
263 |
|
|
|
264 |
def to_bibtex(self, path_to_bibtex="ref.bib"):
|
265 |
"""
|
266 |
Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
|
|
|
268 |
# todo:
|
269 |
# use embeddings to evaluate; keep top k relevant references in papers
|
270 |
# send (title, .bib file) to evaluate embeddings; recieve truncated papers
|
271 |
+
papers = self._get_papers(keyword="_all")
|
272 |
|
273 |
# clear the bibtex file
|
274 |
with open(path_to_bibtex, "w", encoding="utf-8") as file:
|
|
|
297 |
file.write("\n\n")
|
298 |
return paper_ids
|
299 |
|
300 |
+
def _get_papers(self, keyword="_all"):
|
301 |
if keyword == "_all":
|
302 |
papers = []
|
303 |
for k, v in self.papers.items():
|
|
|
306 |
papers = self.papers["keyword"]
|
307 |
return papers
|
308 |
|
309 |
+
def to_prompts(self, keyword="_all", max_tokens=2048):
|
310 |
# `prompts`:
|
311 |
# {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
|
312 |
# this will be used to instruct GPT model to cite the correct bibtex entry.
|
|
|
320 |
json.dump(papers_json, f)
|
321 |
|
322 |
try:
|
323 |
+
# Use external API to obtain the most relevant papers
|
324 |
title = self.title
|
325 |
client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
|
326 |
result = client.predict(
|
|
|
349 |
break
|
350 |
return prompts
|
351 |
|
352 |
+
def to_json(self, keyword="_all"):
|
353 |
papers = self._get_papers(keyword)
|
354 |
papers_json = {}
|
355 |
for paper in papers:
|
|
|
357 |
return papers_json
|
358 |
|
359 |
|
|
|
360 |
if __name__ == "__main__":
|
361 |
# testing search results
|
362 |
print("================Testing `ss_search`================")
|
|
|
376 |
print("================Testing `References.collect_papers`================")
|
377 |
refs.collect_papers(keywords_dict, tldr=True)
|
378 |
for k in refs.papers:
|
379 |
+
papers = refs.papers[k] # for each keyword, there is a list of papers
|
380 |
print("keyword: ", k)
|
381 |
for paper in papers:
|
382 |
print(paper["paper_id"])
|
|
|
385 |
refs.to_bibtex()
|
386 |
|
387 |
print("================Testing `References.to_json`================")
|
388 |
+
papers_json = refs.to_json() # this json can be used to find the most relevant papers
|
389 |
+
with open("papers.json", "w", encoding='utf-8') as text_file:
|
390 |
text_file.write(f"{papers_json}")
|
391 |
|
392 |
print("================Testing `References.to_prompts`================")
|